Python设计爬虫任务队列

from queue import Queue
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class CrawlQueue(object):

    def __init__(self, items, args):
        self.max_failure = 10
        self.num_success = 0
        self.num_tol = len(items)
        self.args = args

        self.infos = self.initInfos(items)
        self.job_queue = Queue()
        self.job_fail = Queue()
        self.job_result = Queue()

        for info in self.infos:
            self.job_queue.put(info)

    #####重写的方法
    def parser(self, info: dict):
        success = True
        res = object
        return success, res

    def initInfos(self, items: list) -> list:
        return [{"id": i + 1, "failure": 0, "item": item, "res": None} for i, item in enumerate(items)]

    def handleInfo(self, info: dict):
        success, res = self.parser(info)
        if success:
            self.num_success += 1
            info['res'] = res
            self.job_result.put(info)
        else:
            info['failure'] += 1
            if info['failure'] > self.max_failure:
                self.job_fail.put(info)
            else:
                self.job_queue.put(info)
        return success, res

    def start(self):
        while (not self.job_queue.empty()):
            info = self.job_queue.get()
            success, _ = self.handleInfo(info)
            print("{}/{} : {}".format(self.num_success, self.num_tol, success))

 

你可能感兴趣的:(python,爬虫)