实现python自定义爬虫框架

import urllib2
from lxml import etree
import Queue
import ssl
import re
import threading
import json


class CrawlThread(threading.Thread):

    def __init__(self, urlQueue, dataQueue, threadName):
        super(CrawlThread, self).__init__()
        self.urlQueue = urlQueue
        self.dataQueue = dataQueue
        self.name = threadName

    def run(self):
        while not urlQueue.empty():
            try:

                dict = self.urlQueue.get(block=False)

                suffix_url = dict.get('url')

                filename = dict.get('filename')

                print  threading.current_thread().name + '\t' + filename

                response = opener.open(url + suffix_url)

                html = response.read().decode('gbk')

                # content = etree.HTML(html)

                # text = content.xpath('//*[@id="content"]')[0].text

                pattern = re.compile(r'
(.*?)
') text = pattern.search(html).group(1) self.dataQueue.put({'filename': filename, 'content': text}) except: pass class WriteThread(threading.Thread): def __init__(self, dataQueue, threadName): super(WriteThread, self).__init__() self.name = threadName self.dataQueue = dataQueue def run(self): while not dataQueue.empty(): try: dict = dataQueue.get(block=False) filename = dict.get('filename') content = dict.get('content') # print content with open('./novel/' + filename.strip() + '.txt', 'w') as f: r = doContent(content) f.write(r.encode('utf-8')) except Exception, e: print e.message def doContent(content): pattern = re.compile(r'

    ') r = pattern.sub('\n', content) return r if __name__ == "__main__": proxy = { 'http': '***', 'https': '***' } ssl_context = ssl._create_unverified_context() https_handler = urllib2.HTTPSHandler(context=ssl_context) global url url = "https://www.i7wx.com/book/0/636/" proxy_handler = urllib2.ProxyHandler(proxy) opener = urllib2.build_opener(proxy_handler, https_handler) response = opener.open(url) # print response.read().decode('gbk') pattern = re.compile(r'(.*?)', re.I) result = pattern.findall(response.read().decode('gbk')) urlQueue = Queue.Queue() dataQueue = Queue.Queue() global CRAWl_EXIT, SAVE_EXIT CRAWl_EXIT = False SAVE_EXIT = False for k, v in result: # print k, v urlQueue.put({ 'url': k, 'filename': v }) crawlThreads = [] thread = CrawlThread(urlQueue, dataQueue, "crawl thread 1") thread2 = CrawlThread(urlQueue, dataQueue, "crawl thread 2") thread3 = CrawlThread(urlQueue, dataQueue, "crawl thread 3") crawlThreads.append(thread) crawlThreads.append(thread2) crawlThreads.append(thread3) thread.start() thread2.start() thread3.start() for t in crawlThreads: t.join() writeThreads = [] thread4 = WriteThread(dataQueue, 't4') thread5 = WriteThread(dataQueue, 't5') thread6 = WriteThread(dataQueue, 't6') writeThreads.append(thread4) writeThreads.append(thread5) writeThreads.append(thread6) for t in writeThreads: t.start() for t in writeThreads: t.join()

你可能感兴趣的:(爬虫)