拍拍贷 爬虫

#-*- coding:utf-8 -*-

import urllib2
import time
import xlwt
import re
import csv
from lxml import etree
from threading import Thread
from Queue import Queue
import random
import socket
import ip_pool

URL_EXIT = False
PARSE_EXIT = False
COLLECT_EXIT = False
OUTPUT_EXIT = False


class urlCollect(Thread):
    def __init__(self, urlQueue, pageQueue):
        super(urlCollect, self).__init__()
        self.urlQueue = urlQueue
        self.pageQueue = pageQueue
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
        }
    def run(self):
        while not URL_EXIT:
            try:
                url = self.urlQueue.get(False)
                request = urllib2.Request(url, headers=self.headers)
                time.sleep(3)
                response = urllib2.urlopen(request)
                text = response.read()
                pattern = re.compile(r'




python编写, 多线程只是鸡肋,没i什么卵用,请求密集这个网站会禁ip,所以只能降低线程(测试了下ip代理,效果并不好)

你可能感兴趣的:(python)