多线程爬取

import requests
from lxml import etree
import time
import threading
from queue import Queue
import csv

#线程列表
crawl_T_list = []
parse_T_list = []
#################################################
class crawl_T(threading.Thread):

	def __init__(self, name, Page_Queue, Data_Queue):
		super().__init__()
		self.name = name
		self.Page_Queue = Page_Queue
		self.Data_Queue = Data_Queue

	def run(self):
		print('------%s已经开始--------' % self.name)
		#从页码队列中获取信息
		while 1:
			if self.Page_Queue.empty():
				break
			page = self.Page_Queue.get()
			#拼接url,发送请求
			url = 'http://www.fanjian.net/jiantu-' + str(page)
			headers = {
			'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
				}
			r = requests.get(url, headers=headers)
			#把获得的信息放入信息队列
			self.Data_Queue.put(r.text)
			print('%s:第%s页已经采集完成' % (self.name, page))
		print('------%s已经结束--------' % self.name)


class parse_T(threading.Thread):

	def __init__(self, name, Data_Queue, f, suo):
		super().__init__()
		self.name = name
		self.Data_Queue = Data_Queue
		self.f = f
		self.suo = suo

	def run(self):
		i = 0
		print('------%s已经开始--------' % self.name)
		#通过try结构,如果5s从网页源码队列中get不到东西的话,会抛出错误,那么就结束进程
		try:
			while 1:
				data = self.Data_Queue.get(True, 3)
				#解析
				tree = etree.HTML(data)
				li_list = tree.xpath('//li[@class="cont-item"]')
				for li in li_list:
					title = li.xpath('.//h2/a/text()')[0]
					#href = li.xpath('.//div/p/img/@data-src')[0]
					#将解析内容写入文件
					self.suo.acquire()
					writer = csv.writer(self.f)
					lt = [title]
					writer.writerow(lt)
					self.suo.release()					
				i += 1
				print('%s:已完成%s页解析' %(self.name, i))
		except:
			print('------%s已经结束--------' % self.name)
			
		
####################################################

def Create_Queue():
	#创建页码队列
	Page_Queue = Queue()
	for i in range(1,501):
		Page_Queue.put(i)
	#创建信息队列
	Data_Queue = Queue()
	return Page_Queue, Data_Queue

def create_crawl_T(Page_Queue, Data_Queue):
	#创建采集线程
	crawl_name_list = ['采集1', '采集2']
	for name in crawl_name_list:
		tcrawl = crawl_T(name, Page_Queue, Data_Queue)
		crawl_T_list.append(tcrawl)	

def create_parse_T(Data_Queue, f, suo):
	#创建解析线程
	parse_name_list = ['解析1', '解析2']
	for name in parse_name_list:
		tparse = parse_T(name, Data_Queue, f, suo)
		parse_T_list.append(tparse)

#####################################################
def main():
	#创建两个队列
	Page_Queue, Data_Queue = Create_Queue()
	#创建锁
	suo = threading.Lock()
	#打开文件
	f = open('fanjian2.csv', 'a', encoding='utf8', newline='')
	#创建线程
	create_crawl_T(Page_Queue, Data_Queue)
	create_parse_T(Data_Queue, f, suo)
	#启动线程
	for crawl_T in crawl_T_list:
		crawl_T.start()
	for parse_T in parse_T_list:
		parse_T.start()
	#确保主线程最后关闭
	for crawl_T in crawl_T_list:
		crawl_T.join()
	for parse_T in parse_T_list:
		parse_T.join()
	#收尾
	f.close()
	print("所有线程关闭,程序结束")
	
if __name__ == '__main__':
	main()

 

你可能感兴趣的:(python)