# coding:utf-8
# 使用了线程库
import threading
# 队列
from queue import Queue
# 解析库
from lxml import etree
# 请求处理
import requests
# json处理
import requests
import json
import json
import bs4
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
result = []
class ThreadCrawl(threading.Thread):
def __init__(self, threadName, pageQueue, dataQueue):
#threading.Thread.__init__(self)
# 调用父类初始化方法
super(ThreadCrawl, self).__init__()
# 线程名
self.threadName = threadName
# 页码队列
self.pageQueue = pageQueue
# 数据队列
self.dataQueue = dataQueue
# 请求报头
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
def run(self):
print("启动 " + self.threadName)
while not CRAWL_EXIT:
try:
# 取出一个数字,先进先出
# 可选参数block,默认值为True
#1. 如果对列为空,block为True的话,不会结束,会进入阻塞状态,直到队列有新的数据
#2. 如果队列为空,block为False的话,就弹出一个Queue.empty()异常,
page = self.pageQueue.get(False)
url = 'https://search.bilibili.com/all?keyword=爬虫&from_source=nav_search&spm_id_from=333.851.b_696e7465726e6174696f6e616c486561646572.11' + '&page=' + str(page)
response = requests.get(url, headers=self.headers, verify=False).text
time.sleep(0.5)
try:
soup = bs4.BeautifulSoup(response, 'lxml').find('div', attrs={'id': 'all-list'})
ul = soup.find('ul', attrs={'class': 'video-list clearfix'}).find_all('li',
attrs={
'class': 'video-item matrix'})
for item in ul:
# print(item)
info = item.find('div', attrs={'class': 'headline clearfix'}).find('span',
attrs={
'class': 'type avid'}).get_text()
aid = info.replace('av', '')
# print(aid)
self.dataQueue.put(aid)
result.append(aid)
return result
except:
print('something is wrong')
except:
pass
class ThreadParse(threading.Thread):
def __init__(self, threadName, dataQueue, lock):
super(ThreadParse, self).__init__()
# 线程名
self.threadName = threadName
# 数据队列
self.dataQueue = dataQueue
# 保存解析后数据的文件名
# 锁
self.lock = lock
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
def run(self):
print("启动" + self.threadName)
while not PARSE_EXIT:
try:
aid = self.dataQueue.get(False)
url = 'https://api.bilibili.com/x/web-interface/view?aid=' + str(aid)
self.parse(url)
except:
pass
print("退出" + self.threadName)
def parse(self, url):
response = requests.get(url=url, headers=self.headers, verify=False).json()
time.sleep(1)
try:
with self.lock:
data_1 = response['data']
data = data_1['stat']
aid = data['aid']
view = data['view']
coin = data['coin']
like = data['like']
favorite = data['favorite']
share = data['share']
danmaku = data['danmaku']
print('视频编号', aid)
print('观看数量', view)
print('投币数量', coin)
print('收藏数量', favorite)
print('点赞数量', like)
print('分享数量', share)
print('弹幕数量', danmaku)
print('------------')
except:
print('------------++++')
CRAWL_EXIT = False
PARSE_EXIT = False
def main():
# 页码的队列,表示50个页面
pageQueue = Queue(50)
# 放入1~10的数字,先进先出
for i in range(1, 51):
pageQueue.put(i)
# 采集结果(每页的HTML源码)的数据队列,参数为空表示不限制
dataQueue = Queue()
# filename = open("duanzi.json", "a")
# 创建锁
lock = threading.Lock()
# 三个采集线程的名字
crawlList = ["采集线程1号", "采集线程2号", "采集线程3号", "采集线程4号", "采集线程5号"]
# 存储三个采集线程的列表集合
threadcrawl = []
for threadName in crawlList:
thread = ThreadCrawl(threadName, pageQueue, dataQueue)
thread.start()
threadcrawl.append(thread)
# 三个解析线程的名字
parseList = ["解析线程1号","解析线程2号","解析线程3号","解析线程4号","解析线程5号","解析线程6号"]
# 存储三个解析线程
threadparse = []
for threadName in parseList:
thread = ThreadParse(threadName, dataQueue, lock)
thread.start()
threadparse.append(thread)
# 等待pageQueue队列为空,也就是等待之前的操作执行完毕
while not pageQueue.empty():
pass
# 如果pageQueue为空,采集线程退出循环
global CRAWL_EXIT
CRAWL_EXIT = True
print("pageQueue为空")
for thread in threadcrawl:
thread.join()
print("1")
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
print("dataQueue为空")
for thread in threadparse:
thread.join()
print("2")
with lock:
pass
print("谢谢使用!")
if __name__ == "__main__":
main()