# from http://pic.netbian.com/index_7.html
# crawl_object http://pic.netbian.com/
import re
import requests
import multiprocessing
# 创建headers
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
"cookie": "__cfduid=d319174bbe8b2c487343207e2147bf00e1524465301; Hm_lvt_14b14198b6e26157b7eba06b390ab763=1524465302; Hm_lpvt_14b14198b6e26157b7eba06b390ab763=1524465302; yjs_id=d27113c49a8ec2f5315ccd693aabda35; ctrl_time=1; Hm_lvt_526caf4e20c21f06a4e9209712d6a20e=1524465305; zkhanecookieclassrecord=%2C53%2C; Hm_lpvt_526caf4e20c21f06a4e9209712d6a20e=1524473323",
}
# 爬取目标url源码
def crawl_netbian(url, header, coding):
'''
:param url:目标爬取url
:param header: header
:param coding: 编码格式
:return:
'''
netbian_list_url = url
respones = requests.get(netbian_list_url, headers=header)
respones.encoding = coding
url_text = respones.text
url_text=re.sub('\t+'," ",url_text)
url_text = re.sub('\n+', " ", url_text)
url_text=re.sub(' +'," ",url_text)
url_text = re.sub(' ', "", url_text)
# print(url_text) test
return url_text
# 首次清洗数据
def one_cleanout_data(data):
'''
:param data:载入爬取页面源码
:return: return首次清洗完毕数据列表
'''
# 载入等待清洗数据
data_old = data
type(data_old)
#print(data_old) # test
# 正则取出目标数据
num = 1
data_list = []
for i in re.findall(r'|^>]/>', data_old):
data_list.append(i)
print('计数:%d,数据清洗成功:%s' % (num, i))
num+=1
# data_new = re.search(r'|^>]/>', data_old)
# if data_new:
# # 获得目标数据
# data_group = data_new.group()
# if data_group:
#
# print('计数:%d,数据清洗成功:%s'%(num,data_group))
# data_old = re.sub(data_group, '789',data_old)
# #print(data_old)
# data_list.append(data_group)
# num+=1
# #time.sleep(0)
# else:
# print('首次清洗数据完毕')
# break
# print(data_list) test使用
return data_list
# 二次清洗数据
def two_cleanout_data(data):
'''
:param data:传入数据
:return: return二次清洗完毕数据字典
'''
old_data = data
new_data_list = {}
for i in old_data:
one_datas = re.sub('|'"alt="'', '', i)
#print(one_datas) # test
ret = re.split('""', one_datas)
new_data_list[ret[1]] = ret[0]
return new_data_list
# 数据重组
def regroup_data(data):
old_data = data
new_data = {}
for key, value in old_data.items():
# print('key:%s,values:%s'%(key, value)) test
new_data[key] = 'http://pic.netbian.com' + value
return new_data
# 数据下载
def download_data(data,header):
download_dictionaries = data
for key, value in download_dictionaries.items():
respones = requests.get(value, headers=header)
print('正在下载:%s'%key)
f = open(key + '.jpg', 'wb')
f.write(respones.content)
f.close()
#file.write()
# 调用执行
def run_main(url):
url = url
data = crawl_netbian(url, header, 'gbk',)
one_data = one_cleanout_data(data)
two_data = two_cleanout_data(one_data)
regropu_over = regroup_data(two_data)
download_data(regropu_over, header)
# 程序入口
if __name__ == '__main__':
#run_main(url = 'http://pic.netbian.com/index_7.html')
num = range(1,949)
pool = multiprocessing.Pool(10)
for i in num:
url = 'http://pic.netbian.com/index_%s.html'%i
pool.apply_async(run_main, args=(url,))
pool.close()
pool.join()
多线程带统计版本
# from http://pic.netbian.com/index_7.html
# crawl_object http://pic.netbian.com/
import re
import requests
import multiprocessing
# 创建headers
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
"cookie": "__cfduid=d319174bbe8b2c487343207e2147bf00e1524465301; Hm_lvt_14b14198b6e26157b7eba06b390ab763=1524465302; Hm_lpvt_14b14198b6e26157b7eba06b390ab763=1524465302; yjs_id=d27113c49a8ec2f5315ccd693aabda35; ctrl_time=1; Hm_lvt_526caf4e20c21f06a4e9209712d6a20e=1524465305; zkhanecookieclassrecord=%2C53%2C; Hm_lpvt_526caf4e20c21f06a4e9209712d6a20e=1524473323",
}
# 1.获得用户爬取页面数
def Page_Count():
'''
获取用户爬虫页面数
:return: 返回开始页面数与结束页面数
'''
begin_num = input('请输入您要爬取的开始页面数:')
if begin_num==1:
begin_num+=1
over_num = input('请输入您要爬取的结束页面数:')
return begin_num, over_num
# 2.爬取目标url源码
def crawl_netbian(url, header, queue, queue_html):
'''
:param url:目标爬取url
:param header: header
:param coding: 编码格式
:return:
'''
global statistics_num
netbian_list_url = url
respones = requests.get(netbian_list_url, headers=header)
respones.encoding = 'gbk'
url_text = respones.text
over_data = Data_Cleansing(url_text)
queue.put(over_data)
queue_html.put(over_data)
# 3.数据处理
def Data_Cleansing(data, ):
'''
:param data:目标处理数据
:return: 处理后的数据
'''
one_data = one_cleanout_data(data)
two_data = two_cleanout_data(one_data)
regropu_over = regroup_data(two_data)
return regropu_over
# 4.数据下载
def download_data(queue, header, queue_num):
'''
:param queue:下载内容消息队列
:param header: header
:param queue_num: 统计下载消息队列
:return: 空
'''
dictionaries = queue.get()
if dictionaries:
for key,value in dictionaries.items():
respones = requests.get(value, headers=header)
print('正在下载:%s'%key)
f = open(key + '.jpg', 'wb')
f.write(respones.content)
f.close()
queue_num.put(key)
#file.write()
# 首次清洗数据
def one_cleanout_data(data):
'''
:param data:载入爬取页面源码
:return: return首次清洗完毕数据列表
'''
global num
# 载入等待清洗数据
data_old = data
data_old = re.sub('\t+', " ", data_old)
data_old = re.sub('\n+', " ", data_old)
data_old = re.sub(' +', " ", data_old)
data_old = re.sub(' ', "", data_old)
type(data_old)
#print(data_old) # test
# 正则取出目标数据
data_list = []
for i in re.findall(r'|^>]/>', data_old):
data_list.append(i)
print('数据清洗成功:%s' % (i))
return data_list
# 二次清洗数据
def two_cleanout_data(data):
'''
:param data:传入数据
:return: return二次清洗完毕数据字典
'''
old_data = data
new_data_list = {}
for i in old_data:
one_datas = re.sub('|'"alt="'', '', i)
#print(one_datas) # test
ret = re.split('""', one_datas)
new_data_list[ret[1]] = ret[0]
return new_data_list
# 数据重组
def regroup_data(data):
'''
:param data: 重组目标数据
:return: 处理好的数据
'''
old_data = data
new_data = {}
for key, value in old_data.items():
# print('key:%s,values:%s'%(key, value)) test
new_data[key] = 'http://pic.netbian.com' + value
return new_data
# 调用数据处理
def run_main(queue, url, header, queue_html)
'''
:param queue: 添加处理后的数据的消息队列
:param url: 爬取url
:param header: header
:param queue_html: 统计消息队列
:return:
'''
crawl_netbian(url, header, queue, queue_html)
# 打印结果统计
def lprtin_sum_tota():
print('\n\n\n\n\n\n\n\n ***处理完毕***')
print('*' * 30)
print(' 目标处理网页数量共计:%s' % (queue_html.qsize()))
print(' 目标图片下载数量共计:%s' % (queue_num.qsize()))
print('*' * 30)
# 程序入口
if __name__ == '__main__':
# 图片下载统计变量
num_a = 0
# 获得用户爬取页面数量
begin_num, over_num = Page_Count()
# 抓取页面数生成器
num = range(int(begin_num), int(over_num) + 1)
# 创建消息队列
queue = multiprocessing.Manager().Queue()
queue_html = multiprocessing.Manager().Queue()
queue_num = multiprocessing.Manager().Queue()
# 创建进程池
pool = multiprocessing.Pool(10)
# 创建进程
for i in num:
url = 'http://pic.netbian.com/index_%s.html'%i
pool.apply_async(run_main, args=(queue, url, header, queue_html))
pool.apply_async(download_data, args=(queue, header, queue_num))
# 关闭进程池
pool.close()
# 进程等待
pool.join()
#结束统计打印
lprtin_sum_tota()