Python Requests多线程爬取 数据清洗 彼岸图网缩略图demo



# from http://pic.netbian.com/index_7.html
# crawl_object http://pic.netbian.com/
import re
import requests
import multiprocessing

# 创建headers
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
    "cookie": "__cfduid=d319174bbe8b2c487343207e2147bf00e1524465301; Hm_lvt_14b14198b6e26157b7eba06b390ab763=1524465302; Hm_lpvt_14b14198b6e26157b7eba06b390ab763=1524465302; yjs_id=d27113c49a8ec2f5315ccd693aabda35; ctrl_time=1; Hm_lvt_526caf4e20c21f06a4e9209712d6a20e=1524465305; zkhanecookieclassrecord=%2C53%2C; Hm_lpvt_526caf4e20c21f06a4e9209712d6a20e=1524473323",
    }

# 爬取目标url源码
def crawl_netbian(url, header, coding):
    '''

    :param url:目标爬取url
    :param header: header
    :param coding: 编码格式
    :return:
    '''
    netbian_list_url = url
    respones = requests.get(netbian_list_url, headers=header)
    respones.encoding = coding
    url_text = respones.text
    url_text=re.sub('\t+'," ",url_text)
    url_text = re.sub('\n+', " ", url_text)
    url_text=re.sub(' +'," ",url_text)
    url_text = re.sub(' ', "", url_text)
    # print(url_text) test
    return url_text

# 首次清洗数据
def one_cleanout_data(data):
        '''
        :param data:载入爬取页面源码
        :return: return首次清洗完毕数据列表
        '''
        # 载入等待清洗数据
        data_old = data
        type(data_old)
        #print(data_old) # test
        # 正则取出目标数据
        num = 1
        data_list = []
        for i in re.findall(r'|^>]/>', data_old):
            data_list.append(i)
            print('计数:%d,数据清洗成功:%s' % (num, i))
            num+=1



            # data_new = re.search(r'|^>]/>', data_old)
            # if data_new:
            #     # 获得目标数据
            #     data_group = data_new.group()
            #     if data_group:
            #
            #         print('计数:%d,数据清洗成功:%s'%(num,data_group))
            #         data_old = re.sub(data_group, '789',data_old)
            #         #print(data_old)
            #         data_list.append(data_group)
            #         num+=1
            #         #time.sleep(0)

            # else:
            #     print('首次清洗数据完毕')
            #     break
        # print(data_list) test使用
        return data_list

# 二次清洗数据
def two_cleanout_data(data):
    '''

    :param data:传入数据
    :return: return二次清洗完毕数据字典
    '''
    old_data = data
    new_data_list = {}
    for i in old_data:
        one_datas = re.sub('|'"alt="'', '', i)
        #print(one_datas) # test
        ret = re.split('""', one_datas)
        new_data_list[ret[1]] = ret[0]
    return new_data_list

# 数据重组
def regroup_data(data):
    old_data = data
    new_data = {}
    for key, value in old_data.items():
        # print('key:%s,values:%s'%(key, value)) test
        new_data[key] = 'http://pic.netbian.com' + value
    return new_data

# 数据下载
def download_data(data,header):
    download_dictionaries = data
    for key, value in download_dictionaries.items():
        respones = requests.get(value, headers=header)
        print('正在下载:%s'%key)
        f = open(key + '.jpg', 'wb')
        f.write(respones.content)
        f.close()


        #file.write()
# 调用执行
def run_main(url):
    url = url
    data = crawl_netbian(url, header, 'gbk',)
    one_data = one_cleanout_data(data)
    two_data = two_cleanout_data(one_data)
    regropu_over = regroup_data(two_data)
    download_data(regropu_over, header)

# 程序入口
if __name__ == '__main__':
    #run_main(url = 'http://pic.netbian.com/index_7.html')
    num = range(1,949)
    pool = multiprocessing.Pool(10)
    for i in num:
        url = 'http://pic.netbian.com/index_%s.html'%i
        pool.apply_async(run_main, args=(url,))
    pool.close()
    pool.join()

多线程带统计版本

# from http://pic.netbian.com/index_7.html
# crawl_object http://pic.netbian.com/
import re
import requests
import multiprocessing

# 创建headers
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
    "cookie": "__cfduid=d319174bbe8b2c487343207e2147bf00e1524465301; Hm_lvt_14b14198b6e26157b7eba06b390ab763=1524465302; Hm_lpvt_14b14198b6e26157b7eba06b390ab763=1524465302; yjs_id=d27113c49a8ec2f5315ccd693aabda35; ctrl_time=1; Hm_lvt_526caf4e20c21f06a4e9209712d6a20e=1524465305; zkhanecookieclassrecord=%2C53%2C; Hm_lpvt_526caf4e20c21f06a4e9209712d6a20e=1524473323",
    }

# 1.获得用户爬取页面数
def Page_Count():
    '''
    获取用户爬虫页面数
    :return: 返回开始页面数与结束页面数
    '''

    begin_num = input('请输入您要爬取的开始页面数:')
    if begin_num==1:
        begin_num+=1
    over_num = input('请输入您要爬取的结束页面数:')

    return begin_num, over_num

# 2.爬取目标url源码
def crawl_netbian(url, header, queue, queue_html):
    '''

    :param url:目标爬取url
    :param header: header
    :param coding: 编码格式
    :return:
    '''
    global statistics_num
    netbian_list_url = url
    respones = requests.get(netbian_list_url, headers=header)
    respones.encoding = 'gbk'
    url_text = respones.text
    over_data = Data_Cleansing(url_text)
    queue.put(over_data)
    queue_html.put(over_data)

# 3.数据处理
def Data_Cleansing(data, ):
    '''

    :param data:目标处理数据
    :return: 处理后的数据
    '''
    one_data = one_cleanout_data(data)
    two_data = two_cleanout_data(one_data)
    regropu_over = regroup_data(two_data)
    return regropu_over

# 4.数据下载
def download_data(queue, header, queue_num):
    '''
    
    :param queue:下载内容消息队列 
    :param header: header
    :param queue_num: 统计下载消息队列
    :return: 空
    '''
    dictionaries = queue.get()
    if dictionaries:
        for key,value in dictionaries.items():
            respones = requests.get(value, headers=header)
            print('正在下载:%s'%key)
            f = open(key + '.jpg', 'wb')
            f.write(respones.content)
            f.close()
            queue_num.put(key)




        #file.write()

# 首次清洗数据
def one_cleanout_data(data):
        '''
        :param data:载入爬取页面源码
        :return: return首次清洗完毕数据列表
        '''
        global num
        # 载入等待清洗数据
        data_old = data
        data_old = re.sub('\t+', " ", data_old)
        data_old = re.sub('\n+', " ", data_old)
        data_old = re.sub(' +', " ", data_old)
        data_old = re.sub(' ', "", data_old)
        type(data_old)
        #print(data_old) # test
        # 正则取出目标数据

        data_list = []
        for i in re.findall(r'|^>]/>', data_old):
            data_list.append(i)
            print('数据清洗成功:%s' % (i))
        return data_list

# 二次清洗数据
def two_cleanout_data(data):
    '''

    :param data:传入数据
    :return: return二次清洗完毕数据字典
    '''
    old_data = data
    new_data_list = {}
    for i in old_data:
        one_datas = re.sub('|'"alt="'', '', i)
        #print(one_datas) # test
        ret = re.split('""', one_datas)
        new_data_list[ret[1]] = ret[0]
    return new_data_list

# 数据重组
def regroup_data(data):
    '''
    
    :param data: 重组目标数据 
    :return: 处理好的数据
    '''
    old_data = data
    new_data = {}
    for key, value in old_data.items():
        # print('key:%s,values:%s'%(key, value)) test
        new_data[key] = 'http://pic.netbian.com' + value
    return new_data

# 调用数据处理
def run_main(queue, url, header, queue_html)
    '''
    
    :param queue: 添加处理后的数据的消息队列 
    :param url: 爬取url
    :param header: header
    :param queue_html: 统计消息队列
    :return: 
    '''

    crawl_netbian(url, header, queue, queue_html)

# 打印结果统计
def lprtin_sum_tota():
    print('\n\n\n\n\n\n\n\n        ***处理完毕***')
    print('*' * 30)
    print('    目标处理网页数量共计:%s' % (queue_html.qsize()))
    print('    目标图片下载数量共计:%s' % (queue_num.qsize()))
    print('*' * 30)

# 程序入口
if __name__ == '__main__':
    # 图片下载统计变量
    num_a = 0
    # 获得用户爬取页面数量
    begin_num, over_num = Page_Count()
    # 抓取页面数生成器
    num = range(int(begin_num), int(over_num) + 1)
    # 创建消息队列
    queue = multiprocessing.Manager().Queue()
    queue_html = multiprocessing.Manager().Queue()
    queue_num = multiprocessing.Manager().Queue()
     # 创建进程池
    pool = multiprocessing.Pool(10)
    # 创建进程
    for i in num:
        url = 'http://pic.netbian.com/index_%s.html'%i
        pool.apply_async(run_main, args=(queue, url, header, queue_html))
        pool.apply_async(download_data, args=(queue, header, queue_num))

    # 关闭进程池
    pool.close()
    # 进程等待
    pool.join()
    #结束统计打印
    lprtin_sum_tota()

你可能感兴趣的:(Python代码)