爬虫爬取百度图片

具体分析过程就不写了,给出我学习的链接:https://blog.csdn.net/qq_35371031/article/details/81207966
上代码

import requests
import os
import threading
import urllib.parse
import time
import re
import hashlib
class picture:
    """
    爬取百度图片
    """
    def __init__(self, picture_name,picture_number=100 ,path = 'picture'):
        self.save_path = picture_name
        self.picture_number = int(picture_number)
        self.start_time = time.time()
        self.picture_name = picture_name
        self.header = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
        }
        if self.save_path not in os.listdir('.'):
            os.makedirs(self.save_path)
        self.start()
    def start(self):
        for i in range(0,self.picture_number,60):
            self.get_picture_content(i)
    def get_picture_content(self,count):
        url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&rn=60&word={0}&pn={1}'.format(urllib.parse.quote(self.picture_name),str(count))
        print(url)
        r = requests.get(url,headers = self.header)
        if r.status_code != 200:
            exit("访问百度图库错误")
        else:
            link_url = re.findall('(?<=thumbURL":").*?.jpg',r.text)
            new_count = 60 if count+60 < self.picture_number else count + 60 - self.picture_number
            for i in range(new_count):
                res = requests.get(link_url[i],headers=self.header)
                if res.status_code != 200:
                    exit('访问图片链接错误')
                else:
                    self.save_picture(res.content,link_url[i])
    def save_picture(self,content,picture_name):
        
        with open("{0}/{1}.jpg".format(self.save_path,hashlib.md5(picture_name.encode()).hexdigest()),'wb') as f:
            f.write(content)
    def __del__(self):
              print("花费了{}s时间".format(str(time.time()-self.start_time)))
if __name__ == "__main__":
    picture_name = input("输入你要爬取的图片类型    ")
    number  = input('输入你想爬取的数量   ')
    pic = picture(picture_name,number)

我没有写多线程,在我本地测试中了1000张
爬虫爬取百度图片_第1张图片

PS:
就这(狗头)??

你可能感兴趣的:(python,爬虫)