Python3.6使用requests进行网页(Web)爬虫之批量下载图片

前段时间发现了一个超清图片的网站,然后闲着没事干就写了个爬虫,进行批量下载,拿来当壁纸切换,个人觉得很不错的,然后就开干 . . .

解析网页用的lmxl,这个一般需要手动先安装,中文注释是后来加的,方便大家学习理解,下面是我的源代码,其中很多变量命名我都一直遵循规范来的

#!/usr/bin/env python3.6
# -*-coding = 'utf-8'-*-
# @Time: 2020/2/28  19:39:26
# @File: wallhaven2.py

DOWN_URL = 'https://wallhaven.cc/toplist?page='
SAVE_PATH = 'photo_in_wallhaven/'  # 存放图片的路径
COUNT = 0
Temp_DATA = """
fdsfds General 1920x1080 nature landscape trees forest rocks mountains mist clouds Monsoon town fall river rock formation Kalampaka Greece
"""
import requests from lxml import etree from time import sleep from random import randint from os import mkdir from os.path import exists # 弄了两个请求头,随机算一个,因为访问的地址比较大,所以也可以多加几个 headers = [{ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0" }, { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36' }] if not exists(SAVE_PATH): mkdir(SAVE_PATH) class DownWallhaven(object): def __init__(self, number, img_group: list = None): self.number = number self.img_group = img_group # 因为返回lxml.etree的是一个列表,后面通过一个for循环对这个图片的下载列表进行依次下载 self.folder = None self.new_img_folder = None self.headers = None def askUrl(self): global COUNT COUNT += 1 at_url = DOWN_URL + str(self.number) folder_name = at_url.split('https://')[1].replace('/', '_') + '/' self.new_img_folder = SAVE_PATH + folder_name if not exists(self.new_img_folder): mkdir(self.new_img_folder) self.headers = headers[randint(1, len(headers)) - 1] http = requests.get(at_url, headers=self.headers) cont = http.content.decode('utf-8') # parser html img url parser = etree.HTML(cont) ul = parser.xpath('//section[@class="thumb-listing-page"]/ul')[0] print('%s ' % str(COUNT) + str(ul)) # because no refresh so only have one url # find all picture's url li = ul.xpath('./li/figure/a[@class="preview"]/@href') # print(li) # use xpath can back a list self.img_group = li print(li) # 打印的是内存地址 return li def download_every_url(self): count = 0 # just click every url to request to get response for i in range(len(self.img_group)): count += 1 # _url = 'https://wallhaven.cc/w/96y88d' _url = self.img_group[i] self.headers = headers[randint(1, len(headers)) - 1] _response = requests.get(_url, headers=self.headers) html = _response.content # 直接可以保存 ‘wb’ (实际会失败很多次,因为把不必要的其他的html源码写进去了,实测会打不开) data_img = html _html = html.decode('utf-8') _parser = etree.HTML(_html) img_down_url = _parser.xpath('//div[@class="scrollbox"]/img/@src')[0] img_name = _parser.xpath('//div[@class="scrollbox"]/img/@alt')[0] sleep(randint(1, 3)) # 模拟隔一段时间(随机)再访问,如果只下载一两页,选中该行CTRl+/,屏蔽掉 # if 'Traceback' in img_down_url: # 没用 General # img_down_url = img_down_url.replace('Traceback', '') # print('it is happend!') # 文件扩展名,后来发现有两种格式,不然会打不开 extension_name = img_down_url[len(img_down_url) - 4:len(img_down_url)] if extension_name != '.jpg' or extension_name == '.png': extension_name = '.png' else: extension_name = '.jpg' #  去掉将下载网址最后干扰单词("然并卵",没有实现,所以后面会跳出"失败"这个提示) extra_words = img_down_url.split(extension_name) if extra_words[1] is not '': #  有的图片链接没有文件名的后缀,经过分割处理,所以就为一个空字符 img_down_url = img_down_url.split(extra_words)[0] print('extra_words: ' + extra_words) print('After split url: ' + img_down_url) else: pass # 这两个count纯粹是为了计数用,是按顺序依次增多的 print('{}.{}_'.format(COUNT, count), end='') # self.headers = headers[randint(1, len(headers)) - 1] # data_img = requests.get(img_down_url, headers=self.headers).content print('CurrentUrl: ' + img_down_url, end='') try: with open(self.new_img_folder + img_name + extension_name, 'wb') as img: img.write(data_img) except: # 这里是我试了几遍硬是过不去,就把下面的给注释掉了 print('失败,分割再处理...', end='\t') # try: # 这是尝试的感觉还是不行... # # extra_words = img_down_url.split(extension_name) # # img_down_url = img_down_url.split(extra_words)[0] # data_img = requests.get(img_down_url, headers=self.headers).content # with open(self.new_img_folder + img_name + extension_name, 'wb') as img: # img.write(data_img) # print('download finally') # except: # print('再下一次,依旧不行') else: print(', <--' + img_name + '-->下载完毕') if __name__ == '__main__': # 可以直接在Pycharm新建一个py文件粘贴进去右键运行看看效果,我是终端可以直接跑的 # 建议选择1尝试下载,有的图片文件比较大 page = int(input("你想下载多少页?: ")) n = page COUNT = 0 # 用来计数的 while page > 0: # 网页默认从2开始,所以1就是下载的第一页 app = DownWallhaven(number=n - page + 2) # 默认第2页开始 app.askUrl() app.download_every_url() page -= 1

=== 下载成果 ===
Python3.6使用requests进行网页(Web)爬虫之批量下载图片_第1张图片
Python3.6使用requests进行网页(Web)爬虫之批量下载图片_第2张图片
Python3.6使用requests进行网页(Web)爬虫之批量下载图片_第3张图片
拿来学习使用,请勿他用,如果有什么问题,plz留言,谢谢

你可能感兴趣的:(Python3,爬虫)