爬取趣图网的素材

  • 所需环境Python3.7
  • 用到的包 requests lxml
  • 代码:
import os
import random
import time

import requests
from lxml import etree


class Scai:
    """获取图片"""

    def __init__(self):
        self.url = "http://www.tuquu.com/Sucai/"

    def parse(self, url, headers):
        """发送请求"""
        resp = requests.get(url, headers).content
        html = etree.HTML(resp)
        return resp, html

    def get_page_info(self, html):
        """获取页面的数据"""
        div_list = html.xpath("//div[@id='masonry']/div")
        info_list = []
        for div in div_list:
            item = {}
            item["name"] = div.xpath(".//h3/a/text()")[0]
            item["src"] = div.xpath(".//img/@src")[0]
            info_list.append(item)
        return info_list

    def get_next_page(self, html):
        temp_url = html.xpath("//a[text()='下一页']/@href")[0]
        return temp_url

    def get_ua(self):
        """random User-Agent"""
        first_num = random.randint(55, 62)
        third_num = random.randint(0, 3200)
        fourth_num = random.randint(0, 140)
        os_type = [
            '(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
            '(Macintosh; Intel Mac OS X 10_12_6)'
        ]
        chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)

        ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
                       '(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
                      )
        return ua

    def downloader(self, url, path):  # contains
        start = time.time()  # 开始时间
        size = 0
        response = requests.get(url, stream=True)  # stream 属性
        chunk_size = 1024  # 每次下载的数据大小
        content_size = int(response.headers['content-length'])  # 总大小
        if response.status_code == 200:
            print('\033[1;32m' + "[文件大小]:%0.2f MB" % (content_size / chunk_size / 1024))  # 换算单位并print
            with open(path, "wb") as file:
                for data in response.iter_content(chunk_size=chunk_size):
                    file.write(data)
                    size += len(data)
                    # \033[显示方式;前景色;背景色m
                    print("\033[1;32m" + "\r" + '[下载进度]:%s%.2f%%' % (
                        '#' * int(size * 50 / content_size), float(size / content_size * 100)), end='')
        end = time.time()  # 结束时间
        print("\033[32;1m" + "\n" + "全部下载完成!用时%.2f秒" % (end - start))

    def run(self):
        """运行程序"""
        url = self.url
        dir = "C:\\图片\\"
        if not os.path.exists(dir):
            os.makedirs(dir)
        while True:
            ua = self.get_ua()
            headers = ua
            resp, html = self.parse(url, headers)
            info_list = self.get_page_info(html)
            for item in info_list:
                url = item["src"]
                path = dir + item["name"] + ".png"
                # 下载图片
                self.downloader(url, path)
            # 下一页url地址
            temp_url = self.get_next_page(html)

            if temp_url == "javascript:;":
                break
            else:
                url = "http://www.tuquu.com/Sucai/" + temp_url


if __name__ == '__main__':
    print("***********仅供学习使用**********")
    print("***********author:afei**********")
    print("图片下载路径c://图片")
    sc = Scai()
    try:
        sc.run()
    except:
        print("请检查网络连接。。。。。。。。。。。")

-效果

爬取趣图网的素材_第1张图片

你可能感兴趣的:(爬虫)