python3 爬取 妹子图 自拍栏 图片

python3 爬取 妹子图 自拍栏 图片


简介:
python3 爬取妹子图自拍栏目图片( https://www.mzitu.com/zipai/)

import os
import time
import requests
import threading
from bs4 import BeautifulSoup


class MZiTu(object):
    def __init__(self):
        self.session = requests.session()
        self.headers = {
            "Connection": "keep-alive",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
                          " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
        }
        self.time = 2  # 间隔时间

    # 获取状态
    def get_status(self, url):
        response = self.session.get(url, headers=self.headers)
        if response.status_code == 200:
            return response
        else:
            print("ERROR: 网络连接失败!")
            return False

    # 首页,建立连接
    def get_index(self, url):
        response = self.get_status(url)
        if response:
            # response.encoding = "utf-8"
            # html = response.text
            # print(html)
            return True
        else:
            print("ERROR: 首页访问失败!")
            return False

    # 获取最后一页
    def get_last_page(self, url):
        response = self.get_status(url)
        if response:
            html = response.content
            soup = BeautifulSoup(html, "html5lib")
            span = soup.select(".pagenavi-cm > .page-numbers.current")
            if span:
                last_page = span[0].text
                return int(last_page)
            else:
                print("ERROR: 获取页数失败!")
        else:
            print("ERROR: 获取页数失败!")

    # 解析
    def parse(self, url):
        title_url = {}
        response = self.get_status(url)
        if not response:
            return None
        html = response.text
        soup = BeautifulSoup(html, "html5lib")
        lis = soup.select("#comments ul li")
        for li in lis:
            img_url = li.select(".comment-body p > .lazy")
            if img_url:
                img_url = img_url[0].get("data-original")
                title = os.path.basename(img_url)
                # title = hashlib.md5(title.encode(encoding='UTF-8')).hexdigest()
                # print(img)
                # print(title)
                title_url[title] = img_url
        return title_url

    # 下载
    def download(self, path, url):
        # print(url)
        with open(path, "wb") as f:
            response = self.get_status(url)
            content = response.content
            f.write(content)

    # 翻页
    @staticmethod
    def next_page(last_page):
        for i in range(1, last_page + 1):
            # url = "https://www.mzitu.com/zipai/comment-page-376"
            url = "https://www.mzitu.com/zipai/comment-page-{}".format(i)
            print(url)
            yield url

    def main_(self):
        # 首页,建立连接
        url = "https://www.mzitu.com"
        if not self.get_status(url):
            return None

        # 获取最后一页
        url = "https://www.mzitu.com/zipai/"
        last_page = self.get_last_page(url)
        if not last_page:
            return None

        path = os.path.abspath(os.path.join(os.getcwd(), "image"))
        if not os.path.exists(path):
            os.mkdir(path)

        # 翻页
        urls = self.next_page(last_page)
        for url in urls:
            title_url = self.parse(url)
            thread_list = []
            for title in title_url:
                path = os.path.abspath(os.path.join(os.getcwd(), "image", title))
                url = title_url[title]

                # self.download(abspath, url)
                t = threading.Thread(target=self.download, args=(path, url))
                thread_list.append(t)

            for t in thread_list:
                t.start()
            for t in thread_list:
                t.join()

            time.sleep(self.time)

    def main(self):
        t = threading.Thread(target=self.main_)
        t.daemon = True
        t.start()
        t.join()


if __name__ == '__main__':
    mzt = MZiTu()
    mzt.main()

你可能感兴趣的:(Python案例)