Python多线程案例一爬取美女图片

构建2个队列

一个存放所有的页面链接
一个存放所有详情页链接
request拼接链接
requests下载网页
os创建文件夹
threading多线程
借助contains精确查找标签内容为“下一页”的a标签

        next_url = html.xpath('//div[@class="page"]/ul/a[contains(string(),"下一页")]/@href')

工具pycharm、chrome浏览器、xpath查找插件(特别好用)

def main():
	#总共页面6页
    page_queue = Queue(6)
    img_queue = Queue(1000)
    #网页有6页
    for i in range(1,7):
        url = 'https://www.meinvtu123.net/a/47/list_47_{}.html'.format(i)
        page_queue.put(url)
	#创建2个解析标题和详情链接的多线程
    for x in range(2):
        t = Finder(page_queue,img_queue)
        t.start()
    #创建5个解析详情页,下载图片的多线程
    for i in range(5):
        t = Downder(page_queue,img_queue)
        t.start()

代码如下

import requests
from urllib import request
from queue import Queue
from lxml import etree
import os
import threading

class Finder(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
    }
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super().__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
    	#网页为空就终止
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.jiexi(url)

    def jiexi(self,url):

        res = requests.get(url,headers=self.headers)
        res.encoding='gbk'
        html = res.text
        html = etree.HTML(html)
        titles = html.xpath('//div[@class="con"]/div/a/@title')
        urls = html.xpath('//div[@class="con"]/div/a/@href')
        self.img_queue.put((titles,urls))

class Downder(threading.Thread):
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super().__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue
    def run(self):
        while True:
        	#都为空才终止
            if self.img_queue.empty() and self.page_queue.empty():
                break
            titles,urls = self.img_queue.get()
            print(len(urls))
            for title,url in zip(titles,urls):
                print(title,url)
                self.img_xiazai(title,url)

    def img_xiazai(self,title,url):
        res = requests.get(url)
        res.encoding='gbk'
        html = res.text
        html = etree.HTML(html)
        pic_url = html.xpath('//div[@class="contenta"]/img/@src')
        #按照标题创建文件夹
        rosi_dir = 'E:\\Picture' + '\\' + title
        if not os.path.exists(rosi_dir):
            os.makedirs(rosi_dir)

        for url in pic_url:
            #取链接尾部作为文件名
            name = url.split('-')[-1]
            print(name)
            res = requests.get(url)
            with open(rosi_dir + '\\' + name, 'wb') as f:
                f.write(res.content)
        #借助contains精确查找下一页链接
        next_url = html.xpath('//div[@class="page"]/ul/a[contains(string(),"下一页")]/@href')
        if next_url:
            next_url = request.urljoin(url,next_url)
            self.img_xiazai(title,next_url)

def main():
    page_queue = Queue(6)
    img_queue = Queue(1000)
    for i in range(1,7):
        url = 'https://www.meinvtu123.net/a/47/list_47_{}.html'.format(i)
        page_queue.put(url)

    for x in range(2):
        t = Finder(page_queue,img_queue)
        t.start()
    for i in range(5):
        t = Downder(page_queue,img_queue)
        t.start()

if __name__ == '__main__':
    main()

你可能感兴趣的:(Python多线程案例一爬取美女图片)