python爬虫(scrapy)-妹子图

最近开始学习爬虫,正好拿这个典型的项目练练手,该网站中含有图片防盗链,且链接比较多,需要多层爬取,采用scrapy Crawlspider进行爬取,比较省劲儿。啥也不说了,看代码吧!配置什么的自行修改,这是主要的爬虫代码,仅供参考
# -*- coding: utf-8 -*-
import scrapy,os
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class MeiziSpider(CrawlSpider):
    name = 'meizi'
    allowed_domains = ['www.mzitu.com']
    start_urls = ['http://www.mzitu.com/']

    rules = (
        #页面内图片链接
        Rule(LinkExtractor(restrict_xpaths=('//ul[@id="pins"]/li/a')), follow=True),
        #图片组详情页码链接
        Rule(LinkExtractor(restrict_xpaths='//div[@class="pagenavi"]/a'),callback='parse_item',follow=True),
        #页码链接
        Rule(LinkExtractor(restrict_xpaths='//nav[@class="navigation pagination"]/div[@class="nav-links"]/a'),follow=True)
    )

    def parse_item(self, response):
        img_src = response.xpath('//div[@class="content"]/div[@class="main-image"]/p/a/img/@src').extract_first()
        imgName = response.xpath('//div[@class="content"]/h2/text()').extract_first()
        yield scrapy.Request(url=img_src,callback=self.downloadImage,meta={'name':imgName},dont_filter=True)

    def downloadImage(self,response):
        imgName = response.meta['name']+'.jpg'
        filepath = r'F:\1Python\code\pythonlianxi\0609\meizitu'
        imgpath = os.path.join(filepath,imgName)
        with open(imgpath,'wb') as fp:
            fp.write(response.body)






  

你可能感兴趣的:(爬虫)