scrapy爬取页面并按目录结构存放页面

给大家共享一段代码,希望对发家有帮助。
该代码的作用是将指定站点的页面递归的爬取,并按照目录结构存放爬取结果:


from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy import log 

import os, os.path

class BookSpider(CrawlSpider):
    name = 'Book'
    #allowed_domains = ['xx.com']
    #start_urls = ['http://www.xx.com/']

    #rules = (
    #    Rule(SgmlLinkExtractor(allow=r'\.html'), callback='parse_item', follow=True),
    #)  
    def __init__(self, start_url, output_dir = "./", *args, **kwargs):
        super(BookSpider, self).__init__(*args, **kwargs)
        self.start_urls = []
        self.start_urls.append(start_url)

        self.output_dir = output_dir

        self.allowed_domains = map(self._get_domain, self.start_urls)

    def _get_domain(self, url):
        first_dot = url.find('.')
        if -1 == first_dot:
            return None
        first_slash = url.find('/', first_dot + 1)
        if -1 == first_slash:
            return url[first_dot + 1:] 
        return url[first_dot + 1: first_slash]

    def parse(self, response):
        """first Request return to fetch start_url"""
        self.parse_detail(response)
        yield Request(response.url, callback = self.parse_item)

    def parse_item(self, response):
        page_links = SgmlLinkExtractor(allow=r'\.html').extract_links(response)
        """ iterate two times for BFS; one for DFS"""
        for link in page_links:
            yield Request(link.url, callback = self.parse_detail)
        for link in page_links:
            yield Request(link.url, callback = self.parse_item)

    def parse_detail(self, response):
        outputfile = self._rtouch(response.url)
        if not outputfile:
            log.msg("download %s fail" % response.url, level = log.WARNING, spider = self)
            return

        with open(outputfile, 'w') as f:
            f.write(response.body)
        log.msg("download file: %s" % outputfile, level = log.INFO, spider = self)

    def _rtouch(self, filepath):
        pos = filepath.find('://')
        if -1 != pos:
            filepath = filepath[pos + 3:]
        if ".html" != filepath[-5:]:
            filepath += "/index.html"
        opath = os.path.abspath(self.output_dir + "/" + filepath)
        basedir = os.path.dirname(opath)
        if not os.path.exists(basedir):
            try:
                os.makedirs(basedir)
            except Exception, msg:
                log.msg(msg, level = log.WARNING, spider = self)
                return None
        return opath

你可能感兴趣的:(爬虫,scrapy,网页,结构存放)