（python2.7）实现糗百爬虫

先贴代码，文章以后有空再写，注释也以后再写，bug也以后再调，参考文献也以后在贴吧，就这样了（葛优躺）

文件1：main.py

# -*- coding:utf-8 -*-
import urllib
import qsbk

spider = qsbk.QsbkSpider()
spider.section='8hr'
spider.loadSomePages(10)
while True:
    article = spider.getRandomArticle()
    if not article:
        break
    print '[ page',article['pageIndex'],'artical',article['articleIndex'],']\n',\
        '< Article by', article['author'], '>\n', article['text'],'\n< God Comment >\n',\
        article['cmtMan'], article['cmt']
    print 'pause enter to get next article'
    input = raw_input()
    if(input in ['q','Q']):
        break

文件2：qsbk.py

__author__ = 'ssins'
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import random
from bs4 import BeautifulSoup

class QsbkSpider:
    def __init__(self):
        self._pageIndex = 1
        self.maxPageIndex = 35
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self._qsbkUrl = 'http://www.qiushibaike.com/'
        self.section = '8hr'
        self._sections = ['8hr','hot','imgrank','text','history','pic','textnew']

        self._headers = {'User-Agent' : self.user_agent}
        self._stories = []
        self.enable = False

    def getPageUrl(self,section,pageIndex):
        if section not in self._sections or pageIndex < 1 or pageIndex > self.maxPageIndex:
            return None
        url = self._qsbkUrl + section + '/page/' + str(pageIndex)
        return url

    def getPageInfo(self, url):
        try:
            request = urllib2.Request(url, headers = self._headers)
            response = urllib2.urlopen(request)
            html = response.read()
            return html
        except:
            return None

    def find_article_span(self,tag):
        if tag.name != 'span':
            return False
        children = tag.children
        for child in children:
            if (child.name in ['img', 'h2']):
                return False
        return True

    def getPageArticles(self,section,pageIndex):
        pageCode = self.getPageInfo(self.getPageUrl(section,pageIndex))
        if not pageCode:
            return None
        pageCode = str(pageCode)
        soup = BeautifulSoup(pageCode, 'lxml')
        #soup = BeautifulSoup(pageCode, 'html.parser')
        articles = soup.find_all('div', class_='article block untagged mb15')
        articlesDictionaryList = []
        try:
            for tmpArt in articles:
                article = str(tmpArt)
                if re.search('class="thumb"', article):
                    continue
                replaceBr = re.compile('
')
                article = re.sub(replaceBr, "\n", article)
                soupArticle = BeautifulSoup(article, 'lxml')
                #soupArticle = BeautifulSoup(article, 'html.parser')
                author = soupArticle.h2.string
                text = soupArticle.find(self.find_article_span).string
                cmtMan = 'no God Comment'
                cmt = ''
                try:
                    cmtMan = soupArticle.find('span', class_='cmt-name').string
                    cmt = soupArticle.find('div', class_='main-text').string
                except:
                    pass
                articlesDictionary = {}
                articlesDictionary['author'] = author
                articlesDictionary['text'] = text
                articlesDictionary['cmtMan'] = cmtMan
                articlesDictionary['cmt'] = cmt
                articlesDictionaryList.append(articlesDictionary)
            self._stories.append(articlesDictionaryList)
        except:
            return False
        return True

    def loadNextPage(self):
        if(self._pageIndex > self.maxPageIndex):
            return False
        if(self.getPageArticles(self.section,self._pageIndex)):
            self._pageIndex += 1
            return True
        return False

    def loadSomePages(self, pageNums):
        for i in range(pageNums):
            self.loadNextPage();

    def getRandomArticle(self):
        if(len(self._stories)<1):
            return None
        pageIndex = random.randint(0, len(self._stories) - 1)
        articleIndex = random.randint(0, len(self._stories[pageIndex]) - 1)
        article = self._stories[pageIndex][articleIndex]
        article['pageIndex'] = pageIndex + 1
        article['articleIndex'] = articleIndex + 1
        return article

（python2.7）实现糗百爬虫

你可能感兴趣的:(（python2.7）实现糗百爬虫)