(python2.7)实现糗百爬虫

先贴代码,文章以后有空再写,注释也以后再写,bug也以后再调,参考文献也以后在贴吧,就这样了(葛优躺)

文件1:main.py

# -*- coding:utf-8 -*-
import urllib
import qsbk

spider = qsbk.QsbkSpider()
spider.section='8hr'
spider.loadSomePages(10)
while True:
    article = spider.getRandomArticle()
    if not article:
        break
    print '[ page',article['pageIndex'],'artical',article['articleIndex'],']\n',\
        '< Article by', article['author'], '>\n', article['text'],'\n< God Comment >\n',\
        article['cmtMan'], article['cmt']
    print 'pause enter to get next article'
    input = raw_input()
    if(input in ['q','Q']):
        break

文件2:qsbk.py

__author__ = 'ssins'
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import random
from bs4 import BeautifulSoup

class QsbkSpider:
    def __init__(self):
        self._pageIndex = 1
        self.maxPageIndex = 35
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self._qsbkUrl = 'http://www.qiushibaike.com/'
        self.section = '8hr'
        self._sections = ['8hr','hot','imgrank','text','history','pic','textnew']

        self._headers = {'User-Agent' : self.user_agent}
        self._stories = []
        self.enable = False

    def getPageUrl(self,section,pageIndex):
        if section not in self._sections or pageIndex < 1 or pageIndex > self.maxPageIndex:
            return None
        url = self._qsbkUrl + section + '/page/' + str(pageIndex)
        return url

    def getPageInfo(self, url):
        try:
            request = urllib2.Request(url, headers = self._headers)
            response = urllib2.urlopen(request)
            html = response.read()
            return html
        except:
            return None

    def find_article_span(self,tag):
        if tag.name != 'span':
            return False
        children = tag.children
        for child in children:
            if (child.name in ['img', 'h2']):
                return False
        return True

    def getPageArticles(self,section,pageIndex):
        pageCode = self.getPageInfo(self.getPageUrl(section,pageIndex))
        if not pageCode:
            return None
        pageCode = str(pageCode)
        soup = BeautifulSoup(pageCode, 'lxml')
        #soup = BeautifulSoup(pageCode, 'html.parser')
        articles = soup.find_all('div', class_='article block untagged mb15')
        articlesDictionaryList = []
        try:
            for tmpArt in articles:
                article = str(tmpArt)
                if re.search('class="thumb"', article):
                    continue
                replaceBr = re.compile('
') article = re.sub(replaceBr, "\n", article) soupArticle = BeautifulSoup(article, 'lxml') #soupArticle = BeautifulSoup(article, 'html.parser') author = soupArticle.h2.string text = soupArticle.find(self.find_article_span).string cmtMan = 'no God Comment' cmt = '' try: cmtMan = soupArticle.find('span', class_='cmt-name').string cmt = soupArticle.find('div', class_='main-text').string except: pass articlesDictionary = {} articlesDictionary['author'] = author articlesDictionary['text'] = text articlesDictionary['cmtMan'] = cmtMan articlesDictionary['cmt'] = cmt articlesDictionaryList.append(articlesDictionary) self._stories.append(articlesDictionaryList) except: return False return True def loadNextPage(self): if(self._pageIndex > self.maxPageIndex): return False if(self.getPageArticles(self.section,self._pageIndex)): self._pageIndex += 1 return True return False def loadSomePages(self, pageNums): for i in range(pageNums): self.loadNextPage(); def getRandomArticle(self): if(len(self._stories)<1): return None pageIndex = random.randint(0, len(self._stories) - 1) articleIndex = random.randint(0, len(self._stories[pageIndex]) - 1) article = self._stories[pageIndex][articleIndex] article['pageIndex'] = pageIndex + 1 article['articleIndex'] = articleIndex + 1 return article

你可能感兴趣的:((python2.7)实现糗百爬虫)