爬取糗事百科

糗事百科纯文抓取,包括用户id,用户头像,糗事内容

#coding:utf-8
import sys
import string
import urllib2
import sqlite3
import re
import time


class HTMLTool:
    #用非贪婪模式匹配\t 或者\n 或者空格 或者超链接 或者图片
    BgnCharToNoneRex = re.compile('(\t|\n| ||)')
    #用非贪婪模式匹配任意 <>标签
    EndCharToNoneRex = re.compile("<.*?>")

    #用非贪婪模式匹配任意 

标签 BgnPartRex = re.compile("") CharToNewLineRex = re.compile('(
|

||
|
)') CharToNextTabRex = re.compile("") #将一些html符号实体转变为原始符号 replaceTab = [("<","<"),(">",">"),("&","&"),("&","\""),(" "," ")] def replace_char(self,x): x = self.BgnCharToNoneRex.sub("",x) x = self.BgnPartRex.sub("\n ",x) x = self.CharToNewLineRex.sub("\n",x) x = self.CharToNextTabRex.sub("\t",x) x = self.EndCharToNoneRex.sub("",x) for t in self.replaceTab: x = x.replace(t[0],t[1]) return x class SQLITETool: def __init__(self,databaseName): self.databaseName = databaseName self.create_db() def create_db(self): conn = sqlite3.connect(self.databaseName) conn.close(); def execute_table(self,sql): conn = sqlite3.connect(self.databaseName); cursor = conn.cursor(); try: cursor.execute(sql) except Exception, e: print(Exception,":",e) finally: cursor.close() conn.commit() conn.close() class QiuBaiSpider: def __init__(self): self.myTool = HTMLTool() self.sqlTool = SQLITETool("qiubai.db") self.nowPage = "" self.pageNumber = 1 print("create") def pageHandle(self,page): myItems = re.findall(r'(.*?).*?
',page,re.S) for item in myItems: user_id = item[0] icon_url = item[1] content = self.myTool.replace_char(item[2]).decode('utf-8') sql = 'insert into text_table(user_id, icon_url, content) values({},\'{}\',\'{}\');'.format(user_id,icon_url,content) self.sqlTool.execute_table(sql) def getPageNumber(self,page): myMatch = re.search(r'
    (.*?)
',page,re.S) myItems = re.findall(r'
  • .*?(.*?).*?
  • ',myMatch.group(1),re.S) value = self.myTool.replace_char(myItems[6]) print("count="+self.myTool.replace_char(myItems[6])) if value.isdigit(): return int(myItems[6]) else: return 0 def getSinglePage(self,kindName,page): myUrl = "http://m.qiushibaike.com/{}/page/".format(kindName) + str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent':user_agent} try: req = urllib2.Request(myUrl,headers=headers) myResponse = urllib2.urlopen(req) self.nowPage = myResponse.read() return self.nowPage except Exception, e: print(Exception,":",e) return def getAllPageOfKind(self,kindName): try: #先获取首页, self.getSinglePage(kindName,1) pageNum = self.getPageNumber(self.nowPage) self.pageHandle(self.nowPage) for i in range(2,pageNum+1): print("page="+str(i)) self.getSinglePage(kindName,i) self.pageHandle(self.nowPage) time.sleep(0.5) except Exception, e: print(Exception,":",e) if __name__ == '__main__': reload(sys); sys.setdefaultencoding('utf-8') spider = QiuBaiSpider() sql = "CREATE TABLE text_table(caseid integer PRIMARY KEY autoincrement, user_id char(15),icon_url char(128),content char(512));" spider.sqlTool.execute_table(sql) spider.getAllPageOfKind("hot")

    你可能感兴趣的:(爬取糗事百科)