python简单爬取某网站python教程内容

一般的小白python新手可能都知道廖雪峰网站吧。由于自己也是个小白,所以就想能不能将该教程爬取下来呢。说做就做。好了不多说,直接上代码:

#coding:utf-8
#autor:myndtt
import urllib2
import requests
import os
import multiprocessing
import sys
from bs4 import BeautifulSoup
from lxml import etree
import pdfkit

reload(sys)
sys.setdefaultencoding('utf-8')

url='http://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000'

def geturl(url):
 article = []
 try:
   re=urllib2.urlopen(url).read()
   selector=etree.HTML(re)
   content=selector.xpath('//*[@id="main"]/div/div/div/div/div/ul/li/a/@href')
   for each in content:
     article.append('http://www.liaoxuefeng.com'+each.strip())
 except urllib2.HTTPError as e:
            pass
 return article
def gethtml():
     text=u'

廖雪峰Python教程

'+u'
' a=1 re=urllib2.urlopen(url).read() selector=etree.HTML(re) conten=selector.xpath('//*[@id="main"]/div/div/div/div/div/ul/li/a/text()') #获取目录 for con in conten: text=text+u'

'+unicode(a)+u':'+unicode(con)+u'

'+u'
' a=a+1 return text def getothers(urllist): n = 0 text=gethtml() pool = multiprocessing.Pool(multiprocessing.cpu_count()) #获取每份网页要的东西 for ur in urllist: n=n+1 m=pool.apply_async(getpage,(ur,n,)).get() text=text+unicode(m) #提示打印的页数 print n pool.close() pool.join() file = open("pdf.html", "a") file.write(u'' + unicode(text) + u'') pdfcreate() print "ok!!!" #实际获取每份网页要的 def getpage(ur,n): page=u'' rep=urllib2.urlopen(ur).read() soup=BeautifulSoup(rep,"lxml",from_encoding='utf8') sou=soup.find("div",{"class":"x-wiki-content"}) smu=soup.find("h4").get_text() page=page+u'

'+unicode(n)+u':'+unicode(smu)+u'

' #找到img标签 将其src属性值补全 so=sou.find_all("img") for s in so: if str(s).find("http:")== -1: sou=unicode(sou).replace(s.get('src'),'http://www.liaoxuefeng.com'+s.get('src')) page=page+unicode(sou) return unicode(page) #打印成pdf(其实可有可无) def pdfcreate(): path_wkthmltopdf = r'C:\Windows\System32\wkhtmltopdf.exe' config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf) pdfkit.from_url("pdf.html", "hello.pdf", configuration=config) if __name__=='__main__': urllis=geturl(url) getothers(urllis)

其实这是大半年前写的代码,写的有点糟糕,同时用了xpath和beautifulsoup,真是汗颜,也懒的改了(哈哈)


本博客仅记录一下自己学习生活,如不胜对大家有点借鉴作用,也是极好的。



你可能感兴趣的:(python)