爬非动态的网页的模板

#coding=utf-8
import urllib2
import httplib
import re
from pybloomfilter import BloomFilter
import StringIO
import os
import gzip
import zlib
import lxml
from lxml import html
from lxml import etree

from bs4 import BeautifulSoup

request_headers = {
       'Accept':"image/webp,image/apng,image/*,*/*;q=0.8",
       'Accept-Encoding':"gzip, deflate",
       'Accept-Language':"zh-CN,zh;q=0.8",
       'Connection':"keep-alive",
       'Referer':"http://3g.163.com/touch/local?dataversion=A&uversion=A&version=v_standard",
       'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Mobile Safari/537.36"
                     }


# 创建 Bloom Filter
download_bf = BloomFilter(1024 * 1024 * 16, 0.01)
url = 'http://3g.163.com/touch/local?dataversion=A&uversion=A&version=v_standard'

req = urllib2.Request(url, headers=request_headers)
response = urllib2.urlopen(req)
htmlcontent = response.read()

#如果是gzip解码的话,怕出现乱码,要用下面三行
gzipped = response.headers.get('Content-Encoding')
if gzipped:
    htmlcontent = zlib.decompress(htmlcontent, 16+zlib.MAX_WBITS)

print htmlcontent

soup = BeautifulSoup(htmlcontent, 'lxml')
urls=[]
news_content=[]
# a=soup.select('div.cm_news_main > ul.cm_ul_round > li > a ')
# print a

# ul_contents=soup.select('ul[class="cm_ul_round ul_page1"] > li > a')
# print ul_contents




# for link in soup.select('div.aslide > a'):
#
#        urls.append(link.get('href'))
#        news_content.append(link.text)
# print urls
#
#
# for i in news_content:
#        print i
# print len(news_content)

for link in soup.select('div.ndi_main > h3 > a'):

       urls.append(link.get('href'))
       news_content.append(link.text)
print urls
print len(news_content)

你可能感兴趣的:(python爬虫)