基于requests库和re库,编写的爬取淘宝网站。
import re
import requests
def getHTMLText(url):
try:
header = {
'authority': 's.taobao.com',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.taobao.com/',
'accept-language': 'en,en-GB;q=0.9,zh;q=0.8,zh-CN;q=0.7,en-US;q=0.6',
'cookie': 'cna=d21JF1QyZFsCAZ9BYR71xdyg; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; sgcookie=E4i9L9gKHEpxQIMjiPBeH; uc3=nk2=FPjangLZtTxJ6OM2&lg2=UIHiLt3xD8xYTw%3D%3D&vt3=F8dBxGJtZZ4KKxfRhRY%3D&id2=UojWlSfz7FajKg%3D%3D; lgc=wangyu155465; uc4=nk4=0%40FnNat4I5QG8jxg%2F1nEmRVlUqUVhZX6Y%3D&id4=0%40UOBStweVBZ3pFeiDJbNe%2BHoGKK3u; tracknick=wangyu155465; _cc_=UtASsssmfA%3D%3D; enc=I9pt99Wj0gIbiJBOqucj2SGIavHM0lSp8O0UGT6cnAlyWqe65G9jSwalAVHy8UBl21V80Ih0C2%2BMPXaRHMlWOA%3D%3D; tfstk=c-U1BQVTZdvs9EI2751EQrwW60uCaDASGGMg1k8TE1kpudNspsvBzY6kSovvJ0hC.; v=0; mt=ci=-1_0; cookie2=1f1553f3a651d86583719cc43a71e099; t=66fb8e3c9c82ae03405abd9e6e2e2fe2; _tb_token_=53e88a105388e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _nk_=wangyu155465; JSESSIONID=1C4852C4B39774C761FA4496C04E845C; uc1=cookie14=UoTV6eyS7JbKRA%3D%3D; l=eBQnLhL4Q05zmuICBOfanurza77OSIRYSuPzaNbMiOCPOM5p5wIhWZkDqJ89C3GNh6RXR3oIr-vXBeYBqIv4n5U62j-la_kmn; isg=BC8v83o_rLX5i6lJLZZvok9tvkM51IP2TTLHVEG8yx6lkE-SSaQTRi1CFoiu7Vtu',
}
r = requests.get(url, headers = header)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "页面爬取失败"
def parsePage(ilt, html):
try:
plt = re.findall(r'\"view_price\":\"\d+\.\d*\"', html)
#r'\"view_price\":\"\d+\.\d*\"'
tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
#r'\"raw_title\"\:\".*?\"'
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price, title])
except:
print("解析出错")
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号", "价格", "商品名称"))
count = 0
for g in ilt:
count = count +1
print(tplt.format(count, g[0], g[1]))
def main():
goods = "书包"
depth = 2
start_url = "https://s.taobao.com/search?q=" + goods
infolist = []
for i in range(depth):
try:
url = start_url + '&s=' + str(44*i)
html = getHTMLText(url)
parsePage(infolist, html)
except:
continue
printGoodsList(infolist)
main()
其中,因为淘宝在2019年做了反爬机制,所以需要登录之后在爬取。需要在headers那里重写头部信息。具体可以参看这篇博客文章:https://blog.csdn.net/Guanhai1617/article/details/104120581。
getHTMLText(url): 爬虫获取网页信息。
parsePage(ilt, html): 解析返回的界面。
printGoodsList(ilt): 格式化输出信息。
在main()中,需要注意,deepth控制的是页面深度,即爬取多少页面。