requests库的功能与之前学习的urllib库类似,但功能更强大,实现也更简洁。下面是基本的使用方法。
import requests
r=requests.get("https://www.baidu.com/")
print(type(r)) # 打印Response的类型
print("---------------------")
print(r.status_code) # 打印Response的状态码
print("---------------")
print(type(r.text)) # 打印Response的内容类型
print("---------------")
#print(r.text) # 打印Response的内容
print("---------------")
print(r.cookies) # 打印Response的cookies
---------------------
200
---------------
---------------
---------------
]>
import requests
data={
'name':'germey',
'age':22,
}
# 如果要附加额外信息,直接构造一个字典结构储存,传入参数params
r=requests.get('http://httpbin.org/get',params=data)
print(type(r.text))
print(r.text)
print("***************************")
# r.text返回的是str类型,但是是json格式。
#可以通过调用json()方法将json格式的字符串转化为字典类型
print(type(r.json()))
print(r.json())
{
"args": {
"age": "22",
"name": "germey"
},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.18.4"
},
"origin": "171.209.79.31",
"url": "http://httpbin.org/get?name=germey&age=22"
}
***************************
{'args': {'age': '22', 'name': 'germey'}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'close', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.18.4'}, 'origin': '171.209.79.31', 'url': 'http://httpbin.org/get?name=germey&age=22'}
import requests
import re
# 构造请求头,伪装成浏览器来进行访问
headers={
'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebkit/537.36(KHML,like Gecko)\
Chrome/52.0.0.2743.116 Safari/573.36'
}
r=requests.get("https://www.zhihu.com/explore",headers=headers)
# 利用正则表达式来抓取相应内容
pattern=re.compile('explore-feed.*?question_link.*?>(.*?)',re.S)
titles=re.findall(pattern,r.text)
print(titles)
['\n如何看待美国五角大楼首次接受审计?\n', '\n李现和杨紫有可能产生火花吗?双方适合吗?\n', '\n你最欣赏的性格是什么样的?\n', '\n如何评价新 iPad Pro 宣称有 Xbox One S 一般的图形性能?\n', '\n你有亲历过道德绑架吗?\n', '\n外国人对中国的哪些刻板印象会令中国人大吃一惊?\n', '\n有哪些东西你以为很贵,但其实很便宜?\n', '\n如何看待靳东粉丝团官博靳东影视天地因王凯献血而造谣,以及后续处理?\n', '\n你家猫咪给过你哪些礼物或者回礼?\n', '\n怎么评价朱一龙幻乐之城的表现?\n']
抓取页面用返回的HTML文档,而网页中的图片、音频、视频等都是有二进制码组成,抓取这些,首先要拿到它的二进制码
import requests
r=requests.get("https://github.com/favicon.ico")
#print(r.text) # 返回的是字符串内容,相当于将图片直接转化为字符串所以会出现乱码
#print(r.content) # 返回的是bytes类型的数据
import requests
r=requests.get("https://github.com/favicon.ico")
# open()方法,第一个参数是要保存的文件名,第二个参数代表用写入二进制的形式打开,可以向文件写入二进制
with open('favicon.ico','wb') as f:
f.write(r.content)
import requests
# 如果不传递headers就无法正常请求
r=requests.get("https://www.zhihu.com/explore")
print(r.text)
400 Bad Request
400 Bad Request
openresty
import requests
# 构造headers
headers={
"User-Agent":"Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebkit/537.36(KHML,like Gecko)\
Chrome/52.0.0.2743.116 Safari/573.36'"
}
r=requests.get("https://www.zhihu.com/explore",headers=headers)
#print(r.text)
import requests
data={
'name':'germey',
'age':'22'
}
r=requests.post("http://httpbin.org/post",data=data)
print(r.text)
{
"args": {},
"data": "",
"files": {},
"form": {
"age": "22",
"name": "germey"
},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
"Content-Length": "18",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.18.4"
},
"json": null,
"origin": "171.209.79.31",
"url": "http://httpbin.org/post"
}
import requests
r=requests.get("http://www.jianshu.com")
print(type(r.status_code),r.status_code) # 返回状态码
print(type(r.headers),r.headers) # 返回响应头
print(type(r.cookies),r.cookies) # 返回cookies
print(type(r.url),r.url) # 返回url
print(type(r.history),r.history) # 返回请求历史
403
{'Date': 'Sat, 03 Nov 2018 01:35:11 GMT', 'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Server': 'Tengine', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'Content-Encoding': 'gzip', 'X-Via': '1.1 dianxinxiazai180:5 (Cdn Cache Server V2.0), 1.1 PSscnjdx3gy39:6 (Cdn Cache Server V2.0)'}
https://www.jianshu.com/
[]
requests库提供一个状态码查询对象requests.codes
import requests
r=requests.get("http://www.jiansh.com")
# 如果返回的状态码不是200(requests.conde.ok返回的是请求成功的状态码),终止程序,否则输出Successfully
exit() if not r.status_code==requests.codes.ok else print("Successfully")
Successfully
用requets可以模拟提交一些数据和文件
import requests
files={'file':open('favicon.ico','rb')} # 传入数据用字典的格式,value是用open()打开的本地文件
r=requests.post("http://www.httpbin.org/post",files=files)
#print(r.text)
import requests
r=requests.get("http://www.baidu.com")
print(r.cookies)
# 返回的cookies是CookieJar类型,用items()的方法将其转化为元组构成的列表,遍历输出
for key,value in r.cookies.items():
print(key+"="+value)
]>
BDORZ=27315
我们可以复制登录知乎后的cookies,来替换到自己的Cookies里面放在headrs中发送请求,来维持登录状态
import requests
headers={
'Cookie':'_zap=a977c302-a5b4-4cf9-89de-9c44d5f8d0ae;\
d_c0="ADAooIQzaA6PTnyPSh-yO0R7n8-EwQhfcSk=|1540290995";\
q_c1=864dfba067974af29b460fb2158c25d2|1540290996000|1540290996000; \
l_cap_id="ZjkzOGI3MThjYzIyNGZlNGFmYjY3ZDUzNWM0NWY3YWQ=|1540542856|b8797500de95d7f50dee23a5386d252e3ec3e97b";\
r_cap_id="ODgxNGQ0ZmYyYTQyNDgwMDk2NDJmMzhiZDk3MGI3Njc=|1540542856|754b0a42d380d5f9c2f07e830da3a9e16212f76b"; \
cap_id="ZjRmZWZkYjcwOTc1NGRkYzllYjY5MGYyMDAwMzY3ZmM=|1540542856|0578b4234ea73adafacbf5c1e932c1288f62f358";\
tst=r; __gads=ID=a7a78c5ddc1f7e94:T=1540611321:S=ALNI_Mbgv0Vp_utqpBpA7F8HzEWx8uFEFA;\
tgw_l7_route=61066e97b5b7b3b0daad1bff47134a22;\
_xsrf=cqXGAOua1myL8pz9OD0y5Jc8AkaOp4Hg; \
capsion_ticket=\
"2|1:0|10:1541210996|14:capsion_ticket|44:\
YmY1MTAxMjJkNjlkNGUxNGIxOGNmYTk1YjI5MDUwMWU=\
|84ca07ff47cec6751101e7639bf6b73bea6c5da14337fb0779f73b7b7a99cbdc"; \
z_c0="2|1:0|10:1541210997|4:z_c0|92:Mi4xYVlhNUJRQUFBQUFBTUNpZ2hETm9Ea\
VlBQUFCZ0FsVk5kVkhLWEFBMi1fNy1Yb0hwNTY2bXVobGRWZUNabDFpX0ZR|3ffa6324aa\
b11684be5e0ee557421de9e0923b46f9c339d69a7828e3d758cdc5"',
'Host':'www.zhihu.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
r=requests.get("https://www.zhihu.com",headers=headers)
#print(r.text)
import requests
cookies='_zap=a977c302-a5b4-4cf9-89de-9c44d5f8d0ae; d_c0="ADAooIQzaA6PTnyPSh-yO0R7n8-EwQhfcSk=|1540290995"; q_c1=864dfba067974af29b460fb2158c25d2|1540290996000|1540290996000; l_cap_id="ZjkzOGI3MThjYzIyNGZlNGFmYjY3ZDUzNWM0NWY3YWQ=|1540542856|b8797500de95d7f50dee23a5386d252e3ec3e97b"; r_cap_id="ODgxNGQ0ZmYyYTQyNDgwMDk2NDJmMzhiZDk3MGI3Njc=|1540542856|754b0a42d380d5f9c2f07e830da3a9e16212f76b"; cap_id="ZjRmZWZkYjcwOTc1NGRkYzllYjY5MGYyMDAwMzY3ZmM=|1540542856|0578b4234ea73adafacbf5c1e932c1288f62f358"; tst=r; __gads=ID=a7a78c5ddc1f7e94:T=1540611321:S=ALNI_Mbgv0Vp_utqpBpA7F8HzEWx8uFEFA; _xsrf=cqXGAOua1myL8pz9OD0y5Jc8AkaOp4Hg; tgw_l7_route=ec452307db92a7f0fdb158e41da8e5d8; capsion_ticket="2|1:0|10:1541212495|14:capsion_ticket|44:NjM3NWRjOTgzNTY3NDdmM2IyZGM3MmUyYzYxMmQwOGE=|53f5bcd729eff1b682d405c5194ec1686ee730c53ead793920b4d5de3299ef17"; z_c0="2|1:0|10:1541212497|4:z_c0|92:Mi4xYVlhNUJRQUFBQUFBTUNpZ2hETm9EaVlBQUFCZ0FsVk5VVmZLWEFDWGlyb1BwOGJqR2ZTamtiWEFBWDJSRG9ubW53|7b00132e6477871c34a19a4655b612a0e9ea64b6e14914044e06380b97453347"'
jar=requests.cookies.RequestsCookieJar()
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"Host":"www.zhihu.com"
}
# 将cookies分割成每个,再用set()方法设置好每个Cookie的key和value
for cookie in cookies.split(";"):
key,value=cookie.split('=',1)
jar.set(key,value)
r=requests.get("https://www.zhihu.com",headers=headers,cookies=jar)
#print(r.text)
直接用法get()或post()方法进行网页请求,相当于两个不同的会话。也就是说第一次用post()方法登陆了页面,第二次想获取登录成功后的个人信息页面,等于同时打开了两个浏览的两个会话,就会获取失败。
import requests
r=requests.get('http://httpbin.org/cookies/set/number/123456789')
r=requests.get("http://httpbin.org/cookies")
print(r.text)
{
"cookies": {}
}
requests中的Session对象可以维持会话,通常用于模拟登录成功后的下一步操作
import requests
s=requests.Session()
s.get('http://httpbin.org/cookies/set/number/123456789')
r=s.get("http://httpbin.org/cookies")
print(r.text)
{
"cookies": {
"number": "123456789"
}
}
有一些网站没有被官方的CA的机构信任,所以会出现证书验证错误,可以用参数verify控制是否检查此证书
import requests
from requests.packages import urllib3
# 设置忽略警告来屏蔽这个警告
urllib3.disable_warnings()
r=requests.get("https://www.12306.cn",verify=False)
print(r.status_code)
200
import logging
import requests
# 通过捕获警告到日志的方式忽略警告
logging.captureWarnings(True)
r=requests.get("https://www.12306.cn",verify=False)
print(r.status_code)
200
import requests
# 也可以指定一个本地用户作为客户端证书,可以是单个文件(包含密钥和证书)或者一个包含两个文件路径的元组
r=requests.get("https://www.12306.cn",cert=("/path/server.crt,'/path/key'"))
print(r.status_code)
当我们大规模爬取和频繁请求网站时,网站可能会弹出验证码或者跳转到登录页面,甚至会直接封禁客户端。此时我们需要设置代理来解决这种情况,需要用到参数proxies
import requests
proxies={
"http":"http://www.10.10.1.10:3128",
"https":"http://www.10.10.1.10:1080"
}
r=requests.get("https://www.taobao.com",proxies=proxies)
print(r.status_code)
若代理需要HTTP Basic Auth,我们可以使用类似http://user:password@host:port 这样的语法来设置代理
import requests
proxies={
"http":"http://user:[email protected]:3128/",
}
r=requests.get("https://www.taobao.com",proxies=proxies)
print(r.status_code)
200
import requests
# requests支持socks代理
proxies={
"http":"socks5://user:password@host:port",
"https":"socks5://user:password@host:port"
}
r=requests.get("https://www.taobao.com",proxies=proxies)
print(r.status_code)
import requests
# 请求分为两个阶段:连接(connect)和读取(read),也可以传入一个元组来进行分别设置
r=requests.get("http://www.baidu.com",timeout=1)
print(r.status_code)
200
requests自带身份验证功能,可以传入auth模块的HTTPBasicAuth类,也可以在参数auth中直接传入一个元组
import requests
from requests.auth import HTTPBasicAuth
r=requests.get("http://localhost:5000",auth=HTTPBasicAuth('username','password'))
print(r.status_code)
import requests
r=requests.get("http://localhost:5000",auth=('username','password'))
print(r.status_code)
也可以使用OAuth1验证
import requests
from requests_oauthlib import OAuth1
url="https://api.twitter.com/1.1/account/verify_credentials.json"
auth=OAuth1("YOUR_APP_KEY","YOUR_APP_SECRET","USER_OAUTH_TOKEN","USER_OAUTH_TOKEN_SECRET")
r=requests.get(url,auth=auth)
print(r.status_code)
在urllib中将请求表示为数据结构,其中各个参数都可以通过一个Request对象表示,这个数据结构交PreparedRequest
from requests import Request,Session
url="http://httpbin.org/post"
data={
'name':'jake',
}
headers={
'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT',
}
s=Session()
req=Request('POST',url,data=data,headers=headers) # 构造一个Request对象
prepped=s.prepare_request(req) # 利用Session的prepare_request方法将其转化为Prepared Request对象
r=s.send(prepped)# 调用send方法发送
print(r.text)
{
"args": {},
"data": "",
"files": {},
"form": {
"name": "jake"
},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
"Content-Length": "9",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Mozilla/4.0(compatible;MSIE 5.5;Windows NT"
},
"json": null,
"origin": "139.207.99.242",
"url": "http://httpbin.org/post"
}
参考:崔庆才《python3网络爬虫开发实战》