网络爬虫学习第二弹:requests库的使用

requests库使用

requests库的功能与之前学习的urllib库类似,但功能更强大,实现也更简洁。下面是基本的使用方法。

import requests

r=requests.get("https://www.baidu.com/")
print(type(r)) # 打印Response的类型
print("---------------------")
print(r.status_code) # 打印Response的状态码
print("---------------")
print(type(r.text)) # 打印Response的内容类型
print("---------------")
#print(r.text) # 打印Response的内容
print("---------------")
print(r.cookies) # 打印Response的cookies

---------------------
200
---------------

---------------
---------------
]>

GET请求

import requests

data={
    'name':'germey',
    'age':22,
}
# 如果要附加额外信息,直接构造一个字典结构储存,传入参数params
r=requests.get('http://httpbin.org/get',params=data)
print(type(r.text))
print(r.text)
print("***************************")
# r.text返回的是str类型,但是是json格式。
#可以通过调用json()方法将json格式的字符串转化为字典类型
print(type(r.json()))
print(r.json())

{
  "args": {
    "age": "22", 
    "name": "germey"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.18.4"
  }, 
  "origin": "171.209.79.31", 
  "url": "http://httpbin.org/get?name=germey&age=22"
}

***************************

{'args': {'age': '22', 'name': 'germey'}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'close', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.18.4'}, 'origin': '171.209.79.31', 'url': 'http://httpbin.org/get?name=germey&age=22'}
抓取网页
import requests
import re

# 构造请求头,伪装成浏览器来进行访问
headers={
    'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebkit/537.36(KHML,like Gecko)\
    Chrome/52.0.0.2743.116 Safari/573.36'
}
r=requests.get("https://www.zhihu.com/explore",headers=headers)
# 利用正则表达式来抓取相应内容
pattern=re.compile('explore-feed.*?question_link.*?>(.*?)',re.S)
titles=re.findall(pattern,r.text)
print(titles)
['\n如何看待美国五角大楼首次接受审计?\n', '\n李现和杨紫有可能产生火花吗?双方适合吗?\n', '\n你最欣赏的性格是什么样的?\n', '\n如何评价新 iPad Pro 宣称有 Xbox One S 一般的图形性能?\n', '\n你有亲历过道德绑架吗?\n', '\n外国人对中国的哪些刻板印象会令中国人大吃一惊?\n', '\n有哪些东西你以为很贵,但其实很便宜?\n', '\n如何看待靳东粉丝团官博靳东影视天地因王凯献血而造谣,以及后续处理?\n', '\n你家猫咪给过你哪些礼物或者回礼?\n', '\n怎么评价朱一龙幻乐之城的表现?\n']
抓取二进制数据

抓取页面用返回的HTML文档,而网页中的图片、音频、视频等都是有二进制码组成,抓取这些,首先要拿到它的二进制码

import requests

r=requests.get("https://github.com/favicon.ico")
#print(r.text) # 返回的是字符串内容,相当于将图片直接转化为字符串所以会出现乱码
#print(r.content) # 返回的是bytes类型的数据
import requests

r=requests.get("https://github.com/favicon.ico")
# open()方法,第一个参数是要保存的文件名,第二个参数代表用写入二进制的形式打开,可以向文件写入二进制
with open('favicon.ico','wb') as f:
    f.write(r.content)

添加headers

import requests
# 如果不传递headers就无法正常请求
r=requests.get("https://www.zhihu.com/explore")
print(r.text)

400 Bad Request

400 Bad Request


openresty
import requests
# 构造headers
headers={
    "User-Agent":"Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebkit/537.36(KHML,like Gecko)\
    Chrome/52.0.0.2743.116 Safari/573.36'"
}
r=requests.get("https://www.zhihu.com/explore",headers=headers)
#print(r.text)

POST请求

import requests 

data={
    'name':'germey',
    'age':'22'
}

r=requests.post("http://httpbin.org/post",data=data)
print(r.text)
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "age": "22", 
    "name": "germey"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Content-Length": "18", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.18.4"
  }, 
  "json": null, 
  "origin": "171.209.79.31", 
  "url": "http://httpbin.org/post"
}

响应

import requests

r=requests.get("http://www.jianshu.com")
print(type(r.status_code),r.status_code) # 返回状态码
print(type(r.headers),r.headers) # 返回响应头
print(type(r.cookies),r.cookies) # 返回cookies
print(type(r.url),r.url) # 返回url
print(type(r.history),r.history) # 返回请求历史
 403
 {'Date': 'Sat, 03 Nov 2018 01:35:11 GMT', 'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Server': 'Tengine', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'Content-Encoding': 'gzip', 'X-Via': '1.1 dianxinxiazai180:5 (Cdn Cache Server V2.0), 1.1 PSscnjdx3gy39:6 (Cdn Cache Server V2.0)'}
 
 https://www.jianshu.com/
 []

requests库提供一个状态码查询对象requests.codes

import requests 

r=requests.get("http://www.jiansh.com")
# 如果返回的状态码不是200(requests.conde.ok返回的是请求成功的状态码),终止程序,否则输出Successfully
exit() if not r.status_code==requests.codes.ok else print("Successfully")
Successfully

高级用法

文件上传

用requets可以模拟提交一些数据和文件

import requests

files={'file':open('favicon.ico','rb')} # 传入数据用字典的格式,value是用open()打开的本地文件
r=requests.post("http://www.httpbin.org/post",files=files)
#print(r.text)

Cookies

import requests

r=requests.get("http://www.baidu.com")
print(r.cookies)
# 返回的cookies是CookieJar类型,用items()的方法将其转化为元组构成的列表,遍历输出
for key,value in r.cookies.items():
    print(key+"="+value)
]>
BDORZ=27315

我们可以复制登录知乎后的cookies,来替换到自己的Cookies里面放在headrs中发送请求,来维持登录状态

import requests

headers={
    'Cookie':'_zap=a977c302-a5b4-4cf9-89de-9c44d5f8d0ae;\
    d_c0="ADAooIQzaA6PTnyPSh-yO0R7n8-EwQhfcSk=|1540290995";\
    q_c1=864dfba067974af29b460fb2158c25d2|1540290996000|1540290996000; \
    l_cap_id="ZjkzOGI3MThjYzIyNGZlNGFmYjY3ZDUzNWM0NWY3YWQ=|1540542856|b8797500de95d7f50dee23a5386d252e3ec3e97b";\
    r_cap_id="ODgxNGQ0ZmYyYTQyNDgwMDk2NDJmMzhiZDk3MGI3Njc=|1540542856|754b0a42d380d5f9c2f07e830da3a9e16212f76b"; \
    cap_id="ZjRmZWZkYjcwOTc1NGRkYzllYjY5MGYyMDAwMzY3ZmM=|1540542856|0578b4234ea73adafacbf5c1e932c1288f62f358";\
    tst=r; __gads=ID=a7a78c5ddc1f7e94:T=1540611321:S=ALNI_Mbgv0Vp_utqpBpA7F8HzEWx8uFEFA;\
    tgw_l7_route=61066e97b5b7b3b0daad1bff47134a22;\
    _xsrf=cqXGAOua1myL8pz9OD0y5Jc8AkaOp4Hg; \
    capsion_ticket=\
    "2|1:0|10:1541210996|14:capsion_ticket|44:\
    YmY1MTAxMjJkNjlkNGUxNGIxOGNmYTk1YjI5MDUwMWU=\
    |84ca07ff47cec6751101e7639bf6b73bea6c5da14337fb0779f73b7b7a99cbdc"; \
    z_c0="2|1:0|10:1541210997|4:z_c0|92:Mi4xYVlhNUJRQUFBQUFBTUNpZ2hETm9Ea\
    VlBQUFCZ0FsVk5kVkhLWEFBMi1fNy1Yb0hwNTY2bXVobGRWZUNabDFpX0ZR|3ffa6324aa\
    b11684be5e0ee557421de9e0923b46f9c339d69a7828e3d758cdc5"',
    'Host':'www.zhihu.com',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
r=requests.get("https://www.zhihu.com",headers=headers)
#print(r.text)
import requests

cookies='_zap=a977c302-a5b4-4cf9-89de-9c44d5f8d0ae; d_c0="ADAooIQzaA6PTnyPSh-yO0R7n8-EwQhfcSk=|1540290995"; q_c1=864dfba067974af29b460fb2158c25d2|1540290996000|1540290996000; l_cap_id="ZjkzOGI3MThjYzIyNGZlNGFmYjY3ZDUzNWM0NWY3YWQ=|1540542856|b8797500de95d7f50dee23a5386d252e3ec3e97b"; r_cap_id="ODgxNGQ0ZmYyYTQyNDgwMDk2NDJmMzhiZDk3MGI3Njc=|1540542856|754b0a42d380d5f9c2f07e830da3a9e16212f76b"; cap_id="ZjRmZWZkYjcwOTc1NGRkYzllYjY5MGYyMDAwMzY3ZmM=|1540542856|0578b4234ea73adafacbf5c1e932c1288f62f358"; tst=r; __gads=ID=a7a78c5ddc1f7e94:T=1540611321:S=ALNI_Mbgv0Vp_utqpBpA7F8HzEWx8uFEFA; _xsrf=cqXGAOua1myL8pz9OD0y5Jc8AkaOp4Hg; tgw_l7_route=ec452307db92a7f0fdb158e41da8e5d8; capsion_ticket="2|1:0|10:1541212495|14:capsion_ticket|44:NjM3NWRjOTgzNTY3NDdmM2IyZGM3MmUyYzYxMmQwOGE=|53f5bcd729eff1b682d405c5194ec1686ee730c53ead793920b4d5de3299ef17"; z_c0="2|1:0|10:1541212497|4:z_c0|92:Mi4xYVlhNUJRQUFBQUFBTUNpZ2hETm9EaVlBQUFCZ0FsVk5VVmZLWEFDWGlyb1BwOGJqR2ZTamtiWEFBWDJSRG9ubW53|7b00132e6477871c34a19a4655b612a0e9ea64b6e14914044e06380b97453347"'
jar=requests.cookies.RequestsCookieJar()
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
    "Host":"www.zhihu.com"
}
# 将cookies分割成每个,再用set()方法设置好每个Cookie的key和value
for cookie in cookies.split(";"):
    key,value=cookie.split('=',1)
    jar.set(key,value)
r=requests.get("https://www.zhihu.com",headers=headers,cookies=jar)
#print(r.text)

会话维持

直接用法get()或post()方法进行网页请求,相当于两个不同的会话。也就是说第一次用post()方法登陆了页面,第二次想获取登录成功后的个人信息页面,等于同时打开了两个浏览的两个会话,就会获取失败。

import requests 

r=requests.get('http://httpbin.org/cookies/set/number/123456789')
r=requests.get("http://httpbin.org/cookies")
print(r.text)
{
  "cookies": {}
}

requests中的Session对象可以维持会话,通常用于模拟登录成功后的下一步操作

import requests

s=requests.Session()
s.get('http://httpbin.org/cookies/set/number/123456789')
r=s.get("http://httpbin.org/cookies")
print(r.text)
{
  "cookies": {
    "number": "123456789"
  }
}

SSL证书验证

有一些网站没有被官方的CA的机构信任,所以会出现证书验证错误,可以用参数verify控制是否检查此证书

import requests 
from requests.packages import urllib3

# 设置忽略警告来屏蔽这个警告
urllib3.disable_warnings()
r=requests.get("https://www.12306.cn",verify=False)
print(r.status_code)
200
import logging
import requests

# 通过捕获警告到日志的方式忽略警告
logging.captureWarnings(True)
r=requests.get("https://www.12306.cn",verify=False)
print(r.status_code)
200
import requests
# 也可以指定一个本地用户作为客户端证书,可以是单个文件(包含密钥和证书)或者一个包含两个文件路径的元组

r=requests.get("https://www.12306.cn",cert=("/path/server.crt,'/path/key'"))
print(r.status_code)

代理设置

当我们大规模爬取和频繁请求网站时,网站可能会弹出验证码或者跳转到登录页面,甚至会直接封禁客户端。此时我们需要设置代理来解决这种情况,需要用到参数proxies

import requests

proxies={
    "http":"http://www.10.10.1.10:3128",
    "https":"http://www.10.10.1.10:1080"
}
r=requests.get("https://www.taobao.com",proxies=proxies)
print(r.status_code)

若代理需要HTTP Basic Auth,我们可以使用类似http://user:password@host:port 这样的语法来设置代理

import requests

proxies={
    "http":"http://user:[email protected]:3128/",
}
r=requests.get("https://www.taobao.com",proxies=proxies)
print(r.status_code)
200
import requests
# requests支持socks代理
proxies={
    "http":"socks5://user:password@host:port",
    "https":"socks5://user:password@host:port"
}
r=requests.get("https://www.taobao.com",proxies=proxies)
print(r.status_code)

超时设置

import requests
# 请求分为两个阶段:连接(connect)和读取(read),也可以传入一个元组来进行分别设置
r=requests.get("http://www.baidu.com",timeout=1)
print(r.status_code)
200

身份认证

requests自带身份验证功能,可以传入auth模块的HTTPBasicAuth类,也可以在参数auth中直接传入一个元组

import requests
from requests.auth import HTTPBasicAuth

r=requests.get("http://localhost:5000",auth=HTTPBasicAuth('username','password'))
print(r.status_code)
import requests

r=requests.get("http://localhost:5000",auth=('username','password'))
print(r.status_code)

也可以使用OAuth1验证

import requests
from requests_oauthlib import OAuth1

url="https://api.twitter.com/1.1/account/verify_credentials.json"
auth=OAuth1("YOUR_APP_KEY","YOUR_APP_SECRET","USER_OAUTH_TOKEN","USER_OAUTH_TOKEN_SECRET")
r=requests.get(url,auth=auth)
print(r.status_code)

Prepared Request

在urllib中将请求表示为数据结构,其中各个参数都可以通过一个Request对象表示,这个数据结构交PreparedRequest

from requests import Request,Session

url="http://httpbin.org/post"
data={
    'name':'jake',
}
headers={
    'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT',
}

s=Session()
req=Request('POST',url,data=data,headers=headers) # 构造一个Request对象
prepped=s.prepare_request(req) # 利用Session的prepare_request方法将其转化为Prepared Request对象
r=s.send(prepped)# 调用send方法发送
print(r.text)
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "jake"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Content-Length": "9", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0(compatible;MSIE 5.5;Windows NT"
  }, 
  "json": null, 
  "origin": "139.207.99.242", 
  "url": "http://httpbin.org/post"
}

参考:崔庆才《python3网络爬虫开发实战》

你可能感兴趣的:(网络爬虫,requests,python)