python爬虫设置代理(UA, IP)

避免请求频率过高,被访问网站禁止,顾设置代理池

1.设置用户代理User-Agent

import requests
from lxml import etree
'''

# 1.查看浏览器内核版本检测https://ie.icoa.cn/
# 访问网站时,浏览器会自动发送User-Agent

# 浏览器内核检测
headers = {"X-Requested-With": "XMLHttpRequest",
           "authority": "ie.icoa.cn"}
res = requests.get("https://ie.icoa.cn/", headers=headers, verify=False)
res.encoding = "utf8"

root = etree.HTML(res.text)
ua = root.xpath('//table[@class="zebra"]/tr/td/i/text()')
print(ua)  # ['python-requests/2.22.0']
# 如何伪装浏览器,headers中设置User-Agent
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
           "X-Requested-With": "XMLHttpRequest",
           "authority": "ie.icoa.cn"}

res = requests.get("https://ie.icoa.cn/", headers=headers, verify=False)
res.encoding = "utf8"

root = etree.HTML(res.text)
ua = root.xpath('//table[@class="zebra"]/tr/td/i/text()')
print(ua)  #['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36']

# 2.生成ua工具:fake-useragent
# 安装
# pip install fake-useragent
#官方文档:https://pypi.org/project/fake-useragent/
from fake_useragent import UserAgent

# 常见用法
# 1.ua.random
ua = UserAgent()

u1 = ua.random
print(u1)       #Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)


# 2.生成指定浏览器
print(ua.ie)    #Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)
print(ua.chrome)    #Mozilla/5.0 (X11; Ubuntu; Linux i686 on x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2820.59 Safari/537.36
print(ua.firefox)   #Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0

#设置浏览器代理池(用户代理池)
import random

ua_list = []
for i in range(5):
    ua_list.append(ua.random)
print(ua_list)

for i in range(5):
    user_agent = random.choice(ua_list)
    headers = {"User-Agent":user_agent}
    res = requests.get("http://baidu.com", headers=headers)
    print(res.request.headers["User-Agent"])
'''

2. 设置代理IP

# 1.在网页显示本机IP: 百度搜索”ip“
import requests
import re
url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip"
res = requests.get(url)
res.encoding="utf8"
pat = '本机IP: (.*)'
ip = re.findall(pat, res.text)
print(ip)   #['113.200.157.34']
# 2.设置代理IP
proxies = {
  "http": "http://171.15.153.156:9999",
  "https": "http://171.15.153.156:9999",
}

res = requests.get("http://www.baidu.com", proxies=proxies)
res.encoding="utf8"
pat = '本机IP: (.*)'
ip = re.findall(pat, res.text)
print(ip)   #['113.200.157.34']
# 3.设置IP代理池
# 3.1 IP代理网站:****
# 同ua代理池
# 3.2 超时异常(代理ip可能会超时)
# 方式1:增加timeout时间
#方式2:try... except

你可能感兴趣的:(爬虫,IP代理)