要点:(环境Python3.5,额外库bs4)
1.从免费代理ip获取ip和端口号 http://www.xicidaili.com/nn/
2.使用代理
import urllib.request
proxy_support = urllib.request.ProxyHandler({'http': 'ip:port'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
3.urlopen测试ip能否使用 http://ip.chinaz.com/getip.aspx
4.返回包含可以使用的ip和端口组成的字典的list
贴代码:
from urllib.request import urlopen
import re
import requests
from bs4 import BeautifulSoup as bs
from urllib import request
import socket
#init timeout = 3
socket.setdefaulttimeout(3)
#request the xiciURL and get the response
def request_to_get(url):
hearder = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.9",
"Connection":"keep-alive",
"Host":"www.xicidaili.com",
"Referer":"http://www.xicidaili.com/",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
}
response = requests.get(url,headers=hearder).content
content = str(response,encoding = "utf-8")
bs_obj = bs(content,"html.parser")
return bs_obj
#get ip port and return a list format:{"https://":"ip:port"}
def find_ip_port(bs_obj):
ip_list = []
port_list = []
ips = bs_obj.findAll('tr')
for x in range(1,len(ips)):
ip = ips[x]
tds = ip.findAll("td")
ip_list.append(tds[1].text)
port_list.append(tds[2].text)
proxys = []
for i in range(len(ip_list)):
proxy_host = "http://"+ip_list[i]+":"+port_list[i]
proxy_temp = {"http":proxy_host}
proxys.append(proxy_temp)
return proxys
#test the proxy and return proxy that can be used
def return_ok_proxys(proxys):
test_url = "http://ip.chinaz.com/getip.aspx"
alright_proxys = []
for i in range(len(proxys)):
try:
proxy_support = request.ProxyHandler(proxys[i])
opener = request.build_opener(proxy_support)
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')]
request.install_opener(opener)
# response = urlopen(url,proxies = proxys[i]).read()
response = request.urlopen(test_url)
alright_proxys.append(proxys[i])
print(proxys[i])
print("is alright")
except Exception as e:
print(proxys[i])
print(e)
pass
return alright_proxys
#main function
def main_fun():
url = "http://www.xicidaili.com/nn/"
bs_obj = request_to_get(url)
proxys = find_ip_port(bs_obj)
alright_proxys = return_ok_proxys(proxys)
print(alright_proxys)
main_fun()
补充多线程版本:
from urllib.request import urlopen
import re
import requests
from bs4 import BeautifulSoup as bs
from urllib import request
import socket
import threading
import time
#init timeout = 3
socket.setdefaulttimeout(5)
test_url = "http://ip.chinaz.com/getip.aspx"
#request the xiciURL and get the response
def request_to_get(url):
hearder = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.9",
"Connection":"keep-alive",
"Host":"www.xicidaili.com",
"Referer":"http://www.xicidaili.com/",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
}
response = requests.get(url,headers=hearder).content
content = str(response,encoding = "utf-8")
bs_obj = bs(content,"html.parser")
return bs_obj
#get ip port and return a list format:{"https://":"ip:port"}
def find_ip_port(bs_obj):
ip_list = []
port_list = []
ips = bs_obj.findAll('tr')
for x in range(1,len(ips)):
ip = ips[x]
tds = ip.findAll("td")
ip_list.append(tds[1].text)
port_list.append(tds[2].text)
proxys = []
for i in range(len(ip_list)):
proxy_host = "http://"+ip_list[i]+":"+port_list[i]
proxy_temp = {"http":proxy_host}
proxys.append(proxy_temp)
return proxys
#check ip alright
def check_ip(alright_proxys,proxy):
try:
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')]
request.install_opener(opener)
response = request.urlopen(test_url).read()
content = str(response,encoding = "utf-8")
alright_proxys.append(proxy)
#print(proxy)
#print(content)
#print("is alright")
except Exception as e:
#print(proxy)
#print(e)
pass
#test the proxy and return proxy that can be used
def return_ok_proxys(proxys):
alright_proxys = []
for i in range(len(proxys)):
t = threading.Thread(target = check_ip,args = (alright_proxys,proxys[i],))
t.start()
time.sleep(5)
return alright_proxys
#main function
def main_function():
url = "http://www.xicidaili.com/nn/"
bs_obj = request_to_get(url)
proxys = find_ip_port(bs_obj)
alright_proxys = return_ok_proxys(proxys)
return alright_proxys
print(main_function())
效果图: