import pymysql
from random import choice
from warnings import filterwarnings
import traceback
import requests
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import time
# 针对ipython不支持的情况
import nest_asyncio
from flask import Flask, g, render_template
import pandas as pd
nest_asyncio.apply()
# 过滤数据库报错
filterwarnings("error",category=pymysql.Warning)
整个代理池的核心部分,担负起接收代理、存储代理、发送代理的重任。数据库中共有两个条目,一个是ip,一个是score,从0到100指示ip的可用性。
# 每个代理的分数设定
MAX_SCORE = 100
MIN_SCORE = 0
INITIAL_SCORE = 10
class MysqlClient(object):
# 构造函数
def __init__(self,
mhost='localhost',
muser='root',
mpassword='1234',
mport=3306,
mdb="exercise"):
# 注意init前后下划线都是俩
# 连接数据量
self.db = pymysql.connect(host=mhost,
user=muser,
password=mpassword,
port=mport,
db=mdb)
# 取得指针运行命令
self.cursor = self.db.cursor()
self.cursor.execute('SELECT VERSION()')
print('database version', self.cursor.fetchone())
self.table = 'pools'
# 注意调用内部函数要加self
self.create_pools()
def create_pools(self):
# 如果表不存在则建新表
sql = 'CREATE TABLE IF NOT EXISTS pools (ip VARCHAR(255) NOT NULL, score INT NOT NULL, PRIMARY KEY (ip))'
try:
self.cursor.execute(sql)
except:
pass
def insert_ip(self, ip, score=INITIAL_SCORE):
# 插入新的ip和分数
data = {'ip': ip, 'score': score}
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'INSERT INTO {table}({keys}) VALUES ({values}) '.format(
table=self.table, keys=keys, values=values)
# 增删查改要用commit和rollback
try:
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
except:
# traceback.print_exc()
self.db.rollback()
def delete_ip(self, ip):
# 从池中删除某个ip
condition = "ip = '" + ip + "'"
sql = 'DELETE FROM {table} WHERE {condition}'.format(
table=self.table, condition=condition)
try:
self.cursor.execute(sql)
self.db.commit()
except:
self.db.rollback()
def get_score(self, ip):
# 获取某个ip的分数,注意字符串要加单引号
sql = 'SELECT score FROM {table} WHERE {condition}'.format(
table=self.table, condition="ip = '" + ip + "'")
try:
self.cursor.execute(sql)
self.db.commit()
return self.cursor.fetchall()[0][0]
except:
traceback.print_exc()
self.db.rollback()
def get_ip(self):
# 随机产生一个可用的代理
sql = 'SELECT ip FROM {table} WHERE {condition}'.format(
table=self.table, condition="score = " + str(MAX_SCORE))
best_results = []
# 如果满分代理不存在再查找高分代理
try:
self.cursor.execute(sql)
self.db.commit()
best_results = self.cursor.fetchall()
best_results = [x[0] for x in best_results]
except:
traceback.print_exc()
self.db.rollback()
if len(best_results) == 0:
# 从分数最高的前百分之二十的ip中随机选一个
sql = 'SELECT ip from {table} order by score desc limit {num}'.format(
table=self.table,
num=str(np.max(10, int(0.2 * self.get_num()))))
try:
self.cursor.execute(sql)
self.db.commit()
best_results = self.cursor.fetchall()
best_results = [x[0] for x in best_results]
except:
traceback.print_exc()
self.db.rollback()
if len(best_results) > 0:
return choice(best_results)
def get_ip_by_score(self, num):
# 获取排名最高的前若干个代理
sql = 'SELECT * from {table} order by score desc limit {num}'.format(
table=self.table,
num=str(num))
results = []
try:
self.cursor.execute(sql)
self.db.commit()
results = self.cursor.fetchall()
except:
traceback.print_exc()
self.db.rollback()
return results
def change_score(self, ip, action):
# action有decrease和max两种状态,分别是分数减一和分数最大
old_score = self.get_score(ip)
if old_score == None:
return
new_score = MAX_SCORE
if action == "decrease":
if old_score <= MIN_SCORE + 1:
self.delete_ip(ip)
return
else:
new_score = old_score - 1
data = {'ip': ip, 'score': new_score}
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
# 整合成sql语句
sql = 'INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE'.format(
table=self.table, keys=keys, values=values)
update = ','.join([" {key} = %s".format(key=key) for key in data])
sql += update
try:
self.cursor.execute(sql, tuple(data.values()) * 2)
self.db.commit()
except:
traceback.print_exc()
self.db.rollback()
def show_all(self):
# 显示所有数据
sql = 'SELECT * FROM {table}'.format(table=self.table)
try:
self.cursor.execute(sql)
self.db.commit()
data = self.cursor.fetchall()
print('ip\t \t\tscore')
for d in data:
print(d[0] + '\t' + str(d[1]))
except:
self.db.rollback()
def get_num(self):
# 显示数据总数
sql = 'SELECT * FROM {table}'.format(table=self.table)
try:
self.cursor.execute(sql)
self.db.commit()
data = self.cursor.fetchall()
return len(data)
except:
self.db.rollback()
def get_all(self):
# 显示所有数据
sql = 'SELECT * FROM {table}'.format(table=self.table)
try:
self.cursor.execute(sql)
self.db.commit()
data = self.cursor.fetchall()
return [d[0] for d in data]
except:
self.db.rollback()
def delete_all(self):
# 删除所有数据
sql = 'truncate table {table};'.format(table=self.table)
try:
self.cursor.execute(sql)
self.db.commit()
except:
self.db.rollback()
# 析构函数
def __del__(self):
# 关闭指针
self.cursor.close()
# 关闭数据库
self.db.close()
通过爬虫从各大网站抓取代理,并送到存储模块。
# 注意要继承type
class SpiderMetaClass(type):
# 对相关联的类进行修改,并返回一个新的类
def __new__(cls, name, bases, attrs):
# 爬取网站函数名的合集
attrs['__ProxyFunc__'] = []
for k, v in attrs.items():
if 'Spider' in k:
attrs['__ProxyFunc__'].append(k)
return type.__new__(cls, name, bases, attrs)
class Spider(object, metaclass=SpiderMetaClass):
def __init__(self, xila_page = 10, xici_page = 10):
self.header = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
self.xila_page = xila_page
self.xici_page = xici_page
# 对proxydb网站进行爬取
def Spider_proxydb(self):
url = 'http://proxydb.net/?protocol=https&anonlvl=4&country=CN'
r = requests.get(url, headers=self.header)
html = r.text
soup = BeautifulSoup(html)
Table = soup.select('div.table-responsive')[0]
for proxy in Table.select('a'):
yield proxy.string
# 对西拉网爬取前若干页
def Spider_xila(self):
urls = ['http://www.xiladaili.com/https/']
if self.xila_page:
urls = urls + ['http://www.xiladaili.com/https/'+str(i+1)+'/' for i in range(1,self.xila_page)]
for url in urls:
r = requests.get(url, headers=self.header)
html = r.text
soup = BeautifulSoup(html)
# print(soup.select('div.mt-4'))
Table = soup.select('div.mt-4')[0]
for proxy in Table.select('tr')[1:]:
yield proxy.select('td')[0].string
# 对西刺网爬取前若干页
def Spider_xici(self):
urls = ['https://www.xicidaili.com/nn/'+str(i+1) for i in range(self.xici_page)]
for url in urls:
r = requests.get(url, headers=self.header)
html = r.text
soup = BeautifulSoup(html)
Table = soup.select('table#ip_list')[0]
for proxy in Table.select('tr')[1:]:
yield proxy.select('td')[1].string
# 通过汇总函数汇总代理结果,输入参数为超类新增的模型属性
def get_proxy(self, function_names):
proxies = []
for function_name in function_names:
for proxy in eval("self.{}()".format(function_name)):
proxies.append(proxy)
return proxies
class Getter():
def __init__(self):
self.client = MysqlClient()
self.spider = Spider()
def run(self):
for proxy in self.spider.get_proxy(self.spider.__ProxyFunc__):
self.client.insert_ip(proxy)
getter = Getter()
database version ('8.0.15',)
getter.run()
检测模块对数据库中存储的ip代理进行检测,为了提高速度,使用异步方式进行检测。
# 可通过校验的状态码集合
VALID_STATUS_CODES = [200]
# 测试用URL,注意不要用https,目前不支持
TEST_URL = 'http://www.baidu.com'
# 每次异步测试的规模
BATCH_SIZE = 100
class Tester(object):
def __init__(self):
self.client = MysqlClient()
# 单个测试URL的函数,由于是异步的,需要加上async标识
async def test_one_proxy(self, proxy):
conn = aiohttp.TCPConnector(ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
try:
if isinstance(proxy, bytes):
proxy = proxy.decode('utf-8')
real_proxy = 'http://'+proxy
print('正在测试', proxy)
async with session.get(TEST_URL, proxy=real_proxy, timeout = 100) as response:
# 状态码正常
if response.status in VALID_STATUS_CODES:
self.client.change_score(proxy, 'max')
print('代理可用', proxy)
else:
self.client.change_score(proxy, 'decrease')
print('状态不对', proxy)
except:
print('测试失败', proxy)
traceback.print_exc()
self.client.change_score(proxy, 'decrease')
# 对数据库中的所有proxy进行测试
def test_all_proxy(self):
try:
proxies = self.client.get_all()
# event_loop 事件循环:程序开启一个无限的循环,程序员会把一些函数注册到事件循环上。当满足事件发生的时候,调用相应的协程函数。
loop = asyncio.get_event_loop()
for i in range(0, len(proxies), BATCH_SIZE):
test_proxies = proxies[i:i+BATCH_SIZE]
# 构建新任务
# 协程对象不能直接运行,在注册事件循环的时候,其实是run_until_complete方法将协程包装成为了一个任务(task)对象。
# 所谓task对象是Future类的子类。保存了协程运行后的状态,用于未来获取协程的结果。
tasks = [self.test_one_proxy(proxy) for proxy in test_proxies]
loop.run_until_complete(asyncio.wait(tasks))
time.sleep(3)
except:
print('测试器错误')
traceback.print_exc()
tester = Tester()
database version ('8.0.15',)
tester.test_all_proxy()
正在测试 106.14.206.26
正在测试 1.197.204.251:9999
正在测试 106.14.76.134
正在测试 113.117.121.141
正在测试 113.117.27.223
状态不对 106.14.206.26
代理可用 111.231.239.143
代理可用 101.231.104.82
测试失败 111.231.202.91
...
显示模块用网页来输出当前代理池的信息
class Weber(object):
def __init__(self):
self.client = MysqlClient()
app = Flask(__name__)
# 主页
@app.route('/')
def index():
return 'Welcome to Proxy Pool System
'
# 获取一个随机可用代理
@app.route('/random')
def get_porxy():
return '可用代理:'+ self.client.get_ip()
# 获取多个最高分数代理
@app.route('/count/' )
def get_porxies(number):
results = self.client.get_ip_by_score(int(number))
df = pd.DataFrame({'ip':[x[0] for x in results], 'score':[x[1] for x in results]})
html = df.to_html(classes='data', index = False, bold_rows=False, header=True)
# 加标题
title = '可用代理表
'
html = title + html
# 加修饰
css = ""
html = css + html
return html
app.run()
weber = Weber()
database version ('8.0.15',)
* Serving Flask app "__main__" (lazy loading)
* Environment: production
WARNING: This is a development server. Do not use it in a production deployment.
Use a production WSGI server instead.
* Debug mode: off
* Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [09/Jun/2020 00:02:38] "[37mGET /count/10 HTTP/1.1[0m" 200 -