从零开始打造代理池

文章目录

    • 引入包
    • 存储模块
    • 获取模块
    • 将获取代理与存储代理相结合
    • 检测模块
    • 显示模块
        • 效果

引入包

import pymysql
from random import choice
from warnings import filterwarnings
import traceback  
import requests
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import time
# 针对ipython不支持的情况
import nest_asyncio
from flask import Flask, g, render_template
import pandas as pd
nest_asyncio.apply()
# 过滤数据库报错
filterwarnings("error",category=pymysql.Warning)

存储模块

整个代理池的核心部分,担负起接收代理、存储代理、发送代理的重任。数据库中共有两个条目,一个是ip,一个是score,从0到100指示ip的可用性。

# 每个代理的分数设定
MAX_SCORE = 100
MIN_SCORE = 0
INITIAL_SCORE = 10


class MysqlClient(object):
    # 构造函数
    def __init__(self,
                 mhost='localhost',
                 muser='root',
                 mpassword='1234',
                 mport=3306,
                 mdb="exercise"):
        # 注意init前后下划线都是俩
        # 连接数据量
        self.db = pymysql.connect(host=mhost,
                                  user=muser,
                                  password=mpassword,
                                  port=mport,
                                  db=mdb)
        # 取得指针运行命令
        self.cursor = self.db.cursor()
        self.cursor.execute('SELECT VERSION()')
        print('database version', self.cursor.fetchone())
        self.table = 'pools'
        # 注意调用内部函数要加self
        self.create_pools()

    def create_pools(self):
        # 如果表不存在则建新表
        sql = 'CREATE TABLE IF NOT EXISTS pools (ip VARCHAR(255) NOT NULL, score INT NOT NULL, PRIMARY KEY (ip))'
        try:
            self.cursor.execute(sql)
        except:
            pass

    def insert_ip(self, ip, score=INITIAL_SCORE):
        # 插入新的ip和分数
        data = {'ip': ip, 'score': score}
        keys = ', '.join(data.keys())
        values = ', '.join(['%s'] * len(data))
        sql = 'INSERT INTO {table}({keys}) VALUES ({values}) '.format(
            table=self.table, keys=keys, values=values)
        # 增删查改要用commit和rollback
        try:
            self.cursor.execute(sql, tuple(data.values()))
            self.db.commit()
        except:
#             traceback.print_exc()
            self.db.rollback()

    def delete_ip(self, ip):
        # 从池中删除某个ip
        condition = "ip = '" + ip + "'"
        sql = 'DELETE FROM {table} WHERE {condition}'.format(
            table=self.table, condition=condition)
        try:
            self.cursor.execute(sql)
            self.db.commit()
        except:
            self.db.rollback()

    def get_score(self, ip):
        # 获取某个ip的分数,注意字符串要加单引号
        sql = 'SELECT score FROM {table} WHERE {condition}'.format(
            table=self.table, condition="ip = '" + ip + "'")
        try:
            self.cursor.execute(sql)
            self.db.commit()
            return self.cursor.fetchall()[0][0]
        except:
            traceback.print_exc()
            self.db.rollback()

    def get_ip(self):
        # 随机产生一个可用的代理
        sql = 'SELECT ip FROM {table} WHERE {condition}'.format(
            table=self.table, condition="score = " + str(MAX_SCORE))
        best_results = []
        # 如果满分代理不存在再查找高分代理
        try:
            self.cursor.execute(sql)
            self.db.commit()
            best_results = self.cursor.fetchall()
            best_results = [x[0] for x in best_results]
        except:
            traceback.print_exc()
            self.db.rollback()
        if len(best_results) == 0:
            # 从分数最高的前百分之二十的ip中随机选一个
            sql = 'SELECT ip from {table} order by score desc limit {num}'.format(
                table=self.table,
                num=str(np.max(10, int(0.2 * self.get_num()))))
            try:
                self.cursor.execute(sql)
                self.db.commit()
                best_results = self.cursor.fetchall()
                best_results = [x[0] for x in best_results]
            except:
                traceback.print_exc()
                self.db.rollback()
        if len(best_results) > 0:
            return choice(best_results)
        
    def get_ip_by_score(self, num):
        # 获取排名最高的前若干个代理
        sql = 'SELECT * from {table} order by score desc limit {num}'.format(
                table=self.table,
                num=str(num))
        results = []
        try:
            self.cursor.execute(sql)
            self.db.commit()
            results = self.cursor.fetchall()
        except:
            traceback.print_exc()
            self.db.rollback()
        return results

    def change_score(self, ip, action):
        # action有decrease和max两种状态,分别是分数减一和分数最大
        old_score = self.get_score(ip)
        if old_score == None:
            return
        new_score = MAX_SCORE
        if action == "decrease":
            if old_score <= MIN_SCORE + 1:
                self.delete_ip(ip)
                return
            else:
                new_score = old_score - 1
        data = {'ip': ip, 'score': new_score}
        keys = ', '.join(data.keys())
        values = ', '.join(['%s'] * len(data))
        # 整合成sql语句
        sql = 'INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE'.format(
            table=self.table, keys=keys, values=values)
        update = ','.join([" {key} = %s".format(key=key) for key in data])
        sql += update
        try:
            self.cursor.execute(sql, tuple(data.values()) * 2)
            self.db.commit()
        except:
            traceback.print_exc()
            self.db.rollback()

    def show_all(self):
        # 显示所有数据
        sql = 'SELECT * FROM {table}'.format(table=self.table)
        try:
            self.cursor.execute(sql)
            self.db.commit()
            data = self.cursor.fetchall()
            print('ip\t \t\tscore')
            for d in data:
                print(d[0] + '\t' + str(d[1]))
        except:
            self.db.rollback()

    def get_num(self):
        # 显示数据总数
        sql = 'SELECT * FROM {table}'.format(table=self.table)
        try:
            self.cursor.execute(sql)
            self.db.commit()
            data = self.cursor.fetchall()
            return len(data)
        except:
            self.db.rollback()

    def get_all(self):
        # 显示所有数据
        sql = 'SELECT * FROM {table}'.format(table=self.table)
        try:
            self.cursor.execute(sql)
            self.db.commit()
            data = self.cursor.fetchall()
            return [d[0] for d in data]
        except:
            self.db.rollback()
    
    def delete_all(self):
        # 删除所有数据
        sql = 'truncate table {table};'.format(table=self.table)
        try:
            self.cursor.execute(sql)
            self.db.commit()
        except:
            self.db.rollback()

    # 析构函数
    def __del__(self):
        # 关闭指针
        self.cursor.close()
        # 关闭数据库
        self.db.close()

获取模块

通过爬虫从各大网站抓取代理,并送到存储模块。

  • 定义爬虫超类,用于为后续的爬虫类增加整合爬取各网站函数
# 注意要继承type
class SpiderMetaClass(type):
    # 对相关联的类进行修改,并返回一个新的类
    def __new__(cls, name, bases, attrs):
        # 爬取网站函数名的合集
        attrs['__ProxyFunc__'] = []
        for k, v in attrs.items():
            if 'Spider' in k:
                attrs['__ProxyFunc__'].append(k)
        return type.__new__(cls, name, bases, attrs)
  • 定义由超类动态改变的爬虫类
class Spider(object, metaclass=SpiderMetaClass):
    def __init__(self, xila_page = 10, xici_page = 10):
        self.header = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
        self.xila_page = xila_page
        self.xici_page = xici_page
        
    # 对proxydb网站进行爬取
    def Spider_proxydb(self):
        url = 'http://proxydb.net/?protocol=https&anonlvl=4&country=CN'
        r = requests.get(url, headers=self.header)
        html = r.text
        soup = BeautifulSoup(html)
        Table = soup.select('div.table-responsive')[0]
        for proxy in Table.select('a'):
            yield proxy.string
    
    # 对西拉网爬取前若干页
    def Spider_xila(self):
        urls = ['http://www.xiladaili.com/https/']
        if self.xila_page:
            urls = urls + ['http://www.xiladaili.com/https/'+str(i+1)+'/' for i in range(1,self.xila_page)]
        for url in urls:
            r = requests.get(url, headers=self.header)
            html = r.text
            soup = BeautifulSoup(html)
#             print(soup.select('div.mt-4'))
            Table = soup.select('div.mt-4')[0]
            for proxy in Table.select('tr')[1:]:
                yield proxy.select('td')[0].string
                
    # 对西刺网爬取前若干页
    def Spider_xici(self):
        urls = ['https://www.xicidaili.com/nn/'+str(i+1) for i in range(self.xici_page)]
        for url in urls:
            r = requests.get(url, headers=self.header)
            html = r.text
            soup = BeautifulSoup(html)
            Table = soup.select('table#ip_list')[0]
            for proxy in Table.select('tr')[1:]:
                yield proxy.select('td')[1].string
                
    # 通过汇总函数汇总代理结果,输入参数为超类新增的模型属性
    def get_proxy(self, function_names):
        proxies = []
        for function_name in function_names:
            for proxy in eval("self.{}()".format(function_name)):
                proxies.append(proxy)
        return proxies

将获取代理与存储代理相结合

class Getter():
    def __init__(self):
        self.client = MysqlClient()
        self.spider = Spider()
    def run(self):
        for proxy in self.spider.get_proxy(self.spider.__ProxyFunc__):
            self.client.insert_ip(proxy)
getter = Getter()
database version ('8.0.15',)
getter.run()

检测模块

检测模块对数据库中存储的ip代理进行检测,为了提高速度,使用异步方式进行检测。

# 可通过校验的状态码集合
VALID_STATUS_CODES = [200]
# 测试用URL,注意不要用https,目前不支持
TEST_URL = 'http://www.baidu.com'
# 每次异步测试的规模
BATCH_SIZE = 100
class Tester(object):
    def __init__(self):
        self.client = MysqlClient()
    
    # 单个测试URL的函数,由于是异步的,需要加上async标识
    async def test_one_proxy(self, proxy):
        conn = aiohttp.TCPConnector(ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://'+proxy
                print('正在测试', proxy)
                async with session.get(TEST_URL, proxy=real_proxy, timeout = 100) as response:
                    # 状态码正常
                    if response.status in VALID_STATUS_CODES:
                        self.client.change_score(proxy, 'max')
                        print('代理可用', proxy)
                    else:
                        self.client.change_score(proxy, 'decrease')
                        print('状态不对', proxy)
            except:
                print('测试失败', proxy)
                traceback.print_exc()
                self.client.change_score(proxy, 'decrease')
                
    # 对数据库中的所有proxy进行测试
    def test_all_proxy(self):
        try:
            proxies = self.client.get_all()
            # event_loop 事件循环:程序开启一个无限的循环,程序员会把一些函数注册到事件循环上。当满足事件发生的时候,调用相应的协程函数。
            loop = asyncio.get_event_loop()
            for i in range(0, len(proxies), BATCH_SIZE):
                test_proxies = proxies[i:i+BATCH_SIZE]
                # 构建新任务
                # 协程对象不能直接运行,在注册事件循环的时候,其实是run_until_complete方法将协程包装成为了一个任务(task)对象。
                # 所谓task对象是Future类的子类。保存了协程运行后的状态,用于未来获取协程的结果。
                tasks = [self.test_one_proxy(proxy) for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(3)
        except:
            print('测试器错误')
            traceback.print_exc()
tester = Tester()
database version ('8.0.15',)
tester.test_all_proxy()
正在测试 106.14.206.26
正在测试 1.197.204.251:9999
正在测试 106.14.76.134
正在测试 113.117.121.141
正在测试 113.117.27.223
状态不对 106.14.206.26
代理可用 111.231.239.143
代理可用 101.231.104.82
测试失败 111.231.202.91
...

显示模块

显示模块用网页来输出当前代理池的信息

class Weber(object):
    def __init__(self):
        self.client = MysqlClient()
        app = Flask(__name__)
        
        # 主页
        @app.route('/')
        def index():
            return '

Welcome to Proxy Pool System

'
# 获取一个随机可用代理 @app.route('/random') def get_porxy(): return '可用代理:'+ self.client.get_ip() # 获取多个最高分数代理 @app.route('/count/') def get_porxies(number): results = self.client.get_ip_by_score(int(number)) df = pd.DataFrame({'ip':[x[0] for x in results], 'score':[x[1] for x in results]}) html = df.to_html(classes='data', index = False, bold_rows=False, header=True) # 加标题 title = '

可用代理表

'
html = title + html # 加修饰 css = "" html = css + html return html app.run()
weber = Weber()
database version ('8.0.15',)
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   WARNING: This is a development server. Do not use it in a production deployment.
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [09/Jun/2020 00:02:38] "GET /count/10 HTTP/1.1" 200 -

效果

从零开始打造代理池_第1张图片
从零开始打造代理池_第2张图片
从零开始打造代理池_第3张图片

你可能感兴趣的:(爬虫,代理池,爬虫)