python实现多进程下载数据

# -*- coding: utf-8 -*-

from elasticsearch import Elasticsearch
import requests
import os
import datetime
from urllib.request import urlopen
from multiprocessing import Process, Queue

def get_es_data(conn, es_index, plat, lang):
    total_query = {
        "from": 0,
        "size": 10000,
        "query": {
            "bool": {
                "must": [
                    {
                        "term": {
                            "languagename.keyword": {
                                "value": lang
                            }
                        }
                    },
                    {
                        "term": {
                            "platformid.keyword": {
                                "value": plat
                            }
                        }
                    },
                    {
                        "range": {
                            "callsecs": {
                                "gte": 0,
                                "lte": 20000
                            }
                        }
                    }
                ]
            }
        }
    }
    es_res = conn.search(index=es_index, body=total_query, scroll='10m')
    return es_res['hits']['hits']

def download_file(url, dstPath):
    file_size = int(urlopen(url).info().get('Content-Length', -1))
    if file_size < 1 * 1024:
        return
    name = os.path.basename(url)
    file_path = os.path.join(dstPath, name)
    response = requests.get(url)
    with open(file_path, 'wb') as ff:
        ff.write(response.content)

def worker(task_queue, dstPath):
    while not task_queue.empty():
        url = task_queue.get()
        download_file(url, dstPath)

if __name__ == '__main__':
    today = datetime.date.today()
    yesterday = today - datetime.timedelta(days=1)
    conv_time = str(yesterday).replace('-', '.')
    conn = Elasticsearch(['192.168.10.147:9200', '192.168.10.148:9200', '192.168.10.149:9200', '192.168.10.150:9200', '192.168.10.141:9200'])
    es_index = 'cr-stat-{}'.format(conv_time)
    dstPath = r'D:\data\other'
    if not os.path.exists(dstPath):
        os.makedirs(dstPath)

    plat = '507'
    lang = '其他'
    p_num = 10
    p_lst = []

    es_data = get_es_data(conn, es_index, plat, lang)
    task_queue = Queue()
    for res in es_data:
        urls = res['_source']['files']
        for value in urls.values():
            task_queue.put(value)

    for i in range(p_num):
        p = Process(target=worker, args=(task_queue, dstPath))
        p.start()
        p_lst.append(p)

    for p in p_lst:
        p.join()

    print('download successful')

你可能感兴趣的:(jenkins,运维)