用 python 实现一个多线程网页下载器

转自http://blog.csdn.net/lanphaday/archive/2009/04/16/4083852.aspx

学习之

#!/usr/bin/env python
# -*- coding:utf-8 -*-  
import urllib, httplib  
import thread  
import time  
from Queue import Queue, Empty, Full  
HEADERS = {"Content-type": "application/x-www-form-urlencoded",  
                        'Accept-Language':'zh-cn',  
                        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0;Windows NT 5.0)',  
                        "Accept": "text/plain"}  
UNEXPECTED_ERROR = -1  
POST = 'POST'  
GET = 'GET'  
def base_log(msg):  
    print msg  
def base_fail_op(task, status, log):  
    log('fail op. task = %s, status = %d'%(str(task), status))  
def get_remote_data(tasks, results, fail_op = base_fail_op, log = base_log):  
    while True:  
        task = tasks.get()  
        try:  
            tid = task['id']  
            hpt = task['conn_args'] # hpt <= host:port, timeout  
        except KeyError, e:  
            log(str(e))  
            continue  
        log('thread_%s doing task %d'%(thread.get_ident(), tid))  
        #log('hpt = ' + str(hpt))  
        conn = httplib.HTTPConnection(**hpt)  
              
        try:  
            params = task['params']  
        except KeyError, e:  
            params = {}  
        params = urllib.urlencode(params)  
        #log('params = ' + params)  
          
        try:  
            method = task['method']  
        except KeyError:  
            method = 'GET'  
        #log('method = ' + method)  
          
        try:  
            url = task['url']  
        except KeyError:  
            url = '/'  
        #log('url = ' + url)  
          
        headers = HEADERS  
        try:  
            tmp = task['headers']  
        except KeyError, e:  
            tmp = {}  
        headers.update(tmp)  
        #log('headers = ' + str(headers))  
        headers['Content-Length'] = len(params)  
          
        try:  
            if method == POST:  
                conn.request(method, url, params, headers)  
            else:  
                conn.request(method, url + params)  
            response = conn.getresponse()  
        except Exception, e:  
            log('request failed. method = %s, url = %s, params = %s headers = %s'%(  
                        method, url, params, headers))  
            log(str(e))  
            fail_op(task, UNEXPECTED_ERROR, log)  
            continue  
              
        if response.status != httplib.OK:  
            fail_op(task, response.status, log)  
            continue  
              
        data = response.read()  
        results.put((tid, data), True)  
          
class HttpPool(object):  
    def __init__(self, threads_count, fail_op, log):  
        self._tasks = Queue()  
        self._results = Queue()  
          
        for i in xrange(threads_count):  
            thread.start_new_thread(get_remote_data,(self._tasks, self._results, fail_op, log))  
              
    def add_task(self, tid, host, url, params, headers = {}, method = 'GET', timeout = None):  
        task = {  
            'id' : tid,  
            'conn_args' : {'host' : host} if timeout is None else {'host' : host, 'timeout' : timeout},  
            'headers' : headers,  
            'url' : url,  
            'params' : params,  
            'method' : method,  
            }  
        try:  
            self._tasks.put_nowait(task)  
        except Full:  
            return False  
        return True  
          
    def get_results(self):  
        results = []  
        while True:  
            try:  
                res = self._results.get_nowait()  
            except Empty:  
                break  
            results.append(res)  
        return results  
          
def test_google(task_count, threads_count):  
    hp = HttpPool(threads_count, base_fail_op, base_log)  
    for i in xrange(task_count):  
        if hp.add_task(i,  
                'www.google.cn',  
                '/search?',  
                {'q' : 'lai'},  
#               method = 'POST'  
                ):  
            print 'add task successed.'  
              
    while True:  
        results = hp.get_results()  
        if not results:  
            time.sleep(1.0 * random.random())  
        for i in results:  
            print i[0], len(i[1])  
#           print unicode(i[1], 'gb18030')  
              
if __name__ == '__main__':  
    import sys, random  
    task_count, threads_count = int(sys.argv[1]), int(sys.argv[2])  
    test_google(task_count, threads_count) 

有兴趣想尝试运行的朋友,可以把它保存为 xxxx.py,然后执行 python xxxx.py 10 4,其中 10 表示向 google.cn 请求 10 次查询,4 表示由 4 条线程来执行这些任务。

你可能感兴趣的:(多线程,python)