os win7 python2.7
#coding=utf8 import urllib2,re,os import threading import time,datetime def get_proxy_addr(urls,ports): proxylist = [] p = re.compile('''<tr><td>(.+?)<SCRIPT type=text/javascript>document.write\(":"\+(.+?)\)</SCRIPT></td><td>(.+?)</td><td>.+?</td><td>(.+?)</td></tr>''') for url in urls: res = urllib2.urlopen(url) pageinfo = res.read() #print pageinfo ips = p.findall(pageinfo) #根据需要构造出一定格式的条目 for row in ips: ip = row[0] port = map(lambda x:ports[x],row[1].split('+')) port = ''.join(port) agent = row[2] addr = row[3] l = [ip, port, agent, addr] proxylist.append(l) print u'数据分析完毕开始返回--------------------------------------------' return proxylist class ProxyCheck(threading.Thread): ''' 用来检查获取到的代理是否可用 以及在本地网络上的速度 ''' def __init__(self,proxylist): threading.Thread.__init__(self) self.proxylist = proxylist self.timeout = 10 self.test_url = "http://www.baidu.com" self.test_str = '030173' self.checkedPorxyList = [] def checkPorxy(self): #第一步启用 cookie cookies = urllib2.HTTPCookieProcessor() for proxy in self.proxylist: proxy_server = r'http://%s:%s' %(proxy[0],proxy[1]) #第二步 装载代理 proxy_hander = urllib2.ProxyHandler({"http":proxy_server}) #第三步 组合request try: opener = urllib2.build_opener(cookies, proxy_hander) pass except urllib2.URLError: print u'url设置错误' continue #配置request opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1')] #发送请求 urllib2.install_opener(opener) t1 = time.time() try: req = urllib2.urlopen(self.test_url,timeout=self.timeout) result = req.read() pos = result.find(self.test_str) timeused = time.time() - t1 if pos>1: self.checkedPorxyList.append((proxy[0],proxy[1],proxy[2],proxy[3],timeused)) print u'成功采集',proxy[0],timeused else: continue except Exception,e: print proxy[0],'timeout' continue def sort(self): sorted(self.checkedPorxyList,cmp=lambda x,y:cmp(x[4],y[4])) def save(self): path = os.getcwd() filename = path + '/Proxy-'+datetime.datetime.now().strftime(r'%Y%m%d%H%M%S')+'.txt' f = open(filename,'w+') for proxy in self.checkedPorxyList: f.write('%s %s %s %s %s \r\n'%(proxy[0],proxy[1],proxy[2],proxy[3],proxy[4])) f.close() def run(self): print u'代理检查开始--------------------------------------' self.checkPorxy() self.sort() print '开始保存-----' self.save() print u'数据采集完毕---------------------------------------' if __name__=='__main__': urls = (r'http://www.cnproxy.com/proxy1.html',) ports = {"z":"3","m":"4","a":"2","l":"9","f":"0","b":"5","i":"7","w":"6","x":"8","c":"1"} print u'页面采集开始---------------------------------------------------' proxylist = get_proxy_addr(urls,ports) print u'代理测试开始---------------------------------------------------' proxychek = ProxyCheck(proxylist) proxychek.start() proxychek.join()