获取网页中网站的标题与描述
# -*- coding: utf-8 -*- """ """ import gzip, cStringIO import HTMLParser import threading import re import urllib2 def check_url(idx): urls = ['http://www.weibo.com', 'http://www.taobao.com', 'http://www.jd.com'] url = urls[idx] if idx == 2: req = urllib2.Request(url); req.add_header('Accept-Encoding', 'gzip, deflate'); f = urllib2.urlopen(req, timeout=30) html = f.read() #gzip解压缩 if html[:6] == '\x1f\x8b\x08\x00\x00\x00': html = gzip.GzipFile(fileobj = cStringIO.StringIO(html)).read() txt = html.decode('gbk') title_pattern = re.compile('<title>.+</title>') title = title_pattern.findall(txt)[0] title = title.replace('<title>','') title = title.replace('</title>','') content_pattern = re.compile('"description" content=.+>') content= content_pattern.findall(txt)[0] content = content.replace('"description" content=','') content = content.replace('/','') content = content.replace('>','') print url print title print content return web = urllib2.urlopen(url) txt = web.read() if idx==1: httpParser = HTMLParser.HTMLParser() txt = httpParser.unescape(txt).encode("utf-8") title_pattern = re.compile('<title>.+</title>') title = title_pattern.findall(txt)[0] title = title.replace('<title>','') title = title.replace('</title>','') content_pattern = re.compile('"description" content=.+>') content= content_pattern.findall(txt)[0] content = content.replace('"description" content=','') content = content.replace('/','') content = content.replace('>','') print url print title print content thrd_list = [] for idx in xrange(3): thrd = threading.Thread(target = check_url, args = [idx]) thrd.start() thrd_list.append(thrd) for thrd in thrd_list: thrd.join()获取网站状态码
import urllib2 import threading urls = ['http://www.weibo.com', 'http://www.taobao.com', 'http://www.jd.com'] def check_response(url): response = None try: response = urllib2.urlopen(url,timeout=5) print url,response.getcode() except urllib2.URLError as e: if hasattr(e, 'code'): print 'Error code:',e.code elif hasattr(e, 'reason'): print 'Reason:',e.reason finally: if response: response.close() thrd_list = [] for idx in xrange(3): thrd = threading.Thread(target = check_response, args = [urls[idx]]) thrd.start() thrd_list.append(thrd) for thrd in thrd_list: thrd.join()