终于实现了登陆淘宝,这个验证码机制困惑了我好几天啊。
代码中验证码提供有两种方式,第一种通过webbrowser的open直接在浏览器中打开含有验证码的图片,第二种就是将其以jepg格式存在
C:\\Users\\Administrator\\Desktop\\checkcode.jepg。你可以根据自己主机的用户名更改路径。同时这个代码必须先指定用户名和账号也
可以实时输入账号的密码,小小修改一下代码就可以。
显示根据httpfox分析网页数据,之后再使用正则扣除你想要的数据,将其显示出来。过几天可能会写一个从淘宝上抓取信息的爬虫,现在
还没有使用各种爬虫框架,基本都是使用urllib、urllib2等比较基础的包,过一阶段可能会学习到框架,我也只是一个菜鸟,一个想要飞的
更高的菜鸟罢了,代码里我写了比较详细的注释,基本逻辑就是初始化设置cookie等->发送post数据从返回信息中抓取验证码->再次发送
携带验证码的post数据->从返回信息中提取登陆状态。
# -*- coding: utf-8 -*-
import urllib
import urllib2
import cookielib #设置opener
import re #正则表达式
import webbrowser #打开界面
#淘宝登录地址
tbLoginUrl = "https://login.taobao.com/member/login.jhtml"
#存放验证码图片的网址
checkCodeUrl = ''
#post请求头部
headers = {
'x-requestted-with': 'XMLHttpRequest',
'Accept-Language': 'zh-cn',
'Accept-Encoding': 'gzip, deflate',
'ContentType': 'application/x-www-form-urlencoded; chartset=UTF-8',
'Host': 'login.taobao.com',
'DNT': 1,
'Cache-Control': 'no-cache',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Referer' : 'https://login.taobao.com/member/login.jhtml?redirectURL=http%3A%2F%2Fwww.taobao.com%2F',
'Connection' : 'Keep-Alive'
}
#设置用户名,密码
username = ""
password = ""
#同样可以采用实时输入模式
#username = raw_input("Please input your username of taobao: ")
#password = raw_input("Please input your password of taobao: ")
#请求数据包
postData = {
'TPL_username':username,
'TPL_password':password,
"need_check_code" : "false",
"loginsite": 0,
"newlogin":1,
'TPL_redirect_url':'',
'from':'tbTop',
'fc':"default",
'style':'default',
'css_style':'',
'tid':'',
'support':'000001',
'CtrlVersion':'1,0,0,7',
'loginType':3,
'minititle':'',
'minipara' :'',
"umto":"NAN",
'pstrong':2,
'llnick':'',
'sign':'',
'need_sign':'',
"isIgnore":'',
"full_redirect":'',
'popid':'',
'callback':'1',
'guf':'',
'not_duplite_str':'',
'need_user_id':'',
'poy':'',
'gvfdcname':10,
'from_encoding':'',
"sub":'',
"allp":'',
'action':'Authenticator',
'event_submit_do_login':'anything',
'longLogin':0
}
#登录主函数
def loginToTaobao():
#设置代理IP,防止频率过高本地IP被封
urllib2.ProxyHandler({'http':'http://120.193.146.97:843'})
#cookie 自动处理器
cookiejar = cookielib.LWPCookieJar()#LWPCookieJar提供可读写操作的cookie文件,存储cookie对象
cookieSupport= urllib2.HTTPCookieProcessor(cookiejar)
opener = urllib2.build_opener(cookieSupport, urllib2.HTTPHandler)
urllib2.install_opener(opener)
#打开登陆页面
taobao = urllib2.urlopen(tbLoginUrl)
resp = taobao.read().decode("gbk")
#此时直接发送post数据包到登陆地址
sendPostData(tbLoginUrl, postData, headers)
#打开验证码图片的网页
webbrowser.open_new_tab(checkCodeUrl)
#将验证码图片下载到本地
if checkCodeUrl != "":
getCheckCode(checkCodeUrl)
sendPostData(tbLoginUrl, postData, headers)
#发送post数据到登陆网址
def sendPostData(url, data, header):
print "+"*20+"sendPostData"+"+"*20
data = urllib.urlencode(data)
request = urllib2.Request(url, data, header)
response = urllib2.urlopen(request)
text = response.read().decode("gbk")
info = response.info()
status = response.getcode()
response.close()
print status
print info
print "Response:", text
#如果为第一次调用,则进入获取验证码的函数
if checkCodeUrl == "":
global checkCodeUrl
checkCodeUrl = getIdenCode(text)
print checkCodeUrl
result = handleResponseText(text)
print result
if result["state"]:
print "successfully login in!"
else:
print "failed to login in, error message: ",result["message"]
#利用正则得到存放二维码图片的网址
def getIdenCode(page):
#得到验证码的图片
pattern = re.compile('ccurl":"(.*?)"',re.S)
#匹配的结果
matchResult = re.search(pattern,page)
#已经匹配得到内容,并且验证码图片链接不为空,返回(.*?)中的内容
if matchResult and matchResult.group(1):
print matchResult.group(1)
return matchResult.group(1)
else:
print u"没有找到验证码内容"
return False
#从数据中抓取网页登陆的状态,并输出到界面上
def handleResponseText(text):
"""处理登录返回结果"""
global checkCodeUrl
print "+"*20+"handleResponseText"+"+"*20
text = text.replace(',', ' ')
responseData = {"state": False,
"message" : "",
"code" : ""}
m1 = re.match(r'\{?"state":(\w*)\ ', text)
if m1 is not None:
s = m1.group(1)
if s == "true":
responseData["state"] = True
else:
m2 = re.search(r'"message":"(\S*)"( |})', text)
if m2 is not None:
msg = m2.group(1)
responseData["message"] = msg.encode("utf-8")
else:
print "failed to get the error message"
m3 = re.match(r'.+\"code":(\w*)\ ', text)
if m3 is not None:
code = m3.group(1)
responseData["code"] = code
else:
print "failed to get the error code"
return responseData
#将图片存在本地路径中
def getCheckCode(url):
print "+"*20+"getCheckCode"+"+"*20
response = urllib2.urlopen(url)
status = response.getcode()
picData = response.read()
path = "C:\\Users\\Administrator\\Desktop\\checkcode.jepg"
if status == 200:
localPic = open(path, "wb")
localPic.write(picData)
localPic.close()
print "请到%s,打开验证码图片"%path
checkCode = raw_input("请输入验证码:")
print checkCode, type(checkCode)
postData["TPL_checkcode"] = checkCode
postData["need_check_code"] = "true"
else:
print "failed to get Check Code, status: ",status
if __name__ == "__main__":
print "-"*54
print "|"+"+"*20+"京东放养的爬虫"+"+"*20+"|"
print "-"*54+"\n\n"
loginToTaobao()