import requests
post_data = {'first':'true','kd':'python','pn':'1'}
r = requests.post("http://www.lagou.com/jobs/positionAjax.json?px=default", data=post_data)
print r.text
从返回的Json数据分析可以得出我们想要的字段:
- positionName
- companyShortName
- city
- workYear
- positionAdvantage
- salary
- education
- financeStage
用于分隔每个公司的关键字是:positionId
# spider.py
#-*-coding:utf-8-*-
import tools
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# 构造所有的url,并开始抓取(共30页)
for i in range(1,31):
post_data = {'first':'true','kd':'python','pn': i}
r = requests.post("http://www.lagou.com/jobs/positionAjax.json?px=default", data=post_data)
html = r.text
tools.fetch_content(html)
# tools.py
#-*-coding:utf-8-*-
import time,os,cookielib,urllib2,urllib
import StringIO,gzip
f = open('data.txt','wb')
def write(positionName,companyShortName,city,workYear,positionAdvantage,salary,education,financeStage):
f.write(positionName)
f.write('\r\n')
f.write(companyShortName)
f.write('\r\n')
f.write(city)
f.write('\r\n')
f.write(workYear)
f.write('\r\n')
f.write(positionAdvantage)
f.write('\r\n')
f.write(salary)
f.write('\r\n')
f.write(education)
f.write('\r\n')
f.write(financeStage)
f.write('\r\n')
f.write('\r\n')
def fj_function(url_content,beg_str,end_str,lengths):
str_len=len(beg_str)
start=url_content.find(beg_str,0,lengths)
obj=''
if start>=0:
content=url_content[start+str_len:lengths]
if end_str<>'':
end=content.find(end_str,0,lengths)
obj=content[0:end]
content=content[end:lengths]
else:
content=url_content
return content,obj
def fetch_content(url_content):
lengths=len(url_content)
while 1:
beg_str = '"positionId"'
str_len=len(beg_str)
start=url_content.find(beg_str,0,lengths)
if start>=0:
url_content=url_content[start+str_len:lengths]
end_str = '"positionId"'
end=url_content.find(end_str,0,lengths)
obj_content=url_content[:end]
# 分拣具体数据
obj_content,positionName=fj_function(obj_content,'"positionName":"','"',lengths)
obj_content,companyShortName=fj_function(obj_content,'companyShortName":"','"',lengths)
obj_content,city=fj_function(obj_content,'"city":"','"',lengths)
obj_content,workYear=fj_function(obj_content,'workYear":"','"',lengths)
obj_content,positionAdvantage=fj_function(obj_content,'positionAdvantage":"','"',lengths)
obj_content,salary=fj_function(obj_content,'salary":"','"',lengths)
obj_content,education=fj_function(obj_content,'education":"','"',lengths)
obj_content,financeStage=fj_function(obj_content,'financeStage":"','"',lengths)
# 写入文件
write(positionName,companyShortName,city,workYear,positionAdvantage,salary,education,financeStage)
else:
break