蘑菇租房数据爬取

主要步骤:
1、分析蘑菇租房的请求url,观察到总共是28页数据,数据以JSON格式返回,还比较简单,不涉及到解析html
2、通过规律,构建不同到请求参数,循环通过requests调用url进行爬取数据
3、把爬来的数据存储到本地Excel中

#coding:utf-8
import requests
import json
import xlsxwriter
import time
import random


# 构建不同的参数
def url_data():
    print "开始进行爬虫程序"
    dataList=[]
    count=1
    for num in range(1,29):
        root_data = {
            'currentPage': num,
            'cityId': 289,
            'showCount': 18
        }
        root_data['currentPage']=num
        dataList.append(root_data)
        print "成功构建第"+str(count)+"条请求参数"
        count+=1
    print "成功构建所有请求参数,一共构建了"+str(count-1)+'条请求参数'
    return dataList

# 获取房子数据
def data_parser(dataList,root_url,headers):
    time.sleep(random.random()*8)
    roomList = []
    count=1
    for data in dataList:
        response=requests.post(root_url,headers=headers,data=data)
        dataJson=json.loads(response.text)
        resList=dataJson['content']['list']
        for resInfo in resList:
            resDict={
                'title':resInfo['title'],
                'subTitle':resInfo['subTitle'],
                'showPrice':resInfo['showPrice'],
                'location': resInfo['location'],
                'lat': resInfo['lat'],
                'lng': resInfo['lng'],
                'detailDesc':resInfo['detailDesc'],
                'cityId':resInfo['cityId'],
                'communityId':resInfo['communityId']
            }
            roomList.append(resDict)
            print '成功把第'+str(count)+'条房子信息存储到列表'
            count+=1
    print "房子信息获取成功,一共获取到"+str(count-1)+"条数据"
    return roomList


# 把数据存储到表格
def sort_excel(roomList):
    print "开始存储数据到Excel"
    book = xlsxwriter.Workbook('上海租房数据.xlsx')  # 创建book
    sheet = book.add_worksheet()  # 根据book创建表
    #定义表头
    sheet.write('A1','title')
    sheet.write('B1','subTitle')
    sheet.write('C1','showPrice')
    sheet.write('D1','location')
    sheet.write('E1','lat')
    sheet.write('F1','lng')
    sheet.write('G1','detailDesc')
    sheet.write('H1','cityId')
    sheet.write('I1','communityId')
    row=1
    col=0
    count=1
    for roomInfo in roomList:
        sheet.write(row,col,roomInfo['title'])
        sheet.write(row, col+1, roomInfo['subTitle'])
        sheet.write(row, col+2, roomInfo['showPrice'])
        sheet.write(row, col+3, roomInfo['location'])
        sheet.write(row, col+4, roomInfo['lat'])
        sheet.write(row, col+5, roomInfo['lng'])
        sheet.write(row, col+6, roomInfo['detailDesc'])
        sheet.write(row, col+7, roomInfo['cityId'])
        sheet.write(row, col +8, roomInfo['communityId'])
        row+=1
        print "成功存储第"+str(count)+"条房子数据到excel中"
        count+=1
    print "存储完毕,一共存储了"+str(count-1)+"条房子数据"
    book.close()
    print "爬虫程序结束"


if __name__=='__main__':
    root_url='https://api.mgzf.com/room-find-web/find/list'
    headers={
        'Content-Type':'application/x-www-form-urlencoded',
        'origin':'http://www.mgzf.com',
        'referer':"http://www.mgzf.com/list/pg28/",
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    dataList=url_data()
    roomList=data_parser(dataList,root_url,headers)
    sort_excel(roomList)

你可能感兴趣的:(蘑菇租房数据爬取)