爬取房天下二手房房价数据,进行线性回归预测

本次通过爬取房天下中烟台二手房房价数据,使用线性回归进行简单预测。
具体代码如下:

1.从房天下爬取烟台二手房数据信息

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import csv
import time
from sqlalchemy import create_engine

browser=webdriver.Chrome()   #设置浏览器
browser.maximize_window()    #浏览器窗口最大化
wait=WebDriverWait(browser,20)  #设置显示等待

def getHouseInfo3(url):
    info = {}
    browser.get(url)
    # page_source属性用于获取网页的源代码,然后就可以使用正则表达式,css,xpath,bs4来解析网页
    soup = BeautifulSoup(browser.page_source,"html.parser")
    houses = soup.select(".tab-cont-right .trl-item1")
    #获取户型,建筑面积,单价,朝向,楼层,装修情况
    for re in houses:
        tmp = re.text.strip().split('\n')
        name = tmp[-1].strip()
        if("朝向" in name):
            name = name.strip("进门")
        if("楼层" in name):
            name = name[0:2]
        if("地上层数" in name):
            name = "楼层"
        if("装修程度" in name):
            name = "装修"
        info[name] = tmp[0].strip()
    #获取小区名称,及总价,
    residentialQuartersName = soup.select(".rcont .blue")[0].text.strip().split('\n')[0]
    info["小区名字"] = residentialQuartersName
    # 组合选择器
    totalPrice = soup.select(".tab-cont-right .trl-item")
    info["总价"] = totalPrice[0].text
    return info

#将爬取一页数据的代码放到方法中
domain = "http://yt.esf.fang.com/"
city = "house/"
def pageFun(i):
    page_url = domain + city +"i3"+ i + "/"
    print(page_url)
    time.sleep(5)
    browser.get(page_url)
    soup = BeautifulSoup(browser.page_source,"html.parser")
    houses = soup.select(".shop_list dl")    
    page_info_list = []
    #遍历返回的房屋信息
    for house in houses:
        #加try except异常处理
        try:
            url = domain+house.select(".floatl a")[0]['href']  
            print(url)
            info = getHouseInfo3(url)
            page_info_list.append(info)
            #睡眠
            time.sleep(2)
        except Exception as e:
            print("---------->",e)
        # 将数据转化成DataFrame类型
        df = pd.DataFrame(page_info_list)
        #以追加的方式将爬取下来的数据放入houseinfo.csv文件中
#         df.to_csv("houseinfomation.csv",mode='a')
    return df


# 连接数据库
yconnect = create_engine('mysql+pymysql://root:@localhost:3306/pythondatabase?charset=utf8')

def intoDataBase():
    for i in range(1,101):
        try:
            time.sleep(5)
            df_a = pageFun(str(i))
    #         使用pandas向数据库中写入数据
            pd.io.sql.to_sql(df_a, 'yt_house_price_2', yconnect, schema='pythondatabase', if_exists='append')
        except Exception as e: 
            print("Exception :",e)
    
if __name__ == '__main__':
    intoDataBase()

2 一元线性回归模型

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# 从数据库中读取数据
def connectDataBase():
    yconnect = create_engine('mysql+pymysql://root:@localhost:3306/pythondatabase?charset=utf8')
    df = pd.io.sql.read_sql(sql='select * from yt_house_price',con=yconnect)
    return df

# 数据预处理
def dataPreprocessing(df):
    #删除index列
    del df["index"]
    df[["室","厅","卫"]] = df["户型"].str.extract("(\d+)室(\d+)厅(\d+)卫")
    df["室"] = df["室"].astype(float)
    df["厅"] = df["厅"].astype(float)
    df["卫"] = df["卫"].astype(float)
    df.dropna(how='any', inplace = True)
    df["建筑面积"] = df["建筑面积"].map(lambda e: e.replace("平米",""))
    df["建筑面积"] = df["建筑面积"].astype(float)
    df["总价"] = df["总价"].map(lambda e:e.replace("万",""))
    df["总价"] = df["总价"].astype(float)
    df["单价"] = df["单价"].map(lambda e:e.replace("元/平米",""))
    df["单价"] = df["单价"].astype(float)
    df_direction = pd.get_dummies(df["朝向"])
    df_decoration = pd.get_dummies(df["装修"])
    df_floor = pd.get_dummies(df["楼层"])
    del df["小区名字"]
    del df["户型"]
    del df["朝向"]
    del df["楼层"]
    del df["装修"]
    df = pd.concat([df,df_decoration,df_direction,df_floor],axis=1)
    del df["东西"]
    del df["南北"]
    del df["北"]
    del df["中层"]
    del df["简装修"]
    del df['暂无']
    return df

# 画数据散点图
def scatterPlot(df): 
    # 预处理数据,将df传入函数dataPreprocessing中
    df = dataPreprocessing(df)
    #绘制建筑面积和总价的散点图    
    #在jupyter notebook中需要加入这一行,否则,显示不出图像
    %matplotlib inline      
#     plt.figure(figsize=(15,10))
    df1 = df[df["建筑面积"] <= 400]
    #print(df1.count())
    area = df1[["建筑面积"]]
    price = df1[["总价"]]
    plt.scatter(area, price)

    plt.xlabel('建筑面积/平米',fontproperties='SimHei',fontsize=14)
    plt.ylabel('总价/万',fontproperties='SimHei',fontsize=14)
    # plt.title('总价与建筑面积关系图',fontproperties='SimHei',fontsize=14)
    plt.show()
    return area,price,df1

#使用线性回归拟合
def modelTraining(area,price):
    linear = LinearRegression()
    #训练
    model = linear.fit(area,price)
    print(model.intercept_,model.coef_)
    return model

#将预测的房价和原始的数据的散点图绘制在一张图上
#绘制建筑面积和总价的散点图
def scatterPlotFinall(area, model,df1):
    price_ = model.predict(area)
    plt.figure(figsize=(15,10))
    area = df1[["建筑面积"]]
    price = df1[["总价"]]
    plt.scatter(area,price)
    plt.plot(area,price_,color="red")

    plt.xlabel('建筑面积/平米',fontproperties='SimHei',fontsize=14)
    plt.ylabel('总价/万',fontproperties='SimHei',fontsize=14)
    # plt.title('总价与建筑面积关系图',fontproperties='SimHei',fontsize=14)
    plt.show()
    
def main():
    df = connectDataBase()
    area,price,df1 = scatterPlot(df)
    model = modelTraining(area,price)
    scatterPlotFinall(area,model,df1)
    
if __name__ == '__main__':
    main()

3 多元线性回归模型

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
# 特征的选取
def features(df1):
    # 上面使用的是单元线性回归,下面用多元线性回归来对房价做预测
    cols = ["建筑面积","室","厅","卫","中装修","毛坯","精装修","豪华装修","东","东北","东南","南","西","西北","西南","低层","高层"]
    X = df1[cols]
    Y = df1["总价"]
    return X,Y

def main(X,Y): 
    # 划分测试集和训练集
    # 训练集和测试集为4:1
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=123)
    #多元线性回归
    linear_multi = LinearRegression()
    model = linear_multi.fit(x_train,y_train)
    print("model.intercept_为:\n",model.intercept_)
    print('\n')
    print("model.coef_为:\n",model.coef_)
    print('\n')
    #多元线性回归的截距和回归系数
    predict_result = model.predict(x_test)
    score = model.score(x_test,y_test)
    print('R-scores:\n',score)

if __name__ == '__main__':
    df = connectDataBase()
    area,price,df1 = scatterPlot(df)
    X,Y = features(df1)
    main(X,Y)

你可能感兴趣的:(python,Mysql)