本次通过爬取房天下中烟台二手房房价数据,使用线性回归进行简单预测。
具体代码如下:
1.从房天下爬取烟台二手房数据信息
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import csv
import time
from sqlalchemy import create_engine
browser=webdriver.Chrome() #设置浏览器
browser.maximize_window() #浏览器窗口最大化
wait=WebDriverWait(browser,20) #设置显示等待
def getHouseInfo3(url):
info = {}
browser.get(url)
# page_source属性用于获取网页的源代码,然后就可以使用正则表达式,css,xpath,bs4来解析网页
soup = BeautifulSoup(browser.page_source,"html.parser")
houses = soup.select(".tab-cont-right .trl-item1")
#获取户型,建筑面积,单价,朝向,楼层,装修情况
for re in houses:
tmp = re.text.strip().split('\n')
name = tmp[-1].strip()
if("朝向" in name):
name = name.strip("进门")
if("楼层" in name):
name = name[0:2]
if("地上层数" in name):
name = "楼层"
if("装修程度" in name):
name = "装修"
info[name] = tmp[0].strip()
#获取小区名称,及总价,
residentialQuartersName = soup.select(".rcont .blue")[0].text.strip().split('\n')[0]
info["小区名字"] = residentialQuartersName
# 组合选择器
totalPrice = soup.select(".tab-cont-right .trl-item")
info["总价"] = totalPrice[0].text
return info
#将爬取一页数据的代码放到方法中
domain = "http://yt.esf.fang.com/"
city = "house/"
def pageFun(i):
page_url = domain + city +"i3"+ i + "/"
print(page_url)
time.sleep(5)
browser.get(page_url)
soup = BeautifulSoup(browser.page_source,"html.parser")
houses = soup.select(".shop_list dl")
page_info_list = []
#遍历返回的房屋信息
for house in houses:
#加try except异常处理
try:
url = domain+house.select(".floatl a")[0]['href']
print(url)
info = getHouseInfo3(url)
page_info_list.append(info)
#睡眠
time.sleep(2)
except Exception as e:
print("---------->",e)
# 将数据转化成DataFrame类型
df = pd.DataFrame(page_info_list)
#以追加的方式将爬取下来的数据放入houseinfo.csv文件中
# df.to_csv("houseinfomation.csv",mode='a')
return df
# 连接数据库
yconnect = create_engine('mysql+pymysql://root:@localhost:3306/pythondatabase?charset=utf8')
def intoDataBase():
for i in range(1,101):
try:
time.sleep(5)
df_a = pageFun(str(i))
# 使用pandas向数据库中写入数据
pd.io.sql.to_sql(df_a, 'yt_house_price_2', yconnect, schema='pythondatabase', if_exists='append')
except Exception as e:
print("Exception :",e)
if __name__ == '__main__':
intoDataBase()
2 一元线性回归模型
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# 从数据库中读取数据
def connectDataBase():
yconnect = create_engine('mysql+pymysql://root:@localhost:3306/pythondatabase?charset=utf8')
df = pd.io.sql.read_sql(sql='select * from yt_house_price',con=yconnect)
return df
# 数据预处理
def dataPreprocessing(df):
#删除index列
del df["index"]
df[["室","厅","卫"]] = df["户型"].str.extract("(\d+)室(\d+)厅(\d+)卫")
df["室"] = df["室"].astype(float)
df["厅"] = df["厅"].astype(float)
df["卫"] = df["卫"].astype(float)
df.dropna(how='any', inplace = True)
df["建筑面积"] = df["建筑面积"].map(lambda e: e.replace("平米",""))
df["建筑面积"] = df["建筑面积"].astype(float)
df["总价"] = df["总价"].map(lambda e:e.replace("万",""))
df["总价"] = df["总价"].astype(float)
df["单价"] = df["单价"].map(lambda e:e.replace("元/平米",""))
df["单价"] = df["单价"].astype(float)
df_direction = pd.get_dummies(df["朝向"])
df_decoration = pd.get_dummies(df["装修"])
df_floor = pd.get_dummies(df["楼层"])
del df["小区名字"]
del df["户型"]
del df["朝向"]
del df["楼层"]
del df["装修"]
df = pd.concat([df,df_decoration,df_direction,df_floor],axis=1)
del df["东西"]
del df["南北"]
del df["北"]
del df["中层"]
del df["简装修"]
del df['暂无']
return df
# 画数据散点图
def scatterPlot(df):
# 预处理数据,将df传入函数dataPreprocessing中
df = dataPreprocessing(df)
#绘制建筑面积和总价的散点图
#在jupyter notebook中需要加入这一行,否则,显示不出图像
%matplotlib inline
# plt.figure(figsize=(15,10))
df1 = df[df["建筑面积"] <= 400]
#print(df1.count())
area = df1[["建筑面积"]]
price = df1[["总价"]]
plt.scatter(area, price)
plt.xlabel('建筑面积/平米',fontproperties='SimHei',fontsize=14)
plt.ylabel('总价/万',fontproperties='SimHei',fontsize=14)
# plt.title('总价与建筑面积关系图',fontproperties='SimHei',fontsize=14)
plt.show()
return area,price,df1
#使用线性回归拟合
def modelTraining(area,price):
linear = LinearRegression()
#训练
model = linear.fit(area,price)
print(model.intercept_,model.coef_)
return model
#将预测的房价和原始的数据的散点图绘制在一张图上
#绘制建筑面积和总价的散点图
def scatterPlotFinall(area, model,df1):
price_ = model.predict(area)
plt.figure(figsize=(15,10))
area = df1[["建筑面积"]]
price = df1[["总价"]]
plt.scatter(area,price)
plt.plot(area,price_,color="red")
plt.xlabel('建筑面积/平米',fontproperties='SimHei',fontsize=14)
plt.ylabel('总价/万',fontproperties='SimHei',fontsize=14)
# plt.title('总价与建筑面积关系图',fontproperties='SimHei',fontsize=14)
plt.show()
def main():
df = connectDataBase()
area,price,df1 = scatterPlot(df)
model = modelTraining(area,price)
scatterPlotFinall(area,model,df1)
if __name__ == '__main__':
main()
3 多元线性回归模型
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
# 特征的选取
def features(df1):
# 上面使用的是单元线性回归,下面用多元线性回归来对房价做预测
cols = ["建筑面积","室","厅","卫","中装修","毛坯","精装修","豪华装修","东","东北","东南","南","西","西北","西南","低层","高层"]
X = df1[cols]
Y = df1["总价"]
return X,Y
def main(X,Y):
# 划分测试集和训练集
# 训练集和测试集为4:1
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=123)
#多元线性回归
linear_multi = LinearRegression()
model = linear_multi.fit(x_train,y_train)
print("model.intercept_为:\n",model.intercept_)
print('\n')
print("model.coef_为:\n",model.coef_)
print('\n')
#多元线性回归的截距和回归系数
predict_result = model.predict(x_test)
score = model.score(x_test,y_test)
print('R-scores:\n',score)
if __name__ == '__main__':
df = connectDataBase()
area,price,df1 = scatterPlot(df)
X,Y = features(df1)
main(X,Y)