pip install scikit-learn pandas matplotlib seaborn numpy
想象你是一个房地产经纪人,客户问你:“我的房子能卖多少钱?”
传统方式:你凭经验估算
线性回归方式:
用数学公式自动计算:
房价 = 基础价格 + 面积×面积系数 + 房间数×房间系数 + 地段评分×地段系数
这就是线性回归:找到输入特征和输出结果之间的线性关系。
如果只有一个特征(比如面积),线性回归就是找一条直线,让所有数据点到这条直线的距离最小。
y = w₀ + w₁x₁ + w₂x₂ + ... + wₙxₙ
寻找最佳的权重,使得预测值和真实值的差距平方和最小。
就像射箭,目标是让所有箭都尽可能接近靶心。
维度 | 线性回归 | 分类(如第9讲) |
---|---|---|
输出 | 连续数值 | 离散类别 |
例子 | 房价、股价、温度 | 品种、类型、是否 |
评估 | MAE、RMSE、R² | 准确率、精确率 |
今天我们用波士顿房价数据来构建预测模型。虽然sklearn已经移除了波士顿房价数据集,我们用加利福尼亚房价数据作为替代。
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
# 设置中文字体和图表样式
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")
print(" 线性回归实战:房价预测模型")
print("=" * 50)
# 加载加利福尼亚房价数据
# 这个数据集包含20640个房屋样本,8个特征
housing = fetch_california_housing()
# 转换为DataFrame便于分析
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['price'] = housing.target
print(" 数据基本信息:")
print(f"数据形状: {df.shape}")
print(f"特征数量: {len(housing.feature_names)}")
print(f"样本数量: {len(df)}")
print("\n 特征说明:")
feature_descriptions = {
'MedInc': '社区收入中位数',
'HouseAge': '房屋年龄中位数',
'AveRooms': '平均房间数',
'AveBedrms': '平均卧室数',
'Population': '社区人口',
'AveOccup': '平均居住人数',
'Latitude': '纬度',
'Longitude': '经度'
}
for feature, desc in feature_descriptions.items():
print(f" {feature}: {desc}")
print(f"\n 目标变量: price (房价,单位:10万美元)")
# 查看数据基本统计信息
print("\n 数据统计摘要:")
print(df.describe())
# 检查缺失值
print(f"\n 缺失值检查:")
print(df.isnull().sum())
if df.isnull().sum().sum() == 0:
print("✅ 数据完整,无缺失值")
# 1. 目标变量分布
plt.figure(figsize=(15, 12))
# 房价分布直方图
plt.subplot(3, 3, 1)
plt.hist(df['price'], bins=50, alpha=0.7, color='skyblue')
plt.xlabel('房价 (10万美元)')
plt.ylabel('频次')
plt.title('房价分布')
# 2. 特征与房价的关系
important_features = ['MedInc', 'HouseAge', 'AveRooms', 'Population']
for i, feature in enumerate(important_features):
plt.subplot(3, 3, i+2)
plt.scatter(df[feature], df['price'], alpha=0.3, s=1)
plt.xlabel(feature_descriptions[feature])
plt.ylabel('房价 (10万美元)')
plt.title(f'{feature_descriptions[feature]} vs 房价')
# 3. 相关性热力图
plt.subplot(3, 3, 6)
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
fmt='.2f', square=True)
plt.title('特征相关性矩阵')
# 4. 地理位置可视化
plt.subplot(3, 3, 7)
scatter = plt.scatter(df['Longitude'], df['Latitude'],
c=df['price'], cmap='viridis', alpha=0.6, s=1)
plt.colorbar(scatter, label='房价')
plt.xlabel('经度')
plt.ylabel('纬度')
plt.title('房价地理分布')
# 5. 收入与房价关系(最重要的特征)
plt.subplot(3, 3, 8)
plt.scatter(df['MedInc'], df['price'], alpha=0.3, s=1, color='red')
plt.xlabel('社区收入中位数')
plt.ylabel('房价 (10万美元)')
plt.title('收入 vs 房价 (关键关系)')
# 添加趋势线
z = np.polyfit(df['MedInc'], df['price'], 1)
p = np.poly1d(z)
plt.plot(df['MedInc'], p(df['MedInc']), "r--", alpha=0.8, linewidth=2)
plt.tight_layout()
plt.show()
# 分析相关性
print("\n 特征与房价的相关性:")
correlations = df.corr()['price'].sort_values(ascending=False)
for feature, corr in correlations.items():
if feature != 'price':
print(f" {feature_descriptions.get(feature, feature)}: {corr:.3f}")
print("\n 发现:")
print(" 1. 社区收入与房价强正相关 (0.688)")
print(" 2. 纬度与房价有一定正相关 (0.144)")
print(" 3. 房屋年龄与房价负相关 (-0.106)")
# 准备特征和目标变量
X = df.drop('price', axis=1) # 特征矩阵
y = df['price'] # 目标变量
print(" 数据预处理...")
# 分割训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # 20%用于测试
random_state=42 # 固定随机种子
)
print(f"✅ 数据分割完成:")
print(f" 训练集: {len(X_train)} 个样本")
print(f" 测试集: {len(X_test)} 个样本")
# 特征标准化
# 为什么需要标准化:不同特征的数值范围差别很大
# 例如:经度(-124到-114) vs 平均房间数(1到40)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"\n 特征标准化:")
print("标准化前后对比 (训练集前5个样本):")
print("标准化前:", X_train.iloc[0].values[:3])
print("标准化后:", X_train_scaled[0][:3])
print("\n 标准化的作用:")
print(" 1. 避免大数值特征主导模型")
print(" 2. 加速模型收敛")
print(" 3. 提高数值稳定性")
print("\n 开始训练线性回归模型...")
# 创建线性回归模型
model = LinearRegression()
# 训练模型(寻找最佳的权重系数)
model.fit(X_train_scaled, y_train)
print("✅ 模型训练完成!")
# 查看模型参数
print(f"\n 模型参数:")
print(f" 截距 (w₀): {model.intercept_:.4f}")
print(f"\n 各特征的权重系数:")
feature_weights = pd.DataFrame({
'feature': housing.feature_names,
'weight': model.coef_,
'abs_weight': np.abs(model.coef_)
}).sort_values('abs_weight', ascending=False)
for idx, row in feature_weights.iterrows():
direction = "正向" if row['weight'] > 0 else "负向"
print(f" {feature_descriptions[row['feature']]}: {row['weight']:.4f} ({direction})")
print(f"\n 权重解读:")
print(" 正权重:特征增加时,房价上升")
print(" 负权重:特征增加时,房价下降")
print(" 权重绝对值越大,影响越大")
# 生成线性回归方程
print(f"\n 线性回归方程:")
equation = f"房价 = {model.intercept_:.2f}"
for feature, weight in zip(housing.feature_names, model.coef_):
sign = "+" if weight >= 0 else ""
equation += f" {sign}{weight:.2f}×{feature}"
print(f" {equation}")
print("\n 评估模型性能...")
# 在训练集和测试集上进行预测
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
# 计算各种评估指标
def evaluate_model(y_true, y_pred, dataset_name):
"""计算回归模型的评估指标"""
mae = mean_absolute_error(y_true, y_pred) # 平均绝对误差
mse = mean_squared_error(y_true, y_pred) # 均方误差
rmse = np.sqrt(mse) # 均方根误差
r2 = r2_score(y_true, y_pred) # R²决定系数
print(f"\n {dataset_name}性能指标:")
print(f" 平均绝对误差 (MAE): {mae:.4f}")
print(f" 均方根误差 (RMSE): {rmse:.4f}")
print(f" 决定系数 (R²): {r2:.4f}")
return mae, rmse, r2
# 评估训练集和测试集
train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred, "训练集")
test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred, "测试集")
print(f"\n 指标解释:")
print(f" MAE: 平均预测偏差,越小越好")
print(f" RMSE: 预测误差的标准差,越小越好")
print(f" R²: 模型解释数据变异的比例,越接近1越好")
print(f"\n 模型表现总结:")
print(f" 模型解释了{test_r2:.1%}的房价变异")
print(f" 平均预测误差约为{test_mae:.2f}万美元")
# 检查过拟合
print(f"\n 过拟合检查:")
print(f" 训练集R²: {train_r2:.4f}")
print(f" 测试集R²: {test_r2:.4f}")
if train_r2 - test_r2 < 0.05:
print(" ✅ 模型表现良好,无明显过拟合")
else:
print(" ⚠️ 可能存在过拟合")
# 可视化预测结果
plt.figure(figsize=(15, 10))
# 1. 预测值 vs 真实值散点图
plt.subplot(2, 3, 1)
plt.scatter(y_test, y_test_pred, alpha=0.5, s=1)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('真实房价')
plt.ylabel('预测房价')
plt.title('预测值 vs 真实值')
plt.text(0.05, 0.95, f'R² = {test_r2:.3f}', transform=plt.gca().transAxes,
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# 2. 残差图
plt.subplot(2, 3, 2)
residuals = y_test - y_test_pred
plt.scatter(y_test_pred, residuals, alpha=0.5, s=1)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('预测房价')
plt.ylabel('残差 (真实值 - 预测值)')
plt.title('残差分析')
# 3. 残差分布
plt.subplot(2, 3, 3)
plt.hist(residuals, bins=50, alpha=0.7)
plt.xlabel('残差')
plt.ylabel('频次')
plt.title('残差分布')
# 4. 特征重要性
plt.subplot(2, 3, 4)
feature_importance = np.abs(model.coef_)
features = [feature_descriptions[f] for f in housing.feature_names]
y_pos = np.arange(len(features))
plt.barh(y_pos, feature_importance)
plt.yticks(y_pos, features)
plt.xlabel('权重绝对值')
plt.title('特征重要性')
# 5. 预测误差分布
plt.subplot(2, 3, 5)
errors = np.abs(y_test - y_test_pred)
plt.hist(errors, bins=50, alpha=0.7, color='orange')
plt.xlabel('预测误差绝对值')
plt.ylabel('频次')
plt.title('预测误差分布')
# 6. 真实vs预测的地理分布对比
plt.subplot(2, 3, 6)
# 选择测试集中的前1000个样本进行可视化
n_samples = 1000
idx = np.random.choice(len(X_test), n_samples, replace=False)
X_test_sample = X_test.iloc[idx]
y_test_sample = y_test.iloc[idx]
y_pred_sample = y_test_pred[idx]
scatter = plt.scatter(X_test_sample['Longitude'], X_test_sample['Latitude'],
c=y_pred_sample, cmap='viridis', alpha=0.6, s=1)
plt.colorbar(scatter, label='预测房价')
plt.xlabel('经度')
plt.ylabel('纬度')
plt.title('预测房价地理分布')
plt.tight_layout()
plt.show()
def predict_house_price(model, scaler, feature_descriptions):
"""演示房价预测功能"""
print("\n 房价预测演示")
print("=" * 30)
# 创建一个示例房屋数据
sample_house = {
'MedInc': 5.0, # 社区收入中位数
'HouseAge': 10.0, # 房屋年龄
'AveRooms': 6.5, # 平均房间数
'AveBedrms': 1.2, # 平均卧室数
'Population': 3000, # 社区人口
'AveOccup': 3.5, # 平均居住人数
'Latitude': 34.0, # 纬度
'Longitude': -118.0 # 经度
}
print(" 房屋信息:")
for feature, value in sample_house.items():
desc = feature_descriptions.get(feature, feature)
print(f" {desc}: {value}")
# 转换为模型输入格式
sample_array = np.array(list(sample_house.values())).reshape(1, -1)
sample_scaled = scaler.transform(sample_array)
# 预测房价
predicted_price = model.predict(sample_scaled)[0]
print(f"\n 预测结果:")
print(f" 预测房价: {predicted_price:.2f} × 10万美元 = ${predicted_price*100000:.0f}")
# 分析各特征的贡献
print(f"\n 各特征对房价的贡献:")
contributions = model.coef_ * sample_scaled[0]
for i, (feature, contrib) in enumerate(zip(sample_house.keys(), contributions)):
desc = feature_descriptions.get(feature, feature)
print(f" {desc}: {contrib:+.3f}")
print(f" 基础价格 (截距): {model.intercept_:.3f}")
print(f" 总和: {model.intercept_ + contributions.sum():.3f}")
return predicted_price
# 演示预测
predicted_price = predict_house_price(model, scaler, feature_descriptions)
# 第10讲:线性回归 - 房价预测模型
# 完整的线性回归项目,从数据加载到模型部署
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
class HousePricePredictor:
"""房价预测模型类"""
def __init__(self):
self.model = None
self.scaler = None
self.feature_names = None
self.feature_descriptions = {
'MedInc': '社区收入中位数',
'HouseAge': '房屋年龄中位数',
'AveRooms': '平均房间数',
'AveBedrms': '平均卧室数',
'Population': '社区人口',
'AveOccup': '平均居住人数',
'Latitude': '纬度',
'Longitude': '经度'
}
def load_data(self):
"""加载和预处理数据"""
print(" 加载加利福尼亚房价数据...")
# 加载数据
housing = fetch_california_housing()
self.feature_names = housing.feature_names
# 创建DataFrame
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['price'] = housing.target
print(f"✅ 数据加载完成: {df.shape[0]} 个样本, {df.shape[1]-1} 个特征")
# 基本数据检查
print(f" 数据概览:")
print(f" 价格范围: ${housing.target.min()*100000:.0f} - ${housing.target.max()*100000:.0f}")
print(f" 平均价格: ${housing.target.mean()*100000:.0f}")
print(f" 缺失值: {df.isnull().sum().sum()}")
return df, housing
def explore_data(self, df):
"""数据探索和可视化"""
print("\n 生成数据探索图表...")
plt.figure(figsize=(16, 12))
# 1. 房价分布
plt.subplot(3, 4, 1)
plt.hist(df['price'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('房价 (10万美元)')
plt.ylabel('频次')
plt.title('房价分布')
# 2-5. 关键特征与房价的关系
key_features = ['MedInc', 'HouseAge', 'AveRooms', 'Population']
for i, feature in enumerate(key_features):
plt.subplot(3, 4, i+2)
plt.scatter(df[feature], df['price'], alpha=0.3, s=0.5)
plt.xlabel(self.feature_descriptions[feature])
plt.ylabel('房价')
plt.title(f'{self.feature_descriptions[feature]} vs 房价')
# 添加趋势线
z = np.polyfit(df[feature], df['price'], 1)
p = np.poly1d(z)
plt.plot(df[feature], p(df[feature]), "r--", alpha=0.8)
# 6. 相关性热力图
plt.subplot(3, 4, 6)
corr_matrix = df.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm',
center=0, fmt='.2f', square=True)
plt.title('特征相关性')
# 7. 地理分布
plt.subplot(3, 4, 7)
scatter = plt.scatter(df['Longitude'], df['Latitude'],
c=df['price'], cmap='viridis', alpha=0.6, s=0.5)
plt.colorbar(scatter, label='房价')
plt.xlabel('经度')
plt.ylabel('纬度')
plt.title('房价地理分布')
# 8. 房价统计信息
plt.subplot(3, 4, 8)
price_stats = df['price'].describe()
plt.text(0.1, 0.9, f"房价统计信息:", transform=plt.gca().transAxes,
fontsize=12, fontweight='bold')
for i, (stat, value) in enumerate(price_stats.items()):
plt.text(0.1, 0.8-i*0.08, f"{stat}: {value:.2f}",
transform=plt.gca().transAxes, fontsize=10)
plt.axis('off')
# 9-12. 各特征分布
for i, feature in enumerate(['MedInc', 'HouseAge', 'AveRooms', 'Latitude']):
plt.subplot(3, 4, i+9)
plt.hist(df[feature], bins=30, alpha=0.7, edgecolor='black')
plt.xlabel(self.feature_descriptions[feature])
plt.ylabel('频次')
plt.title(f'{self.feature_descriptions[feature]}分布')
plt.tight_layout()
plt.show()
# 输出相关性分析
print("\n 特征与房价相关性排名:")
correlations = df.corr()['price'].drop('price').sort_values(key=abs, ascending=False)
for feature, corr in correlations.items():
print(f" {self.feature_descriptions[feature]}: {corr:.3f}")
def prepare_data(self, df):
"""准备训练数据"""
print("\n 准备训练数据...")
# 分离特征和目标
X = df.drop('price', axis=1)
y = df['price']
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 特征标准化
self.scaler = StandardScaler()
X_train_scaled = self.scaler.fit_transform(X_train)
X_test_scaled = self.scaler.transform(X_test)
print(f"✅ 数据准备完成:")
print(f" 训练集: {len(X_train)} 个样本")
print(f" 测试集: {len(X_test)} 个样本")
print(f" 特征已标准化")
return X_train_scaled, X_test_scaled, y_train, y_test, X_train, X_test
def train_model(self, X_train_scaled, y_train):
"""训练线性回归模型"""
print("\n 训练线性回归模型...")
# 创建并训练模型
self.model = LinearRegression()
self.model.fit(X_train_scaled, y_train)
print("✅ 模型训练完成!")
# 显示模型参数
print(f"\n 模型参数:")
print(f" 截距: {self.model.intercept_:.4f}")
print(f"\n 特征权重:")
feature_weights = []
for feature, weight in zip(self.feature_names, self.model.coef_):
direction = "↗️" if weight > 0 else "↘️"
feature_weights.append((feature, weight, abs(weight)))
print(f" {self.feature_descriptions[feature]}: {weight:.4f} {direction}")
# 按重要性排序
feature_weights.sort(key=lambda x: x[2], reverse=True)
print(f"\n 最重要的3个特征:")
for i, (feature, weight, abs_weight) in enumerate(feature_weights[:3]):
print(f" {i+1}. {self.feature_descriptions[feature]} (权重: {weight:.4f})")
def evaluate_model(self, X_train_scaled, X_test_scaled, y_train, y_test):
"""评估模型性能"""
print("\n 评估模型性能...")
# 预测
y_train_pred = self.model.predict(X_train_scaled)
y_test_pred = self.model.predict(X_test_scaled)
# 计算指标
def calc_metrics(y_true, y_pred, name):
mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
print(f"\n {name}性能:")
print(f" 平均绝对误差 (MAE): {mae:.4f} (约${mae*100000:.0f})")
print(f" 均方根误差 (RMSE): {rmse:.4f} (约${rmse*100000:.0f})")
print(f" 决定系数 (R²): {r2:.4f} ({r2*100:.1f}%)")
return mae, rmse, r2
# 评估训练集和测试集
train_mae, train_rmse, train_r2 = calc_metrics(y_train, y_train_pred, "训练集")
test_mae, test_rmse, test_r2 = calc_metrics(y_test, y_test_pred, "测试集")
# 过拟合检查
print(f"\n 模型诊断:")
if abs(train_r2 - test_r2) < 0.05:
print(" ✅ 模型泛化良好,无明显过拟合")
else:
print(" ⚠️ 可能存在过拟合,考虑正则化")
return y_test_pred, test_mae, test_rmse, test_r2
def visualize_results(self, y_test, y_test_pred, X_test):
"""可视化预测结果"""
print("\n 生成结果可视化...")
plt.figure(figsize=(16, 10))
# 1. 预测vs真实散点图
plt.subplot(2, 4, 1)
plt.scatter(y_test, y_test_pred, alpha=0.5, s=1)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('真实房价')
plt.ylabel('预测房价')
plt.title('预测 vs 真实')
# 添加R²标注
r2 = r2_score(y_test, y_test_pred)
plt.text(0.05, 0.95, f'R² = {r2:.3f}', transform=plt.gca().transAxes,
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
# 2. 残差图
plt.subplot(2, 4, 2)
residuals = y_test - y_test_pred
plt.scatter(y_test_pred, residuals, alpha=0.5, s=1)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('预测房价')
plt.ylabel('残差')
plt.title('残差分析')
# 3. 残差分布
plt.subplot(2, 4, 3)
plt.hist(residuals, bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('残差')
plt.ylabel('频次')
plt.title('残差分布')
# 4. 特征重要性
plt.subplot(2, 4, 4)
importances = np.abs(self.model.coef_)
feature_names = [self.feature_descriptions[f] for f in self.feature_names]
# 排序
sorted_idx = np.argsort(importances)
pos = np.arange(sorted_idx.shape[0]) + 0.5
plt.barh(pos, importances[sorted_idx], align='center')
plt.yticks(pos, [feature_names[i] for i in sorted_idx])
plt.xlabel('权重绝对值')
plt.title('特征重要性')
# 5. 预测误差分布
plt.subplot(2, 4, 5)
errors = np.abs(residuals)
plt.hist(errors, bins=50, alpha=0.7, color='orange', edgecolor='black')
plt.xlabel('预测误差绝对值')
plt.ylabel('频次')
plt.title('预测误差分布')
# 6. 地理位置预测效果
plt.subplot(2, 4, 6)
# 随机选择1000个点进行可视化
idx = np.random.choice(len(X_test), min(1000, len(X_test)), replace=False)
X_test_sample = X_test.iloc[idx]
y_pred_sample = y_test_pred.iloc[idx] if hasattr(y_test_pred, 'iloc') else y_test_pred[idx]
scatter = plt.scatter(X_test_sample['Longitude'], X_test_sample['Latitude'],
c=y_pred_sample, cmap='viridis', alpha=0.6, s=2)
plt.colorbar(scatter, label='预测房价')
plt.xlabel('经度')
plt.ylabel('纬度')
plt.title('预测房价地理分布')
# 7. 性能摘要
plt.subplot(2, 4, 7)
mae = mean_absolute_error(y_test, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
metrics_text = f"""模型性能摘要
R² Score: {r2:.3f}
MAE: ${mae*100000:.0f}
RMSE: ${rmse*100000:.0f}
解释能力: {r2*100:.1f}%
平均误差: ${mae*100000:.0f}
"""
plt.text(0.1, 0.9, metrics_text, transform=plt.gca().transAxes,
fontsize=11, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
plt.axis('off')
# 8. 模型方程
plt.subplot(2, 4, 8)
equation_text = "线性回归方程:\n\n"
equation_text += f"房价 = {self.model.intercept_:.2f}\n"
for feature, coef in zip(self.feature_names, self.model.coef_):
sign = "+" if coef >= 0 else ""
equation_text += f" {sign}{coef:.3f} × {feature}\n"
plt.text(0.05, 0.95, equation_text, transform=plt.gca().transAxes,
fontsize=9, verticalalignment='top', fontfamily='monospace',
bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))
plt.axis('off')
plt.title('模型方程')
plt.tight_layout()
plt.show()
def predict_single_house(self, house_features):
"""预测单个房屋价格"""
if self.model is None or self.scaler is None:
raise ValueError("模型尚未训练,请先运行 train_model()")
# 转换输入格式
if isinstance(house_features, dict):
features_array = np.array([house_features[f] for f in self.feature_names]).reshape(1, -1)
else:
features_array = np.array(house_features).reshape(1, -1)
# 标准化
features_scaled = self.scaler.transform(features_array)
# 预测
prediction = self.model.predict(features_scaled)[0]
# 分析贡献度
contributions = self.model.coef_ * features_scaled[0]
return prediction, contributions
def run_complete_analysis(self):
"""运行完整的分析流程"""
print(" 房价预测模型 - 完整分析流程")
print("=" * 60)
try:
# 1. 加载数据
df, housing = self.load_data()
# 2. 数据探索
self.explore_data(df)
# 3. 准备数据
X_train_scaled, X_test_scaled, y_train, y_test, X_train, X_test = self.prepare_data(df)
# 4. 训练模型
self.train_model(X_train_scaled, y_train)
# 5. 评估模型
y_test_pred, test_mae, test_rmse, test_r2 = self.evaluate_model(
X_train_scaled, X_test_scaled, y_train, y_test
)
# 6. 可视化结果
self.visualize_results(y_test, y_test_pred, X_test)
# 7. 演示预测
print("\n 演示房价预测:")
sample_houses = [
{
'MedInc': 8.0, 'HouseAge': 5.0, 'AveRooms': 7.0, 'AveBedrms': 1.1,
'Population': 2500, 'AveOccup': 3.0, 'Latitude': 34.2, 'Longitude': -118.3
},
{
'MedInc': 3.0, 'HouseAge': 25.0, 'AveRooms': 4.5, 'AveBedrms': 1.3,
'Population': 5000, 'AveOccup': 4.0, 'Latitude': 36.8, 'Longitude': -121.3
}
]
for i, house in enumerate(sample_houses, 1):
print(f"\n 示例房屋 {i}:")
for feature, value in house.items():
print(f" {self.feature_descriptions[feature]}: {value}")
prediction, contributions = self.predict_single_house(house)
print(f"\n 预测房价: ${prediction*100000:.0f}")
print(f" 主要影响因素:")
contrib_df = pd.DataFrame({
'feature': [self.feature_descriptions[f] for f in self.feature_names],
'contribution': contributions
}).sort_values('contribution', key=abs, ascending=False)
for _, row in contrib_df.head(3).iterrows():
direction = "提升" if row['contribution'] > 0 else "降低"
print(f" {row['feature']}: {direction} ${abs(row['contribution'])*100000:.0f}")
# 8. 总结
print("\n" + "=" * 60)
print(" 分析完成!模型性能总结:")
print(f" 决定系数 (R²): {test_r2:.3f} - 模型解释了{test_r2*100:.1f}%的房价变异")
print(f" 平均预测误差: ${test_mae*100000:.0f}")
print(f" 均方根误差: ${test_rmse*100000:.0f}")
if test_r2 > 0.6:
print("✅ 模型表现良好,可用于房价预测")
elif test_r2 > 0.4:
print("⚠️ 模型表现中等,建议进一步优化")
else:
print("❌ 模型表现较差,需要重新设计")
print("\n 关键发现:")
feature_importance = np.abs(self.model.coef_)
top_features = np.argsort(feature_importance)[-3:][::-1]
for i, idx in enumerate(top_features, 1):
feature_name = self.feature_descriptions[self.feature_names[idx]]
print(f" {i}. {feature_name} 是房价的重要影响因素")
print("=" * 60)
except Exception as e:
print(f"❌ 运行出错: {e}")
print(" 请检查:")
print(" 1. 是否安装了所需库: sklearn, pandas, matplotlib, seaborn")
print(" 2. Python版本是否 >= 3.8")
def main():
"""主函数"""
# 创建预测器实例
predictor = HousePricePredictor()
# 运行完整分析
predictor.run_complete_analysis()
if __name__ == "__main__":
main()
房价预测模型 - 完整分析流程
============================================================
加载加利福尼亚房价数据...
✅ 数据加载完成: 20640 个样本, 8 个特征
数据概览:
价格范围: $14999 - $500001
平均价格: $206856
缺失值: 0
准备训练数据...
✅ 数据准备完成:
训练集: 16512 个样本
测试集: 4128 个样本
特征已标准化
训练线性回归模型...
✅ 模型训练完成!
模型参数:
截距: 2.0686
特征权重:
社区收入中位数: 0.8296 ↗️
房屋年龄中位数: 0.1165 ↗️
平均房间数: -0.2654 ↘️
平均卧室数: 0.3057 ↗️
社区人口: -0.0420 ↘️
平均居住人数: -0.0398 ↘️
纬度: -0.8700 ↘️
经度: -0.8638 ↘️
最重要的3个特征:
1. 社区收入中位数 (权重: 0.8296)
2. 纬度 (权重: -0.8700)
3. 经度 (权重: -0.8638)
评估模型性能...
训练集性能:
平均绝对误差 (MAE): 0.5331 (约$53310)
均方根误差 (RMSE): 0.7463 (约$74630)
决定系数 (R²): 0.6020 (60.2%)
测试集性能:
平均绝对误差 (MAE): 0.5339 (约$53390)
均方根误差 (RMSE): 0.7490 (约$74900)
决定系数 (R²): 0.5988 (59.9%)
模型诊断:
✅ 模型泛化良好,无明显过拟合
分析完成!模型性能总结:
决定系数 (R²): 0.599 - 模型解释了59.9%的房价变异
平均预测误差: $53390
均方根误差: $74900
✅ 模型表现良好,可用于房价预测
回答:
回答:
回答:
这反映了地理位置的影响:
回答:
尝试特征工程:
# 创建新特征
df['rooms_per_person'] = df['AveRooms'] / df['AveOccup']
df['bedrooms_ratio'] = df['AveBedrms'] / df['AveRooms']
对比不同模型:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
# 尝试随机森林
rf_model = RandomForestRegressor()
rf_model.fit(X_train_scaled, y_train)
预测自己的房子:
数据可视化练习:
下节预告:第11讲我们将学习逻辑回归,从预测连续值转向分类问题,构建邮件垃圾邮件分类器!