机器学习本科课程 实验1 线性模型

第三章 线性模型

  • 3.1 一元线性回归

  • 3.2 多元线性回归

  • 3.3 对数几率回归,线性判别分析(二选一)

  • 3.4 类别不均衡

3.1一元线性回归——Kaggle房价预测

使用Kaggle房价预测数据集:

  1. 打乱数据顺序,取前70%的数据作为训练集,后30%的数据作为测试集
  2. 分别以LotArea, BsmtUnfSF, GarageArea三种特征作为模型的输入,SalePrice作为模型的输出
  3. 在训练集上,使用最小二乘法求解模型参数(需自己实现,不允许第三方库完成)
  4. 计算三个模型在测试集上的MAE和RMSE这两种指标的大小(需自己实现,不允许第三方库完成)
  5. 分别绘制模型的在训练集和测试集上的曲线
  6. 选做:尝试去除训练集中的异常值或离群值后再次训练模型,绘制模型的预测曲线,观察模型在测试集上预测能力的变化
import numpy as np
import pandas as pd

# 读取数据
data = pd.read_csv('./train.csv')


# 丢弃有缺失值的特征(列)
data.dropna(axis = 1, inplace = True)


# 只保留整数的特征
data = data[[col for col in data.dtypes.index if data.dtypes[col] == 'int64']]
data.head()
features = ['LotArea', 'BsmtUnfSF', 'GarageArea']
target = 'SalePrice'
data = data[features + [target]]
from sklearn.utils import shuffle
data = shuffle(data, random_state = 32) # 这个32不要改变
#输出打乱顺序后的前五行
data.head()
num_of_samples = data.shape[0]
#输出样例的数量 
print(num_of_samples)
split_line = int(num_of_samples * 0.7)
#训练集的数量
print(split_line)
train_data = data.iloc[:split_line]
test_data = data.iloc[split_line:]
def get_w(x, y):
    sumX = 0
    sumY = 0
    for i in range(0, x.count()):
        sumX = sumX + x.iat[i]
        sumY = sumY + y.iat[i]
    averX = sumX / x.count()
    averY = sumY / y.count()

    w = 0.0
    wU = 0.0
    wD = 0.0

    for i in range(0, x.count()):
        wU = wU + y.iat[i] * (x.iat[i] - averX)
        wD = wD + x.iat[i] * x.iat[i]

    wD = wD - sumX * sumX / x.count()
    w = wU / wD
    return w
def get_b(x, y, w):

b = 0.0
for i in range(0, x.count()):
    b = b + y.iat[i] - w*x.iat[i]
b = b / x.count()
return b
class myLinearRegression:
	 def __init__(self):
	     self.w = None
	     self.b = None
	
	 def fit(self, x, y):
	     self.w = get_w(x, y)
	     self.b = get_b(x, y, self.w)
	
	 def predict(self, x):
	     if self.w == None or self.b == None:
	         print("模型还未训练,请先调用fit方法训练")
	         return
	     return self.w * x + self.b
# 创建一个模型的实例
model1 = myLinearRegression()

# 使用训练集对模型进行训练,传入训练集的LotArea和标记SalePrice
model1.fit(train_data['LotArea'], train_data['SalePrice'])

# 对测试集进行预测,并将结果存储在变量prediction中
prediction1 = model1.predict(test_data['LotArea'])
def MAE(y_hat, y):
    ans = 0
    for i in range(0, y.count()):
        temp = y_hat.iat[i] - y.iat[i]
        if temp < 0:
            temp = -temp
        ans = ans + temp

    ans = ans / y.count()
    return ans

import math
def RMSE(y_hat, y):
    ans = 0
    for i in range(0, y.count()):
        temp = y_hat.iat[i] - y.iat[i]
        temp = temp ** 2
        ans = ans + temp
    ans = ans / y.count()
    ans = math.sqrt(ans)

    return ans

mae1 = MAE(prediction1, test_data['SalePrice'])
rmse1 = RMSE(prediction1, test_data['SalePrice'])
print("模型1,特征:LotArea")
print("MAE:", mae1)
print("RMSE:", rmse1)

import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize = (16, 6))

plt.subplot(121)
plt.plot(train_data['LotArea'].values, train_data['SalePrice'].values, '.', label = 'training data')
plt.plot(train_data['LotArea'].values, model1.predict(train_data['LotArea']), '-', label = 'prediction')
plt.xlabel("LotArea")
plt.ylabel('SalePrice')
plt.title("training set")
plt.legend()

使用BsmtUnfSF作为特征,完成模型的训练,指标计算,可视化

# 创建实例
model2 = myLinearRegression()

# 使用训练集对模型进行训练,传入训练集的LotArea和标记SalePrice
model2.fit(train_data['BsmtUnfSF'], train_data['SalePrice'])

# 对测试集进行预测,并将结果存储在变量prediction中
prediction = model2.predict(test_data['BsmtUnfSF'])

mae3 = MAE(prediction, test_data['SalePrice'])
rmse3 = RMSE(prediction, test_data['SalePrice'])
print("模型2,特征:BsmtUnfSF")
print("MAE:", mae3)
print("RMSE:", rmse3)

plt.figure(figsize = (16, 6))
plt.subplot(121)
plt.plot(train_data['BsmtUnfSF'].values, train_data['SalePrice'].values, '.', label = 'training data')
plt.plot(train_data['BsmtUnfSF'].values, model2.predict(train_data['BsmtUnfSF']), '-', label = 'prediction')
plt.xlabel('BsmtUnfSF')
plt.ylabel('SalePrice')
plt.title("training set")
plt.legend()
plt.yticks(np.arange(0, 800000, 100000))

plt.subplot(122)
plt.plot(test_data['BsmtUnfSF'].values, test_data['SalePrice'].values, '.', label='training data')
plt.plot(test_data['BsmtUnfSF'].values, model2.predict(test_data['BsmtUnfSF']), '-', label='prediction')
plt.xlabel('BsmtUnfSF')
plt.ylabel('SalePrice')
plt.title("testing set")
plt.yticks(np.arange(0, 800000, 100000))
plt.show()

使用GarageArea作为特征,完成模型的训练,指标计算,可视化

# 创建实例
model3 = myLinearRegression()

# 使用训练集对模型进行训练,传入训练集的LotArea和标记SalePrice
model3.fit(train_data['GarageArea'], train_data['SalePrice'])

# 对测试集进行预测,并将结果存储在变量prediction中
prediction = model3.predict(test_data['GarageArea'])

mae3 = MAE(prediction, test_data['SalePrice'])
rmse3 = RMSE(prediction, test_data['SalePrice'])
print("模型3,特征:GarageArea")
print("MAE:", mae3)
print("RMSE:", rmse3)

plt.figure(figsize = (16, 6))
plt.subplot(121)
plt.plot(train_data['GarageArea'].values, train_data['SalePrice'].values, '.', label = 'training data')
plt.plot(train_data['GarageArea'].values, model3.predict(train_data['GarageArea']), '-', label = 'prediction')
plt.xlabel('GarageArea')
plt.ylabel('SalePrice')
plt.title("training set")
plt.legend()
plt.yticks(np.arange(0, 800000, 100000))

plt.subplot(122)
plt.plot(test_data['GarageArea'].values, test_data['SalePrice'].values, '.', label='training data')
plt.plot(test_data['GarageArea'].values, model3.predict(test_data['GarageArea']), '-', label='prediction')
plt.xlabel('GarageArea')
plt.ylabel('SalePrice')
plt.title("testing set")
plt.yticks(np.arange(0, 800000, 100000))
plt.show()


选做:剔除训练集中的离群值(outlier),然后重新训练模型,观察模型预测性能的变化


t = train_data[(train_data['LotArea'] < 60000) & (train_data['LotArea'] > 0)] # 将训练集中LotArea小于60000的值存入t
t = t[t['SalePrice'] < 500000] # 将t中SalePrice小于500000的值保留
# 绘制处理后的数据
plt.figure(figsize = (8, 7))
plt.plot(t['LotArea'], t['SalePrice'], '.')
plt.show()
num_of_samples = t.shape[0]
split_line = int(num_of_samples * 0.7)
train_data2 = t.iloc[:split_line]
test_data2 = t.iloc[split_line:]

model = myLinearRegression()
model.fit(train_data2['LotArea'], train_data2['SalePrice'])

prediction = model.predict(test_data2['LotArea'])

mae = MAE(prediction, test_data2['SalePrice'])
rmse = RMSE(prediction, test_data2['SalePrice'])
print("特征:LotArea")
print("MAE:", mae)
print("RMSE:", rmse)

3.2 多元线性回归——Kaggle房价预测

使用多个特征作为输入完成房价预测问题,计算模型在十折交叉验证上MAE和RMSE的值,比较不同的特征组合在模型预测能力上的影响。
模型可使用sklearn.linear_model.LinearRegression

选做:多项式回归(一元线性回归的扩展),尝试对部分特征进行变换,如将其二次幂,三次幂作为特征输入模型,观察模型在预测能力上的变化。

这部分的内容是要求大家完成多元线性回归,我们会先带着大家使用sklearn做一元线性回归的十折交叉验证,多元线性回归大家可以仿照着完成

1. 读取数据

同3.1

2. 引入模型


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict

3. 使用sklearn完成一元线性回归的十折交叉验证验证


model = LinearRegression()
features = ['LotArea']
x = data[features]
y = data['SalePrice']
prediction = cross_val_predict(model, x, y, cv = 10)
prediction.shape

4. 计算评价指标

mean_absolute_error(prediction, data['SalePrice'])
mean_squared_error(prediction, data['SalePrice']) ** 0.5

5. 请你选择多种特征进行组合,完成多元线性回归,并对比不同的特征组合,它们训练出的模型在十折交叉验证上MAE与RMSE的差别,至少完成3组

MULmodel1 = LinearRegression()
features1 = ['LotArea', 'MSSubClass', 'TotalBsmtSF']
x = data[features1]
y = data['SalePrice']
prediction = cross_val_predict(MULmodel1, x, y, cv = 10)
mean_absolute_error(prediction, data['SalePrice'])
mean_squared_error(prediction, data['SalePrice']) ** 0.5


MULmodel2 = LinearRegression()
features2 = ['1stFlrSF', '2ndFlrSF', 'MSSubClass', 'TotalBsmtSF']
x = data[features2]
y = data['SalePrice']
prediction = cross_val_predict(MULmodel2, x, y, cv=10)
mean_absolute_error(prediction, data['SalePrice'])
mean_squared_error(prediction, data['SalePrice']) ** 0.5
print("MAE:", mean_absolute_error(prediction, data['SalePrice']))
print("RMAE", mean_squared_error(prediction, data['SalePrice']) ** 0.5)

MULmodel3 = LinearRegression()
features3 = ['1stFlrSF', '2ndFlrSF', 'YrSold', 'GarageArea', 'TotalBsmtSF']
x = data[features3]
y = data['SalePrice']
prediction = cross_val_predict(MULmodel3, x, y, cv=10)
mean_absolute_error(prediction, data['SalePrice'])
mean_squared_error(prediction, data['SalePrice']) ** 0.5
print("MAE:", mean_absolute_error(prediction, data['SalePrice']))
print("RMAE", mean_squared_error(prediction, data['SalePrice']) ** 0.5)

3.3对数几率回归——肿瘤分类(二选一)

  1. 使用对数几率回归完成Breast_Cancer_Wisconsin结果预测问题
  2. 计算十折交叉验证下的精度(accuracy),查准率(precision),查全率(recall),F1值。
  3. 模型可使用sklearn.linear_model.LogisticRegression
  4. 选做:尝试对特征进行筛选、标准化、正则化后,训练模型并计算十折交叉验证后的四项指标

1. 读取数据


import numpy as np
import pandas as pd
data = pd.read_csv('./breast-cancer.csv')
data.head()

2. 导入模型

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict
from sklearn import preprocessing

3. 数据处理

# y=diagnosis,x=radius_mean:
# data_x=...

num_of_samples = data.shape[0]
split_line = int(num_of_samples * 0.7)
train_data = data.iloc[:split_line]
test_data = data.iloc[split_line:]

features = ['radius_mean', 'texture_mean', 'perimeter_mean',
            'area_mean', 'smoothness_mean', 'compactness_mean',
            'concavity_mean', 'concave points_mean', 'symmetry_mean',
            'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se',
            'area_se', 'smoothness_se', 'compactness_se', 'concavity_se',
            'concave points_se', 'symmetry_se', 'fractal_dimension_se',
            'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
            'smoothness_worst', 'compactness_worst', 'concavity_worst',
            'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
data_x = train_data[features]
test_x = test_data[features]

data_y = train_data['diagnosis']
test_y = test_data['diagnosis']

4. 训练并预测

# # YOUR CODE HERE
# model = LogisticRegression(max_iter = 10000)
# prediction = ...
model = LogisticRegression(max_iter = 10000)
model.fit(data_x,data_y)
prediction = model.predict(test_x)

5. 评价指标的计算

def evalute(prediction, test_y):
    acc = accuracy_score(test_y, prediction)
    pre = precision_score(test_y, prediction,pos_label='B')
    recall = recall_score(test_y, prediction,pos_label='B')
    f1 = recall_score(test_y, prediction,pos_label='B')
    return acc, pre, recall, f1
evalute(prediction, test_y)

3.4 类别不均衡

类别不均衡

  1. 使用imblearn实现SMOTE过采样,RandomUnderSampler降采样。
  2. 在样本不均衡(处理过后)的癌症数据集(Breast_Cancer_Wisconsin)上利用SMOTE,对比使用过采样算法前后各评价指标的差异。

使用imblearn实现SMOTE过采样,RandomUnderSampler降采样

import pandas as pd
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from imblearn.datasets import make_imbalance
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')
#balanced dataset
train_X, train_y = make_classification(n_samples=700, n_features=2, n_redundant=0,
                             n_informative=2, n_clusters_per_class=1,
                             class_sep=1.0, flip_y=0.06, random_state=100)
plt.title("Balanced dataset")
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(train_X[:, 0], train_X[:, 1], marker='o', c=train_y,
            s=25, edgecolor='k', cmap=plt.cm.coolwarm)
plt.show()
train_X1, train_y1 =  make_imbalance(train_X, train_y, sampling_strategy={0: 340, 1: 10}, random_state=100)
plt.title("Imbalanced dataset")
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(train_X1[:, 0], train_X1[:, 1], marker='o', c=train_y1,
            s=25, edgecolor='k', cmap=plt.cm.coolwarm)
plt.show()
sm_model = SMOTE(sampling_strategy='auto', k_neighbors=8, random_state=100)
train_X2, train_y2 = sm_model.fit_resample(train_X1, train_y1)
plt.title('SMOTE')
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(train_X2[:, 0], train_X2[:, 1], marker='o', c=train_y2,
            s=25, edgecolor='k', cmap=plt.cm.coolwarm)
plt.show()
rus_model = RandomUnderSampler(random_state=100)
train_X3, train_y3 = rus_model.fit_resample(train_X1, train_y1)
plt.title('RUS')
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(train_X3[:, 0], train_X3[:, 1], marker='o', c=train_y3,
            s=25, edgecolor='k', cmap=plt.cm.coolwarm)
plt.show()

在样本不均衡的癌症数据集上利用SMOTE,对比使用过采样算法前后各评价指标的差异

raw_data = pd.read_csv('./breast-cancer.csv')
raw_data
print(len(raw_data[raw_data['diagnosis'] == 'M']))
print(len(raw_data[raw_data['diagnosis'] == 'B']))
# 将癌症数据集处理为样本更加不均衡的数据如100:357

names = raw_data[raw_data['diagnosis'] == 'M'].sample(n=112).index
data = raw_data.drop(names, inplace=True)
num_of_samples = data.shape[0]
split_line = int(num_of_samples * 0.7)
train_data = data.iloc[:split_line]
test_data = data.iloc[split_line:]

features = ['radius_mean', 'texture_mean', 'perimeter_mean',
            'area_mean', 'smoothness_mean', 'compactness_mean', 
            'concavity_mean', 'concave points_mean', 'symmetry_mean',
            'fractal_dimension_mean	radius_se', 'texture_se	perimeter_se',
            'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 
            'concave points_se', 'symmetry_se', 'fractal_dimension_se', 
            'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
            'smoothness_worst', 'compactness_worst', 'concavity_worst', 
            'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']

data_x = train_data[features]
test_x = test_data[features]

data_y = train_data['diagnosis']
test_y = test_data['diagnosis']
# 预测并评价指标
model1 = LogisticRegression(max_iter=10000)
model1.fit(data_x, data_y)
prediction1 = model1.predict(test_x)

print("1 过采样处理前:", evalute(prediction1, test_y))

def evalute(prediction, test_y):
    acc = accuracy_score(test_y, prediction)
    pre = precision_score(test_y, prediction,pos_label='B')
    recall = recall_score(test_y, prediction,pos_label='B')
    f1 = recall_score(test_y, prediction,pos_label='B')
    return acc, pre, recall, f1

# 将处理后的数据集进行SMOTE过采样
sm_model = SMOTE(sampling_strategy='auto', k_neighbors=8, random_state=100)
data_X2, data_y2 = sm_model.fit_resample(data_x, data_y)

# 预测并评价指标
model2 = LogisticRegression(max_iter=10000)
model2.fit(data_X2, data_y2)
prediction2 = model2.predict(test_x)
print("2 过采样处理后:", evalute(prediction2, test_y))

你可能感兴趣的:(机器学习,本科课程,机器学习,人工智能)