1 导入库:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
sns.set_style('darkgrid')
plt.rcParams['font.sans-serif'] = ['SimHei']
2 读取数据:
data1 = pd.read_excel('./附件一.xlsx')
data_285 = pd.read_excel('./285_313.xlsx',sheet_name='Sheet2')
data_313 = pd.read_excel('./313.xlsx')
data4= pd.read_excel('./附件四:354个操作变量信息_改.xlsx')
3 降维:
3.1 用附件四找出存在超出范围的点
standard = []
for each in data4['取值范围']:
v = each.split('_')
standard.append(v)
standard_val = []
for row in standard:
a = []
for j in row:
a.append(eval(j))
standard_val.append(a)
standard_dic = {
}
for i,each in enumerate(data4['位号']):
standard_dic[each] = standard_val[i]
standard_result = {
}
for k,v in standard_dic.items():
col_val = data1[k]
thre1 = v[0]
thre2 = v[1]
index = []
for inx,j in enumerate(col_val):
if j>thre2 or j<thre1:
index.append(inx)
standard_result[k]=index
bad_features = {
}
for k,v in standard_result.items():
if len(v) !=0:
bad_features[k]=v
bad_feature_names = list(bad_features.keys())
bad_feature_vals = []
for i in bad_features.values():
n = len(i)
bad_feature_vals.append(n)
print(bad_feature_vals)
bad_features_total = pd.DataFrame({
'Bad Features':bad_feature_names,'Number of bad features':bad_feature_vals})
bad_features_total_sort = bad_features_total.sort_values(by='Number of bad features',ascending=False)
plt.figure(figsize=(10,8),dpi=100)
sns.barplot('Bad Features','Number of bad features',data=bad_features_total_sort)
plt.xticks(rotation=90)
plt.xlabel('特征名',fontsize=14,fontweight='bold')
plt.ylabel('异常值数',fontsize=14,fontweight='bold')
plt.savefig('./异常值占比.jpg')
plt.show()
X = data1_copy.drop('RON损失\n(不是变量)',axis=1)
3.2 找出含0值较多的特征
def missing_data(data):
"""将原始数据集中为0的值全部转为nan
Input:
data:原始数据
return:
data_:缺失值转化后的数据集
"""
columns = list(data.columns)
index_list={
}
for each in columns:
index=[]
col = data[each]
for inx,v in enumerate(col):
if v == 0:
index.append(inx)
index_list[each]=index
final_index = {
}
for key in index_list.keys():
if len(index_list[key]) != 0:
final_index[key] = index_list[key]
data_ = data
for each in final_index.keys():
data_[each].iloc[final_index[each]] = np.nan
return data_
if __name__ == "__main__":
Data = missing_data(data1)
print('Data Size:{}'.format(Data.shape))
print('----------------------------------------------------------------------------------------------')
print('Missing proportion:\n',Data.isnull().mean().sort_values(ascending=False).head(33))
Missing_proportion = pd.DataFrame({
'Proportion':Data.isnull().mean().sort_values(ascending=False).head(32)})
plt.figure(figsize=(12,8),dpi=100)
plt.rcParams['font.sans-serif'] = ['SimHei']
sns.barplot(Missing_proportion.index,Missing_proportion.Proportion)
plt.xlabel('特征名',fontsize=12,fontweight='bold')
plt.ylabel('比例',fontsize=12,fontweight='bold')
plt.xticks(rotation=90,fontsize=8)
plt.savefig('./Missing proportion.jpg')
plt.show()
3.3 删除含10%以上零值的特征和含5个异常值以上的特征
bad_feature_names = list(bad_features.keys())
bad_feature_vals = []
for i in bad_features.values():
n = len(i)
bad_feature_vals.append(n)
bad_features_total = pd.DataFrame({
'Bad Features':bad_feature_names,'Number of bad features':bad_feature_vals})
bad_features_total_sort = bad_features_total.sort_values(by='Number of bad features',ascending=False)
missing_name = list(Missing_proportion.index)
missing_standard = []
for each in missing_name:
thre = standard_dic[each]
thre1 = thre[0]
thre2 = thre[1]
if (thre1<0 and thre2>=0) or (thre1==0):
missing_standard.append(each)
miss_list = list(Missing_proportion.loc[Missing_proportion['Proportion']>0.1].index)
outlinear = list(bad_features_total_sort.loc[bad_features_total_sort['Number of bad features']>5]['Bad Features'])
delete_feature = set([])
for m in miss_list:
delete_feature.add(m)
for o in outlinear:
delete_feature.add(o)
X_drop = X.drop(delete_feature,axis=1)
3.4 用RFR+REF选择35个特征
X_drop_norm = (X_drop-X_drop.mean())/X_drop.std()
X_train,X_test,y_train,y_test = train_test_split(X_drop_norm,Y,test_size=0.3)
RF = RandomForestRegressor(random_state=0,n_jobs=-1)
rfe = RFE(RF,n_features_to_select=35,step=1)
rfe.fit(X_train,y_train)
features_35 = list(X_train.columns[rfe.support_])
X_norm_35 = X_drop_norm[features_35]
3.5 用Pearson相关性检验删除与过多特征有高度相关性的特征
corr = {
}
X_index = list(X_norm_35_corr.index)
for inx,i in enumerate(list(X_norm_35_corr.columns)):
corr_list = []
val = X_norm_35_corr[i]
feature = X_index[inx]
for j,each in enumerate(val):
if inx!=j and each>0.5:
corr_list.append(list(X_norm_35_corr.columns)[j])
corr[i] = corr_list
final_corr = {
}
for k,v in corr.items():
if len(v)>0:
final_corr[k]=v
num_dict = {
}
for k,v in final_corr.items():
n = len(v)
num_dict[k] = n
sorted_num_dict = sorted(num_dict.items(),key=lambda x:x[1],reverse=True)
delete_features = ['辛烷值RON.1','S-ZORB.AT-0009.DACA.PV','S-ZORB.PT_7107.DACA','S-ZORB.PT_7103.DACA','S-ZORB.TC_2801.PV','S-ZORB.FC_2801.PV']
X_norm_final = X_norm_35.drop(delete_features,axis=1)
plt.figure(figsize=(24,24))
sns.heatmap(X_norm_final.corr(),square=True,linewidth=4,linecolor='black',annot_kws={
'size':12})
plt.savefig('./最后的特征相关性.jpg')
plt.show()
4.第三问建模
X_train,X_test,y_train,y_test = train_test_split(X_norm_final,Y,test_size=0.2,shuffle=True)
mae_score = {
}
mse_score = {
}
R2_score = {
}
lr = LinearRegression()
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test)
mae_score['LR'] = mean_absolute_error(y_test,lr_pred)
mse_score['LR'] = mean_squared_error(y_test,lr_pred)
R2_score['LR'] = r2_score(y_test,lr_pred)
rf = RandomForestRegressor(random_state=5)
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)
mae_score['RF'] = mean_absolute_error(y_test,rf_pred)
mse_score['RF'] = mean_squared_error(y_test,rf_pred)
R2_score['RF'] = r2_score(y_test,rf_pred)
LS = Lasso(alpha=0.0005,random_state=5)
LS.fit(X_train,y_train)
LS_pred = LS.predict(X_test)
mae_score['LS'] = mean_absolute_error(y_test,LS_pred)
mse_score['LS'] = mean_squared_error(y_test,LS_pred)
R2_score['LS'] = r2_score(y_test,LS_pred)
svr = SVR()
svr.fit(X_train,y_train)
svr_pred = svr.predict(X_test)
mae_score['SVR'] = mean_absolute_error(y_test,svr_pred)
mse_score['SVR'] = mean_squared_error(y_test,svr_pred)
R2_score['SVR'] = r2_score(y_test,svr_pred)
ridge =Ridge(alpha=0.002,random_state=5)
ridge.fit(X_train,y_train)
ridge_pred = ridge.predict(X_test)
mae_score['Ridge'] = mean_absolute_error(y_test,ridge_pred)
mse_score['Ridge'] = mean_squared_error(y_test,ridge_pred)
R2_score['Ridge'] = r2_score(y_test,ridge_pred)
GBR =GradientBoostingRegressor(n_estimators=300, learning_rate=0.05,
max_depth=4, random_state=5)
GBR.fit(X_train,y_train)
GBR_pred = GBR.predict(X_test)
mae_score['GBR'] = mean_absolute_error(y_test,GBR_pred)
mse_score['GBR'] = mean_squared_error(y_test,GBR_pred)
R2_score['GBR'] = r2_score(y_test,GBR_pred)
print('MAE:-------------------------')
print(mae_score)
print('MSE:-------------------------')
print(mse_score)
print('R2:--------------------------')
print(R2_score)
4.1 10-折交叉验证
n_folds = 10
cross_score = {
}
scores = cross_val_score(lr, X_norm_final, Y, scoring='neg_mean_squared_error',
cv=n_folds)
lr_mae_scores = np.sqrt(-scores)
cross_score['LinearRegression'] =lr_mae_scores.mean().round(decimals=3)
print('For LR model:')
print('Mean RMSE = ' + str(lr_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(lr_mae_scores.std().round(decimals=3)))
scores = cross_val_score(rf, X_norm_final, Y, scoring='neg_mean_squared_error',
cv=n_folds)
rf_mae_scores = np.sqrt(-scores)
cross_score['RandomForest'] =rf_mae_scores.mean().round(decimals=3)
print('For RF model:')
print('Mean RMSE = ' + str(rf_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(rf_mae_scores.std().round(decimals=3)))
scores = cross_val_score(LS, X_norm_final, Y , scoring='neg_mean_squared_error',
cv=n_folds)
ls_mae_scores = np.sqrt(-scores)
cross_score['Lasso'] =ls_mae_scores.mean().round(decimals=3)
print('For LS model:')
print('Mean RMSE = ' + str(ls_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(ls_mae_scores.std().round(decimals=3)))
scores = cross_val_score(svr,X_norm_final, Y , scoring='neg_mean_squared_error',
cv=n_folds)
svr_mae_scores = np.sqrt(-scores)
cross_score['SVR'] =svr_mae_scores.mean().round(decimals=3)
print('For svr model:')
print('Mean RMSE = ' + str(svr_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(svr_mae_scores.std().round(decimals=3)))
scores = cross_val_score(ridge,X_norm_final, Y , scoring='neg_mean_squared_error',
cv=n_folds)
ridge_mae_scores = np.sqrt(-scores)
cross_score['Ridge'] =ridge_mae_scores.mean().round(decimals=3)
print('For ridge model:')
print('Mean RMSE = ' + str(ridge_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(ridge_mae_scores.std().round(decimals=3)))
scores = cross_val_score(GBR, X_norm_final, Y , scoring='neg_mean_squared_error',
cv=n_folds)
gbr_mae_scores = np.sqrt(-scores)
cross_score['Gradient Boosting Regression'] =gbr_mae_scores.mean().round(decimals=3)
print('For GBR model:')
print('Mean RMSE = ' + str(gbr_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(gbr_mae_scores.std().round(decimals=3)))
model_names = list(cross_score.keys())
model_RMSE = list(cross_score.values())
plt.figure(figsize=(12,10))
sns.barplot(model_names,model_RMSE)
plt.xlabel('模型名称',fontsize=14,fontweight='bold')
plt.ylabel('RMES',fontsize=14,fontweight='bold')
plt.savefig('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/数模题/图/交叉验证中各个模型的平均误差.jpg')
plt.show()
4.2 调参
rmse_list = []
for i in range(100,1000,50):
final_rf = RandomForestRegressor(n_estimators=i,oob_score=True,random_state=5)
final_rf.fit(X_train,y_train)
scores = cross_val_score(final_rf, X_norm_final, Y, scoring='neg_mean_squared_error',
cv=n_folds)
final_rf_mae_scores = np.sqrt(-scores)
rmse = final_rf_mae_scores.mean().round(decimals=3)
rmse_list.append(rmse)
plt.figure(figsize=(14,12))
sns.relplot(np.arange(100,1000,50),rmse_list,kind='line')
plt.xlabel('决策树个数',fontsize=14,fontweight='bold')
plt.ylabel('RMSE',fontsize=14,fontweight='bold')
plt.savefig('./不同数量树情况下的RMSE.jpg')
plt.show()
4.3 第三问最终模型:
final_rf = RandomForestRegressor(n_estimators=550,oob_score=True,random_state=5)
final_rf.fit(X_train,y_train)
5.1 第四问建模:
Y_2 = data1['硫含量,μg/g.1']
X_train_2,X_test_2,y_train_2,y_test_2 = train_test_split(X_norm_final,Y_2,test_size=0.2,shuffle=True)
mae_score_2 = {
}
mse_score_2 = {
}
R2_score_2 = {
}
lr_2 = LinearRegression()
lr_2.fit(X_train_2,y_train_2)
lr_pred_2 = lr_2.predict(X_test_2)
mae_score_2['LR'] = mean_absolute_error(y_test_2,lr_pred_2)
mse_score_2['LR'] = mean_squared_error(y_test_2,lr_pred_2)
R2_score_2['LR'] = r2_score(y_test_2,lr_pred_2)
rf_2 = RandomForestRegressor(random_state=5)
rf_2.fit(X_train_2,y_train_2)
rf_pred_2 = rf_2.predict(X_test_2)
mae_score_2['RF'] = mean_absolute_error(y_test_2,rf_pred_2)
mse_score_2['RF'] = mean_squared_error(y_test_2,rf_pred_2)
R2_score_2['RF'] = r2_score(y_test_2,rf_pred_2)
LS_2 = Lasso(alpha=0.0005,random_state=5)
LS_2.fit(X_train_2,y_train_2)
LS_pred_2 = LS_2.predict(X_test_2)
mae_score_2['LS'] = mean_absolute_error(y_test_2,LS_pred_2)
mse_score_2['LS'] = mean_squared_error(y_test_2,LS_pred_2)
R2_score_2['LS'] = r2_score(y_test_2,LS_pred_2)
svr_2 = SVR()
svr_2.fit(X_train_2,y_train_2)
svr_pred_2 = svr_2.predict(X_test_2)
mae_score_2['SVR'] = mean_absolute_error(y_test_2,svr_pred_2)
mse_score_2['SVR'] = mean_squared_error(y_test_2,svr_pred_2)
R2_score_2['SVR'] = r2_score(y_test_2,svr_pred_2)
ridge_2 =Ridge(alpha=0.002,random_state=5)
ridge_2.fit(X_train_2,y_train_2)
ridge_pred_2 = ridge_2.predict(X_test_2)
mae_score_2['Ridge'] = mean_absolute_error(y_test_2,ridge_pred_2)
mse_score_2['Ridge'] = mean_squared_error(y_test_2,ridge_pred_2)
R2_score_2['Ridge'] = r2_score(y_test_2,ridge_pred_2)
GBR_2 =GradientBoostingRegressor(n_estimators=300, learning_rate=0.05,
max_depth=4, random_state=5)
GBR_2.fit(X_train_2,y_train_2)
GBR_pred_2 = GBR_2.predict(X_test_2)
mae_score_2['GBR'] = mean_absolute_error(y_test_2,GBR_pred_2)
mse_score_2['GBR'] = mean_squared_error(y_test_2,GBR_pred_2)
R2_score_2['GBR'] = r2_score(y_test_2,GBR_pred_2)
print('MAE:-------------------------')
print(mae_score_2)
print('MSE:-------------------------')
print(mse_score_2)
print('R2:--------------------------')
print(R2_score_2)
5.2 问题四模型调参
C = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2]
mse_list = []
for c in C:
svr = SVR(kernel='rbf',C=c)
svr.fit(X_train_2,y_train_2)
pred = svr.predict(X_test_2)
mse_list.append(mean_squared_error(y_test_2,pred))
plt.plot(C,mse_list)
plt.show()
5.3 问题四最终模型
final_svr = SVR(kernel='rbf',C=1.75)
final_svr.fit(X_train_2,y_train_2)
5.4 问题四优化建模
5.4.1 根据附件四找出主要操作变量的取值
step_list = []
object_standard = {
}
for inx,each in enumerate(data4['位号']):
if each in list(X_norm_final.columns)[2:]:
object_standard[each] = standard_dic[each]
step = data4['Δ值'][inx]
step_list.append(step)
object_region = {
}
count = 0
for each,v in list(object_standard.items()):
b = []
if step_list[count]>0:
step = int(step_list[count]*100)
thre1 = int(v[0]*100)
thre2 = int(v[1]*100)
b.append(thre1-step)
a = [i for i in range(thre1,thre2,step)]
a.append(thre2+step)
for j in a:
b.append(j)
object_region[each] = np.array(b)/100
count+=1
elif step_list[count]<0:
b = [-0.6,-0.5,-0.4,-0.3,-0.2,-0.15,-0.05]
object_region[each] = np.array(b)
count+=1
5.4.2 完全随机优化
def cal_dp(x,y,model):
pred = model.predict(x)
drop = (y - pred)/y
return drop
def cal_sul(x,model):
return model.predict(x)
def get_random(object_val):
r = []
for k,v in object_val.items():
d = random.choice(v)
r.append(d)
return r
def get_random_v(object_val,index,inx):
v= list(object_val.values())[index]
return v[inx]
def random_optimize(Y,index,model1,model2,original_data,object_val):
x = original_data[index].reshape(1,-1)
y = Y[index]
original_norm = (original_data - original_data.mean())/original_data.std()
x_norm = original_norm[index].reshape(1,-1)
d_p = cal_dp(x_norm,y,model1)
sul = cal_sul(x_norm,model2)
while True:
if d_p<0.3 or sul>5:
r = get_random(object_val)
original_data[index][2:] = np.array(r)
x = original_data[index].reshape(1,-1)
original_norm = (original_data - original_data.mean())/original_data.std()
x_norm = original_norm[index].reshape(1,-1)
d_p = cal_dp(x_norm,y,model1)
sul = cal_sul(x_norm,model2)
else:
print('got!')
return d_p,sul,x
break
if __name__ == '__main__':
def cal_dp(x,y,model):
pred = model.predict(x)
drop = (y - pred)/y
return drop
def cal_sul(x,model):
return model.predict(x)
def get_random(object_val):
r = []
for k,v in object_val.items():
d = random.choice(v)
r.append(d)
return r
def get_random_v(object_val,index,inx):
v= list(object_val.values())[index]
return v[inx]
def random_optimize(Y,index,model1,model2,original_data,object_val):
x = original_data[index].reshape(1,-1)
y = Y[index]
original_norm = (original_data - original_data.mean())/original_data.std()
x_norm = original_norm[index].reshape(1,-1)
d_p = cal_dp(x_norm,y,model1)
sul = cal_sul(x_norm,model2)
while True:
if d_p<0.2 or sul>5:
r = get_random(object_val)
original_data[index][2:] = np.array(r)
x = original_data[index].reshape(1,-1)
original_norm = (original_data - original_data.mean())/original_data.std()
x_norm = original_norm[index].reshape(1,-1)
d_p = cal_dp(x_norm,y,model1)
sul = cal_sul(x_norm,model2)
else:
print('got!')
return d_p,sul,x
break
if __name__ == '__main__':
d_p_list = []
sul_list = []
x_list = []
for i in range(len(original_data)):
d_p,sul,x = random_optimize(np.array(Y2),i,final_rf,final_svr,np.array(original_data),object_region)
d_p_list.append(d_p)
sul_list.append(sul)
x_list.append(x)