天池大赛之工业蒸汽处理(改进版 ---- 0.1235)

导包

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet,RidgeCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#支持向量机
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler,PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')

数据聚合

train_data = pd.read_csv('./zhengqi_train.txt',sep='\t')
test_data = pd.read_csv('./zhengqi_test.txt',sep='\t')
#合并训练数据和预测数据
train_data["origin"]="train"
test_data["origin"]="test"
data_all=pd.concat([train_data,test_data],axis=0,ignore_index=True)
#View data
data_all
V0 V1 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V2 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V4 V5 V6 V7 V8 V9 origin target
0 0.566 0.016 -0.940 -0.307 -0.073 0.550 -0.484 0.000 -1.707 -1.162 -0.573 -0.991 -0.143 0.610 -0.400 -0.063 0.356 0.800 -0.223 0.796 0.168 -0.450 0.136 0.407 0.109 -0.615 0.327 -4.627 -4.789 -5.101 -2.608 -3.508 0.452 -0.901 -1.812 -2.360 -0.436 -2.114 train 0.175
1 0.968 0.437 0.188 -0.455 -0.134 1.109 -0.488 0.000 -0.977 -1.162 -0.571 -0.836 0.066 0.588 -0.802 -0.063 0.357 0.801 -0.144 1.057 0.338 0.671 -0.128 0.566 0.124 0.032 0.600 -0.843 0.160 0.364 -0.335 -0.730 0.194 -0.893 -1.566 -2.360 0.332 -2.114 train 0.676
2 1.013 0.568 0.874 -0.051 -0.072 0.767 -0.493 -0.212 -0.618 -0.897 -0.564 -0.558 0.235 0.576 -0.477 -0.063 0.355 0.961 -0.067 0.915 0.326 1.287 -0.009 0.370 0.361 0.277 -0.116 -0.843 0.160 0.364 0.765 -0.589 0.112 -0.797 -1.367 -2.360 0.396 -2.114 train 0.633
3 0.733 0.368 0.011 0.102 -0.014 0.769 -0.371 -0.162 -0.429 -0.897 -0.574 -0.564 0.283 0.272 -0.491 -0.063 0.352 1.435 0.113 0.898 0.277 1.298 0.015 0.165 0.417 0.279 0.603 -0.843 -0.065 0.364 0.333 -0.112 0.599 -0.679 -1.200 -2.086 0.403 -2.114 train 0.206
4 0.684 0.638 -0.251 0.570 0.199 -0.349 -0.342 -0.138 -0.391 -0.897 -0.572 -0.394 0.260 0.106 0.309 -0.259 0.352 0.881 0.221 0.386 0.332 1.289 0.183 0.209 1.078 0.328 0.418 -0.843 -0.215 0.364 -0.280 -0.028 0.337 -0.454 -1.073 -2.086 0.314 -2.114 train 0.384
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4808 -1.362 -1.553 -2.551 0.518 0.396 0.928 1.452 0.867 -5.143 1.227 -3.573 0.107 -3.096 -0.088 0.227 2.953 -1.538 -0.630 -3.072 -1.120 -1.674 0.525 0.171 -0.444 -4.488 -5.793 -4.050 -1.187 -0.852 -2.131 -2.564 0.597 0.381 1.375 -4.854 -5.331 -4.074 -3.838 test NaN
4809 -2.698 -3.452 -2.525 0.311 -1.786 1.871 1.885 1.135 -5.774 1.227 -0.965 0.193 -3.620 -0.506 -0.574 3.149 -1.479 -0.204 -3.432 -2.101 -1.773 -0.446 1.297 -1.066 -0.613 -7.698 -0.674 -1.187 -0.852 -2.131 -2.564 1.215 -1.385 1.378 -4.927 -5.103 -4.393 -1.683 test NaN
4810 -2.615 -3.564 -2.529 -0.029 -1.151 1.976 2.337 0.504 -4.752 1.492 -1.568 0.301 -3.402 0.109 -0.541 3.511 -1.085 1.057 -2.409 0.477 -1.585 -0.447 0.552 -0.422 0.125 -6.111 0.275 -1.851 -1.548 -1.537 -2.544 1.612 -1.272 1.121 -4.223 -4.315 -5.196 -3.407 test NaN
4811 -2.661 -3.646 -2.560 -0.028 -1.512 1.520 2.243 0.206 -4.200 1.492 -1.282 -0.036 -3.271 -1.015 -0.203 3.511 -1.084 0.800 -2.339 0.050 -1.410 -0.447 0.318 -0.699 1.086 -5.268 0.683 -1.645 -1.471 -1.537 -2.549 1.431 -1.270 1.116 -3.716 -3.809 -4.735 -2.976 test NaN
4812 -2.321 -3.037 0.056 0.306 -1.154 0.847 2.221 0.206 -3.960 1.492 -1.213 0.592 -3.214 -1.502 0.153 3.609 -1.088 0.799 -2.339 -0.077 -1.242 -0.442 0.323 -1.594 -0.774 -5.211 1.618 -1.703 -1.471 -1.537 -1.123 1.988 -0.910 1.259 -3.616 -3.747 -4.368 -2.976 test NaN

4813 rows × 40 columns

特征探索

#探索出去最后两列的数字属性
data_all.columns[:-2]
Index(['V0', 'V1', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17',
       'V18', 'V19', 'V2', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26',
       'V27', 'V28', 'V29', 'V3', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35',
       'V36', 'V37', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9'],
      dtype='object')
#38个特征将一些不重要的删除
#特征分布情况,训练和测试数据特征分布不均匀,删除
for column in data_all.columns[0:-2]:
    g = sns.kdeplot(data_all[column][(data_all["origin"] == "train")], color="Red", shade = True)
    g = sns.kdeplot(data_all[column][(data_all["origin"] == "test")], ax =g, color="Blue", shade= True)
    g.set_xlabel(column)
    g.set_ylabel("Frequency")
    g = g.legend(["train","test"])
    plt.show()

天池大赛之工业蒸汽处理(改进版 ---- 0.1235)_第1张图片

fig = plt.figure(figsize=(10, 10))
for i in range(len(data_all.columns)-2):
    g = sns.FacetGrid(data_all, col='origin')
    g = g.map(sns.distplot, data_all.columns[i])

天池大赛之工业蒸汽处理(改进版 ---- 0.1235)_第2张图片

#通过图示可以看出'V11','V17','V22','V5',波动太大,删除
drop_labels = ['V11','V17','V22','V5']
data_all.drop(drop_labels,axis=1,inplace=True)

相关性系数corr

# 找出相关程度
plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
mcorr = train_data.corr()  # 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型
mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图(看两两相似度)
plt.show

在这里插入图片描述

# 通过相关性系数找到7个相关性不大的属性
cond = mcorr.loc['target'].abs()<0.1
drop_labels = mcorr.loc['target'][cond].index
#['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34']

#查看属性分布后,将分布不好的删除  ('V14', 'V21', )
drop_labels = ['V14', 'V21']
data_all.drop(drop_labels,axis=1,inplace=True)
#删除了6个属性
data_all.shape
(4813, 34)

对数据进行归一化

data = data_all.iloc[:,:-2]
minmaxscale = MinMaxScaler()
data = minmaxscale.fit_transform(data)
data
array([[0.77577505, 0.723449  , 0.22174265, ..., 0.43285165, 0.66410771,
        0.73528007],
       [0.83374189, 0.77878549, 0.37388724, ..., 0.43285165, 0.7548128 ,
        0.73528007],
       [0.84023071, 0.79600421, 0.46641489, ..., 0.43285165, 0.76237156,
        0.73528007],
       ...,
       [0.31708724, 0.25289169, 0.0074184 , ..., 0.17367095, 0.10192512,
        0.64706284],
       [0.31045422, 0.24211356, 0.00323712, ..., 0.24075302, 0.1563718 ,
        0.67646858],
       [0.35948089, 0.32216088, 0.35608309, ..., 0.24897256, 0.19971655,
        0.67646858]])
#归一化数据
data_all_norm = pd.DataFrame(data,columns=data_all.columns[:-2])
data_all_norm
V0 V1 V10 V12 V13 V15 V16 V18 V19 V2 V20 V23 V24 V25 V26 V27 V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V4 V6 V7 V8 V9
0 0.775775 0.723449 0.221743 0.570828 0.694786 0.402245 0.487950 0.375125 0.380238 0.582197 0.537946 0.792169 0.569153 0.375250 0.730736 0.902936 0.279341 0.406834 0.665193 0.603714 0.729379 0.679479 0.000000 0.000000 0.242424 0.000000 0.018343 0.571839 0.508616 0.432852 0.664108 0.735280
1 0.833742 0.778785 0.373887 0.564418 0.778544 0.402245 0.569779 0.375374 0.401962 0.611588 0.534996 0.792304 0.569419 0.381824 0.762915 0.924734 0.437095 0.371596 0.689434 0.605676 0.796005 0.721792 0.374950 0.499949 0.800020 0.289702 0.436025 0.544381 0.541225 0.432852 0.754813 0.735280
2 0.840231 0.796004 0.466415 0.570933 0.727300 0.372870 0.610021 0.376246 0.440925 0.635354 0.533387 0.792035 0.611893 0.388232 0.745407 0.923195 0.523783 0.387480 0.659552 0.636673 0.821234 0.610818 0.374950 0.499949 0.800020 0.429901 0.457224 0.535653 0.567603 0.432852 0.762372 0.735280
3 0.799856 0.769716 0.350013 0.577028 0.727600 0.379798 0.631207 0.375000 0.440084 0.642104 0.492625 0.791633 0.737722 0.403212 0.743312 0.916912 0.525331 0.390683 0.628297 0.643997 0.821440 0.722257 0.374950 0.477220 0.800020 0.374841 0.528943 0.587484 0.589740 0.469177 0.763198 0.735280
4 0.792790 0.805205 0.314675 0.599412 0.560084 0.383123 0.635467 0.375249 0.463910 0.638869 0.470367 0.791633 0.590656 0.412200 0.680187 0.923965 0.524064 0.413107 0.635005 0.730447 0.826485 0.693583 0.374950 0.462067 0.800020 0.296712 0.541573 0.559600 0.606575 0.469177 0.752687 0.735280
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4808 0.497765 0.517219 0.004451 0.620113 0.751423 0.522378 0.102791 0.001246 0.534128 0.166924 0.444355 0.538214 0.189541 0.138149 0.494514 0.666752 0.416549 0.411506 0.535447 0.002485 0.196169 0.001085 0.340864 0.397717 0.545455 0.005608 0.635544 0.564283 0.105382 0.038977 0.234440 0.617657
4809 0.305119 0.267613 0.007958 0.390815 0.892718 0.559512 0.032059 0.326271 0.546181 0.093236 0.388308 0.546125 0.302628 0.108189 0.373567 0.654058 0.279904 0.561799 0.440616 0.509286 0.000000 0.524334 0.340864 0.397717 0.545455 0.005608 0.728462 0.376330 0.095705 0.069203 0.196764 0.764686
4810 0.317087 0.252892 0.007418 0.457545 0.908451 0.472080 0.146620 0.251122 0.561317 0.123893 0.470770 0.598954 0.637377 0.193326 0.691407 0.678164 0.279764 0.462360 0.538802 0.605807 0.163423 0.671420 0.275069 0.327407 0.606061 0.008157 0.788152 0.388357 0.189024 0.173671 0.101925 0.647063
4811 0.310454 0.242114 0.003237 0.419609 0.840126 0.430788 0.208497 0.286765 0.514085 0.142315 0.320059 0.599088 0.569153 0.199151 0.638762 0.700603 0.279764 0.431127 0.496570 0.731494 0.250232 0.734656 0.295482 0.335185 0.606061 0.007520 0.760938 0.388570 0.256230 0.240753 0.156372 0.676469
4812 0.359481 0.322161 0.356083 0.457230 0.739287 0.430788 0.235400 0.295364 0.602102 0.150330 0.254760 0.598552 0.568888 0.199151 0.623104 0.722144 0.280467 0.431794 0.360116 0.488229 0.256101 0.879572 0.289734 0.335185 0.606061 0.189268 0.844685 0.426884 0.269486 0.248973 0.199717 0.676469

4813 rows × 32 columns

#将oringin和target属性merage上
data_all_norm = pd.merge(data_all_norm,data_all.iloc[:,-2:],left_index=True,right_index=True)
data_all_norm.describe()
V0 V1 V10 V12 V13 V15 V16 V18 V19 V2 V20 V23 V24 V25 V26 V27 V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V4 V6 V7 V8 V9 target
count 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 4813.000000 2888.000000
mean 0.694172 0.721357 0.348518 0.578507 0.612372 0.402251 0.679294 0.446542 0.519158 0.602300 0.456147 0.744438 0.356712 0.393796 0.632582 0.881401 0.342653 0.388683 0.603139 0.589459 0.792709 0.628824 0.458493 0.483790 0.762873 0.332385 0.545795 0.523743 0.748823 0.745740 0.715607 0.879536 0.126353
std 0.144198 0.131443 0.134882 0.105088 0.149835 0.138561 0.112095 0.124627 0.140166 0.140628 0.134083 0.134085 0.265512 0.083226 0.123294 0.128221 0.140731 0.133475 0.152462 0.130786 0.102976 0.155003 0.099095 0.101020 0.102037 0.127456 0.150356 0.106430 0.132560 0.132577 0.118105 0.068244 0.983966
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -3.044000
25% 0.626676 0.679416 0.284327 0.532892 0.519928 0.299016 0.629414 0.399302 0.414436 0.514414 0.370475 0.719362 0.040616 0.347870 0.566515 0.888575 0.278778 0.292445 0.503888 0.550092 0.761816 0.562461 0.409037 0.454490 0.727273 0.270584 0.445647 0.478182 0.683324 0.696938 0.664934 0.852903 -0.350250
50% 0.729488 0.752497 0.366469 0.591635 0.627809 0.391437 0.700258 0.456256 0.540294 0.617072 0.447305 0.788817 0.381736 0.388815 0.641228 0.916015 0.279904 0.375734 0.614270 0.594428 0.815055 0.643056 0.454518 0.499949 0.800020 0.347056 0.539317 0.535866 0.774125 0.771974 0.742884 0.882377 0.313000
75% 0.790195 0.799553 0.432965 0.641971 0.719958 0.489954 0.753279 0.501745 0.623125 0.700464 0.522660 0.792706 0.574728 0.427597 0.713599 0.932555 0.413031 0.471837 0.710474 0.650798 0.852229 0.719777 0.500000 0.511365 0.800020 0.414861 0.643061 0.585036 0.842259 0.836405 0.790835 0.941189 0.793250
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 2.538000
def scale_minmax(data):
    return (data - data.min())/(data.max() - data.min())
#使用Box-Cox将连续数据转换的更加平滑(主要处理类似正太分布)
from scipy import stats
fcols = 6
frows = len(data_all_norm.columns[:10])
plt.figure(figsize=(4*fcols,4*frows))
i = 0

for col in data_all_norm.columns[:10]:
    dat = data_all_norm[[col, 'target']].dropna()

#     这条线就是数据分布dist:distribution(分布)
    i+=1
    plt.subplot(frows,fcols,i)
    sns.distplot(dat[col],fit = stats.norm);
    plt.title(col+' Original')
    plt.xlabel('')

#     第二个图:skew统计分析中中一个属性
#     skewness 偏斜系数,对正太分布的度量
    i+=1
    plt.subplot(frows,fcols,i)
    _=stats.probplot(dat[col], plot=plt)#画图,偏析度
    plt.title('skew='+'{:.4f}'.format(stats.skew(dat[col])))
    plt.xlabel('')
    plt.ylabel('')

#     散点图
    i+=1
    plt.subplot(frows,fcols,i)
#     plt.plot(dat[var], dat['target'],'.',alpha=0.5)
    plt.scatter(dat[col],dat['target'],alpha=0.5)
    plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[col], dat['target'])[0][1]))

#     !!!对数据进行了处理!!!
#   数据分布图distribution
    i+=1
    plt.subplot(frows,fcols,i)
    trans_var, lambda_var = stats.boxcox(dat[col].dropna()+1)
    trans_var = scale_minmax(trans_var)      
    sns.distplot(trans_var , fit=stats.norm);
    plt.title(col+' Tramsformed')
    plt.xlabel('')

#     偏斜度
    i+=1
    plt.subplot(frows,fcols,i)
    _=stats.probplot(trans_var, plot=plt)
    plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
    plt.xlabel('')
    plt.ylabel('')

#     散点图
    i+=1
    plt.subplot(frows,fcols,i)
    plt.plot(trans_var, dat['target'],'.',alpha=0.5)
    plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))

天池大赛之工业蒸汽处理(改进版 ---- 0.1235)_第3张图片

# 将数据进行Box-Cox转换
# 统计建模中常用的数据变化
# 数据更加正态化,标准化
for col in data_all_norm.columns[:-2]:
    boxcox,maxlog = stats.boxcox(data_all_norm[col] + 1)
    data_all_norm[col] = scale_minmax(boxcox)
data_all_norm
V0 V1 V10 V12 V13 V15 V16 V18 V19 V2 V20 V23 V24 V25 V26 V27 V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V4 V6 V7 V8 V9 origin target
0 0.507483 0.357070 0.134959 0.303471 0.561751 0.539735 0.136013 0.239798 0.272914 0.442658 0.694629 0.425929 0.592470 0.626176 0.552721 0.394651 0.377657 0.559002 0.581357 0.323667 0.267157 0.440715 0.000000 0.000000 0.026476 0.000000 0.020896 0.353680 0.165759 0.094056 0.304061 0.253539 train 0.175
1 0.610419 0.445015 0.253597 0.297055 0.668704 0.539735 0.197424 0.240004 0.292263 0.474668 0.692110 0.426178 0.592730 0.632878 0.597390 0.488267 0.547131 0.522334 0.608782 0.325784 0.376810 0.496931 0.355631 0.432280 0.467466 0.175457 0.466089 0.325746 0.190998 0.094056 0.430326 0.253539 train 0.676
2 0.622895 0.475812 0.336900 0.303577 0.602125 0.509062 0.234823 0.240729 0.328031 0.501306 0.690732 0.425680 0.634207 0.639320 0.572815 0.481023 0.630446 0.539067 0.575040 0.360554 0.427728 0.359039 0.355631 0.432280 0.467466 0.290367 0.487365 0.317154 0.213520 0.094056 0.442528 0.253539 train 0.633
3 0.548433 0.429476 0.233498 0.309766 0.602504 0.516390 0.256694 0.239694 0.327245 0.508997 0.654865 0.424935 0.755301 0.654039 0.569916 0.452488 0.631879 0.542400 0.540475 0.369141 0.428167 0.497575 0.355631 0.409624 0.467466 0.242724 0.558547 0.370220 0.233980 0.113283 0.443879 0.253539 train 0.206
4 0.536158 0.492987 0.204775 0.333233 0.409379 0.519887 0.261284 0.239901 0.349778 0.505305 0.634468 0.424935 0.613508 0.662648 0.486902 0.484632 0.630707 0.565360 0.547831 0.482005 0.439062 0.458936 0.355631 0.394675 0.467466 0.180709 0.570959 0.341058 0.250548 0.113283 0.426944 0.253539 train 0.384
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4808 0.184452 0.144638 0.002294 0.356001 0.633015 0.655194 0.009837 0.000579 0.419213 0.092622 0.609861 0.127612 0.205965 0.302314 0.286901 0.033158 0.526439 0.563741 0.442151 0.000452 0.007696 0.000245 0.322227 0.332586 0.152596 0.002675 0.662217 0.345855 0.012300 0.002622 0.035277 0.128271 test NaN
4809 0.074357 0.036169 0.004113 0.155465 0.830440 0.687976 0.002518 0.200542 0.431591 0.048667 0.553814 0.132960 0.324364 0.246144 0.187745 0.028743 0.378309 0.702706 0.348424 0.232703 0.000000 0.271468 0.322227 0.332586 0.152596 0.002675 0.750654 0.182800 0.010893 0.005091 0.026747 0.298339 test NaN
4810 0.079352 0.032715 0.003833 0.203011 0.854218 0.608691 0.015871 0.145112 0.447329 0.066348 0.634842 0.173746 0.658944 0.395001 0.501062 0.037668 0.378146 0.613591 0.445590 0.325925 0.005566 0.430530 0.258323 0.267339 0.203097 0.003899 0.806584 0.191546 0.027408 0.017464 0.010723 0.152833 test NaN
4811 0.076558 0.030332 0.001667 0.175014 0.753601 0.568571 0.026862 0.170668 0.398930 0.077385 0.479359 0.173862 0.592470 0.404054 0.436769 0.048282 0.378146 0.583346 0.402911 0.483508 0.012412 0.514964 0.278067 0.274424 0.203097 0.003592 0.781167 0.191703 0.044201 0.029609 0.019058 0.181492 test NaN
4812 0.098781 0.051178 0.238555 0.202767 0.617374 0.568571 0.032709 0.177030 0.490804 0.082286 0.400802 0.173400 0.592210 0.404054 0.418681 0.061087 0.378960 0.584005 0.274131 0.215143 0.013031 0.751303 0.272500 0.274424 0.203097 0.105645 0.858952 0.221029 0.048101 0.031384 0.027365 0.181492 test NaN

4813 rows × 34 columns

过滤异常值

ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])

cond = data_all_norm['origin'] == 'train'

X_train = data_all_norm[cond].iloc[:,:-2]
# 真实值
y_train = data_all_norm[cond]['target']
# 算法拟合数据和目标值的时候,不可能100%拟合
ridge.fit(X_train,y_train)
# 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值
y_ = ridge.predict(X_train)

cond = abs(y_ - y_train) > y_train.std()
print(cond.sum())
# 画图
plt.figure(figsize=(12,6))
axes = plt.subplot(1,3,1)
axes.scatter(y_train,y_)
axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)

axes = plt.subplot(1,3,2)
axes.scatter(y_train,y_train - y_)
axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')

axes = plt.subplot(1,3,3)
# _ = axes.hist(y_train,bins = 50)
(y_train - y_).plot.hist(bins = 50,ax = axes)
(y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')
40






天池大赛之工业蒸汽处理(改进版 ---- 0.1235)_第4张图片

index = cond[cond].index

data_all_norm.drop(index,axis = 0,inplace=True)
cond = data_all_norm['origin'] == 'train'
X_train = data_all_norm[cond].iloc[:,:-2]
y_train = data_all_norm[cond]['target']

cond = data_all_norm['origin'] == 'test'
X_test = data_all_norm[cond].iloc[:,:-2]

使用不同算法进行计算,最后求取平均值!!

estimators = {}
estimators['forest'] = RandomForestRegressor(n_estimators=300)
estimators['gbdt'] = GradientBoostingRegressor(n_estimators=300)
estimators['ada'] = AdaBoostRegressor(n_estimators=300)
estimators['extreme'] = ExtraTreesRegressor(n_estimators=300)
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor(n_estimators=300)
estimators['xgb'] = XGBRegressor(n_estimators=300)
#将结果存入列表中,求取平均值作为最后答案
result = []
for key,model in estimators.items():
    model.fit(X_train,y_train)
    y_ = model.predict(X_test)
    result.append(y_)
y_ = np.mean(result,axis = 0)

pd.Series(y_).to_csv('./norm.txt',index = False)
[19:51:26] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

你可能感兴趣的:(天池大赛之工业蒸汽处理(改进版 ---- 0.1235))