import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')%matplotlib inline
for i inrange(3):
train_data.loc[(train_data.Fare ==0)&(train_data.Pclass ==i+1),'Fare']= train_data[(train_data.Fare !=0)&(train_data.Pclass ==i+1)].Fare.median()
D:\anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
from numpy.core.umath_tests import inner1d
146.06308221585957
#不同仓位,性别下的年龄分布和生还的关系
fig, ax = plt.subplots(1,2, figsize=(18,8))
sns.violinplot('Pclass','Age',hue='Survived', data=train_data, split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(1,110,10))
sns.violinplot('Sex','Age', hue='Survived', data=train_data, split=True, ax=ax[1])
ax[1].set_title('sex and age vs Survived')
ax[1].set_yticks(range(1,110,10))
plt.show()
age = combined_data[['Age','Sex','Embarked','Pclass','Pclass_Fare_Category','Title','Name_len','Fare','Fare_id','Family_size','Family_size_categroy']]
age_train = age[age['Age'].notnull()]
age_test = age[age['Age'].isnull()]
#建立多模型预测from sklearn import ensemble
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
deffill_age(age_train,age_test):
age_train_x = age_train.drop(['Age'],axis=1)
age_train_y = age_train['Age']
age_test_x = age_test.drop(['Age'],axis=1)#model 1 gbmprint('==========Model GBM==========')
gbm_reg = GradientBoostingRegressor(random_state=41)
gbm_param ={'n_estimators':[2000],'max_depth':[4],'learning_rate':[0.01],'max_features':[3]}
gbm_grid = model_selection.GridSearchCV(gbm_reg, gbm_param, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
gbm_grid.fit(age_train_x, age_train_y)print('GBM best features for age params'+str(gbm_grid.best_params_))print('GBM best features for age scores'+str(gbm_grid.best_score_))print('GBM train error for age feature regressor'+str(gbm_grid.score(age_train_x, age_train_y)))print(age_train_x.shape,age_test_x.shape)
age_test.loc[:,'Age_gbm']= gbm_grid.predict(age_test_x)print(age_test['Age_gbm'][:4])#model 2 rfprint('==========Model RF==========')
rf_reg = RandomForestRegressor()
rf_param ={'n_estimators':[200],'max_depth':[5],'random_state':[0]}
rf_grid = model_selection.GridSearchCV(rf_reg,rf_param, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
rf_grid.fit(age_train_x,age_train_y)print('RF best features for age params'+str(rf_grid.best_params_))print('RF best features for age score'+str(rf_grid.best_score_))print('RF train error for age feature regressor'+str(rf_grid.score(age_train_x,age_train_y)))
age_test.loc[:,'Age_rf']= rf_grid.predict(age_test_x)print(age_test['Age_rf'][:4])#merge modelprint('==========Merge Model==========')print('shape',age_test['Age'].shape,age_test[['Age_gbm','Age_rf']].mode(axis=1).shape)
age_test.loc[:,'Age']= np.mean([age_test['Age_gbm'], age_test['Age_rf']])print('merge age:\n',age_test['Age'][:4])
age_test.drop(['Age_gbm','Age_rf'],axis=1,inplace=True)return age_test
==========Model GBM==========
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[Parallel(n_jobs=25)]: Done 5 out of 10 | elapsed: 35.2s remaining: 35.2s
[Parallel(n_jobs=25)]: Done 10 out of 10 | elapsed: 47.2s finished
GBM best features for age params{'learning_rate': 0.01, 'max_depth': 4, 'max_features': 3, 'n_estimators': 2000}
GBM best features for age scores-130.07827342623582
GBM train error for age feature regressor-63.488346095073126
(1046, 10) (263, 10)
5 36.266057
17 29.768299
19 37.534189
26 27.857612
Name: Age_gbm, dtype: float64
==========Model RF==========
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[Parallel(n_jobs=25)]: Done 5 out of 10 | elapsed: 37.7s remaining: 37.7s
[Parallel(n_jobs=25)]: Done 10 out of 10 | elapsed: 47.6s finished
RF best features for age params{'max_depth': 5, 'n_estimators': 200, 'random_state': 0}
RF best features for age score-120.22123963994939
RF train error for age feature regressor-96.82435399344224
5 32.667672
17 31.516429
19 31.493906
26 27.854183
Name: Age_rf, dtype: float64
==========Merge Model==========
shape (263,) (263, 2)
merge age:
5 29.841298
17 29.841298
19 29.841298
26 29.841298
Name: Age, dtype: float64
#1.用不同模型对特征进行筛选,选出较重要特征from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
defget_top_n_features(train_x,train_y,get_feature_num):#extra tree的参数并行数n_jobs=25会报错,=1时候就没事print('========== Extra Trees ==========')
et_est = ExtraTreesClassifier(random_state=0)
et_param ={'n_estimators':[500],'min_samples_split':[3,4],'max_depth':[20]}
et_grid = model_selection.GridSearchCV(et_est, et_param, n_jobs=1, cv=10, verbose=1)
et_grid.fit(train_x,train_y)print('best ET params for top n features'+str(et_grid.best_params_))print('best ET score for top n features'+str(et_grid.best_score_))print('best ET training score for top n features'+str(et_grid.score(train_x,train_y)))
top_n_features_et_sorted = pd.DataFrame({'feature':list(train_x),'importance':et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
top_n_features_et = top_n_features_et_sorted.head(get_feature_num)['feature']print('Sample 10 features from Extra Trees')print(str(top_n_features_et[:10]))print('========== Gradient Boosting ==========')
gb_est = GradientBoostingClassifier(random_state=0)
gb_param ={'n_estimators':[500],'learning_rate':[0.01,0.1],'max_depth':[20]}
gb_grid = model_selection.GridSearchCV(gb_est, gb_param, n_jobs=25, cv=10, verbose=1)
gb_grid.fit(train_x,train_y)print('best GB params for top n features'+str(gb_grid.best_params_))print('best GB score for top n features'+str(gb_grid.best_score_))print('best GB training score for top n features'+str(gb_grid.score(train_x,train_y)))
top_n_features_gb_sorted = pd.DataFrame({'feature':list(train_x),'importance':gb_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)
top_n_features_gb = top_n_features_gb_sorted.head(get_feature_num)['feature']print('Sample 10 features from Gradient Boosting')print(str(top_n_features_gb[:10]))print('========== Decision Tree ==========')
dt_est = DecisionTreeClassifier(random_state=0)
dt_param ={'min_samples_split':[2,4],'max_depth':[20]}
dt_grid = model_selection.GridSearchCV(dt_est, dt_param, n_jobs=25, cv=10, verbose=1)
dt_grid.fit(train_x,train_y)print('best DT params for top n features'+str(dt_grid.best_params_))print('best DT score for top n features'+str(dt_grid.best_score_))print('best DT training score for top n features'+str(dt_grid.score(train_x, train_y)))
top_n_features_dt_sorted = pd.DataFrame({'feature':list(train_x),'importance':dt_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)
top_n_features_dt = top_n_features_dt_sorted.head(get_feature_num)['feature']print('Sample 10 features from Decision Tree')print(str(top_n_features_dt[:10]))print('========== Random Forest ==========')
rf_est = RandomForestClassifier(random_state=0)
rf_param ={'n_estimators':[500],'min_samples_split':[2,3],'max_depth':[20]}
rf_grid = model_selection.GridSearchCV(rf_est, rf_param, n_jobs=25, cv=10, verbose=1)
rf_grid.fit(train_x,train_y)print('best RF params for top n features'+str(rf_grid.best_params_))print('best RF score for top n fratures'+str(rf_grid.best_score_))print('best RF training score for top n fratures'+str(rf_grid.score(train_x,train_y)))
top_n_features_rf_sorted = pd.DataFrame({'feature':list(train_x),'importance':rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
top_n_features_rf = top_n_features_rf_sorted.head(get_feature_num)['feature']print('Sample 10 Features from Random Forest')print(str(top_n_features_rf[:10]))print('========== AdaBoost ==========')
ada_est = AdaBoostClassifier(random_state=0)
ada_param ={'n_estimators':[500],'learning_rate':[0.01,0.1]}
ada_grid = model_selection.GridSearchCV(ada_est, ada_param, n_jobs=25, cv=10, verbose=1)
ada_grid.fit(train_x, train_y)print('best Ada params for top n features'+str(ada_grid.best_params_))print('best Ada score for top n features'+str(ada_grid.best_score_))print('best Ada training score for top n features'+str(ada_grid.score(train_x,train_y)))
top_n_features_ada_sorted = pd.DataFrame({'feature':list(train_x),'importance':ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
top_n_features_ada = top_n_features_ada_sorted.head(get_feature_num)['feature']print('Sample 10 Features from AdaBoost')print(str(top_n_features_ada[:10]))#model merge
top_n_feature = pd.concat([top_n_features_rf,top_n_features_ada,top_n_features_et,top_n_features_gb,top_n_features_dt],ignore_index=True).drop_duplicates()
features_importance = pd.concat([top_n_features_rf_sorted,top_n_features_ada_sorted,top_n_features_et_sorted,top_n_features_gb_sorted,top_n_features_dt_sorted],ignore_index=True)return top_n_feature, features_importance
========== Extra Trees ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Done 20 out of 20 | elapsed: 16.7s finished
best ET params for top n features{'max_depth': 20, 'min_samples_split': 4, 'n_estimators': 500}
best ET score for top n features0.8271604938271605
best ET training score for top n features0.9652076318742986
Sample 10 features from Extra Trees
14 Title0
12 Sex0
13 Sex1
2 Name_len
0 Age
1 Fare
11 Pclass2
16 Title2
15 Title1
4 Cabin0
Name: feature, dtype: object
========== Gradient Boosting ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 1.3min remaining: 42.1s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 1.6min finished
best GB params for top n features{'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 500}
best GB score for top n features0.7654320987654321
best GB training score for top n features0.9977553310886644
Sample 10 features from Gradient Boosting
0 Age
2 Name_len
1 Fare
14 Title0
10 Pclass1
16 Title2
13 Sex1
3 Family_size
28 Pclass_Fare_Category3
12 Sex0
Name: feature, dtype: object
========== Decision Tree ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 1.1min remaining: 36.3s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 1.5min finished
best DT params for top n features{'max_depth': 20, 'min_samples_split': 4}
best DT score for top n features0.7643097643097643
best DT training score for top n features0.9618406285072951
Sample 10 features from Decision Tree
14 Title0
1 Fare
0 Age
2 Name_len
3 Family_size
19 Title5
29 Pclass_Fare_Category4
10 Pclass1
22 Fare_id2
4 Cabin0
Name: feature, dtype: object
========== Random Forest ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 1.3min remaining: 41.9s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 1.8min finished
best RF params for top n features{'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 500}
best RF score for top n fratures0.8294051627384961
best RF training score for top n fratures0.9809203142536476
Sample 10 Features from Random Forest
2 Name_len
0 Age
1 Fare
13 Sex1
14 Title0
12 Sex0
3 Family_size
11 Pclass2
16 Title2
15 Title1
Name: feature, dtype: object
========== AdaBoost ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 1.4min remaining: 43.8s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 1.7min finished
best Ada params for top n features{'learning_rate': 0.01, 'n_estimators': 500}
best Ada score for top n features0.8148148148148148
best Ada training score for top n features0.8170594837261503
Sample 10 Features from AdaBoost
14 Title0
1 Fare
33 Family_size_category2
3 Family_size
11 Pclass2
12 Sex0
13 Sex1
4 Cabin0
5 Cabin1
2 Name_len
Name: feature, dtype: object
D:\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
if diff:
from sklearn.learning_curve import learning_curve
defplot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(0.1,1.0,5), verbose=0):"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)if ylim isnotNone:
plt.ylim(*ylim)
plt.xlabel('Training examples')
plt.ylabel('Score')
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores,axis=1)
train_scores_std = np.std(train_scores,axis=1)
test_scores_mean = np.mean(test_scores,axis=1)
test_scores_std = np.std(test_scores,axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color='g')
plt.plot(train_sizes, train_scores_mean,'o-', color='r', label='Training score')
plt.plot(train_sizes, test_scores_mean,'o-', color='g', label='Cross-validation score')
plt.legend(loc='best')return plt
// 多态, 在JAVA中是这样用的, 其实在PHP当中可以自然消除, 因为参数是动态的, 你传什么过来都可以, 不限制类型, 直接调用类的方法
abstract class Tiger {
public abstract function climb();
}
class XTiger extends Tiger {
public function climb()
jQuery.extend({
handleError: function( s, xhr, status, e ) {
// If a local callback was specified, fire it
if ( s.error ) {
s.error.call( s.context || s, xhr, status, e );
}
always 总是
rice 水稻,米饭
before 在...之前
live 生活,居住
usual 通常的
early 早的
begin 开始
month 月份
year 年
last 最后的
east 东方的
high 高的
far 远的
window 窗户
world 世界
than 比...更
最近使用mybatis.3.1.0时无意中碰到一个问题:
The errors below were detected when validating the file "mybatis-3-mapper.dtd" via the file "account-mapper.xml". In most cases these errors can be d