于得水Fake_water

Titanic

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

一、数据总览

#1.数据总览
path = 'E:\\data\\titanic\\'
train_data = pd.read_csv(path+'train.csv')
test_data = pd.read_csv(path+'test.csv')

sns.set_style('whitegrid')
train_data.head()
#1是获救

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

#数据信息总览
train_data.info()
#可以看出age，Cabin，Embarked有缺失


RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

#train_data['Fare'].value_counts()
#Fare有15个是0,看一下这些人的仓位，pclass，没准费用为0真的合理

train_data[train_data.Fare==0]
#发现都是男的，且有不同的Pclass，因此断定这15人的Fare为0不合理因此需要进行根据不同阶级的中位数填充

	PassengerId	Survived	Pclass	Name	Sex	Age	Ticket	Cabin	Embarked
179	180	0	3	Leonard, Mr. Lionel	male	36.0	LINE	NaN	S
263	264	0	1	Harrison, Mr. William	male	40.0	112059	B94	S
271	272	1	3	Tornquist, Mr. William Henry	male	25.0	LINE	NaN	S
277	278	0	2	Parkes, Mr. Francis "Frank"	male	NaN	239853	NaN	S
302	303	0	3	Johnson, Mr. William Cahoone Jr	male	19.0	LINE	NaN	S
413	414	0	2	Cunningham, Mr. Alfred Fleming	male	NaN	239853	NaN	S
466	467	0	2	Campbell, Mr. William	male	NaN	239853	NaN	S
481	482	0	2	Frost, Mr. Anthony Wood "Archie"	male	NaN	239854	NaN	S
597	598	0	3	Johnson, Mr. Alfred	male	49.0	LINE	NaN	S
633	634	0	1	Parr, Mr. William Henry Marsh	male	NaN	112052	NaN	S
674	675	0	2	Watson, Mr. Ennis Hastings	male	NaN	239856	NaN	S
732	733	0	2	Knight, Mr. Robert J	male	NaN	239855	NaN	S
806	807	0	1	Andrews, Mr. Thomas Jr	male	39.0	112050	A36	S
815	816	0	1	Fry, Mr. Richard	male	NaN	112058	B102	S
822	823	0	1	Reuchlin, Jonkheer. John George	male	38.0	19972	NaN	S

#绘制存活比例
train_data['Survived'].value_counts().plot.pie(autopct = '%1.2f%%')
#可以看出是否存活和性别有很大的相关性

二、缺失值的处理：Fare，Age，Embarked，Cabin

1.如果数据集很多，缺失值很少，则可以删掉缺失行

2.如果该属性对于机器学习不太重要，则可以赋值均值或众数或中位数，比如上传地点Embarked一共就三个，可以用众数来填充

train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

train_data.Embarked[train_data.Embarked.isnull()] = train_data.Embarked.mode().values
#用众数S填写了两个缺失值

3.对于标称属性(nominal attribute)意味着‘与名称相关’，它的值是一些符号或事物的名称。可以赋值一个代表性的值,因为有可能缺失代表没有仓位，但是此案例根据观察，各种阶级，票价的都有缺失，后面还需考察与目标变量的相关性，此处暂且先用U0替换

train_data['Cabin'] = train_data.Cabin.fillna('U0')
train_data['Cabin'].head()

0      U0
1     C85
2      U0
3    C123
4      U0
Name: Cabin, dtype: object

4.Fare这个变量有15个0来自不同阶级且都是男人，用不同pclass的中位数来填充

for i in range(3):
    train_data.loc[(train_data.Fare == 0) & (train_data.Pclass ==i+1),'Fare'] = train_data[(train_data.Fare !=0) & (train_data.Pclass ==i+1)].Fare.median()

5.用随机森林来填充年龄缺失值

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
#年龄类似连续值的用回归
for_age = train_data[['Survived','Sex','SibSp','Parch','Fare','Age']]
for_age['Sex'] = pd.Categorical(for_age.Sex).codes #编码

train_age = for_age[for_age.Age.notnull()]
x_train, x_test, y_train, y_test = train_test_split(train_age.iloc[:,:-1], train_age.iloc[:,-1:], test_size=0.2, random_state=2)
test_age = for_age[for_age.Age.isnull()]
x = test_age.iloc[:,:-1]
rfr = RandomForestRegressor(n_estimators=50, max_depth=6, n_jobs=-1)# 
rfr.fit(x_train,y_train)
#检测模型准确率
y_test_hat = rfr.predict(x_test)
print(mean_squared_error(y_test,y_test_hat))
y_hat = rfr.predict(x)
train_data.Age[train_data.Age.isnull()] = y_hat

D:\anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d


146.06308221585957

三、分析数据关系：数据预处理完后开始分析与目标变量之间的关系

1.性别与生还的关系

train_data.groupby(['Sex','Survived'])['Survived'].count()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64

train_data.groupby(['Sex'])['Survived'].count()

Sex
female    314
male      577
Name: Survived, dtype: int64

train_data[['Sex','Survived']].head()

	Sex	Survived
0	male	0
1	female	1
2	female	1
3	female	1
4	male	0

train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
#例如100个女样本，30个未生还（0），70个生还（1）求均值得0.7
#女性生还率远大于男性，体现了女士优先

2.船舱等级pclass与生还的关系

train_data[['Pclass','Survived']].groupby(['Pclass']).count()

	Survived
Pclass
1	216
2	184
3	491

train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
#可以看出阶级越高获救的可能性越大

train_data[['Pclass','Survived','Sex']].groupby(['Pclass','Sex']).mean().plot.bar()
#加上性别

train_data.groupby(['Sex','Pclass','Survived'])['Survived'].count()
#整体来讲，阶级越高获救比例越高，女性获救比例本来就高阶级高的越高

Sex     Pclass  Survived
female  1       0             3
                1            91
        2       0             6
                1            70
        3       0            72
                1            72
male    1       0            77
                1            45
        2       0            91
                1            17
        3       0           300
                1            47
Name: Survived, dtype: int64

3.年龄与是否存活的关系

#不同仓位，性别下的年龄分布和生还的关系
fig, ax = plt.subplots(1,2, figsize=(18,8))
sns.violinplot('Pclass','Age',hue='Survived', data=train_data, split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(1,110,10))

sns.violinplot('Sex', 'Age', hue='Survived', data=train_data, split=True, ax=ax[1])
ax[1].set_title('sex and age vs Survived')
ax[1].set_yticks(range(1,110,10))

plt.show()

#年龄分布图和箱图
plt.figure(figsize=(12,5))
plt.subplot(121)
train_data['Age'].hist(bins=30)
plt.xlabel('age')
plt.ylabel('num')

plt.subplot(122)
train_data.boxplot(column='Age', showfliers=False)
plt.show()

#不同年龄下生存情况
facet = sns.FacetGrid(train_data, hue='Survived', aspect=4)
facet.map(sns.kdeplot,'Age',shade=True)
facet.set(xlim=(0, train_data['Age'].max()))
facet.add_legend()

#不同年龄下的生存率
fig. axis1 = plt.subplots(1,1,figsize=(18,4))
train_data['Age_int'] = train_data['Age'].astype(int)
avg_age = train_data[['Age_int','Survived']].groupby(['Age_int'],as_index=False).mean()
sns.barplot(x='Age_int', y='Survived',data=avg_age)

train_data['Age'].describe()

count    891.000000
mean      29.720748
std       13.384343
min        0.420000
25%       22.000000
50%       29.000000
75%       36.000000
max       80.000000
Name: Age, dtype: float64

#划分幼儿，少年，成年，老人四个群体
bins=[0,12,18,65,100]
train_data['Age_group'] = pd.cut(train_data['Age'],bins)
by_age = train_data[['Age_group','Survived']].groupby(['Age_group']).mean()
by_age

	Survived
Age_group
(0, 12]	0.560000
(12, 18]	0.430556
(18, 65]	0.364130
(65, 100]	0.125000

by_age.plot.bar()
#年龄越大生还率越低

4.称呼与存活关系

train_data['Title'] = train_data['Name'].str.extract('([A-Za-z]+)\.',expand=False)
pd.crosstab(train_data['Title'],train_data['Sex'])

Sex	female	male
Title
Capt	0	1
Col	0	2
Countess	1	0
Don	0	1
Dr	1	6
Jonkheer	0	1
Lady	1	0
Major	0	2
Master	0	40
Miss	182	0
Mlle	2	0
Mme	1	0
Mr	0	517
Mrs	125	0
Ms	1	0
Rev	0	6
Sir	0	1

train_data[['Title','Survived']].groupby(['Title']).mean().plot.bar()
#各称呼下的存活率

fig, axist = plt.subplots(1,1, figsize=(18,4))
train_data['Name_len'] = train_data['Name'].apply(len)
name_len = train_data[['Name_len','Survived']].groupby(['Name_len'],as_index=False).mean()
sns.barplot(x='Name_len',y='Survived',data=name_len)
#从数据上看名字长度确实与存活率有一定关系

5.兄弟姐妹sisp与存活关系

sibsp = train_data[train_data['SibSp'] != 0]
no_sibsp = train_data[train_data['SibSp'] == 0]

plt.figure(figsize=(10,5))
plt.subplot(121)
sibsp['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('sibsp')

plt.subplot(122)
no_sibsp['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('no_sibsp')
#可以看出有兄弟姐妹的人存活率会高一些，可能由于互相帮扶吧

Text(0.5,0,'no_sibsp')

6.有无父母parch

parch = train_data[train_data['Parch'] != 0]
no_parch = train_data[train_data['Parch'] == 0]

plt.figure(figsize=(10,5))
plt.subplot(121)
parch['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('parch')

plt.subplot(122)
no_parch['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('no_parch')
plt.show()
#可以看出结论与兄弟姐妹类似

7.亲友人数与存活关系sibsp&parch

fig,ax = plt.subplots(1,2,figsize=(18,8))
train_data[['SibSp','Survived']].groupby(['SibSp']).mean().plot.bar(ax=ax[0])
ax[0].set_title('SibSp and Survived')

train_data[['Parch','Survived']].groupby(['Parch']).mean().plot.bar(ax=ax[1])
ax[1].set_title('Parch and Survived')
plt.show()
#太多也是累赘吧

train_data['people num'] = train_data['SibSp'] + train_data['Parch'] +1
train_data[['people num','Survived']].groupby(['people num']).mean().plot.bar()
#一个人生存率也比较低，家庭过多的人数存活率也会降低

8.存活率与票价分布Fare的关系

plt.figure(figsize=(10,5))
train_data['Fare'].hist(bins=100)

train_data.boxplot(column='Fare', by='Pclass',showfliers=False)
plt.show()

train_data['Fare'].describe()

count    891.000000
mean      32.689318
std       49.611639
min        4.012500
25%        7.925000
50%       14.500000
75%       31.275000
max      512.329200
Name: Fare, dtype: float64

fns = train_data['Fare'][train_data['Survived']==0]
fs = train_data['Fare'][train_data['Survived']==1]

avg_fare = pd.DataFrame([fns.mean(),fs.mean()])
std_fare = pd.DataFrame([fns.std(),fs.std()])

avg_fare.plot.bar(yerr=std_fare, legend=False)
#生还者的平均票价要高于未生还者

9.船舱类型Cabin与生还的关系

#缺失值过多，可直接剔除该特征，此处分析有无仓位
train_data['has cabin'] = train_data['Cabin'].apply(lambda x:0 if x=='U0' else 1)
plt.figure(figsize=(10,7))
train_data[['has cabin','Survived']].groupby(['Survived']).mean().plot.bar()
train_data[['has cabin','Survived']].groupby(['has cabin']).mean().plot.bar()
#可以看出图一，未生还里有仓位率为0.12，生还的人有仓位率接近0.4，图二中没有仓位生还率0.3，有仓位的生还率接近0.7

#对不同类型船舱分析
train_data['cat_cabin'] = train_data['Cabin'].map(lambda x:re.compile('([a-zA-Z]+)').search(x).group())
#将字母映射成数值
train_data['cat_cabin'] = pd.factorize(train_data['cat_cabin'])[0]
train_data[['cat_cabin','Survived']].groupby(['cat_cabin']).mean().plot.bar()

train_data[['cat_cabin','Survived']].groupby(['cat_cabin']).mean().plot.bar()
#其实跟上面一样，只不过顺序问题，虽然各仓位存活率不同，但差别也不大基本没啥规律可以删除该特征

10.登船地Embarked与生还率的关系

#坦尼克号从英国的南安普顿港出发，途径法国瑟堡和爱尔兰昆士敦，那么在昆士敦之前上船的人，有可能在瑟堡或昆士敦下船，这些人将不会遇到海难。
sns.countplot('Embarked',hue='Survived',data=train_data)
plt.title('Embarked and Survived')
plt.show()

sns.factorplot('Embarked','Survived',data=train_data, size=3, aspect=3)
plt.title('Embarked and Survived rate')
plt.show()
#可见s上船存活率最低，c最高

四.变量、定量转换

#独热编码，类别变量或二元变量，类别过多也不好，因为会增加过多特征
embark_dummies = pd.get_dummies(train_data['Embarked'])
train_data = train_data.join(embark_dummies)
embark_dummies = train_data[['S','C','Q']]

#对于多类例如Cabin编码用pd.factorize
#先填空值
train_data['Cabin'][train_data['Cabin'].isnull()] = 'U0'
#先提取出类别字母再进行编码
train_data['cat_cabin'] = train_data['Cabin'].map(lambda x:re.compile('([a-zA-z]+)').serch(x).group())
train_data['cat_cabin_code'] = pd.factorize(train_data['Cabin'])[0]

Series([], Name: Cabin, dtype: object)

定量转换

1.Scaling数据标准化normalization

from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
train_data['age_s'] = scaler.fit_transform(train_data['Age'].values.reshape(-1,1))

#对Fare分箱
train_data['Fare_bin'] = pd.qcut(train_data['Fare'],5)
#分箱后编码
train_data['Fare_bin_id'] = pd.factorize(train_data['Fare_bin'])[0]

#也可以用One hot编码，但是这里类别过多
fare_one_hot = pd.get_dummies(train_data['Fare_bin']).rename(columns=lambda x: 'Fare'+ str(x))
#拼接，也可以用join
a = train_data.join(fare_one_hot)
#也可以用pd.concat
b = pd.concat([train_data,fare_one_hot],axis=1)

五、特征工程

#需要将训练集和测试集一同处理，使得二者有相同数据类型和分布
train_df_org = pd.read_csv('E:\\data\\titanic\\train.csv')
test_df_org = pd.read_csv('E:\\data\\titanic\\test.csv')
test_df_org['Survived'] = None
combined_data = train_df_org.append(test_df_org)
PassengerId = test_df_org['PassengerId']

1.Embarked

#由于此特征的缺失值不多，可以用众数来填充
combined_data['Embarked'].fillna(combined_data['Embarked'].mode().iloc[0],inplace=True)
#或
# combined_data[combined_data['Embarked'].isnull()]['Embarked'] = combined_data['Embarked'].mode()[0]

#然后可以用pd.get_dummies或pd.factorize,此处用后者方便后面处理其他变量
combined_data['Embarked'] = pd.factorize(combined_data['Embarked'])[0]
# 使用 pd.get_dummies 获取one-hot 编码
# emb_dummies_df = pd.get_dummies(combined_data['Embarked'], prefix=combined_data[['Embarked']].columns[0])
# combined_data = pd.concat([combined_data, emb_dummies_df], axis=1)

2.Sex

#应该用onehot由于也为了后面方便分析其他变量，此处也用factorize
combined_data['Sex'] = pd.factorize(combined_data['Sex'])[0]
#one hot
#sex_one_hot = pd.get_dummies(combined_data['Sex'],prefix=combined_data[['Sex']].columns[0])#给列从新命名
#pd.concat([combined_data, sex_one_hot],axis=1)

3.Name

#提取各种称呼,注意空格不能少，否则下一步会出错
combined_data['Title'] = combined_data['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])

title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))

combined_data['Title'] = combined_data['Title'].map(title_Dict)

#编码
combined_data['Title'] = pd.factorize(combined_data['Title'])[0]

#增加名字长度的特征
combined_data['Name_len'] = combined_data['Name'].map(len)

4.Fare

# 先把空值进行对应阶级均值得填充,然后对于Fare为0的值进行一个平滑处理
combined_data['Fare'] = combined_data[['Fare']].fillna(combined_data.groupby('Pclass').transform(np.mean))
for i in range(3):
    combined_data.loc[(combined_data['Pclass'] == i+1) & (combined_data['Fare']==0),'Fare'] = combined_data[(combined_data['Pclass'] == i+1) & (combined_data['Fare']!=0)]['Fare'].min()/10

#经判断，有很多ticket重复，有团体票，需要分担到每个人的头上
#将Fare按照TIcket分组后，在每个对应Fare位置上写上该Ticket组有几个Fare
combined_data['Group_Ticket'] = combined_data['Fare'].groupby(by=combined_data['Ticket']).transform('count')
combined_data = combined_data.drop(['Group_Ticket','Title_code'],axis=1,inplace=True)

#对票价进行分箱,然后编码
combined_data['Fare_bin'] = pd.qcut(combined_data['Fare'],5)
combined_data['Fare_id'] = pd.factorize(combined_data['Fare_bin'])[0]
combined_data.drop(['Fare_bin'],axis=1,inplace=True)

5.Pclass

#一般情况没必要处理Pclass，但是各等级内部也有可能跟逃生顺序有关，因此分出各等级仓位的高低价位
from sklearn.preprocessing import LabelEncoder

pc1_mean = combined_data[['Pclass','Fare']].groupby(['Pclass']).mean().values[0][0]
pc2_mean = combined_data[['Pclass','Fare']].groupby(['Pclass']).mean().values[1][0]
pc3_mean = combined_data[['Pclass','Fare']].groupby(['Pclass']).mean().values[2][0]

def Pclass_fare_category(df,pc1_mean,pc2_mean,pc3_mean):
    if df['Pclass'] == 1:
        if df['Fare'] > pc1_mean:
            return 'pc1_high'
        else:
            return 'pc1_low'
    
    elif df['Pclass'] == 2:
        if df['Fare'] > pc2_mean:
            return 'pc2_high'
        else:
            return 'pc2_low'
    
    elif df['Pclass'] == 3:
        if df['Fare'] > pc3_mean:
            return 'pc3_high'
        else:
            return 'pc3_low'

combined_data['Pclass_Fare_Category'] = combined_data.apply(Pclass_fare_category,args=(pc1_mean,pc2_mean,pc3_mean),axis=1)
combined_data['Pclass_Fare_Category'] = LabelEncoder().fit_transform(combined_data['Pclass_Fare_Category'])
#顺手把pclass也编码
combined_data['Pclass'] = LabelEncoder().fit_transform(combined_data['Pclass'])

6.SibSp和Parch，合并为family size,还得加上他自己

def Family_size_category(family_size):
    if family_size == 1:
        return '1'
    elif family_size <= 4:
        return '2'
    else:
        return '3'

combined_data['Family_size'] = combined_data['SibSp'] + combined_data['Parch'] + 1
combined_data['Family_size_categroy'] = combined_data['Family_size'].map(Family_size_category)
combined_data['Family_size_categroy'] = LabelEncoder().fit_transform(combined_data['Family_size_categroy'])

7.Age:缺失过多，不能简单填充，模型融合建模预测

age = combined_data[['Age','Sex','Embarked','Pclass','Pclass_Fare_Category','Title','Name_len','Fare','Fare_id','Family_size','Family_size_categroy']]
age_train = age[age['Age'].notnull()]
age_test = age[age['Age'].isnull()]

#建立多模型预测
from sklearn import ensemble
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

def fill_age(age_train,age_test):
    age_train_x = age_train.drop(['Age'],axis=1)
    age_train_y = age_train['Age']
    age_test_x = age_test.drop(['Age'],axis=1)
    
    #model 1 gbm
    print('==========Model GBM==========')
    gbm_reg = GradientBoostingRegressor(random_state=41)
    gbm_param = {'n_estimators':[2000], 'max_depth':[4], 'learning_rate':[0.01], 'max_features':[3]}
    gbm_grid = model_selection.GridSearchCV(gbm_reg, gbm_param, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
    gbm_grid.fit(age_train_x, age_train_y)
    print('GBM best features for age params'+str(gbm_grid.best_params_))
    print('GBM best features for age scores'+str(gbm_grid.best_score_))
    print('GBM train error for age feature regressor'+str(gbm_grid.score(age_train_x, age_train_y)))
    print(age_train_x.shape,age_test_x.shape)
    age_test.loc[:, 'Age_gbm'] = gbm_grid.predict(age_test_x)
    print(age_test['Age_gbm'][:4])
    
    #model 2 rf
    print('==========Model RF==========')
    rf_reg = RandomForestRegressor()
    rf_param = {'n_estimators':[200], 'max_depth':[5], 'random_state':[0]}
    rf_grid = model_selection.GridSearchCV(rf_reg,rf_param, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
    rf_grid.fit(age_train_x,age_train_y)
    print('RF best features for age params'+ str(rf_grid.best_params_))
    print('RF best features for age score'+ str(rf_grid.best_score_))
    print('RF train error for age feature regressor'+ str(rf_grid.score(age_train_x,age_train_y)))
    
    age_test.loc[:, 'Age_rf'] = rf_grid.predict(age_test_x)
    print(age_test['Age_rf'][:4])
    
    #merge model
    print('==========Merge Model==========')
    print('shape',age_test['Age'].shape,age_test[['Age_gbm','Age_rf']].mode(axis=1).shape)
    
    age_test.loc[:,'Age'] = np.mean([age_test['Age_gbm'], age_test['Age_rf']])
    print('merge age:\n',age_test['Age'][:4])
    age_test.drop(['Age_gbm','Age_rf'],axis=1,inplace=True)
    
    return age_test

combined_data.loc[(combined_data.Age.isnull()),'Age'] = fill_age(age_train,age_test)

==========Model GBM==========
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=25)]: Done   5 out of  10 | elapsed:   35.2s remaining:   35.2s
[Parallel(n_jobs=25)]: Done  10 out of  10 | elapsed:   47.2s finished


GBM best features for age params{'learning_rate': 0.01, 'max_depth': 4, 'max_features': 3, 'n_estimators': 2000}
GBM best features for age scores-130.07827342623582
GBM train error for age feature regressor-63.488346095073126
(1046, 10) (263, 10)
5     36.266057
17    29.768299
19    37.534189
26    27.857612
Name: Age_gbm, dtype: float64
==========Model RF==========
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=25)]: Done   5 out of  10 | elapsed:   37.7s remaining:   37.7s
[Parallel(n_jobs=25)]: Done  10 out of  10 | elapsed:   47.6s finished


RF best features for age params{'max_depth': 5, 'n_estimators': 200, 'random_state': 0}
RF best features for age score-120.22123963994939
RF train error for age feature regressor-96.82435399344224
5     32.667672
17    31.516429
19    31.493906
26    27.854183
Name: Age_rf, dtype: float64
==========Merge Model==========
shape (263,) (263, 2)
merge age:
 5     29.841298
17    29.841298
19    29.841298
26    29.841298
Name: Age, dtype: float64

8.Tickt有字母数字之分，将字母与数字分开

combined_data['Ticket_letter'] = combined_data['Ticket'].str.split().str[0]
combined_data['Ticket_letter'] = combined_data['Ticket_letter'].apply(lambda x:'U0' if x.isnumeric() else x)
#编码
combined_data['Ticket_letter'] = LabelEncoder().fit_transform(combined_data['Ticket_letter'])

9.Cabin:缺失值太多，就分为有无

combined_data.loc[combined_data['Cabin'].isnull(),'Cabin'] = 'U0'
combined_data['Cabin'] = combined_data['Cabin'].apply(lambda x:0 if x == 'U0' else 1)

combined_data['Cabin'].value_counts()

0    1014
1     295
Name: Cabin, dtype: int64

combined_data.rename(columns={'Family_size_categroy':'Family_size_category'},inplace=True)

六、特征间相关性分析

correlation = combined_data[['Age','Cabin','Embarked','Fare','Pclass','Sex','Title','Name_len','Fare_id','Pclass_Fare_Category','Family_size','Family_size_category','Ticket_letter']]
colormap = plt.cm.viridis
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features',y=1.05,size=15)
sns.heatmap(correlation.astype(float).corr(),lw=0.1, vmax=1.0, square=True, cmap = colormap, linecolor = 'white', annot=True)
plt.show()

#特征间数据分布图
g = sns.pairplot(combined_data[[u'Survived', u'Pclass', u'Sex', u'Age', u'Fare', u'Embarked',u'Family_size', u'Title', u'Ticket_letter']], hue='Survived', palette='seismic', size=1.2, diag_kind='kde', diag_kws=dict(shade=True), plot_kws=dict(s=10))
g.set(xticklabels=[])

七、入模前处理

#将age，Fare,name_len正则化(归一化)
combined_data[['Age','Fare','Name_len']] = preprocessing.StandardScaler().fit_transform(combined_data[['Age','Fare','Name_len']])

#备份
combined_data_backup = combined_data

#还原
combined_data = combined_data_backup
#把passengerid设置为索引
# combined_data.set_index('PassengerId',inplace=True)
#ticket换成独热编码
pd.cut(combined_data['Ticket_letter'],5)

combined_data.shape

(1309, 40)

#统一独热编码，删除不入模变量
get_dummies = combined_data[['Cabin','Embarked','Pclass','Sex','Title','Fare_id','Pclass_Fare_Category','Family_size_category']]
ohe_list = ['Cabin','Embarked','Pclass','Sex','Title','Fare_id','Pclass_Fare_Category','Family_size_category','Ticket_letter']
drop_list = ohe_list+['Name','Parch','SibSp','Ticket']

for i in ohe_list:
    ohe = pd.get_dummies(combined_data[i]).rename(columns=lambda x: i+str(x))
    combined_data = pd.concat([combined_data,ohe],axis=1)
combined_data.drop(drop_list,axis=1,inplace=True)

#区分训练集与测试集
train_data = combined_data[combined_data.Survived.notnull()]
test_data = combined_data[combined_data.Survived.isnull()]

feature_train_data_x = train_data.drop('Survived',axis=1)
feature_train_data_y = train_data['Survived'].astype('int')
feature_test_data_x = test_data.drop('Survived',axis=1)

八、模型融合及测试

#1.用不同模型对特征进行筛选，选出较重要特征
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

def get_top_n_features(train_x,train_y,get_feature_num):
    
    #extra tree的参数并行数n_jobs=25会报错，=1时候就没事
    print('========== Extra Trees ==========')
    et_est = ExtraTreesClassifier(random_state=0)
    et_param = {'n_estimators':[500],'min_samples_split':[3,4],'max_depth':[20]}
    et_grid = model_selection.GridSearchCV(et_est, et_param, n_jobs=1, cv=10, verbose=1)
    et_grid.fit(train_x,train_y)
    print('best ET params for top n features'+str(et_grid.best_params_))
    print('best ET score for top n features'+str(et_grid.best_score_))
    print('best ET training score for top n features'+str(et_grid.score(train_x,train_y)))
    top_n_features_et_sorted = pd.DataFrame({'feature':list(train_x),'importance':et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    top_n_features_et = top_n_features_et_sorted.head(get_feature_num)['feature']
    print('Sample 10 features from Extra Trees')
    print(str(top_n_features_et[:10]))
    
    print('========== Gradient Boosting ==========')
    gb_est = GradientBoostingClassifier(random_state=0)
    gb_param = {'n_estimators':[500],'learning_rate':[0.01,0.1],'max_depth':[20]}
    gb_grid = model_selection.GridSearchCV(gb_est, gb_param, n_jobs=25, cv=10, verbose=1)
    gb_grid.fit(train_x,train_y)
    print('best GB params for top n features'+str(gb_grid.best_params_))
    print('best GB score for top n features'+str(gb_grid.best_score_))
    print('best GB training score for top n features'+str(gb_grid.score(train_x,train_y)))
    top_n_features_gb_sorted = pd.DataFrame({'feature':list(train_x),'importance':gb_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)
    top_n_features_gb = top_n_features_gb_sorted.head(get_feature_num)['feature']
    print('Sample 10 features from Gradient Boosting')
    print(str(top_n_features_gb[:10]))
    
    print('========== Decision Tree ==========')
    dt_est = DecisionTreeClassifier(random_state=0)
    dt_param = {'min_samples_split':[2,4],'max_depth':[20]}
    dt_grid = model_selection.GridSearchCV(dt_est, dt_param, n_jobs=25, cv=10, verbose=1)
    dt_grid.fit(train_x,train_y)
    print('best DT params for top n features'+str(dt_grid.best_params_))
    print('best DT score for top n features'+str(dt_grid.best_score_))
    print('best DT training score for top n features'+str(dt_grid.score(train_x, train_y)))
    top_n_features_dt_sorted = pd.DataFrame({'feature':list(train_x),'importance':dt_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)
    top_n_features_dt = top_n_features_dt_sorted.head(get_feature_num)['feature']
    print('Sample 10 features from Decision Tree')
    print(str(top_n_features_dt[:10]))

    print('========== Random Forest ==========')
    rf_est = RandomForestClassifier(random_state=0)
    rf_param = {'n_estimators':[500],'min_samples_split':[2,3],'max_depth':[20]}
    rf_grid = model_selection.GridSearchCV(rf_est, rf_param, n_jobs=25, cv=10, verbose=1)
    rf_grid.fit(train_x,train_y)
    print('best RF params for top n features'+str(rf_grid.best_params_))
    print('best RF score for top n fratures'+str(rf_grid.best_score_))
    print('best RF training score for top n fratures'+str(rf_grid.score(train_x,train_y)))
    top_n_features_rf_sorted = pd.DataFrame({'feature':list(train_x),'importance':rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    top_n_features_rf = top_n_features_rf_sorted.head(get_feature_num)['feature']
    print('Sample 10 Features from Random Forest')
    print(str(top_n_features_rf[:10]))
        
    print('========== AdaBoost ==========')
    ada_est = AdaBoostClassifier(random_state=0)
    ada_param = {'n_estimators':[500], 'learning_rate':[0.01,0.1]}
    ada_grid = model_selection.GridSearchCV(ada_est, ada_param, n_jobs=25, cv=10, verbose=1)
    ada_grid.fit(train_x, train_y)
    print('best Ada params for top n features'+str(ada_grid.best_params_))
    print('best Ada score for top n features'+str(ada_grid.best_score_))
    print('best Ada training score for top n features'+str(ada_grid.score(train_x,train_y)))
    top_n_features_ada_sorted = pd.DataFrame({'feature':list(train_x),'importance':ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    top_n_features_ada = top_n_features_ada_sorted.head(get_feature_num)['feature']
    print('Sample 10 Features from AdaBoost')
    print(str(top_n_features_ada[:10]))
    
    
    
    #model merge
    top_n_feature = pd.concat([top_n_features_rf,top_n_features_ada,top_n_features_et,top_n_features_gb,top_n_features_dt],ignore_index=True).drop_duplicates()
    features_importance = pd.concat([top_n_features_rf_sorted,top_n_features_ada_sorted,top_n_features_et_sorted,top_n_features_gb_sorted,top_n_features_dt_sorted],ignore_index=True)
    
    return top_n_feature, features_importance

#2.用融合的模型得出最佳特征
get_features_num = 30
top_n_feature, features_importance = get_top_n_features(feature_train_data_x, feature_train_data_y, get_features_num)

========== Extra Trees ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   16.7s finished


best ET params for top n features{'max_depth': 20, 'min_samples_split': 4, 'n_estimators': 500}
best ET score for top n features0.8271604938271605
best ET training score for top n features0.9652076318742986
Sample 10 features from Extra Trees
14      Title0
12        Sex0
13        Sex1
2     Name_len
0          Age
1         Fare
11     Pclass2
16      Title2
15      Title1
4       Cabin0
Name: feature, dtype: object
========== Gradient Boosting ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  1.3min remaining:   42.1s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  1.6min finished


best GB params for top n features{'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 500}
best GB score for top n features0.7654320987654321
best GB training score for top n features0.9977553310886644
Sample 10 features from Gradient Boosting
0                       Age
2                  Name_len
1                      Fare
14                   Title0
10                  Pclass1
16                   Title2
13                     Sex1
3               Family_size
28    Pclass_Fare_Category3
12                     Sex0
Name: feature, dtype: object
========== Decision Tree ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  1.1min remaining:   36.3s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  1.5min finished


best DT params for top n features{'max_depth': 20, 'min_samples_split': 4}
best DT score for top n features0.7643097643097643
best DT training score for top n features0.9618406285072951
Sample 10 features from Decision Tree
14                   Title0
1                      Fare
0                       Age
2                  Name_len
3               Family_size
19                   Title5
29    Pclass_Fare_Category4
10                  Pclass1
22                 Fare_id2
4                    Cabin0
Name: feature, dtype: object
========== Random Forest ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  1.3min remaining:   41.9s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  1.8min finished


best RF params for top n features{'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 500}
best RF score for top n fratures0.8294051627384961
best RF training score for top n fratures0.9809203142536476
Sample 10 Features from Random Forest
2        Name_len
0             Age
1            Fare
13           Sex1
14         Title0
12           Sex0
3     Family_size
11        Pclass2
16         Title2
15         Title1
Name: feature, dtype: object
========== AdaBoost ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  1.4min remaining:   43.8s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  1.7min finished


best Ada params for top n features{'learning_rate': 0.01, 'n_estimators': 500}
best Ada score for top n features0.8148148148148148
best Ada training score for top n features0.8170594837261503
Sample 10 Features from AdaBoost
14                   Title0
1                      Fare
33    Family_size_category2
3               Family_size
11                  Pclass2
12                     Sex0
13                     Sex1
4                    Cabin0
5                    Cabin1
2                  Name_len
Name: feature, dtype: object

#重新构建入模特征数据集
train_data_x = pd.DataFrame(feature_train_data_x[top_n_feature])
test_data_x = pd.DataFrame(feature_test_data_x[top_n_feature])

2.对挑出来的特征部分作图

#对挑出来的特征部分作图,因为是按照每个模型39个特征的重要性依次拼接上的
et_feature_imp = features_importance[:10]
gb_feature_imp = features_importance[39:39+10].reset_index(drop=True)

#一列是特征名称，一列是重要度，取重要度这一列,再做一个归一化
et_feature_importance = 100.0 * (et_feature_imp['importance']/et_feature_imp['importance'].max())
gb_feature_importance = 100.0 * (gb_feature_imp['importance']/gb_feature_imp['importance'].max())

# Get the indexes of all features over the importance threshold
et_important_idx = np.where(et_feature_importance)[0]
gb_important_idx = np.where(gb_feature_importance)[0]

pos = np.arange(et_important_idx.shape[0]) + .5
plt.figure(1, figsize = (18, 8))

plt.subplot(121)
plt.barh(pos, et_feature_importance[et_important_idx][::-1])
plt.yticks(pos, et_feature_imp['feature'][::-1])
plt.xlabel('Relative Importance')
plt.title('Extra Trees Importance')

plt.subplot(122)
plt.barh(pos, gb_feature_importance[gb_important_idx][::-1])
plt.yticks(pos, gb_feature_imp['feature'][::-1])
plt.xlabel('Relative Importance')
plt.title('Gradient Boosting Importance')

plt.show()

3.模型融合（Model Ensemble）:常见有Bagging,Boosting,Stacking,Blending
#1.Bagging

Bagging 将多个模型，也就是多个基学习器的预测结果进行简单的加权平均或者投票。它的好处是可以并行地训练基学习器。Random Forest就用到了Bagging的思想。

(3-2): Boosting

Boosting 的思想有点像知错能改，每个基学习器是在上一个基学习器学习的基础上，对上一个基学习器的错误进行弥补。我们将会用到的 AdaBoost，Gradient Boost 就用到了这种思想。

(3-3): Stacking

Stacking是用新的次学习器去学习如何组合上一层的基学习器。如果把 Bagging 看作是多个基分类器的线性组合，那么Stacking就是多个基分类器的非线性组合。Stacking可以将学习器一层一层地堆砌起来，形成一个网状的结构。

相比来说Stacking的融合框架相对前面的二者来说在精度上确实有一定的提升，所以在下面的模型融合上，我们也使用Stacking方法。

(3-4): Blending

Blending 和 Stacking 很相似，但同时它可以防止信息泄露的问题。

# Stacking框架融合:
# 这里我们使用了两层的模型融合，Level 1使用了：RandomForest、AdaBoost、ExtraTrees、GBDT、DecisionTree、KNN、SVM ，一共7个模型，Level 2使用了XGBoost使用第一层预测的结果作为特征对最终的结果进行预测。

# Level 1：
# Stacking框架是堆叠使用基础分类器的预测作为对二级模型的训练的输入。 然而，我们不能简单地在全部训练数据上训练基本模型，产生预测，输出用于第二层的训练。如果我们在Train Data上训练，然后在Train Data上预测，就会造成标签。为了避免标签，我们需要对每个基学习器使用K-fold，将K个模型对Valid Set的预测结果拼起来，作为下一层学习器的输入。

# 所以这里我们建立输出fold预测方法：
from sklearn.model_selection import KFold

#一些参数
ntrain = train_data_x.shape[0]
ntest = test_data_x.shape[0]
random_state = 0
n_fold = 7
kf = KFold(n_splits=n_fold, random_state=random_state, shuffle=False)

def get_out_fold(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest))
    oof_test_skf = np.empty((n_fold, ntest))
    #先把数据7等分，第一份为测试集，其余训练集，然后是第二份以此类推，i：第几折了，train_index:本折训练集索引
    for i,(train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        %timeit
        clf.fit(x_tr,y_tr)
        oof_train[test_index] = clf.predict(x_te)#将每次循环得到的交叉验证结果存起来
        oof_test_skf[i,:] = clf.predict(x_test)#用每次循环训练出的模型对测试集进行预测
        
    oof_test[:] = oof_test_skf.mean(axis=0)#把7次预测结果求平均
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)

#选用之前填选特征时的5个基分类器Random Forest, AdaBoost, Extra Trees, Decision Tree, Gradient Boosting, 和KNN, SVM这七个学习器
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


rf = RandomForestClassifier(n_estimators=500, warm_start=True, max_features='sqrt', max_depth=6, min_samples_split=3, min_samples_leaf=2, n_jobs=-1, verbose=0)

ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)

et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_depth=8, min_samples_leaf=2, verbose=0)

gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.008, min_samples_split=3, min_samples_leaf=2, verbose=0)

dt = DecisionTreeClassifier(max_depth=8)

knn = KNeighborsClassifier(n_neighbors=2)

svm = SVC(kernel='linear', C=0.025)

#转化成array
x_train = train_data_x.values
x_test = test_data_x.values
y_train = feature_train_data_y.values

# Create our OOF train and test predictions. These base results will be used as new features
rf_oof_train, rf_oof_test = get_out_fold(rf, x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_out_fold(ada, x_train, y_train, x_test) # AdaBoost 
et_oof_train, et_oof_test = get_out_fold(et, x_train, y_train, x_test) # Extra Trees
gb_oof_train, gb_oof_test = get_out_fold(gb, x_train, y_train, x_test) # Gradient Boost
dt_oof_train, dt_oof_test = get_out_fold(dt, x_train, y_train, x_test) # Decision Tree
knn_oof_train, knn_oof_test = get_out_fold(knn, x_train, y_train, x_test) # KNeighbors
svm_oof_train, svm_oof_test = get_out_fold(svm, x_train, y_train, x_test) # Support Vector
print('Training is complete')

Training is complete

4.用XGB将level1的结果作为特征对最终结果进行预测并生成提交文件

#把每个模型得到的训练集预测结果和测试集的预测结果拼接起来
x_train = np.concatenate((rf_oof_train, ada_oof_train, et_oof_train, gb_oof_train, dt_oof_train, knn_oof_train, svm_oof_train),axis=1)
x_test = np.concatenate((rf_oof_test, ada_oof_test, et_oof_test, gb_oof_test, dt_oof_test, knn_oof_test, svm_oof_test),axis=1)#左右拼

from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=2000, max_depth=4, min_child_weight=2, gamma=0.9, subsample=0.8, 
                    colsample_bytree=0.8, objective='binary:logistic',nthread=-1, scale_pos_weight=1)
xgb.fit(x_train, y_train)#其实这一步就相当于把前面得到的7个结果当做特征来和y_train去重新建模
predictions = xgb.predict(x_test)#然后拿训练好的模型去预测结果

D:\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:

predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])

#保存并提交结果，91.1%的正确率，提交得分0.77
Stacking_Submission = pd.DataFrame({'PassengerId':PassengerId,'Survived':predictions})
Stacking_Submission.to_csv('E:\\data\\titanic\\StackingSubmission.csv', index=False, sep=',')

九、验证：学习曲线

from sklearn.learning_curve import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(0.1,1.0,5), verbose=0):
    
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
        
        
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores,axis=1)
    train_scores_std = np.std(train_scores,axis=1)
    test_scores_mean = np.mean(test_scores,axis=1)
    test_scores_std = np.std(test_scores,axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color='r')
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color='g')
    plt.plot(train_sizes, train_scores_mean,'o-', color='r', label='Training score')
    plt.plot(train_sizes, test_scores_mean,'o-', color='g', label='Cross-validation score')

    plt.legend(loc='best')
    return plt

X = x_train
Y = y_train

# RandomForest
rf_parameters = {'n_jobs': -1, 'n_estimators': 500, 'warm_start': True, 'max_depth': 6, 'min_samples_leaf': 2, 
                 'max_features' : 'sqrt','verbose': 0}

# AdaBoost
ada_parameters = {'n_estimators':500, 'learning_rate':0.1}

# ExtraTrees
et_parameters = {'n_jobs': -1, 'n_estimators':500, 'max_depth': 8, 'min_samples_leaf': 2, 'verbose': 0}

# GradientBoosting
gb_parameters = {'n_estimators': 500, 'max_depth': 5, 'min_samples_leaf': 2, 'verbose': 0}

# DecisionTree
dt_parameters = {'max_depth':8}

# KNeighbors
knn_parameters = {'n_neighbors':2}

# SVM
svm_parameters = {'kernel':'linear', 'C':0.025}

# XGB
gbm_parameters = {'n_estimators': 2000, 'max_depth': 4, 'min_child_weight': 2, 'gamma':0.9, 'subsample':0.8, 
                  'colsample_bytree':0.8, 'objective': 'binary:logistic', 'nthread':-1, 'scale_pos_weight':1}


title = 'learning Curves'
plot_learning_curve(RandomForestClassifier(**rf_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])

plt.show()

title = 'learning Curves'
plot_learning_curve(XGBClassifier(**gbm_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()

title = 'learning Curves'
plot_learning_curve(AdaBoostClassifier(**ada_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()

title = 'learning Curves'
plot_learning_curve(SVC(**svm_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()

title = 'learning Curves'
plot_learning_curve(ExtraTreesClassifier(**et_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()

plot_learning_curve(GradientBoostingClassifier(**gb_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])

from sklearn.metrics import roc_auc_score,roc_curve,auc

y = pd.read_csv('E:\\data\\titanic\\gender_submission.csv')
y = y['Survived'].values
fpr, tpr, thresholds = roc_curve(y, predictions, pos_label=1)
plt.plot(fpr,tpr)
plt.show()

roc_auc_score(y,predictions)

0.9050751879699248

tpr

array([0.        , 0.88157895, 1.        ])

你可能感兴趣的:(Titanic)

Python 机器学习实战：泰坦尼克号生还者预测 (从数据探索到模型构建) 程序员阿超的博客 Python python 机器学习开发语言泰坦尼克号 Kaggle Scikit-learn 实战教程
引言：挑战介绍泰坦尼克号的沉没是历史上最著名的海难之一。除了其悲剧色彩，它还为数据科学提供了一个经典且引人入胜的入门项目。Kaggle平台上的“Titanic:MachineLearningfromDisaster”竞赛，要求我们利用乘客数据来预测哪些人更有可能在这场灾难中幸存。这是一个典型的二元分类问题：目标变量Survived只有两个值，0（遇难）或1（生还）。这个项目之所以经典，是因为它涵盖
【数据挖掘】期末复习模拟题（暨考试题） chaser&upper 数据分析随笔小记数据挖掘 python 聚类
数据挖掘-期末复习试题挑战全网最全题库单选题多选题判断题填空题程序填空sigmoid曼哈顿距离泰坦尼克号披萨价格预测鸢尾花DBSCN密度聚类决策树购物表单-关联规则火龙果-关联分析数据非线性映射高斯朴素贝叶斯分类器手写数字识别k1-10聚类平均偏差程序分析PM2.5线性回归Titanic数据清洗KNN鸢尾花Kmeans聚类KNN电影分类频繁k项集混淆矩阵OverlookMOOC总结挑战全网最全题库
DAY15 超大力王爱学Python 超大力王超大力王爱学Python python 开发语言
仔细回顾一下之前14天的内容，没跟上进度的同学补一下进度。作业：尝试找到一个kaggle或者其他地方的结构化数据集，用之前的内容完成一个全新的项目，这样你也是独立完成了一个专属于自己的项目。要求：有数据地址的提供数据地址，没有地址的上传网盘贴出地址即可。尽可能与他人不同，优先选择本专业相关数据集探索一下开源数据的网站有哪些？Titanic-MachineLearningfromDisaster|K
数据分析-Pandas序列滑动窗口配置参数 Alex_StarSky 机器学习数据分析 pandas 数据挖掘滑动窗口配置参数
数据分析-Pandas序列滑动窗口配置参数数据分析和处理中，难免会遇到各种数据，那么数据呈现怎样的规律呢？不管金融数据，风控数据，营销数据等等，莫不如此。如何通过图示展示数据的规律？数据表，时间序列数据在数据分析建模中很常见，例如天气预报，空气状态监测，股票交易等金融场景。数据分析过程中重新调整，重塑数据表是很重要的技巧，此处选择Titanic数据，以及巴黎、伦敦欧洲城市空气质量监测NO2NO_2
day 22 lcccyyy1 60天计划机器学习人工智能
泰坦尼克号生还人数预测数据来源：Titanic-MachineLearningfromDisaster|Kaggle数据预处理importpandasaspdimportpandasaspd#用于数据处理和分析，可处理表格数据。importnumpyasnp#用于数值计算，提供了高效的数组操作。importmatplotlib.pyplotasplt#用于绘制各种类型的图表importseabor
人工智能与机器学习入门：决策树应用决策树机器学习入门
在人工智能与机器学习入门：使用Kaggle完成Titanic推断学习一文中，给出了使用Kaggle进行机器学习入门的方法，本文基于上文的需求。尝试使用决策树模型来训练数据，并进行test数据集的测试。什么是决策树决策树，简单来讲可以认为是一个大的ifelse判断树，有了决策树后，测试集中的数据便可以使用该决策树进行判断了。比如根据Titanic的训练数据构造了上次决策树后，便可以根据测试数据的性别
数据分析_python进行数据筛选1_行筛选 Monkey*王 python 数据分析 pandas
以titanic的训练数据为例进行展示，为了简化取前十行为例首先导入模块，导入数据importpandasaspdimportnumpyasnpdf=pd.read_csv(r"C:\Users\admin\Desktop\train.csv")df=df.head(10)df.index=['a','b','c','d','e','f','g','h','i','g']筛选单行1.利用df[行索
决策树算法全解析：从零基础到Titanic实战，一文搞定机器学习经典模型吴师兄大模型 0基础实现机器学习入门到精通算法机器学习决策树人工智能深度学习编程开发语言
Langchain系列文章目录01-玩转LangChain：从模型调用到Prompt模板与输出解析的完整指南02-玩转LangChainMemory模块：四种记忆类型详解及应用场景全覆盖03-全面掌握LangChain：从核心链条构建到动态任务分配的实战指南04-玩转LangChain：从文档加载到高效问答系统构建的全程实战05-玩转LangChain：深度评估问答系统的三种高效方法（示例生成、手
python3中的os.path模块 hgz_dm 编程语言 python3 os.path
os.path模块主要用于获取文件的属性，这里对该模块中一些常用的函数做些记录。os.abspath(path):获取文件的绝对路径。这里path指的是路径，例如我这里输入“data.csv”[In]os.path.abspath('data.csv')[Out]'E:\\kaggle\\Titanic\\data.csv'os.path.basename(path):获取文件名称。该函数默认通过
人工智能与机器学习入门：决策树应用决策树机器学习入门
在人工智能与机器学习入门：使用Kaggle完成Titanic推断学习一文中，给出了使用Kaggle进行机器学习入门的方法，本文基于上文的需求。尝试使用决策树模型来训练数据，并进行test数据集的测试。什么是决策树决策树，简单来讲可以认为是一个大的ifelse判断树，有了决策树后，测试集中的数据便可以使用该决策树进行判断了。比如根据Titanic的训练数据构造了上次决策树后，便可以根据测试数据的性别
kaggle入门级竞赛Spaceship Titanic LIghtgbm+Optuna调参机器学习司猫白机器学习实战机器学习 python 集成学习 scikit-learn
kaggle入门级竞赛SpaceshipTitanic简介数据介绍数据集描述数据字段描述train.csv-约三分之二（~8700）乘客的个人记录，用作培训数据。test.csv-剩余三分之一（~4300）乘客的个人记录，用作测试数据。您的任务是预测Transported该集合中乘客的价值。Sample_submission.csv-格式正确的提交文件。代码分类变量optuna算法简介简介欢迎来到
数据分析-Pandas数据探查初步：离散点图 Alex_StarSky 金融风控数据分析 pandas python Visualization
数据分析-Pandas数据探查初步：离散点图数据分析和处理中，难免会遇到各种数据，那么数据呈现怎样的规律呢？不管金融数据，风控数据，营销数据等等，莫不如此。如何通过图示展示数据的规律？数据表，时间序列数据在数据分析建模中很常见，例如天气预报，空气状态监测，股票交易等金融场景。数据分析过程中重新调整，重塑数据表是很重要的技巧，此处选择Titanic数据，以及巴黎、伦敦欧洲城市空气质量监测NO2NO_
R语言逻辑回归logistic模型分析泰坦尼克titanic数据集预测生还情况拓端研究室 R语言 R语言逻辑回归 logistic 泰坦尼克 titanic
最近我们被客户要求撰写关于逻辑回归的研究报告，包括一些图形和统计输出。相关视频：R语言逻辑回归（Logistic回归）模型分类预测病人冠心病风险逻辑回归Logistic模型原理和R语言分类预测冠心病风险实例，时长06:48逻辑回归是一种拟合回归曲线的方法，y=f(x)，当y是一个分类变量时。这个模型的典型用途是在给定一组预测因素x的情况下预测y，预测因素可以是连续的、分类的或混合的。一般来说，分类
清明假期第一天20200327The loss of Titanic~10 来而不可失者时也
早晨5:12火车到站，天没亮，阴沉沉的，下了火车才发现还下着小雨，雾蒙蒙的，车站周围的小店也黑着灯，没有开门。幸亏有爸爸开着三轮车来接我，本来约着去看牙，但走到目的地，发现只是个小门诊，没有明确的门牌，不太靠谱，就在附近逛了逛，刚买了些菜和一双鞋，三轮电车警报急需充电，在就近的充电桩充了三块钱的电，走了2公里又在喊“请充电”，可能因为天气冷的原因，电充的慢，担心返程路上没有充电桩，无奈之下只有返回
Titanic - 1 silent_eyes_77
本周原想探究一下seaborn绘图方面的运用，发现用在实际案例中更有效果，遂直接用Kaggel经典的Titanic案例的描述性分析部分进行研究。以下是案例的其中一部分，模型探究有待补充与更新。复习一下，完成这篇分析报告需要进行的几个步骤：一、导入数据包与数据集二、数据分析1、总体预览2、描述性统计分析：使用统计学与绘图，初步了解数据之间相关性，为构造特征工程和模型建立做准备3、数据清洗4、建模与优
Chinese Titanic survivors 俗世尘沙
DocumentaryshineslightonChineseTitanicsurvivors这部纪录片聚焦泰坦尼克号上的中国幸存者The1997blockbusterfilm"Titanic"showedaheart-wrenchingromancebetweentwoyounglovers.Butfewknowthatamongthepassengers,therewereactuallyei
kaggle：泰坦尼克号获救预测_Titanic_EDA## 卜咦
问题数据来源于Kaggle，通过一组列有泰坦尼克号灾难幸存者或幸存者的训练样本集，我们的模型能否基于不包含幸存者信息的给定测试数据集确定这些测试数据集中的乘客是否幸存。代码与数据分析导入必要的包和titanic数据image数据集基本信息将数据分为不同类别，分别为类别型数据和数字型数据类别数据：Survived,Sex,andEmbarked.Ordinal:Pclass数字型数据：Age,Far
matplotlib,seaborn,plotly数据可视化库这么多，应该如何选择？ Icevivina python学习特征分析可视化
在做titanic分析的过程中，看了一些大神的想法，发现在分析数据的过程中，许多大神会使用到seaborn，plotly这些库，而我等小白仅仅知道matplotlib这个唯一的数据可视化库而已。上网查找资料后整理如下：数据可视化库可以根据其应用场景来分为以下几类：基础的2D,3D图绘制库，交互信息可视化库，地图可视化库基础的2D,3D可视化主要包括了matplotlib和seaborn,其中sea
2021-06-19第二章第二节数据重构 Akai_
2.4数据合并2.4.1载入数据截屏2021-06-1913.01.24.pngtext_left_up=pd.read_csv('./titanic/data/train-left-up.csv')text_left_down=pd.read_csv('./titanic/data/train-left-down.csv')text_right_up=pd.read_csv('./titanic
数据分析-Pandas如何处理表格中的文本数据 Alex_StarSky 金融风控数据分析 pandas 数据挖掘字符数据处理分割搜索
数据分析-Pandas如何处理表格中的文本数据数据分析和处理中，难免会遇到文本数据，比如人名，地名，还有其他的场景描述等等。金融数据，风控数据，营销数据等等，莫不如此。如何用pandas处理文本数据呢？数据表，时间序列数据在数据分析建模中很常见，例如天气预报，空气状态监测，股票交易等金融场景。数据分析过程中重新调整，重塑数据表是很重要的技巧，此处选择Titanic数据，以及巴黎、伦敦欧洲城市空气质
【Kaggle】泰坦尼克号生存预测 Titanic 撕得失败的标签 Kaggle 机器学习 Kaggle 泰塔尼克号 Titanic 逻辑回归分类树
文章目录前言案例背景数据集介绍加载数据集探索性数据分析（EDA）可视化特征和目标值之间关系缺失值分析数据预处理数据清洗缺失值处理去除噪声并且规范化文本内容数据转换数据划分建模逻辑回归模型决策分类树模型随机森林模型梯度提升树模型预测LR完整的python代码前言官网链接：Titanic-MachineLearningfromDisaster|Kaggle资源链接：【免费】Kaggle泰坦尼克号生存预
数据处理II：数据转换 Franchen
下列数据来源Kaggle的Titanic题目特征分类定量特征：如年龄、票价等有数量关系的特征，可二值化或函数变换定性特征：如性别、几等舱等没有数量意义的特征，可哑编码或函数变换定量特征与定性特征需要分开处理二值化Binarizer定量特征二值化的核心在于设定一个阈值，大于阈值的赋值为1，小于等于阈值的赋值为0fromsklearn.preprocessingimportBinarizerbin
数据分析-Pandas如何用图把数据展示出来 Alex_StarSky 金融风控数据分析 pandas 数据挖掘数据透视 python
数据分析-Pandas如何用图把数据展示出来俗话说，一图胜千语，对人类而言一串数据很难立即洞察出什么，但如果展示图就能一眼看出来门道。数据整理后，如何画图，画出好的图在数据分析中成为关键的一环。数据表，时间序列数据在数据分析建模中很常见，例如天气预报，空气状态监测，股票交易等金融场景。数据分析过程中重新调整，重塑数据表是很重要的技巧，此处选择Titanic数据，以及巴黎、伦敦欧洲城市空气质量监测N
Kaggle之旅3 旻璿gg 大数据 Kaggle python 机器学习
Kaggle之旅3文章目录Kaggle之旅3前言一、PredictsurvivalontheTitanicandgetfamiliarwithMLbasics二、开始1.基础知识构造随机森林的4个步骤2.结合教程继续总结前言今天继续Kaggle之旅，尝试Titanic-MachineLearningfromDisaster一、PredictsurvivalontheTitanicandgetfam
数据分析-Pandas如何整合多张数据表 Alex_StarSky 金融风控数据分析 pandas 数据挖掘数据整合多源表整合
数据分析-Pandas如何整合多张数据表数据表，时间序列数据在数据分析建模中很常见，例如天气预报，空气状态监测，股票交易等金融场景。数据分析过程中重新调整，重塑数据表是很重要的技巧，此处选择Titanic数据，以及巴黎、伦敦欧洲城市空气质量监测NO2NO_2NO2数据作为样例。数据分析数据分析-Pandas如何转换产生新列数据分析-Pandas如何统计数据概况数据分析-Pandas如何轻松处理时间
Pandas - 常用操作山药鱼儿
说明：文章使用的数据集来源于https://www.kaggle.com/c/titanic/dataKaggle泰坦尼克号竞赛提供的数据。一.DataFrame结构DataFrame是Pandas最核心的数据结构，可以使用值为列表的字典进行构造：>>data={'a':[1,2,3],'b':[1.2,None,1.3],'c':['Alex','Bob','Chandler']}>>data{
数据分析-Pandas如何重塑数据表 Alex_StarSky 金融风控数据分析 pandas 数据挖掘 python 数据透视数据重塑长变宽
数据分析-Pandas如何重塑数据表数据表，时间序列数据在数据分析建模中很常见，例如天气预报，空气状态监测，股票交易等金融场景。数据分析过程中重新调整，重塑数据表是很重要的技巧，此处选择Titanic数据，以及巴黎、伦敦欧洲城市空气质量监测NO2NO_2NO2数据作为样例。GPT专栏文章：数据分析-Pandas如何转换产生新列数据分析-Pandas如何统计数据概况数据分析-Pandas如何轻松处理
数据导入与预处理实验努力coding的米羊羊课程实验数据分析 python
在泰坦尼克号沉船事件中，船上的人员都惊恐逃生，但是救生艇数量有限，无法让所有人都登艇，副船长命令女士和小孩优先登艇，所以是否获救并非随机的，而是基于一些背景及外界因素来排列先后顺序。目前，泰坦尼克号沉船事件中遇难和生还人数及其信息部分记录在文件Titanic.csv文件中，请读入该数据并对该数据进行预处理。已知该数据集中的乘客属性信息解释如下表所示：表1.泰坦尼克号乘客属性信息序号属性名称属性描述
四个模型建模及数据分析整理（基于Titanic数据集）取名真难. 机器学习数据分析数据挖掘机器学习 python
目录介绍：二、数据2.1引用数据2.2检查缺失数据2.2.1手动检查缺失数据2.2.2查看某一个特征值为空数据2.3补充缺失数据2.3.1盒图2.3.2手动用均值填补缺失数据2.3.3手动用类别填补缺失数据三、数据分析3.1男女生存比例3.2男女生存数3.3船舱级别生存比例3.4船舱生存与死亡比例3.5票价与生存关系3.6年龄与生存关系3.7性别年龄与生存关系3.8性别、登口岸、年龄与生存关系3.
Pycharm在读取数据时不显示所有列解决方法 shen_xian_ python基础 pycharm ide python
Pycharm在读取文件数据时中间的列不显示，用省略号代替，如何全部显示。问题复现importpandasaspdimportnumpyasnpimportmatplotlib.pyplotasplttitanic_train=pd.read_csv('D:\Python\PyCharm\PycharmProjects\\titanic\\train.csv')titanic_test=pd.re
java观察者模式 3213213333332132 java 设计模式游戏观察者模式
观察者模式——顾名思义，就是一个对象观察另一个对象，当被观察的对象发生变化时，观察者也会跟着变化。在日常中，我们配java环境变量时，设置一个JAVAHOME变量,这就是被观察者，使用了JAVAHOME变量的对象都是观察者，一旦JAVAHOME的路径改动，其他的也会跟着改动。这样的例子很多，我想用小时候玩的老鹰捉小鸡游戏来简单的描绘观察者模式。老鹰会变成观察者，母鸡和小鸡是
TFS RESTful API 模拟上传测试 ronin47
TFS RESTful API 模拟上传测试。　　细节参看这里：https://github.com/alibaba/nginx-tfs/blob/master/TFS_RESTful_API.markdown 模拟POST上传一个图片： curl --data-binary @/opt/tfs.png http
PHP常用设计模式单例, 工厂, 观察者, 责任链, 装饰, 策略,适配,桥接模式 dcj3sjt126com 设计模式 PHP
// 多态, 在JAVA中是这样用的, 其实在PHP当中可以自然消除, 因为参数是动态的, 你传什么过来都可以, 不限制类型, 直接调用类的方法 abstract class Tiger { public abstract function climb(); } class XTiger extends Tiger { public function climb()
hibernate 171815164 Hibernate
main,save Configuration conf =new Configuration().configure(); SessionFactory sf=conf.buildSessionFactory(); Session sess=sf.openSession(); Transaction tx=sess.beginTransaction(); News a=new
Ant实例分析 g21121 ant
下面是一个Ant构建文件的实例，通过这个实例我们可以很清楚的理顺构建一个项目的顺序及依赖关系，从而编写出更加合理的构建文件。下面是build.xml的代码： <?xml version="1
[简单]工作记录_接口返回405原因 53873039oycg 工作
最近调接口时候一直报错，错误信息是: responseCode:405 responseMsg:Method Not Allowed 接口请求方式Post.
关于java.lang.ClassNotFoundException 和 java.lang.NoClassDefFoundError 的区别程序员是怎么炼成的
真正完成类的加载工作是通过调用 defineClass来实现的；而启动类的加载过程是通过调用 loadClass来实现的；就是类加载器分为加载和定义 protected Class<?> findClass(String name) throws ClassNotFoundExcept
JDBC学习笔记-JDBC详细的操作流程 aijuans jdbc
所有的JDBC应用程序都具有下面的基本流程：　　1、加载数据库驱动并建立到数据库的连接。　　2、执行SQL语句。　　3、处理结果。　　4、从数据库断开连接释放资源。下面我们就来仔细看一看每一个步骤：其实按照上面所说每个阶段都可得单独拿出来写成一个独立的类方法文件。共别的应用来调用。 1、加载数据库驱动并建立到数据库的连接： Html代码 St
rome创建rss antonyup_2006 tomcat cms xml struts Opera
引用 1.RSS标准 RSS标准比较混乱，主要有以下3个系列 RSS 0.9x / 2.0 : RSS技术诞生于1999年的网景公司(Netscape)，其发布了一个0.9版本的规范。2001年，RSS技术标准的发展工作被Userland Software公司的戴夫温那(Dave Winer)所接手。陆续发布了0.9x的系列版本。当W3C小组发布RSS 1.0后，Dave W
html表格和表单基础百合不是茶 html 表格表单 meta 锚点
第一次用html来写东西,感觉压力山大,每次看见别人发的都是比较牛逼的再看看自己什么都还不会, html是一种标记语言,其实很简单都是固定的格式 _----------------------------------------表格和表单表格是html的重要组成部分,表格用在body里面的主要用法如下; <table> &
ibatis如何传入完整的sql语句 bijian1013 java sql ibatis
ibatis如何传入完整的sql语句？进一步说，String str ="select * from test_table"，我想把str传入ibatis中执行，是传递整条sql语句。解决办法： <
精通Oracle10编程SQL(14)开发动态SQL bijian1013 oracle 数据库 plsql
/* *开发动态SQL */ --使用EXECUTE IMMEDIATE处理DDL操作 CREATE OR REPLACE PROCEDURE drop_table(table_name varchar2) is sql_statement varchar2(100); begin sql_statement:='DROP TABLE '||table_name;
【Linux命令】Linux工作中常用命令 bit1129 linux命令
不断的总结工作中常用的Linux命令 1.查看端口被哪个进程占用通过这个命令可以得到占用8085端口的进程号，然后通过ps -ef|grep 进程号得到进程的详细信息 netstat -anp | grep 8085 察看进程ID对应的进程占用的端口号 netstat -anp | grep 进程ID &
优秀网站和文档收集白糖_ 网站
集成 Flex, Spring, Hibernate 构建应用程序性能测试工具-JMeter Hmtl5-IOCN网站 Oracle精简版教程网站鸟哥的linux私房菜 Jetty中文文档 50个jquery必备代码片段 swfobject.js检测flash版本号工具
angular.extend boyitech AngularJS angular.extend AngularJS API
angular.extend 复制src对象中的属性去dst对象中. 支持多个src对象. 如果你不想改变一个对象，你可以把dst设为空对象{}: var object = angular.extend({}, object1, object2). 注意: angular.extend不支持递归复制. 使用方法: angular.extend(dst, src); 参数:
java-谷歌面试题-设计方便提取中数的数据结构 bylijinnan java
网上找了一下这道题的解答，但都是提供思路，没有提供具体实现。其中使用大小堆这个思路看似简单，但实现起来要考虑很多。以下分别用排序数组和大小堆来实现。使用大小堆： import java.util.Arrays; public class MedianInHeap { /** * 题目：设计方便提取中数的数据结构 * 设计一个数据结构，其中包含两个函数，1.插
ajaxFileUpload 针对 ie jquery 1.7+不能使用问题修复版本 Chen.H ajaxFileUpload ie6 ie7 ie8 ie9
jQuery.extend({ handleError: function( s, xhr, status, e ) { // If a local callback was specified, fire it if ( s.error ) { s.error.call( s.context || s, xhr, status, e ); }
[机器人制造原则]机器人的电池和存储器必须可以替换 comsci 制造
机器人的身体随时随地可能被外来力量所破坏,但是如果机器人的存储器和电池可以更换,那么这个机器人的思维和记忆力就可以保存下来,即使身体受到伤害,在把存储器取下来安装到一个新的身体上之后,原有的性格和能力都可以继续维持..... 另外,如果一
Oracle Multitable INSERT 的用法 daizj oracle
转载Oracle笔记-Multitable INSERT 的用法 http://blog.chinaunix.net/uid-8504518-id-3310531.html 一、Insert基础用法语法： Insert Into 表名 (字段1,字段2,字段3...） Values (值1,
专访黑客历史学家George Dyson datamachine on
20世纪最具威力的两项发明——核弹和计算机出自同一时代、同一群年青人。可是，与大名鼎鼎的曼哈顿计划（第二次世界大战中美国原子弹研究计划）相比，计算机的起源显得默默无闻。出身计算机世家的历史学家George Dyson在其新书《图灵大教堂》（Turing’s Cathedral）中讲述了阿兰·图灵、约翰·冯·诺依曼等一帮子天才小子创造计算机及预见计算机未来
小学6年级英语单词背诵第一课 dcj3sjt126com english word
always 总是 rice 水稻，米饭 before 在...之前 live 生活，居住 usual 通常的 early 早的 begin 开始 month 月份 year 年 last 最后的 east 东方的 high 高的 far 远的 window 窗户 world 世界 than 比...更
在线IT教育和在线IT高端教育 dcj3sjt126com 教育
codecademy http://www.codecademy.com codeschool https://www.codeschool.com teamtreehouse http://teamtreehouse.com lynda http://www.lynda.com/ Coursera https://www.coursera.
Struts2 xml校验框架所定义的校验文件蕃薯耀 Struts2 xml校验 Struts2 xml校验框架 Struts2校验
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 蕃薯耀 2015年7月11日 15:54:59 星期六 http://fa
mac下安装rar和unrar命令 hanqunfeng mac
1.下载：http://www.rarlab.com/download.htm 选择 RAR 5.21 for Mac OS X 2.解压下载后的文件 tar -zxvf rarosx-5.2.1.tar 3.cd rar sudo install -c -o $USER unrar /bin #输入当前用户登录密码 sudo install -c -o $USER rar
三种将list转换为map的方法 jackyrong list
在本文中，介绍三种将list转换为map的方法： 1）传统方法假设有某个类如下 class Movie { private Integer rank; private String description; public Movie(Integer rank, String des
年轻程序员需要学习的5大经验 lampcy 工作 PHP 程序员
在过去的7年半时间里，我带过的软件实习生超过一打，也看到过数以百计的学生和毕业生的档案。我发现很多事情他们都需要学习。或许你会说，我说的不就是某种特定的技术、算法、数学，或者其他特定形式的知识吗？没错，这的确是需要学习的，但却并不是最重要的事情。他们需要学习的最重要的东西是“自我规范”。这些规范就是：尽可能地写出最简洁的代码；如果代码后期会因为改动而变得凌乱不堪就得重构；尽量删除没用的代码，并添加
评“女孩遭野蛮引产致终身不育 60万赔偿款1分未得”医腐深入骨髓 nannan408
先来看南方网的一则报道：再正常不过的结婚、生子，对于29岁的郑畅来说，却是一个永远也无法实现的梦想。从2010年到2015年，从24岁到29岁，一张张新旧不一的诊断书记录了她病情的同时，也清晰地记下了她人生的悲哀。　　粗暴手术让人发寒　　2010年7月，在酒店做服务员的郑畅发现自己怀孕了，可男朋友却联系不上。在没有和家人商量的情况下，她决定堕胎。　　12月5日，
使用jQuery为input输入框绑定回车键事件 VS 为a标签绑定click事件 Everyday都不同 jsp input 回车键绑定 click enter
假设如题所示的事件为同一个，必须先把该js函数抽离出来，该函数定义了监听的处理： function search() { //监听函数略...... } 为input框绑定回车事件，当用户在文本框中输入搜索关键字时，按回车键，即可触发search(): //回车绑定 $(".search").keydown(fun
EXT学习记录 tntxia ext
1. 准备（1）官网：http://www.sencha.com/ 里面有源代码和API文档下载。 EXT的域名已经从www.extjs.com改成了www.sencha.com ，但extjs这个域名会自动转到sencha上。（2）帮助文档：想要查看EXT的官方文档的话，可以去这里h
mybatis3的mapper文件报Referenced file contains errors xingguangsixian mybatis
最近使用mybatis.3.1.0时无意中碰到一个问题： The errors below were detected when validating the file "mybatis-3-mapper.dtd" via the file "account-mapper.xml". In most cases these errors can be d