先导入基本的模块
#导入模块 import numpy as np import pandas as pd from scipy import stats import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline plt.rc("font",family="SimHei",size="12") #解决中文无法显示的问题
导入数据
#导入CSV数据 train=pd.read_csv('F:python/titanic_data.csv')
字段含义:
PassengerId:乘客编号
Survived:乘客是否存活
Pclass:乘客所在的船舱等级
Name:乘客姓名
Sex:乘客性别
Age:乘客年龄
SibSp:乘客的兄弟姐妹和配偶数量
Parch:乘客的父母与子女数量
Ticket:票的编号
Fare:票价
Cabin:座位号
Embarked:乘客登船码头
查查数据分布
#使用info,describe train.info() train.describe()
查看缺失值
#null train.isnull().sum()
区别类别变量和连续变量
#连续变量和类别变量分开 num_features=train.select_dtypes(include=[np.number]) categ_features=train.select_dtypes(include=[np.object]) num_features=num_features.columns categ_features=categ_features.columns
类别变量分析(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'])主要分析性别和乘客登船码头
#类别变量的每个类别频数可视化 def count_plot(x, **kwargs): sns.countplot(x=x) x=plt.xticks(rotation=90) f = pd.melt(train, value_vars=['Sex','Embarked']) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(count_plot, "value")
查看这二者和存活的关系如何
#画图 pd.crosstab(train["Sex"],train["Survived"]).plot(kind="bar") pd.crosstab(train['Embarked'],train["Survived"]).plot(kind="bar")
处理连续变量(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'])
#每个数字特征得分布可视化 f = pd.melt(train, value_vars=num_features) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False) g = g.map(sns.distplot, "value")
存活画图
#使用pandas的画图 train["Survived"].value_counts().plot(kind="bar")
乘客所在的船舱等级
#乘客所在的船舱等级画图 train['Pclass'].value_counts().plot(kind="bar") #和存活做二维表画图 pd.crosstab(train['Pclass'],train["Survived"]).plot(kind="bar")
年龄处理
#画条形图 train["Age"].hist() #按照五岁一个区间去分 cut=[i*5 for i in range(1,20)] train.age=pd.cut(train.Age,cut) a=pd.crosstab(train.age,train.Survived) a.plot(kind="bar") a['rate']=a[1]/(a[1]+a[0]) a['rate'].plot(kind="bar")
效果并不明显,再分一次
#重新分区间 cut=[0,5,15,20,30,40,50,60,100] train.age=pd.cut(train.Age,cut) a=pd.crosstab(train.age,train.Survived) a.plot(kind="bar") a['rate']=a[1]/(a[1]+a[0]) a['rate'].plot(kind="bar")
SibSp:乘客的兄弟姐妹和配偶数量
Parch:乘客的父母与子女数量
pd.crosstab(train["SibSp"],train["Survived"]).plot(kind="bar") pd.crosstab(train["Parch"],train["Survived"]).plot(kind="bar")
票价
#使用箱型图查看二者分布如何 fig,ay = plt.subplots() Fare1 = train.Fare[train.Survived == 1] Fare0 = train.Fare[train.Survived == 0] plt.boxplot((Fare1,Fare0),labels=('Survived','Not Survived')) ay.set_ylim([-10,150]) ay.set_title("Boxplot of Fare")
train["Fare"].hist()
变量之间相关性分析
#corr corr=train.corr() corr["Survived"].sort_values()
将类别变量全部扩展
dummy_Pclass = pd.get_dummies(train.Pclass, prefix='Pclass') dummy_Sex = pd.get_dummies(train.Sex, prefix='Sex') dummy_Embarked = pd.get_dummies(train.Embarked, prefix='Embarked') dummy_Parch = pd.get_dummies(train.Parch, prefix='Parch') dummy_SibSp = pd.get_dummies(train.SibSp, prefix='SibSp') dummy_Age = pd.get_dummies(train.age, prefix='Age') dummy_Cabin = pd.get_dummies(train.Cabin, prefix='Cabin')