# -*- coding: utf-8 -*-
# @Author : 陈浩骏, 2017326603075
# Python Version == 3.8.5
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pylab as plot
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
}
</style>
""");
trainData = pd.read_csv('./titanic/train.csv')
print(trainData.shape)
trainData.head(5)
如无特殊说明, 图例中绿色代表存活
Survived
, 红色代表不幸罹难Perished
.
trainData.describe()
注意到PassengerID.count == 891
, 而Age.count == 714
, 即年龄缺失177个数据.
进行中位数/随机森林预测数据补充.
# Median Data
# trainData['AgeM'] = trainData['Age'].fillna(trainData['Age'].median)
# Random Forest Approach
age_df = trainData[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
age_df_notnull = age_df.loc[(trainData['Age'].notnull())]
age_df_isnull = age_df.loc[(trainData['Age'].isnull())]
X = age_df_notnull.values[:,1:]
Y = age_df_notnull.values[:,0]
RFR = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
RFR.fit(X,Y)
predictAges = RFR.predict(age_df_isnull.values[:,1:])
trainData.loc[trainData['Age'].isnull(), ['Age']]= predictAges
trainData['Age'].count() # 为891即补充完整
# 加入新列: Perished -> 逝世(Boolean)
trainData['Perished'] = 1 - trainData['Survived']
trainData.groupby('Sex').agg('sum')[['Survived', 'Perished']]
trainData.groupby('Sex').agg('mean')[['Survived', 'Perished']]
# 基于性别的死亡计数
trainData.groupby('Sex').agg('sum')[['Survived', 'Perished']] \
.plot(kind='bar', stacked=True, color=['g', 'r'], title='Survival Count Based on Sex', figsize=(16, 12))
# 基于性别的死亡率计算
trainData.groupby('Sex').agg('mean')[['Survived', 'Perished']] \
.plot(kind='bar', stacked=True, color=['g', 'r'], title='Survival Rate/Percentage Based on Sex', figsize=(16, 12))
不难看出, 在数据集中, Age == Female
即女性的死亡率较低. 因此加入年龄作为参考因素, 绘制violin graph
.
fig = plt.figure(figsize=(24, 12))
# 基于性别分类的存活率与死亡率的年龄分布小提琴图
sns.violinplot(x='Sex', y='Age', hue='Survived', data=trainData,
split=True, palette={0: "r", 1: "g"},
title='Violin Plot on Survival Rate and Death Rate Based on Sex')
得到以下特征
Pclass
)¶trainData.groupby('Pclass').agg('sum')[['Survived', 'Perished']]
trainData.groupby('Pclass').agg('mean')[['Survived', 'Perished']]
# 基于客舱等级的死亡计数
trainData.groupby('Pclass').agg('sum')[['Survived', 'Perished']]\
.plot(kind='bar', stacked=True, color=['g', 'r'], title='Survival Count Based on Pclass', figsize=(16, 12))
Fare
)验证客舱等级1是否为高价或低价舱位# 每个客舱等级对应的费用
trainData.groupby('Pclass').mean()['Fare'] \
.plot(kind='bar', color='y', figsize=(16, 12), title='Fare for each Pclass')
验证上述猜想, 1号Pclass等级的客舱售价最高, 约80+美元, 而2, 3等级的客舱售价较低
plt.figure(figsize=(24, 12))
plt.xlabel('Age')
plt.ylabel('Ticket Fare')
plt.scatter(trainData[trainData['Survived'] == 1]['Age'], trainData[trainData['Survived'] == 1]['Fare'],
c='green', s=trainData[trainData['Survived'] == 1]['Fare'])
plt.scatter(trainData[trainData['Survived'] == 0]['Age'], trainData[trainData['Survived'] == 0]['Fare'],
c='red', s=trainData[trainData['Survived'] == 0]['Fare'])
上述图的散点大小代表船票费用(Fare
), x轴代表年龄(Age
), y轴亦代表船票费用.
作以下说明
聚类点1
聚类点2
聚类点3
聚类点4
聚类点1的出现, 表明票价最高的存活率亦最高.
聚类点2的出现, 表面票价最低的中年乘客存活率亦最低, 红点极其密集.
聚类点3的出现, 表面票价适中部分的中年乘客存活率相当可观.
聚类点4的出现, 是最有趣的, 他们属于拥有较低求生技能的一批乘客, 主要为婴幼儿与儿童, 但是存活率亦高.
可以判断婴幼儿与儿童相较于其他乘客, 获得更好的求生/救助资源. 该结论反射的观点也的确是明显受社会认可的(妇女儿童优先).
trainData["AgeInt"] = trainData["Age"].astype(int)
# 精确到每个年龄的成员成活率
avgAge = trainData[["AgeInt", "Survived"]].groupby(['AgeInt'], as_index=False).mean()
sns.barplot(x='AgeInt', y='Survived', data=avgAge)
separationPoint = [0, 6, 18, 40, 60, 100]
trainData['AgeBatch'] = pd.cut(trainData['AgeInt'], separationPoint)
batches = trainData.groupby('AgeBatch')['Survived'].mean()
# 按年龄段的存活率
batches
batches.plot(kind='bar', color='g', figsize=(16, 12), title='Survival Rate on Age Batches')
survivedtmp = trainData[trainData['Survived']==1]['AgeBatch'].value_counts()
perishedtmp = trainData[trainData['Survived']==0]['AgeBatch'].value_counts()
dftmp = pd.DataFrame([survivedtmp, perishedtmp])
dftmp.index = ['Survived','Perished']
dftmp.plot(kind='bar', stacked=True, figsize=(16, 12))
上一柱图颜色仅为区分年龄段
上述年龄-存活率分布图更是验证了上面的说法, (0, 6]
的年龄段可以获得65%的存活率.
婴幼儿/儿童对应的某些年龄段, 获得了甚至接近100%的存活率.
老人对应的年龄段, 考虑到他们的身体条件, 该存活率表现也足以表明社会救助的确有偏向性.
SibSp
¶# 根据有无子女上船, 划分数据
# OB-> On board, NOB-> NOT on board
siblOB = trainData[trainData['SibSp'] != 0]
siblNOB = trainData[trainData['SibSp'] == 0]
plt.figure(figsize=(24, 12))
plt.subplot(121)
siblOB['Survived'].value_counts().\
plot(kind='pie', labels=['Perished', 'Survived'], autopct='%.3f%%', colors=['r', 'g'])
plt.xlabel('Sibling onboard')
plt.ylabel('Survival Rate')
plt.subplot(122)
siblNOB['Survived'].value_counts().\
plot(kind='pie', labels=['Perished', 'Survived'], autopct='%.3f%%', colors=['r', 'g'])
plt.xlabel('Sibling NOT onboard')
plt.ylabel('Survival Rate')
Parch
¶# 根据有无父母上船, 划分数据
# OB-> On board, NOB-> NOT on board
parentOB = trainData[trainData['Parch'] != 0]
parentNOB = trainData[trainData['Parch'] == 0]
plt.figure(figsize=(24, 12))
# plt.title('Survival Rate Based on Parents Onboard/Not Onboard')
plt.subplot(121)
siblOB['Survived'].value_counts()\
.plot(kind='pie', labels=['Perished', 'Survived'], autopct='%.3f%%', colors=['r', 'g'])
plt.xlabel('Parent(s) onboard')
plt.ylabel('Survival Rate')
plt.subplot(122)
siblNOB['Survived'].value_counts()\
.plot(kind='pie', labels=['Perished', 'Survived'], autopct='%.3f%%', colors=['r', 'g'])
plt.xlabel('Parent NOT onboard')
plt.ylabel('Survival Rate')
明显可以看出: 有父母或子女上船的乘客, 存活率都较比较组(父母或儿女未在船上)
高.
将trainData
中数据复制一份至heatMapData
, 并去除相关系数较低的和上面新增的无用的字段, 如PassengerId
类, 并将需要列化的数据进行ONE-HOT
或BINARY
编码.
对某些数据做Scaling, 以增大其敏感度.
并且将子女数量SibSp
, 与父母数量Parch
归为一个字段F(amily)M(embers)Count
->"家庭成员数"
家庭成员数 = 子女数+父母数+自己
FamilyMembersCount = SibSp + Parch + 1
heatMapData = trainData.copy(deep=True)
heatMapData['FMCount'] = heatMapData['Parch'] + heatMapData['SibSp'] + 1
heatMapData.drop(['Name','Ticket','Cabin','PassengerId','AgeBatch', 'AgeInt', 'Perished', 'SibSp', 'Parch'], 1, inplace =True)
heatMapData.Sex.replace(('male','female'), (0,1), inplace = True)
heatMapData.Embarked.replace(('S','C','Q'), (1,2,3), inplace = True)
# 有两行上船地点数据丢失, 用1Replace, 影响不大
heatMapData.Embarked.fillna(1, inplace=True)
heatMapData.head()
plt.figure(figsize=(16, 16))
sns.heatmap(heatMapData.astype(float).corr(),linewidths=.4,
square=True, linecolor='r', annot=True, cmap="RdPu")
xTrain = heatMapData.drop('Survived', axis=1)
yTrain = heatMapData['Survived']
testData = pd.read_csv('./titanic/test.csv')
xTrain.info()
testData.info()
testData.head(5)
可以观察到, 测试数据并不是训练数据的子集, 测试数据来源有别于训练数据中的891位乘客, 而是另外418位乘客
因为训练数据与测试数据有明显的字段差异(因在上文中, 对年龄的空缺值做了随机森林回归, 以及去除了无用字段).
为保证训练能正常进行, xTrain
要与testData
即->xTest
进行同样的处理
# 重复上文处理
testData.Fare.fillna(testData['Fare'].mean(), inplace=True)
age_df = testData[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
age_df_notnull = age_df.loc[(testData['Age'].notnull())]
age_df_isnull = age_df.loc[(testData['Age'].isnull())]
X = age_df_notnull.values[:,1:]
Y = age_df_notnull.values[:,0]
RFR = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
RFR.fit(X,Y)
predictAges = RFR.predict(age_df_isnull.values[:,1:])
testData.loc[testData['Age'].isnull(), ['Age']]= predictAges
testData['FMCount'] = testData['Parch'] + testData['SibSp'] + 1
testData.drop(['Name','Ticket','Cabin','PassengerId', 'SibSp', 'Parch'], 1, inplace=True)
testData.Sex.replace(('male','female'), (0,1), inplace = True)
testData.Embarked.replace(('S','C','Q'), (1,2,3), inplace = True)
testData.Embarked.fillna(1, inplace=True)
xTest = testData.copy()
xTrain.info()
xTest.info()
可以看到训练数据与测试数据字段已经一致, 并且无空值.
引入RandomForestClassifier
进行数据拟合.
即根据前891名乘客的存活情况来预测余下418位乘客的存活情况
# 训练数据头
print('Training Data Head 5')
xTrain.head(5)
# 测试数据头
print('Testing Data Head 5')
xTest.head(5)
random_forest = RandomForestClassifier(n_estimators=300)
random_forest.fit(xTrain, yTrain)
yPredict = random_forest.predict(xTest)
predPercentage = random_forest.score(xTrain, yTrain)
round(predPercentage*100, 4)
以上为模型预测准确值(%)