以下练习数据来源均为seaborn库中提供,通过网络获取,如果出现网络获取慢或者失败的情况,可以到GitHub上搜索seaborn-data,下载后传入读取路径即可
代码语言:javascript复制from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
泰坦尼克号海难幸存状况分析
代码语言:javascript复制data = sns.load_dataset("titanic", data_home='/Volumes/Code/notebooks/seaborn-data')
data.head()
不同仓位等级中幸存和遇难的乘客比例
代码语言:javascript复制pclasses = []
surviveds = [[], []]
# 按等级分组,然后计算不同幸存者的数量
for pclass, items in data.groupby(by=['class']):
pclasses.append(pclass)
count0 = items[items['survived'] == 0]['survived'].count()
count1 = items[items['survived'] == 1]['survived'].count()
surviveds[0].append(count0)
surviveds[1].append(count1)
# 分别绘制不同幸存者图
plt.bar(pclasses, surviveds[0], color='r', width=0.3)
plt.bar(pclasses, surviveds[1], bottom=surviveds[0], color='g', width=0.3)
# 增加文字说明
for i, pclass in enumerate(pclasses):
total = surviveds[0][i] surviveds[1][i]
plt.text(pclass, surviveds[0][i] // 2, '%.2f%%' % ((surviveds[0][i] / total) * 100), ha='center')
plt.text(pclass, surviveds[0][i] surviveds[1][i] // 2, '%.2f%%' % ((surviveds[1][i] / total) * 100), ha='center')
plt.xticks(pclasses, pclasses)
plt.ylim([0, 600])
plt.legend(['die', 'survive'], loc='upper right')
plt.grid(axis='y', color='gray', linestyle=':', linewidth=2)
plt.show()
从图中可以看出,低等舱的人死亡最多
不同性别的幸存比例
换一种统计方式
代码语言:javascript复制 temp = data.groupby(by=['survived', 'sex']).count()
# 非幸存者女性
t1 = temp.loc[0, :].loc['female', :].max()
# 非幸存者男性
t2 = temp.loc[0, :].loc['male', :].max()
# 幸存者女性
t3 = temp.loc[1, :].loc['female', :].max()
# 幸存者男性
t4 = temp.loc[1, :].loc['male', :].max()
sexs = ['female', 'male']
plt.bar(sexs, [t1, t2], color='r', width=0.3)
plt.bar(sexs, [t3, t4], bottom=[t1, t2], color='g', width=0.3)
survived = {'female': [t1, t3], 'male': [t2, t4]}
for i, pclass in enumerate(sexs):
total = sum(survived[pclass])
plt.text(pclass, survived[pclass][0] // 2, '%.2f%%' % ((survived[pclass][0] / total) * 100), ha='center')
plt.text(pclass, survived[pclass][0] survived[pclass][1] // 2, '%.2f%%' % ((survived[pclass][1] / total) * 100),
ha='center')
plt.xticks(sexs, sexs)
plt.ylim([0, 600])
plt.legend(['die', 'survive'], loc='upper left')
plt.grid(axis='y', color='gray', linestyle=':', linewidth=2)
plt.show()
从图中可以看出,女性幸存者比例还是很大的,果然妇女儿童先行
幸存和遇难乘客的票价分布
代码语言:javascript复制 surviveds = []
fares = []
for survived, items in data.groupby(by=['survived']):
surviveds.append(survived)
fares.append(items['fare'])
str_sur = ['survived' if i else 'die' for i in surviveds]
plt.boxplot(x=fares, patch_artist=True, labels=str_sur, showmeans=True,
medianprops={'linestyle': '--', 'color': 'orange'})
plt.show()
整体来讲,存活的人票价相对都高一些,这也解释了上面低等舱的人死亡过多的情况
幸存和遇难乘客的年龄分布
代码语言:javascript复制 surviveds = []
fares = []
# 删除年龄为NaN的行数据
temp = data.dropna(subset=['age'], how='any')
for survived, items in temp.groupby(by=['survived']):
surviveds.append(survived)
fares.append(items['age'])
str_sur = ['survived' if i else 'die' for i in surviveds]
plt.boxplot(x=fares, patch_artist=True, labels=str_sur, showmeans=True,
medianprops={'linestyle': '--', 'color': 'orange'})
plt.show()
幸存者和死亡者平均年龄基本相同
不同上船港口的乘客仓位等级分布
代码语言:javascript复制 embarkeds = []
pclasses = []
for embarked, items in data.groupby(by=['embarked']):
embarkeds.append(embarked)
pclasss = []
for pclass, values in items.groupby(by=['class']):
pclasss.append(values['class'].count())
pclasses.append(pclasss)
width = 0.25
x = np.array(range(len(embarkeds)))
plt.bar(x-width, [i[0] for i in pclasses], width=0.2, color='r')
plt.bar(x, [i[1] for i in pclasses], width=0.2, color='g')
plt.bar(x width, [i[2] for i in pclasses], width=0.2, color='b')
for i, val in enumerate(x):
item = pclasses[i]
print(item)
plt.text(val-width, item[0] 5, '%s' % item[0], ha='center')
plt.text(val, item[1] 5, '%s' % item[1], ha='center')
plt.text(val width, item[2] 5, '%s' % item[2], ha='center')
plt.xticks(x, embarkeds)
plt.legend(['First', 'Second', 'Third'])
plt.grid(axis='y', color='gray', linestyle=':', linewidth=2)
plt.xlabel('embarked')
plt.ylabel('pclass')
plt.show()
幸存和遇难乘客堂兄弟姐妹的数量分布
代码语言:javascript复制sibsps = []
survived = []
for survive, items in data.groupby(by=['survived']):
survived.append(survive)
sibsps.append(items['sibsp'])
plt.boxplot(x=sibsps, patch_artist=True, labels=survived, showmeans=True,
medianprops={'linestyle': '--', 'color': 'orange'})
plt.xlabel('survived')
plt.ylabel('sibsp')
plt.show()
幸存和遇难乘客父母子女的数量分布
代码语言:javascript复制 sibsps = []
survived = []
for survive, items in data.groupby(by=['survived']):
survived.append(survive)
sibsps.append(items['parch'])
plt.boxplot(x=sibsps, patch_artist=True, labels=survived, showmeans=True,
medianprops={'linestyle': '--', 'color': 'orange'})
plt.xlabel('alive')
plt.ylabel('parch')
plt.show()
单独乘船与否和幸存之间有没有联系
代码语言:javascript复制 survived = np.array([0, 1])
temp = data.groupby(by=['alone', 'survived']).count()
# 单独上传并且没有幸存
t1 = temp.loc[False, :].loc[0, :].max()
t2 = temp.loc[False, :].loc[1, :].max()
t3 = temp.loc[True, :].loc[0, :].max()
t4 = temp.loc[True, :].loc[1, :].max()
width = 0.1
plt.bar(survived - width, [t1, t3], color='r', width=width*2)
plt.bar(survived width, [t2, t4], color='g', width=width*2)
plt.xlabel('Alone')
plt.ylabel('Alive')
plt.xticks(survived, ['No Alone', 'Alone'])
plt.legend(['die', 'survive'])
plt.show()
从图中可以看出,非单独乘船的死亡率和存活率相当,单独乘船的死亡率要高于存活率
是否成年男性和幸存之间有没有联系
代码语言:javascript复制 temp = data.groupby(by=['adult_male', 'alive']).count()
# 非成年死亡者
t1 = temp.loc[False, :].loc['no', :].max()
# 非成年幸存者
t2 = temp.loc[False, :].loc['yes', :].max()
# 成年死亡者
t3 = temp.loc[True, :].loc['no', :].max()
# 成年幸存者
t4 = temp.loc[True, :].loc['yes', :].max()
sexs = ['Not Adult', 'Adult']
plt.bar(sexs, [t1, t3], color='r', width=0.3)
plt.bar(sexs, [t2, t4], bottom=[t1, t3], color='g', width=0.3)
survived = {'Not Adult': [t1, t2], 'Adult': [t3, t4]}
for i, pclass in enumerate(sexs):
total = sum(survived[pclass])
plt.text(pclass, survived[pclass][0] // 2, '%.2f%%' % ((survived[pclass][0] / total) * 100), ha='center')
plt.text(pclass, survived[pclass][0] survived[pclass][1] // 2,
'%.2f%%' % ((survived[pclass][1] / total) * 100),
ha='center')
plt.xticks(sexs, sexs)
plt.ylim([0, 600])
plt.legend(['die', 'survive'], loc='upper left')
plt.grid(axis='y', color='gray', linestyle=':', linewidth=2)
plt.show()
从图中可以看出,非成年男性存活率明显高于成年男性,再一次印证了妇女儿童先行的策略