关注公众号:『AI学习星球
』
回复:奥运会数据分析
即可获取数据下载。
算法学习
、4对1辅导
、论文辅导
或核心期刊
可以通过公众号
或?v:codebiubiubiu
滴滴我
本项目是对120年来的奥运会数据集(夏季奥运会)的简单分析。
主要探讨的是以下三个方面:
奥运会里的男性与女性运动员
奥运会历年来的Top
中国的奥运会历史
该数据集包含两个文件:
athlete_events.csv:参赛运动员基本生物数据和奖牌结果
noc_regions.csv:国家奥委会3个字母的代码与对应国家信息
文件athlete_events.csv中包含15个字段,具体信息如下:
字段名称 | 字段含义 |
---|---|
ID | 给每个运动员的唯一ID |
Name | 运动员名字 |
Sex | 性别 |
Age | 年龄 |
Height | 身高 |
Weight | 体重 |
Team | 所代表的国家队 |
NOC | 国家奥委会3个字母的代码 |
Games | 年份与季节 |
Year | 比赛年份 |
Season | 比赛季节 |
City | 举办城市 |
Sport | 运动类别 |
Event | 比赛项目 |
Medal | 奖牌 |
文件noc_regions.csv中包含3个字段,具体信息如下:
字段名称 | 字段含义 |
---|---|
NOC | 国家奥委会3个字母的代码 |
Region | 国家 |
Notes | 地区 |
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotly.graph_objs import *
import plotly.graph_objs as go
import colorlover as cl
from plotly.offline import init_notebook_mode, iplot
f_p = 'athlete_events.csv'
athlete_events = pd.read_csv(f_p)
athlete_events.head()
athlete_events.shape
(271116, 15)
athlete_events.isnull().sum()
athlete_events.info()
athlete_events.describe()
print(' Total of',athlete_events['Sport'].nunique(),'unique sports were played. \n \n Following is the list:\n \n', athlete_events['Sport'].unique())
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
def show_wordcloud(data, title = None):
wordcloud = WordCloud(
background_color='black',
stopwords=stopwords,
max_words=200,
max_font_size=40,
scale=3,
random_state=1 # chosen at random by flipping a coin; it was heads
).generate(str(data))
fig = plt.figure(1, figsize=(15, 15))
plt.axis('off')
if title:
fig.suptitle(title, fontsize=20)
fig.subplots_adjust(top=2.3)
plt.imshow(wordcloud)
plt.show()
show_wordcloud(athlete_events['Sport'])
fig = {
"data": [
{
"values": athlete_events['Sex'].value_counts(),
"labels": [
"Male",
"Female",
],
'marker': {'colors': ['rgb(175, 49, 35)',
'rgb(177, 180, 34)']},
"name": "Sex Ratio of Participants",
"hoverinfo":"label+percent+name",
"hole": .4,
"type": "pie"
}],
"layout": {
"title":"Sex Ratio of Participants"
}
}
iplot(fig, filename='donut')
df_medals=athlete_events.loc[athlete_events['Medal']=='Gold']
cnt_srs = df_medals['Team'].value_counts().head(20)
trace = go.Bar(
x=cnt_srs.index,
y=cnt_srs.values,
marker=dict(
color="blue",
#colorscale = 'Blues',
reversescale = True
),
)
layout = go.Layout(
title='Top 20 countries with Maximum Gold Medals'
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename="medal")
cnt_srs = athlete_events['Sport'].value_counts()
trace = go.Bar(
x=cnt_srs.index,
y=cnt_srs.values,
marker=dict(
color=cnt_srs.values,
colorscale = 'Picnic',
reversescale = True
),
)
layout = go.Layout(
title='Most Popular Sport'
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename="sport")
df_usa=athlete_events.loc[(athlete_events['Team']=='United States')]
df_usa_medal=df_usa.loc[df_usa['Medal']=='Gold']
medal_map = {'Gold':1}
df_usa_medal['Medal'] = df_usa_medal['Medal'].map(medal_map)
df_usa_sport=df_usa_medal.groupby(['Sport'],as_index=False)['Medal'].agg('sum')
df_usa_sport=df_usa_sport.sort_values(['Medal'],ascending=False)
df_usa_sport=df_usa_sport.head(10)
colors = ['#91BBF4', '#91F4F4', '#F79981', '#F7E781', '#C0F781','rgb(32,155,160)', 'rgb(253,93,124)', 'rgb(28,119,139)', 'rgb(182,231,235)', 'rgb(35,154,160)']
n_phase = len(df_usa_sport['Sport'])
plot_width = 200
# height of a section and difference between sections
section_h = 100
section_d = 10
# multiplication factor to calculate the width of other sections
unit_width = plot_width / max(df_usa_sport['Medal'])
# width of each funnel section relative to the plot width
phase_w = [int(value * unit_width) for value in df_usa_sport['Medal']]
height = section_h * n_phase + section_d * (n_phase - 1)
# list containing all the plot shapes
shapes = []
# list containing the Y-axis location for each section's name and value text
label_y = []
for i in range(n_phase):
if (i == n_phase-1):
points = [phase_w[i] / 2, height, phase_w[i] / 2, height - section_h]
else:
points = [phase_w[i] / 2, height, phase_w[i+1] / 2, height - section_h]
path = 'M {0} {1} L {2} {3} L -{2} {3} L -{0} {1} Z'.format(*points)
shape = {
'type': 'path',
'path': path,
'fillcolor': colors[i],
'line': {
'width': 1,
'color': colors[i]
}
}
shapes.append(shape)
# Y-axis location for this section's details (text)
label_y.append(height - (section_h / 2))
height = height - (section_h + section_d)
label_trace = go.Scatter(
x=[-200]*n_phase,
y=label_y,
mode='text',
text=df_usa_sport['Sport'],
textfont=dict(
color='rgb(200,200,200)',
size=15
)
)
# For phase values
value_trace = go.Scatter(
x=[-350]*n_phase,
y=label_y,
mode='text',
text=df_usa_sport['Medal'],
textfont=dict(
color='rgb(200,200,200)',
size=12
)
)
data = [label_trace, value_trace]
layout = go.Layout(
title="<b>Top 10 Sports in which USA is best</b>",
titlefont=dict(
size=12,
color='rgb(203,203,203)'
),
shapes=shapes,
height=600,
width=800,
showlegend=False,
paper_bgcolor='rgba(44,58,71,1)',
plot_bgcolor='rgba(44,58,71,1)',
xaxis=dict(
showticklabels=False,
zeroline=False,
),
yaxis=dict(
showticklabels=False,
zeroline=False
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
关注公众号:『AI学习星球
』
回复:奥运会数据分析
即可获取数据下载。
算法学习
、4对1辅导
、论文辅导
或核心期刊
可以通过公众号
或?v:codebiubiubiu
滴滴我