120 Years of Olympic History
Data Exploration of a Kaggle Dataset
- Data Source
- Dataframe Sample
- Number of Athletes by Year and Season
- Count of Athletes by Gender
- Histogram of Age Counts by Gender
- Percentage of Olympic Games by Season
- Medal Counts by Individuals
- Import Olympic data (city, country, continent) from Wikipedia
Note: Notebook Created by David Rusho
- Github Blog | Github | Tableau | Linkedin
Data Source
Kaggle datset: 120-years-of-olympic-history-athletes-and-results
# Import csv from kaggle
# df_a == 'dataframe athletics'
fn_a = "athlete_events.csv"
df_a = pd.read_csv(fn_a)
# df_a == 'dataframe locations'
fn_l = "noc_regions.csv"
df_l = pd.read_csv(fn_l)
# Combine Dataframes on 'NOC'
df = df_a.merge(df_l, on="NOC")
df.head(3)
df_a.head(3)
fig = px.line(ath_count, x="Year", y="Count of Athletes", color="Season", markers=True)
# update bar markers
# fig.update_traces(textposition="outside", marker_color="rgb(47,138,196)") # blue color
# update plot details
fig.update_layout(
{"plot_bgcolor": "rgba(255,255,255, 0.9)"}, # white background
title={
"text": f"Number of Athletes by Year and Season",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
# xaxis_title="", # remove axis titles
# yaxis_title="", # remove axis titles
)
fig.add_annotation(x=2000, y=13821,
text="13,821 Athletes",
showarrow=True,
arrowhead=4)
fig.add_annotation(x=2014, y=4891,
text="4,891 Athletes",
showarrow=True,
arrowhead=4)
fig.show()
Count of Athletes by Gender
The Winter Olympics have the closest athlete counts between Men and Women, while Summer Olympics have a large gap. The year 1992 had the most significant number of male athletes (11,235), while in the same year, there were only 5,178 women (less than half the competitors were female). The year 1994 began a trend of shrinking the gap between the number of male vs. female athletes.
fig = px.line(gender_count, x="Year", y="Count of Athletes", color="Sex", markers=True)
# update bar markers
# fig.update_traces(textposition="outside", marker_color="rgb(47,138,196)") # blue color
# update plot details
fig.update_layout(
{"plot_bgcolor": "rgba(255,255,255, 0.9)"}, # white background
title={
"text": f"Count of Athletes by Gender",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
# xaxis_title="", # remove axis titles
# yaxis_title="", # remove axis titles
)
fig.add_annotation(x=1992, y=11235,
text="11,235 Male Athletes",
showarrow=True,
arrowhead=6)
fig.add_annotation(x=1992, y=5178,
text="5,178 Female",
showarrow=True,
arrowhead=6)
fig.show()
# global marker color for plotly plots
m_color = "rgb(47,138,196)"
# Age Counts in Games
df_age = df.copy()
df_age['Age'] = df_age['Age'].dropna().astype('float')
# df_age.sort_values(by="Age",inplace=True)
fig = px.histogram(df_age, x="Age",color="Sex",barmode="overlay",
color_discrete_map={"F": "rgb(237,100,90)", "M": m_color})
# update bar markers
# fig.update_traces(marker_color=m_color) # blue color
fig.update_layout(
{"plot_bgcolor": "rgba(255,255,255, 0.9)"}, # white background
title={
"text": "Histogram of Age Counts by Gender",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
}
)
fig.add_annotation(x=24, y=16420,
text="16,421 Men are Age 24",
showarrow=True,
arrowhead=7)
fig.add_annotation(x=21, y=5492,
text="5,492 Women are Age 21",
showarrow=True,
arrowhead=7)
# fig.add_annotation(x=16, y=3014,
# text="More Women than Men",
# showarrow=True,
# arrowhead=7)
fig.show()
# Number of Olympic Games by Season
df_season_city = (
df.groupby(["Season"])["Year"]
.nunique()
.to_frame()
.reset_index()
.rename(columns={"Year": "Count"})
)
# highlight season with highest count
fig = px.pie(
df_season_city,
values="Count",
names="Season",
color="Season",
title="Percentage of Olympic Games by Season",
color_discrete_map={"Summer": "rgb(237,100,90)", "Winter": m_color}
)
# update plot details
fig.update_layout(
title={
# "y": 0.100,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
# Name list of top 10 names with most medals count
top_medal_indiv = df_cntry_mds3.Name.to_list()
# df of top top 10 names with most medals count
df_top_medal_indiv = df[
(df["Name"].str.contains("|".join(top_medal_indiv)))
& (df["Medal"].str.contains("|".join(["Gold", "Silver", "Bronze"])))
]
# groupby Name and Medal counts
ml_ind_gp = (
df_top_medal_indiv.groupby(by=["Name", "Medal"]).size().reset_index(name="counts")
)
# create df of medal names
df_mapping = pd.DataFrame(
{
"size": ["Bronze", "Silver", "Gold"],
}
)
# create index ordered by medal rankings
sort_mapping = df_mapping.reset_index().set_index("size")
# map medal rankings to medal rankings by individuals
ml_ind_gp["medal_rank"] = ml_ind_gp["Medal"].map(sort_mapping["index"])
# sort df by medal_rank
ml_ind_gp.sort_values(by="medal_rank", inplace=True)
# create bar chart
fig = px.bar(
data_frame=ml_ind_gp,
y="Name",
x="counts",
barmode="stack",
color="Medal",
text="counts",
color_discrete_map={
"Bronze": "rgb(175, 100, 88)",
"Silver": "rgb(179,179,179)",
"Gold": "gold",
},
orientation="h",
)
# update layout, white background, remove axis titles, order y-axis
fig.update_layout(
{"plot_bgcolor": "rgba(255,255,255, 0.9)"}, # white background
yaxis={"categoryorder": "total ascending"},
title={
"text": "Medal Counts by Individuals",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="", # remove axis titles
yaxis_title="", # remove axis titles
)
fig.update_xaxes(showticklabels=False)
fig.show()
# Import olympic data (city, country, continent) from wikipedia
olympic_cc = pd.read_html(
"https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities"
)
# Count of Olympics Hosted by Cities
oc_city = olympic_cc[2].groupby("City.1")["Country"].size().reset_index(name="Counts")
oc_city = (
(oc_city.sort_values(by="Counts", ascending=False))
.reset_index()
.rename(columns={"City.1": "City"})
.head(10)
)
# create and show figure (bar chart)
bar_chart(oc_city, "City", "Counts", "City")
# Count of Olympics Hosted by Country
oc_cntry = olympic_cc[5][["Country", "Total"]].fillna("").head(15)
# create and show figure (bar chart)
bar_chart(oc_cntry, "Country", "Total", "Country")
# Count of Olympic Games Hosted by Continent
oc_cont = olympic_cc[2].groupby("Continent")["City.1"].size().reset_index(name="Counts")
# reorder count values
oc_cont = oc_cont.sort_values(by="Counts", ascending=False).reset_index()
# create and show figure (bar chart)
bar_chart(oc_cont, "Continent", "Counts", "Continent")