Open In Colab

Note: Notebook Created by David Rusho

Dataframe Sample

This dataset contains data covering the last 120 years of the Olympics, including athlete names, genders, countries, locations, and the number of medals.

# Import csv from kaggle
# df_a == 'dataframe athletics'
fn_a = "athlete_events.csv"
df_a = pd.read_csv(fn_a)

# df_a == 'dataframe locations'
fn_l = "noc_regions.csv"
df_l = pd.read_csv(fn_l)

# Combine Dataframes on 'NOC'
df = df_a.merge(df_l, on="NOC")
df.head(3)

df_a.head(3)
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal
0 1 A Dijiang M 24.0 180.0 80.0 China CHN 1992 Summer 1992 Summer Barcelona Basketball Basketball Men's Basketball NaN
1 2 A Lamusi M 23.0 170.0 60.0 China CHN 2012 Summer 2012 Summer London Judo Judo Men's Extra-Lightweight NaN
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark DEN 1920 Summer 1920 Summer Antwerpen Football Football Men's Football NaN

Number of Athletes by Year and Season

The year 2000 had the highest number of athletes (13,821) for any olympic game. Summer Olympic games show a trend of having a much larger count of athletes than Winter games.

fig = px.line(ath_count, x="Year", y="Count of Athletes", color="Season", markers=True)

# update bar markers
# fig.update_traces(textposition="outside", marker_color="rgb(47,138,196)")  # blue color

# update plot details
fig.update_layout(
    {"plot_bgcolor": "rgba(255,255,255, 0.9)"},  # white background
    title={
        "text": f"Number of Athletes by Year and Season",
        "y": 0.98,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    },
    # xaxis_title="",  # remove axis titles
    # yaxis_title="",  # remove axis titles
)

fig.add_annotation(x=2000, y=13821,
            text="13,821 Athletes",
            showarrow=True,
            arrowhead=4)

fig.add_annotation(x=2014, y=4891,
            text="4,891 Athletes",
            showarrow=True,
            arrowhead=4)


fig.show()

Count of Athletes by Gender

The Winter Olympics have the closest athlete counts between Men and Women, while Summer Olympics have a large gap. The year 1992 had the most significant number of male athletes (11,235), while in the same year, there were only 5,178 women (less than half the competitors were female). The year 1994 began a trend of shrinking the gap between the number of male vs. female athletes.

fig = px.line(gender_count, x="Year", y="Count of Athletes", color="Sex", markers=True)

# update bar markers
# fig.update_traces(textposition="outside", marker_color="rgb(47,138,196)")  # blue color

# update plot details
fig.update_layout(
    {"plot_bgcolor": "rgba(255,255,255, 0.9)"},  # white background
    title={
        "text": f"Count of Athletes by Gender",
        "y": 0.98,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    },
    # xaxis_title="",  # remove axis titles
    # yaxis_title="",  # remove axis titles
)

fig.add_annotation(x=1992, y=11235,
            text="11,235 Male Athletes",
            showarrow=True,
            arrowhead=6)

fig.add_annotation(x=1992, y=5178,
            text="5,178 Female",
            showarrow=True,
            arrowhead=6)


fig.show()

Histogram of Age Counts by Gender

The number of male athletes far outnumbers the number of female competitors in all age ranges, except for athletes below 18. The majority of female athletes are around age 21, while with men, this is increased to 24.

# global marker color for plotly plots
m_color = "rgb(47,138,196)"

# Age Counts in Games
df_age = df.copy()
df_age['Age'] = df_age['Age'].dropna().astype('float')

# df_age.sort_values(by="Age",inplace=True)

fig = px.histogram(df_age, x="Age",color="Sex",barmode="overlay", 
                   color_discrete_map={"F": "rgb(237,100,90)", "M": m_color})
# update bar markers
# fig.update_traces(marker_color=m_color)  # blue color

fig.update_layout(
    {"plot_bgcolor": "rgba(255,255,255, 0.9)"},  # white background
    title={
        "text": "Histogram of Age Counts by Gender",
        "y": 0.98,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    }
)

fig.add_annotation(x=24, y=16420,
            text="16,421 Men are Age 24",
            showarrow=True,
            arrowhead=7)

fig.add_annotation(x=21, y=5492,
            text="5,492 Women are Age 21",
            showarrow=True,
            arrowhead=7)

# fig.add_annotation(x=16, y=3014,
#             text="More Women than Men",
#             showarrow=True,
#             arrowhead=7)

fig.show()

Percentage of Olympic Games by Season

# Number of Olympic Games by Season
df_season_city = (
    df.groupby(["Season"])["Year"]
    .nunique()
    .to_frame()
    .reset_index()
    .rename(columns={"Year": "Count"})
)

# highlight season with highest count
fig = px.pie(
    df_season_city,
    values="Count",
    names="Season",
    color="Season",
    title="Percentage of Olympic Games by Season",
    color_discrete_map={"Summer": "rgb(237,100,90)", "Winter": m_color}
)

# update plot details
fig.update_layout(
    title={
        #     "y": 0.100,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    },
)

fig.update_traces(textposition='inside', textinfo='percent+label')


fig.show()

Medal Counts by Individuals

# Name list of top 10 names with most medals count
top_medal_indiv = df_cntry_mds3.Name.to_list()

# df of top top 10 names with most medals count
df_top_medal_indiv = df[
    (df["Name"].str.contains("|".join(top_medal_indiv)))
    & (df["Medal"].str.contains("|".join(["Gold", "Silver", "Bronze"])))
]

# groupby Name and Medal counts
ml_ind_gp = (
    df_top_medal_indiv.groupby(by=["Name", "Medal"]).size().reset_index(name="counts")
)


# create df of medal names
df_mapping = pd.DataFrame(
    {
        "size": ["Bronze", "Silver", "Gold"],
    }
)

# create index ordered by medal rankings
sort_mapping = df_mapping.reset_index().set_index("size")

# map medal rankings to medal rankings by individuals
ml_ind_gp["medal_rank"] = ml_ind_gp["Medal"].map(sort_mapping["index"])

# sort df by medal_rank
ml_ind_gp.sort_values(by="medal_rank", inplace=True)

# create bar chart
fig = px.bar(
    data_frame=ml_ind_gp,
    y="Name",
    x="counts",
    barmode="stack",
    color="Medal",
    text="counts",
    color_discrete_map={
        "Bronze": "rgb(175, 100, 88)",
        "Silver": "rgb(179,179,179)",
        "Gold": "gold",
    },
    orientation="h",
)

# update layout, white background, remove axis titles, order y-axis
fig.update_layout(
    {"plot_bgcolor": "rgba(255,255,255, 0.9)"},  # white background
    yaxis={"categoryorder": "total ascending"},
    title={
        "text": "Medal Counts by Individuals",
        "y": 0.98,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    },
    xaxis_title="",  # remove axis titles
    yaxis_title="",  # remove axis titles
)

fig.update_xaxes(showticklabels=False)

fig.show()

Import Olympic data (city, country, continent) from Wikipedia

Descending order by number of games hosted

Data pertaining to Cities, Countries, and Continents was not available in the Olympic dataset. This information could be useful in creating a better understanding of the data.

# Import olympic data (city, country, continent) from wikipedia
olympic_cc = pd.read_html(
    "https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities"
)

Count of Games Hosted by Cities

# Count of Olympics Hosted by Cities
oc_city = olympic_cc[2].groupby("City.1")["Country"].size().reset_index(name="Counts")

oc_city = (
    (oc_city.sort_values(by="Counts", ascending=False))
    .reset_index()
    .rename(columns={"City.1": "City"})
    .head(10)
)

# create and show figure (bar chart)
bar_chart(oc_city, "City", "Counts", "City")

Count of Games Hosted by Country

# Count of Olympics Hosted by Country
oc_cntry = olympic_cc[5][["Country", "Total"]].fillna("").head(15)

# create and show figure (bar chart)
bar_chart(oc_cntry, "Country", "Total", "Country")

Count of Games Hosted by Continent

# Count of Olympic Games Hosted by Continent

oc_cont = olympic_cc[2].groupby("Continent")["City.1"].size().reset_index(name="Counts")

# reorder count values
oc_cont = oc_cont.sort_values(by="Counts", ascending=False).reset_index()

# create and show figure (bar chart)
bar_chart(oc_cont, "Continent", "Counts", "Continent")