-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy patholympicdash-py-2.qmd
134 lines (109 loc) · 3.9 KB
/
olympicdash-py-2.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
---
title: "Olympic Games"
format:
dashboard:
orientation: columns
nav-buttons: [github]
github: https://github.com/posit-conf-2024/olympicdash
logo: images/olympics-logo.svg
logo-alt: "Olympics logo with multicolored circles."
---
```{python}
#| label: load-packages
from plotnine import *
import great_tables as gt
import pandas as pd
```
```{python}
#| label: load-data
olympics_full = pd.read_csv("data/olympics.csv", low_memory = False)
```
```{python}
#| label: prep-data
# Filter for Summer season and non-NA medals
olympics = olympics_full[(olympics_full["season"] == "Summer") & (olympics_full["medal"].notna())]
# Split the team column at "-" into two columns
split_data = olympics["team"].str.split("-", n = 1, expand = True)
olympics.loc[:, "team"] = split_data[0]
# Reorder the medal column categories
olympics["medal"] = pd.Categorical(olympics["medal"], categories = ["Bronze", "Silver", "Gold"])
```
## Column - Medals by sport and year
### Row - Medals by sport {height=60%}
```{python}
#| label: medals-by-sport
#| title: Medals by sport
# Lump the sport column to top 15 categories, grouping others as Other
top_15_sports = olympics["sport"].value_counts().nlargest(15).index
olympics["sport"] = olympics["sport"].apply(lambda x: x if x in top_15_sports else "Other")
# Convert the sport column to a categorical type with order based on frequency, and reverse the order
olympics["sport"] = pd.Categorical(olympics["sport"], categories = olympics["sport"].value_counts().index[::-1])
# Move the Other category of the sport column to the beginning
new_order = ["Other"] + [cat for cat in olympics["sport"].cat.categories if cat != "Other"]
olympics["sport"] = olympics["sport"].cat.reorder_categories(new_order)
# Plot
(
ggplot(olympics, aes(x = "sport", fill = "medal")) +
geom_bar() +
coord_flip() +
guides(fill = guide_legend(reverse = True)) +
labs(
x = "",
y = "",
fill = "Medal"
) +
theme_minimal() +
theme(
legend_position = "inside",
legend_position_inside = (0.9, 0.2),
legend_direction = "horizontal",
legend_background = element_rect(fill = "white", color = "gray"),
figure_size = (10, 6.18)
)
)
```
### Row - Medals by year {height=40%}
```{python}
#| label: medals-by-year
#| title: Medals by year
# Count the occurrences of each medal per year
olympics_count = olympics.groupby(["year", "medal"], observed=True).size().reset_index(name = "n")
# Plot
(
ggplot(olympics_count, aes(x = "year", y = "n", color = "medal")) +
geom_point(size = 0.5) +
geom_line() +
guides(color = guide_legend(reverse = True)) +
scale_x_continuous(breaks = range(1896, 2020, 8)) +
labs(x = "Year", y = "", color = "Medal") +
theme_minimal() +
theme(
legend_position = "inside",
legend_position_inside = (0.9, 0.2),
legend_direction = "horizontal",
legend_background = element_rect(fill = "white", color = "gray"),
figure_size = (10, 3)
)
)
```
## Column - Medals by country
```{python}
#| label: medals-by-country
#| title: Medals by country
# Count the occurrences of each medal per team
olympics_count = olympics.groupby(["team", "medal"]).size().reset_index(name="n")
# Pivot olympics_count to get medals as columns
olympics_pivot = olympics_count.pivot_table(index = "team", columns = "medal", values = "n", fill_value = 0)
# Calculate the total number of medals
olympics_pivot["Total"] = olympics_pivot[["Bronze", "Gold", "Silver"]].sum(axis=1)
# Reset the index and rearrange columns
olympics_pivot = olympics_pivot.reset_index()
olympics_pivot = olympics_pivot[["team", "Gold", "Silver", "Bronze", "Total"]]
# Sort by Total medals, then team
olympics_pivot = olympics_pivot.sort_values(by=["Total", "team"], ascending=[False, True])
# Rename the team column to Team
olympics_pivot.rename(columns={"team": "Team"}, inplace=True)
(
gt.GT(olympics_pivot)
)
```