forked from x-datascience-datacamp/2024-assignment-pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pandas_questions.py
124 lines (99 loc) · 4.14 KB
/
pandas_questions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Plotting referendum results in pandas.
In short, we want to make beautiful map to report results of a referendum. In
some way, we would like to depict results with something similar to the maps
that you can find here:
https://github.com/x-datascience-datacamp/datacamp-assignment-pandas/blob/main/example_map.png
To do that, you will load the data as pandas.DataFrame, merge the info and
aggregate them by regions and finally plot them on a map using `geopandas`.
"""
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
def load_data():
"""Load data from the CSV files referendum/regions/departments."""
referendum = pd.read_csv(
'data/referendum.csv',
delimiter=';',
on_bad_lines='skip'
)
regions = pd.read_csv('data/regions.csv', on_bad_lines='skip')
departments = pd.read_csv('data/departments.csv', on_bad_lines='skip')
return referendum, regions, departments
def merge_regions_and_departments(regions, departments):
"""Merge regions and departments in one DataFrame.
The columns in the final DataFrame should be:
['code_reg', 'name_reg', 'code_dep', 'name_dep']
"""
merged_df = pd.merge(
departments, regions, left_on='region_code', right_on='code',
suffixes=('_dep', '_reg')
)
merged_df = merged_df[['code_reg', 'name_reg', 'code_dep', 'name_dep']]
return merged_df
def merge_referendum_and_areas(referendum, regions_and_departments):
"""Merge referendum and regions_and_departments in one DataFrame.
You can drop the lines relative to DOM-TOM-COM departments, and the
french living abroad.
"""
referendum['Department code'] = referendum['Department code'].apply(
lambda x: str(x).zfill(2)
)
regions_and_departments = regions_and_departments[
~regions_and_departments['code_reg'].isin(['COM'])
]
merge_df = pd.merge(
referendum, regions_and_departments,
left_on='Department code', right_on='code_dep'
)
return merge_df
def compute_referendum_result_by_regions(referendum_and_areas):
"""Return a table with the absolute count for each region.
The return DataFrame should be indexed by `code_reg` and have columns:
['name_reg', 'Registered', 'Abstentions', 'Null', 'Choice A', 'Choice B']
"""
result = referendum_and_areas.groupby('code_reg')[[
'Registered', 'Abstentions', 'Null', 'Choice A', 'Choice B'
]].sum()
# Merge with the region names
result = result.reset_index()
result = pd.merge(
result,
referendum_and_areas[['code_reg', 'name_reg']].drop_duplicates(),
on='code_reg', how='left'
)
result = result[['name_reg', 'Registered',
'Abstentions', 'Null', 'Choice A', 'Choice B']]
return result
def plot_referendum_map(referendum_result_by_regions):
"""Plot a map with the results from the referendum.
* Load the geographic data with geopandas from `regions.geojson`.
* Merge these info into `referendum_result_by_regions`.
* Use the method `GeoDataFrame.plot` to display the result map. The results
should display the rate of 'Choice A' over all expressed ballots.
* Return a gpd.GeoDataFrame with a column 'ratio' containing the results.
"""
regions_geo = gpd.read_file("data/regions.geojson")
merged_data = pd.merge(
regions_geo, referendum_result_by_regions,
left_on='nom', right_on='name_reg'
)
merged_data['ratio'] = merged_data['Choice A'] / (
merged_data['Choice A'] + merged_data['Choice B']
)
ax = merged_data.plot(
column='ratio', legend=True, figsize=(10, 10), cmap='coolwarm'
)
ax.set_title("Referendum Results: Choice A Rate by Region")
return merged_data
if __name__ == "__main__":
referendum, df_reg, df_dep = load_data()
regions_and_departments = merge_regions_and_departments(df_reg, df_dep)
referendum_and_areas = merge_referendum_and_areas(
referendum, regions_and_departments
)
referendum_results = compute_referendum_result_by_regions(
referendum_and_areas
)
print(referendum_results)
plot_referendum_map(referendum_results)
plt.show()