-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
131 lines (95 loc) · 4.97 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import pickle
import plotly.graph_objs as go
def get_command(x, command_key):
try:
command = command_key[x]
except:
command = float('nan')
return command
def get_command_key():
#Get command abbreviations
command_df = pd.read_excel('NYPD-Misconduct-Complaint-Database-Updated/CCRB Filespecs 04.20.2021.xlsx',
sheet_name = 'Tab3_Command Key')
command_df['Command Abrev.'] = command_df['Command Abrev.'].apply(lambda x: ''.join(x.split(' ')).lower())
return command_df.set_index(command_df['Command Abrev.'])['Command Desc.'].to_dict()
def get_rank_key():
# Get rank abbreviations
return pd.read_excel('data/CCRB Data Layout Table.xlsx', sheet_name = 'Rank Abbrevs').set_index('Abbreviation')['Rank'].to_dict()
def get_sustained_list(outcomes):
return outcomes[outcomes['Disposition'].str.contains('Substantiated')]['Disposition'].apply(
lambda x: ' '.join(x.replace('(', '').replace(')', '').split(' ')[1:]))
def get_unsustained_list(outcomes, sustained_list):
return outcomes[~outcomes['Disposition'].str.contains('|'.join(list(sustained_list)))]['Disposition']
def get_sustained_count(outcomes_df, sustained_list):
return outcomes_df[outcomes_df['Disposition'].str.contains('|'.join(list(sustained_list)))]['count'].sum()
def get_unsustained_count(outcomes_df, sustained_list):
return outcomes_df[~outcomes_df['Disposition'].str.contains('|'.join(list(sustained_list)))]['count'].sum()
def add_newlines(outcomes_df):
outcomes_df['Disposition'] = outcomes_df['Disposition'].apply(
lambda x: 'Complainant <br> Uncooperative' if x == 'Complainant Uncooperative' else x)
outcomes_df['Disposition'] = outcomes_df['Disposition'].apply(
lambda x: 'Complaint <br> Withdrawn' if x == 'Complaint Withdrawn' else x)
outcomes_df['Disposition'] = outcomes_df['Disposition'].apply(
lambda x: 'Complainant <br> Unavailable' if x == 'Complainant Unavailable' else x)
return outcomes_df
def open_pickle(file):
with open(file, 'rb') as f:
return pickle.load(f)
def save_pickle(file, variable):
with open(file, 'wb') as f:
pickle.dump(variable, f)
def get_timeseries_plot(df, date_col, count_col, freq = "M", return_trace = False, filename = None):
counts = df.set_index(date_col).groupby(pd.Grouper(freq = freq)).count()[count_col]
counts = counts[counts.index.year > 1985]
total_trace = go.Scatter(x = counts.index, y = counts, hovertemplate = '%{x}: %{y}<extra></extra>', name = "Total allegations")
if return_trace:
return total_trace
fig = go.Figure(data = total_trace)
for typ in list(set(df['FADO Type'])):
counts = df[df['FADO Type'] == typ].set_index(date_col).groupby(pd.Grouper(freq = freq)).count()[count_col]
counts = counts[counts.index.year > 1985]
trace = go.Scatter(x = counts.index, y = counts, hovertemplate = '%{x}: %{y}<extra></extra>', name = typ)
fig.add_trace(trace)
fig.update_layout(template = 'plotly_white',
margin = dict(t = 1, b = 0, r = 0, l = 0))
if filename is not None:
fig.write_html(filename, include_plotlyjs = 'cdn')
else:
fig.show()
def get_pie_counts(df, group_col, count_col, hole = None, return_trace = False, filename = None):
counts = df.groupby(group_col).count()[count_col]
trace = go.Pie(labels = counts.index, values = counts, hole = hole)
fig = go.Figure(data = [trace])
if return_trace:
return trace
if filename is not None:
fig.write_html(filename, include_plotlyjs = 'cdn')
else:
fig.show()
def get_hbar_plot(df, group_col, count_col, desc_key = None, top_n = 5, return_trace = False, filename = None):
counts = df.groupby(group_col).count()[count_col].reset_index()
if desc_key is not None:
counts[group_col] = counts[group_col].apply(lambda x: desc_key[x] if x in desc_key.keys() else x)
counts = counts.groupby(group_col).sum()[count_col]
top = counts.sort_values().iloc[-top_n:]
trace = go.Bar(x = top, y = top.index, orientation = 'h', showlegend = False,
hovertemplate = '%{x}<extra></extra>', marker_color='rgb(55, 83, 109)')
fig = go.Figure(trace)
if return_trace:
return trace
if filename is not None:
fig.write_html(filename, include_plotlyjs = 'cdn')
else:
fig.show()
def get_suburst_plot(labels, parents, values, return_trace = False, filename = None):
trace = go.Sunburst(labels = labels, parents = parents, values = values, branchvalues = "total",
marker = dict(colorscale='Emrld'))
if return_trace:
return trace
fig = go.Figure(trace)
fig.update_layout(margin = dict(t = 0, b = 0, r = 0, l = 0))
if filename is not None:
fig.write_html(filename, include_plotlyjs = 'cdn')
else:
fig.show()