-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
346 lines (248 loc) · 14.1 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
import pandas as pd
import numpy as np
chunk = pd.read_csv('NYPD_Complaint_Data_Historic.csv',chunksize=10000000)
df_NY = pd.concat(chunk)
df_NY = df_NY.dropna(subset=['CMPLNT_FR_DT'])
df_NY = df_NY.dropna(subset=['RPT_DT'])
df_NY['CMPLNT_FR_DT'] = df_NY['CMPLNT_FR_DT'].str.replace('/','')
df_NY['CMPLNT_FR_DT'] = df_NY['CMPLNT_FR_DT'].astype(str).astype(int)
df_NY['RPT_DT'] = df_NY['RPT_DT'].str.replace('/','')
#df_NY.RPT_DT = df_NY.RPT_DT.[0,3]
df_NY['RPT_DT'] = df_NY.RPT_DT.str.slice(0, -6, 1)
df_NY['RPT_DT'] = df_NY['RPT_DT'].astype(str).astype(int)
df_NY.RPT_DT = df_NY.RPT_DT % 10000
df_NY.CMPLNT_FR_DT = df_NY.CMPLNT_FR_DT % 10000
#CMPLNT_NUM
df_NY.drop('CMPLNT_NUM', axis=1, inplace=True)
#CMPLNT_TO_DT
df_NY.drop('CMPLNT_TO_DT', axis=1, inplace=True)
#CMPLNT_TO_TM
df_NY.drop('CMPLNT_TO_TM', axis=1, inplace=True)
#ADDR_PCT_CD
df_NY.drop('ADDR_PCT_CD', axis=1, inplace=True)
#RPT_DT (check other cities)
#df_NY.drop('Open', axis=1, inplace=True)
#PD_CD
df_NY.drop('PD_CD', axis=1, inplace=True)
#PD_DESC
df_NY.drop('PD_DESC', axis=1, inplace=True)
#BORO_NM
df_NY.drop('BORO_NM', axis=1, inplace=True)
#LOC_OF_OCCUR_DESC
df_NY.drop('LOC_OF_OCCUR_DESC', axis=1, inplace=True)
#PREM_TYP_DESC
df_NY.drop('PREM_TYP_DESC', axis=1, inplace=True)
#JURIS_DESC
df_NY.drop('JURIS_DESC', axis=1, inplace=True)
#JURISDICTION_CODE
df_NY.drop('JURISDICTION_CODE', axis=1, inplace=True)
#PARKS_NM
df_NY.drop('PARKS_NM', axis=1, inplace=True)
#HADEVELOPT
df_NY.drop('HADEVELOPT', axis=1, inplace=True)
#HOUSING_PSA
df_NY.drop('HOUSING_PSA', axis=1, inplace=True)
#X_COORD_CD
df_NY.drop('X_COORD_CD', axis=1, inplace=True)
#Y_COORD_CD
df_NY.drop('Y_COORD_CD', axis=1, inplace=True)
#TRANSIT_DISTRICT
df_NY.drop('TRANSIT_DISTRICT', axis=1, inplace=True)
#Latitude
df_NY.drop('Latitude', axis=1, inplace=True)
#Longitude
df_NY.drop('Longitude', axis=1, inplace=True)
#Lat_Lon
df_NY.drop('Lat_Lon', axis=1, inplace=True)
#PATROL_BORO
df_NY.drop('PATROL_BORO', axis=1, inplace=True)
#STATION_NAME
df_NY.drop('STATION_NAME', axis=1, inplace=True)
#get rid of unnamed
df_NY.drop('Unnamed: 0', axis=1, inplace=True)
#get rid of KY_CD
df_NY.drop('KY_CD', axis=1, inplace=True)
df_NY.drop('CMPLNT_FR_TM', axis=1, inplace=True)
#grab ones after 2015 or delete ones before
df_NY = df_NY[df_NY['CMPLNT_FR_DT'] >= 2015]
#This is NY put into the same format as the LA dataframe
df_NY = df_NY[['CMPLNT_FR_DT', 'RPT_DT', 'OFNS_DESC', 'VIC_AGE_GROUP', 'VIC_SEX', 'VIC_RACE']].copy()
#LA's Dataframe section
chunk = pd.read_csv('LA Crime_Cleared.csv', chunksize=10000000)
df_LA = pd.concat(chunk)
df2 = df_LA[['DATE OCC', 'Date Rptd', 'Crm Cd Desc', 'Vict Age', 'Vict Sex', 'Vict Descent']].copy()
df2.rename(columns={'DATE OCC': 'DATE_OCC'}, inplace = True)
df2.rename(columns={'Date Rptd': 'Date_Rptd'}, inplace = True)
df2['DATE_OCC'] = df2['DATE_OCC'].str.replace('-','')
df2['DATE_OCC'] = df2['DATE_OCC'].str.replace(':','')
df2['DATE_OCC'] = df2['DATE_OCC'].str.replace(' ','')
df2['DATE_OCC'] = df2.DATE_OCC.str.slice(0, -10, 1)
df2['DATE_OCC'] = df2['DATE_OCC'].astype(str).astype(int)
df2['Date_Rptd'] = df2['Date_Rptd'].str.replace('-','')
df2['Date_Rptd'] = df2['Date_Rptd'].str.replace(':','')
df2['Date_Rptd'] = df2['Date_Rptd'].str.replace(' ','')
df2['Date_Rptd'] = df2.Date_Rptd.str.slice(4, -8, 1)
df2['Date_Rptd'] = df2['Date_Rptd'].astype(str).astype(int)
df2 = df2[df2['DATE_OCC'] >= 2015]
#Chicago's DataFrame section
chunk = pd.read_csv('Crimes_-_2001_to_present.csv', chunksize=10000000)
df_CHI = pd.concat(chunk)
df3 = df_CHI[['Year', 'Date', 'Primary Type', 'Description', 'Updated On']].copy()
df3['Year'] = df3['Year'].astype(str).astype(int)
df3['Date'] = df3['Date'].str.replace('/','')
df3['Date'] = df3['Date'].str.replace(':','')
df3['Date'] = df3['Date'].str.replace(' ','')
df3['Date'] = df3.Date.str.slice(0, -14, 1)
df3['Date'] = df3['Date'].astype(str).astype(int)
df3 = df3[df3['Year'] >= 2015]
df_NY.to_csv('NY UPDATED.csv', encoding='utf-8', index=False)
df2.to_csv('LA UPDATED.csv', encoding='utf-8', index=False)
df3.to_csv('CHICAGO UPDATED.csv', encoding='utf-8', index=False)
#From this point, read csv from the three files saved and make three new dataframes
chunk = pd.read_csv('NY UPDATED.csv',chunksize = 10000000)
df_NY = pd.concat(chunk)
chunk = pd.read_csv('LA UPDATED.csv',chunksize = 10000000)
df_LA = pd.concat(chunk)
chunk = pd.read_csv('CHICAGO UPDATED.csv',chunksize = 10000000)
df_CHI = pd.concat(chunk)
#rename NY
df_NY.rename(columns={'CMPLNT_FR_DT': 'YEAR'}, inplace = True)
df_NY.rename(columns={'RPT_DT': 'MONTH'}, inplace = True)
df_NY.rename(columns={'OFNS_DESC': 'CRIME_DESCRIPTION'}, inplace = True)
#Get rid of harrassment 2
df_NY['CRIME_DESCRIPTION'] = df_NY['CRIME_DESCRIPTION'].str.replace('HARRASSMENT 2','HARRASSMENT')
#rename LA
df_LA.rename(columns={'DATE_OCC': 'YEAR'}, inplace = True)
df_LA.rename(columns={'Date_Rptd': 'MONTH'}, inplace = True)
df_LA.rename(columns={'Crm Cd Desc': 'CRIME_DESCRIPTION'}, inplace = True)
#drop NAs from crime in NY
df_NY = df_NY.dropna(subset=['CRIME_DESCRIPTION'])
#rename CHICAGO
df_CHI.rename(columns={'Year': 'YEAR'}, inplace = True)
df_CHI.rename(columns={'Date': 'MONTH'}, inplace = True)
df_CHI.rename(columns={'Description': 'CRIME_DESCRIPTION'}, inplace = True)
#drop Updated On
df_CHI.drop('Updated On', axis=1, inplace=True)
#for CHICAGO, choose primary type or crime description as main crime descriptor
#Trim each down OPTIONAL
df_NY = df_NY[['YEAR', 'MONTH', 'CRIME_DESCRIPTION']].copy()
df_LA = df_LA[['YEAR', 'MONTH', 'CRIME_DESCRIPTION']].copy()
df_CHI = df_CHI[['YEAR', 'MONTH', 'Primary Type','CRIME_DESCRIPTION']].copy()
#Change the rows in crime description based on what they are in primary type
df_CHI.loc[df_CHI['Primary Type'] == "BURGLARY", 'CRIME_DESCRIPTION'] = "BURGLARY"
df_CHI.loc[df_CHI['Primary Type'] == "ROBBERY", 'CRIME_DESCRIPTION'] = "ROBBERY"
df_CHI.loc[df_CHI['Primary Type'] == "SEX OFFENSE", 'CRIME_DESCRIPTION'] = "SEXUAL ASSAULT"
df_CHI.loc[df_CHI['Primary Type'] == "CRIM SEXUAL ASSAULT", 'CRIME_DESCRIPTION'] = "SEXUAL ASSAULT"
df_CHI.loc[df_CHI['Primary Type'] == "NARCOTICS", 'CRIME_DESCRIPTION'] = "NARCOTICS"
#TRYING TO SEPARATE OUT 38 DRUGS FROM LA
df_LA.loc[df_LA['CRIME_DESCRIPTION'].str.contains('DRUG'), 'CRIME_DESCRIPTION'] = 'NARCOTICS'
#TRYING TO SEPARATE OUT ARSON FROM CHIRAQ
df_CHI.loc[df_CHI['CRIME_DESCRIPTION'].str.contains('ARSON'), 'CRIME_DESCRIPTION'] = 'ARSON'
#Chicago add hands fist battery to battery
df_CHI.loc[df_CHI['CRIME_DESCRIPTION'].str.contains('BATTERY'), 'CRIME_DESCRIPTION'] = 'BATTERY'
#Separate out HOMICIDE from NY
df_NY.loc[df_NY['CRIME_DESCRIPTION'].str.contains('HOMICIDE'), 'CRIME_DESCRIPTION'] = 'HOMICIDE'
#changed FELONY ASSAULT to BATTERY in NY
df_NY['CRIME_DESCRIPTION'] = df_NY['CRIME_DESCRIPTION'].str.replace("FELONY ASSAULT",'BATTERY')
#changed BURGLAR'S TOOLS to BURGLARY in NY
df_NY['CRIME_DESCRIPTION'] = df_NY['CRIME_DESCRIPTION'].str.replace("BURGLAR'S TOOLS",'BURGLARY')
#changed PETIT LARCENY OF MOTOR VEHICLE to PETIT LARCENY IN NY
df_NY['CRIME_DESCRIPTION'] = df_NY['CRIME_DESCRIPTION'].str.replace("PETIT LARCENY OF MOTOR VEHICLE",'PETIT LARCENY')
#Changed FELONY SEX CRIMES TO SEXUAL ASSAULT
df_NY['CRIME_DESCRIPTION'] = df_NY['CRIME_DESCRIPTION'].str.replace("FELONY SEX CRIMES",'SEXUAL ASSAULT')
#Changed SEX CRIMES TO SEXUAL ASSAULT
df_NY['CRIME_DESCRIPTION'] = df_NY['CRIME_DESCRIPTION'].str.replace("SEX CRIMES",'SEXUAL ASSAULT')
#Changed DANGEROUS DRUGS TO NARCOTICS
df_NY['CRIME_DESCRIPTION'] = df_NY['CRIME_DESCRIPTION'].str.replace("DANGEROUS DRUGS",'NARCOTICS')
#Changed MURDER & NON-NEGL. MANSLAUGHTER TO HOMICIDE
df_NY['CRIME_DESCRIPTION'] = df_NY['CRIME_DESCRIPTION'].str.replace("MURDER & NON-NEGL. MANSLAUGHTER",'HOMICIDE', regex = False)
#Changed RAPE TO SEXUAL ASSAULT
df_NY['CRIME_DESCRIPTION'] = df_NY['CRIME_DESCRIPTION'].str.replace("RAPE",'SEXUAL ASSAULT')
#changed BATTERY - SIMPLE ASSAULT to BATTERY in LA
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("BATTERY - SIMPLE ASSAULT",'BATTERY')
#Changed BATTERY POLICE (SIMPLE) to BATTERY in LA
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("BATTERY POLICE (SIMPLE)","BATTERY",regex = False)
#Changed BATTERY WITH SEXUAL CONTACT to BATTERY in LA
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("BATTERY WITH SEXUAL CONTACT","BATTERY",regex = False)
#changed BURGLARY FROM VEHICLE to BURGLARY in LA
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("BURGLARY FROM VEHICLE",'BURGLARY')
#changed THEFT PLAIN - PETTY ($950 & UNDER) to PETIT LARCENY IN LA
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("THEFT PLAIN - PETTY ($950 & UNDER)","PETIT LARCENY",regex = False)
#changed SODOMY/SEXUAL CONTACT B/W PENIS OF ONE PERS TO ANUS OTH to SEXUAL ASSAULT
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("SODOMY/SEXUAL CONTACT B/W PENIS OF ONE PERS TO ANUS OTH","SEXUAL ASSAULT",regex = False)
#changed SEX OFFENDER REGISTRANT OUT OF COMPLIANCE to SEXUAL ASSAULT
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("SEX OFFENDER REGISTRANT OUT OF COMPLIANCE","SEXUAL ASSAULT",regex = False)
#changed SEXUAL PENETRATION W/FOREIGN OBJECT to SEXUAL ASSAULT
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("SEXUAL PENETRATION W/FOREIGN OBJECT","SEXUAL ASSAULT",regex = False)
#changed SEX,UNLAWFUL(INC MUTUAL CONSENT, PENETRATION W/ FRGN OBJ to SEXUAL ASSAULT
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("SEX,UNLAWFUL(INC MUTUAL CONSENT, PENETRATION W/ FRGN OBJ","SEXUAL ASSAULT",regex = False)
#changed ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT to DANGEROUS WEAPONS
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT","DANGEROUS WEAPONS",regex = False)
#changed BRANDISH WEAPON to DANGEROUS WEAPONS
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("BRANDISH WEAPON","DANGEROUS WEAPONS",regex = False)
#changed RAPE, FORCIBLE to SEXUAL ASSAULT
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("RAPE, FORCIBLE","SEXUAL ASSAULT",regex = False)
#changed INDECENT EXPOSURE to SEXUAL ASSAULT
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("INDECENT EXPOSURE","SEXUAL ASSAULT",regex = False)
#changed CRIMINAL HOMICIDE to HOMICIDE
df_LA['CRIME_DESCRIPTION'] = df_LA['CRIME_DESCRIPTION'].str.replace("CRIMINAL HOMICIDE","HOMICIDE",regex = False)
#changed $500 AND UNDER to PETIT LARCENY IN CHICAGO
df_CHI['CRIME_DESCRIPTION'] = df_CHI['CRIME_DESCRIPTION'].str.replace("$500 AND UNDER","PETIT LARCENY",regex = False)
#changed OVER $500 to PETIT LARCENY IN CHICAGO
df_CHI['CRIME_DESCRIPTION'] = df_CHI['CRIME_DESCRIPTION'].str.replace("OVER $500","PETIT LARCENY",regex = False)
#Changed DOMESTIC BATTERY SIMPLE to BATTERY in CHICAGO
df_CHI['CRIME_DESCRIPTION'] = df_CHI['CRIME_DESCRIPTION'].str.replace("DOMESTIC BATTERY SIMPLE","BATTERY",regex = False)
#Changed AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON to BATTERY in CHICAGO
df_CHI['CRIME_DESCRIPTION'] = df_CHI['CRIME_DESCRIPTION'].str.replace("AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON","BATTERY",regex = False)
#Changed AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST to BATTERY in CHICAGO
df_CHI['CRIME_DESCRIPTION'] = df_CHI['CRIME_DESCRIPTION'].str.replace("AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST","BATTERY",regex = False)
#Change AGGRAVATED: OTHER DANG WEAPON to DANGEROUS WEAPONS in Chicago
df_CHI['CRIME_DESCRIPTION'] = df_CHI['CRIME_DESCRIPTION'].str.replace("AGGRAVATED: OTHER DANG WEAPON","DANGEROUS WEAPONS",regex = False)
#Change UNLAWFUL POSS OF HANDGUN to DANGEROUS WEAPONS in Chicago
df_CHI['CRIME_DESCRIPTION'] = df_CHI['CRIME_DESCRIPTION'].str.replace("UNLAWFUL POSS OF HANDGUN","DANGEROUS WEAPONS",regex = False)
#Change FIRST DEGREE MURDER to HOMICIDE in Chicago
df_CHI['CRIME_DESCRIPTION'] = df_CHI['CRIME_DESCRIPTION'].str.replace("FIRST DEGREE MURDER","HOMICIDE",regex = False)
#WE NEED TO PICK A SUBSET OF CRIMES THAT HAVE THE MOST OCCURENCES (PICK A NUMBER TO FOCUS ON (maybe 10))
#WRITE CODE TO PUT A NUMBER ON EACH CRIME IN THE LIST
#DO THIS FOR ALL THREE CITIES AND PICK THE TOP CRIMES
#ALSO STANDARDIZE THEIR NAMES (FROM THE TOP CRIMES COMBINE ANY THAT ARE CLOSELY RELATED)
#Counts of NY crimes
#print(df_NY['CRIME_DESCRIPTION'].value_counts().head(60))
#print(len(df_NY[df_NY['CRIME_DESCRIPTION'].str.contains('ARSON')]))
#Counts of LA crimes
#print(df_LA['CRIME_DESCRIPTION'].value_counts().head(60))
#print(len(df_LA[df_LA['CRIME_DESCRIPTION'].str.contains('ARSON')]))
#Counts of CHICAGO crimes
#print(df_CHI['CRIME_DESCRIPTION'].value_counts().head(60))
#print(df_CHI['Primary Type'].value_counts().head(60))
#print(len(df_CHI[df_CHI['CRIME_DESCRIPTION'].str.contains('ARSON')]))
#EXAMPLE (WE WOULD RENAME THE CRIMES SO THEY'RE STANDARDIZED FIRST!)
#crimes = ['ASSAULT', 'RAPE']
#PETIT LARCENY IS THE TOP NYC CRIME
#crimes = ['PETIT LARCENY', 'HARRASSMENT']
# selecting rows based on condition
#df_NY = df_NY.loc[df_NY['CRIME_DESCRIPTION'].isin(crimes)]
#LIST OF CRIMES WE WANT TO WORK WITH
#BATTERY FOR SURE (most in LA second most in chicago and NY)
#burglary and robbery
#petit larceny is 1000 in NY, categories for under 500 and over 500 should be combined THEFT PLAIN IN LA 950 UNDER
df_CHI = df_CHI[['YEAR', 'MONTH','CRIME_DESCRIPTION']].copy()
#EIGHT
#PETIT LARCENY DONE DON'T TOUCH
#BATTERY DONE DON'T TOUCH
#BURGLARY DONE DON'T TOUCH
#ROBBERY DONE DON'T TOUCH
#SEXUAL ASSAULT DONE DON'T TOUCH
#NARCOTICS DONE DON'T TOUCH
#ARSON DONE DON'T TOUCH
#DANGEROUS WEAPONS DONE DON'T TOUCH
#HOMICIDE DONE DON'T TOUCH
#EXAMPLE (WE WOULD RENAME THE CRIMES SO THEY'RE STANDARDIZED FIRST!)
crimes = ['PETIT LARCENY', 'BATTERY', 'BURGLARY', 'ROBBERY', 'SEXUAL ASSAULT', 'NARCOTICS', 'ARSON', 'DANGEROUS WEAPONS', 'HOMICIDE']
# selecting rows based on condition
df_NY = df_NY.loc[df_NY['CRIME_DESCRIPTION'].isin(crimes)]
df_LA = df_LA.loc[df_LA['CRIME_DESCRIPTION'].isin(crimes)]
df_CHI = df_CHI.loc[df_CHI['CRIME_DESCRIPTION'].isin(crimes)]
df_NY.to_csv('NY DATAFRAME FINAL.csv', encoding='utf-8', index=False)
df_LA.to_csv('LA DATAFRAME FINAL.csv', encoding='utf-8', index=False)
df_CHI.to_csv('CHICAGO DATAFRAME FINAL.csv', encoding='utf-8', index=False)