-
Notifications
You must be signed in to change notification settings - Fork 91
/
DataPreparation.py
357 lines (211 loc) Β· 9.35 KB
/
DataPreparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# coding: utf-8
# # Data Wrangling, Cleaning of Data, Exploration of Data to make it consistent for Analysis
# In[1]:
#get_ipython().magic('matplotlib inline')
# In[2]:
# importing required libraries
import os
import subprocess
import stat
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
sns.set(style="white")
# In[19]:
# getting absolute path till the raw data file
abs_path = os.getcwd()[:-15]
raw_data_path = abs_path + "/RawData/autos.csv"
# reading csv into raw dataframe
df = pd.read_csv(raw_data_path,encoding="latin-1")
# In[4]:
# checking the column vehicleType for null values
df["vehicleType"].isnull().values.sum()
# In[5]:
# changing the vehicleType from NaN to Others
df["vehicleType"].fillna("Other", inplace=True)
# In[6]:
# checking if there are any null values in the column brand
df["brand"].isnull().value_counts()
# In[7]:
# checking if there are any null values in the column seller
df["seller"].isnull().value_counts()
# In[8]:
# checking if there are any null values in the offerType
df["offerType"].isnull().value_counts()
# In[9]:
# checking the different types of values in the column offerType
df["offerType"].unique()
# In[10]:
# changing values of offerType Gesuch to Request and Angebot to Offer
df["offerType"] = df["offerType"].map({'Gesuch':"Request",'Angebot':'Offer'})
# In[11]:
# checking if there are any null values in the yearOfRegistration
df["yearOfRegistration"].isnull().value_counts()
# In[12]:
# Distribution of vehicles based on year of registration
fig, ax = plt.subplots(figsize=(8,6))
ax = sns.distplot(df["yearOfRegistration"], color="g", ax=ax)
ax.set_title('Distribution of vehicles based on Year of Registration', fontsize= 15)
plt.ylabel("Density (KDE)", fontsize= 15)
plt.xlabel("Year Of Registration", fontsize= 15)
bbox_props = dict(boxstyle="larrow,pad=0.3", fc="white", ec="b", lw=2)
ax.text(2500,0.015,"Maximum No Of Vehicles",ha="left", va="center", rotation=0,size=12,bbox=bbox_props)
ax.annotate('', xy=(25, 0.0009), xycoords='data',
xytext=(1500, 0.0009), textcoords='data',
arrowprops=dict(facecolor='red', shrink=0.05, ec="r"),
horizontalalignment='left', verticalalignment='left',
)
ax.annotate('', xy=(10000, 0.0009), xycoords='data',
xytext=(2050, 0.0009), textcoords='data',
arrowprops=dict(facecolor='red', shrink=0.05, ec="r"),
horizontalalignment='right', verticalalignment='right',
)
ax.text(5000,0.002,"Invalid Registration Years",ha="center", va="center")
plt.show()
# In[13]:
fig.savefig(abs_path + "/DataPreparation/Plots/vehicle-distribution.png")
# In[14]:
# dropping rows that are inconsistent with the dataset based on year of registration
df = df[(df["yearOfRegistration"] >= 1890) & (df["yearOfRegistration"] <= 2016)]
# In[17]:
# Boxplot to see the distribution after outliers has been removed
fig, ax = plt.subplots(figsize=(8,6))
sns.boxplot(x="vehicleType", y="price", data=df)
ax.text(4,1.2*(10**9),"Many Outliers as Boxplot has been suppressed",fontsize=18,color="r",ha="center", va="center")
plt.show()
# In[18]:
fig.savefig(abs_path + "/DataPreparation/Plots/price-vehicleType-boxplot.png")
# In[209]:
# Mean of the prices of all the vehicle types
_median = df.groupby("vehicleType")["price"].median()
# 75th percentile of the prices of all the vehicles types
_quantile75 = df.groupby("vehicleType")["price"].quantile(0.75)
# 25th percentile of the prices of all the vehicles types
_quantile25 = df.groupby("vehicleType")["price"].quantile(0.25)
# Calculating the value of the prices of each vehicle type above which all the values are outliers
iqr = (_quantile75 - _quantile25)*1.5 + _median
iqr
# In[212]:
# Removing the outliers as per the logic above
df = df[((df["vehicleType"] == "andere") & (df["price"] <= 8429)) |
((df["vehicleType"] == "Other") & (df["price"] <= 3708)) |
((df["vehicleType"] == "suv") & (df["price"] <= 28800)) |
((df["vehicleType"] == "kombi") & (df["price"] <= 13076)) |
((df["vehicleType"] == "bus") & (df["price"] <= 13948)) |
((df["vehicleType"] == "cabrio") & (df["price"] <= 21400)) |
((df["vehicleType"] == "limousine") & (df["price"] <= 12801)) |
((df["vehicleType"] == "coupe") & (df["price"] <= 24300)) |
((df["vehicleType"] == "kleinwagen") & (df["price"] <= 5775))]
# In[215]:
# checking if the gearbox column has null values
df["gearbox"].isnull().value_counts()
# setting the NaN gearbox types to Unspecified
df["gearbox"].fillna("Unspecified", inplace=True)
# In[217]:
# checking if the fuelType has null values
df["fuelType"].isnull().value_counts()
# In[218]:
# setting the NaN fuelType types to other
df["fuelType"].fillna("other",inplace=True)
# In[219]:
# checking how many unique types of fuelTypes are present
df["fuelType"].unique()
# In[220]:
# Changing german names to english readable format
df["fuelType"] = df["fuelType"].map({'benzin':'Gasoline','diesel':'Diesel','other':'Other','lpg':'Lpg','hybrid':'Hybrid','cng':'Cng','elektro':'Electric'})
# In[221]:
# deleting the column noOfPictures since all of them are Zero
del df["nrOfPictures"]
# In[225]:
# splitting dateCreated by year
df["yearOfCreation"] = df['dateCreated'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date().strftime('%Y'))
# splitting dateCrawled by year
df["yearCrawled"] = df['dateCrawled'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date().strftime('%Y'))
# splitting dateCreated by month
df["monthOfCreation"] = df['dateCreated'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date().strftime('%B'))
# splitting dateCrawled by month
df["monthCrawled"] = df['dateCrawled'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date().strftime('%B'))
# no of days seen online
from_date = df['dateCreated'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date())
to_date = df['lastSeen'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').date())
no_days = abs((to_date-from_date))
no_days = (no_days / np.timedelta64(1, 'D')).astype(int)
df["NoOfDaysOnline"] = no_days
# no of hrs seen online
hrs_from = df['dateCreated'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').time().strftime('%H'))
hrs_to = df['lastSeen'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S').time().strftime('%H'))
total_hrs = (hrs_to.astype(int) - hrs_from.astype(int))
df["NoOfHrsOnline"] = total_hrs
# In[228]:
# checking for null values in powerPS column
df["powerPS"].isnull().value_counts()
# In[231]:
# checking for unique values in notRepairedDamage
df["notRepairedDamage"].unique()
# In[230]:
# setting nan in notRepairedDamage to other
df["notRepairedDamage"].fillna("other",inplace=True)
# In[232]:
# Changing german names to english readable format
df["notRepairedDamage"] = df["notRepairedDamage"].map({'other':'Other','ja':'Yes','nein':'No'})
# In[238]:
# checking for unique values in model column
df["model"].isnull().value_counts()
# In[239]:
# setting nan in model column to Other
df["model"].fillna("Other",inplace=True)
# In[247]:
# checking for null values in abtest column
df["abtest"].isnull().value_counts()
# In[271]:
# mappig the values on monthOfRegistration column from 1-12
df["monthOfRegistration"].replace([0,12],[1,11],inplace=True)
# In[274]:
# calculating no of years the vehicle is old
df["yearsOld"] = 2016 - df["yearOfRegistration"]
# calculating no of months the vehicle is old
df["monthsOld"] = 12 - df["monthOfRegistration"]
# In[276]:
# mappig the values on monthOfRegistration column from 1-12 to Jan to Dec
months = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov"]
df["monthOfRegistration"].replace([1,2,3,4,5,6,7,8,9,10,11],months,inplace=True)
# In[281]:
# checking if postal code values are null
df["postalCode"].isnull().value_counts()
# In[ ]:
# In[302]:
# creating folder structure as per brand of vehicle
fileName = abs_path + "/ShellScripts/CreateFolder.sh"
file = open(fileName, "w+")
baseString = "mkdir -p " + abs_path + "/CleanData/DataForAnalysis/"
for b in list(t["brand"].unique()):
newString = baseString + b + "\n"
file.write(newString)
file.write("mkdir -p " + abs_path + "/CleanData/CleanedDataSet")
file.close()
st = os.stat(fileName)
os.chmod(fileName, st.st_mode | stat.S_IEXEC)
subprocess.call(fileName,shell=True)
# In[303]:
# saving the data file as csv to current directory location
path_to_file = abs_path + "/CleanData/CleanedDataSet/cleaned_autos.csv"
df.to_csv(path_to_file,index=False)
# In[304]:
# splitting the the records based on brand and vehicleType
fileName = abs_path + "/ShellScripts/CreateFiles.sh"
file = open(fileName, "w+")
baseString_before = '''awk 'BEGIN{FS=OFS=","} FNR == 1 {print} '''
baseString_after = " {print}' " + abs_path + "/CleanData/CleanedDataSet/cleaned_autos.csv" + " >> " + abs_path + "/CleanData/DataForAnalysis/"
for b in list(df["brand"].unique()):
for typ in list(df[df["brand"] == b]["vehicleType"].unique()):
newString = baseString_before + '$15 == ' + '"' + b + '"' + ' && $7 == ' + '"' + typ + '"' + baseString_after + b + "/" + b + "_" + typ + ".csv" + "\n"
file.write(newString)
file.close()
st = os.stat(fileName)
os.chmod(fileName, st.st_mode | stat.S_IEXEC)
subprocess.call(fileName,shell=True)
# In[ ]:
# In[ ]:
# In[ ]: