-
Notifications
You must be signed in to change notification settings - Fork 0
/
sql_data_cleaning_project.sql
284 lines (240 loc) · 7.4 KB
/
sql_data_cleaning_project.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
/*
Cleaning Data with SQL Queries
*/
------------------------------------------------
--Get to Know the Data
--view the data
select * from nashville_housing_data
limit 1000;
--see number of rows
select count(*) from nashville_housing_data;
------------------------------------------------
--Populate Property Address Data
--find total null values in propertyaddress column
select count(*)
from nashville_housing_data
where propertyaddress is null;
--create a view to copy original data
create view housing_data_b as
select * from nashville_housing_data;
--create a view where missing property addresses are replaced by appropriate address
create view new_values as
with cte as
(
select row_number() over(partition by a.uniqueid order by a.uniqueid) as rn ,
a.uniqueid as unique_a,
a.parcelid as parcel_a,
a.propertyaddress as property_a,
b.uniqueid as unique_b,
b.parcelid as parcel_b,
b.propertyaddress as property_b,
coalesce(a.propertyaddress, b.propertyaddress) as replacement
from nashville_housing_data a
left join housing_data_b b
on a.parcelid = b.parcelid
and a.uniqueid <> b.uniqueid
where a.propertyaddress is null
)
select unique_a, parcel_a, property_a, unique_b, parcel_b, property_b, replacement
from cte
where rn = 1;
-- another way to look at the data
with cte as
(
select row_number() over(partition by a.uniqueid order by a.uniqueid) as rn ,
a.uniqueid as unique_a,
a.parcelid as parcel_a,
a.propertyaddress as property_a,
b.uniqueid as unique_b,
b.parcelid as parcel_b,
b.propertyaddress as property_b,
coalesce(a.propertyaddress, b.propertyaddress) as replacement
from nashville_housing_data a
left join housing_data_b b
on a.parcelid = b.parcelid
and a.uniqueid <> b.uniqueid
where a.propertyaddress is null
)
select b.unique_a, b.parcel_a, b.property_a, b.unique_b, b.parcel_b, b.property_b, b.replacement, a.propertyaddress
from nashville_housing_data a
inner join cte b on a.uniqueid = b.unique_a
where rn = 1;
--replace the values with correct data using the view we created earlier
update nashville_housing_data a
set a.propertyaddress = b.replacement
from new_values b
where propertyaddress is null and a.uniqueid = b.unique_a;
------------------------------------------------
--Split both address columns into individual columns (Address, City, State)
--view the data
select propertyaddress, owneraddress
from nashville_housing_data;
select count(*)
from nashville_housing_data
where owneraddress is null;
--split propertyaddress data and add split data into two new columns
select
trim(substr(propertyaddress, 1, locate(',',propertyaddress)-1)) as address,
trim(substr(propertyaddress, locate(',',propertyaddress)+1, length(propertyaddress))) as city
from nashville_housing_data;
alter table nashville_housing_data
add column short_address varchar(255)
add column property_city varchar(255);
update nashville_housing_data
set short_address = trim(substr(propertyaddress, 1, locate(',',propertyaddress)-1));
update nashville_housing_data
set property_city = trim(substr(propertyaddress, locate(',',propertyaddress)+1, length(propertyaddress)));
--check your work
select propertyaddress, short_address, property_city
from nashville_housing_data;
--split owneraddress data and add split data into two new columns
select
owneraddress,
trim(substr(owneraddress, 1, locate(',',owneraddress)-1)) as address,
trim(substr(owneraddress, locate(',',owneraddress)+1, length(owneraddress)-locate(',',owneraddress)-4)) as city,
trim(right(owneraddress,2)) as state
from nashville_housing_data;
alter table nashville_housing_data
add column owner_address_short varchar(255)
add column owner_city varchar(255)
add column owner_state varchar(255);
update nashville_housing_data
set owner_address_short = trim(substr(owneraddress, 1, locate(',',owneraddress)-1));
update nashville_housing_data
set owner_city = trim(substr(owneraddress, locate(',',owneraddress)+1, length(owneraddress)-locate(',',owneraddress)-4));
update nashville_housing_data
set owner_state = trim(right(owneraddress,2));
--check you work
select owneraddress, owner_address_short, owner_city, owner_state
from nashville_housing_data;
------------------------------------------------
--Change Y and N to Yes and No in 'soldasvacant' field
--view the distinct values in the column
select distinct soldasvacant, count(*) as count
from nashville_housing_data
group by soldasvacant
order by count;
-- update Y and N strings accordingly.
select soldasvacant,
case
when soldasvacant = 'Y' then 'Yes'
when soldasvacant = 'N' then 'No'
else soldasvacant
end as yes_no
from nashville_housing_data;
update nashville_housing_data
set soldasvacant = case
when soldasvacant = 'Y' then 'Yes'
when soldasvacant = 'N' then 'No'
else soldasvacant
end;
--check you work
select distinct soldasvacant, count(*) as count
from nashville_housing_data
group by soldasvacant
order by count;
------------------------------------------------
--Check if saleprice column has null values
select saleprice
from nashville_housing_data
where saleprice is null; --found no null values
------------------------------------------------
--Remove duplicates and drop unnecessary columns
--check to see how many duplicate values there are
with cte as
(
select *,
row_number() over(
partition by
parcelid,
landuse,
propertyaddress,
saledate,
saleprice,
legalreference,
soldasvacant,
ownername,
owneraddress,
acreage,
taxdistrict,
landvalue,
buildingvalue,
totalvalue,
yearbuilt,
bedrooms,
fullbath,
halfbath
order by uniqueid
) rn
from nashville_housing_data
)
select *
from cte
where rn > 1;
--remove duplicate rows and remove unnecessary columns using a view
create or replace view clean_nashville_data as
with cte as
(
select *,
row_number() over(
partition by
parcelid,
landuse,
propertyaddress,
saledate,
saleprice,
legalreference,
soldasvacant,
ownername,
owneraddress,
acreage,
taxdistrict,
landvalue,
buildingvalue,
totalvalue,
yearbuilt,
bedrooms,
fullbath,
halfbath
order by uniqueid
) rn
from nashville_housing_data
)
select uniqueid, parcelid,landuse,propertyaddress,saledate, saleprice, legalreference,soldasvacant,ownername,acreage,
landvalue,buildingvalue,totalvalue,yearbuilt,bedrooms,fullbath,halfbath, short_address as property_address_short,
property_city, owner_address_short, owner_city, owner_state
from cte
where rn = 1;
--check your work
with cte as
(
select *,
row_number() over(
partition by
parcelid,
landuse,
propertyaddress,
saledate,
saleprice,
legalreference,
soldasvacant,
ownername,
acreage,
landvalue,
buildingvalue,
totalvalue,
yearbuilt,
bedrooms,
fullbath,
halfbath
) row_number
from clean_nashville_data
)
select *
from cte
where row_number > 1;
------------------------------------------------
--View clean dataset
select * from clean_nashville_data;
------------------------------------------------
------------------------------------------------