-
Notifications
You must be signed in to change notification settings - Fork 0
/
ch9_dataframes.R
247 lines (178 loc) · 5.68 KB
/
ch9_dataframes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# ch9: Dataframes
# file name: mybankcustomers.csv (created during our python session)
# read a CSV file
path = "F:/work/mybankcustomers.csv"
# read the file into a dataframe
cust=read.csv(path, header=T)
print(cust)
# to view the data in an XL format
View(cust)
# basic EDA
head(cust) # returns the first 6 records
head(cust,3)
tail(cust,3) # last 3 records
# dimension
dim(cust)
# names of columns
colnames(cust)
# row names
rownames(cust)
# get the row and column count
nrow(cust)
ncol(cust)
# structure of the dataset
str(cust)
# some columns are showing the wrong datatype. Need to convert them into the right data type
# gender,city,privcust,acctype,srcit = string->factor
cust$gender = as.factor(cust$gender)
cust$city = as.factor(cust$city)
cust$privcust = as.factor(cust$privcust)
cust$acctype = as.factor(cust$acctype)
cust$sr_citz = as.factor(cust$sr_citz)
# verify the change
str(cust)
# summarise the dataset
summary(cust)
summary(cust$age) # summary of a single column
# rename a column
colnames(cust)
# rename 'cards' to 'totalcards'
colnames(cust)[colnames(cust)=="cards"] = "totalcards"
# retrieve data from dataframe
# select * from customer
print(cust)
# select custid,name,age,gender,phone from cust
# method 1
cust[,c('custid','name','age','gender','phone')]
# method 2
cols=c('custid','name','age','gender','phone')
cust[,cols]
# method 3
cust[cols]
# accessing a single column - using the $ sign after df
cust$acctype
# filter data from dataframe
# select * from customer where custid = 12752
cust[cust$custid==12752,]
# select personal info of customer id 12752
cust[cols][cust$custid == 12752,]
cust[cust$custid==12752,cols]
# IN operator
# select customers whose ID's are (12752,10801,16664)
colnames(cust)
cid=c(12752,10801,16664)
cols=c('custid','acctype','balance','crscore','privcust')
cust[cust$custid %in% cid,cols]
# select all female customers having a credit score more than 600
cols=c('name','gender','crscore')
cust[(cust$gender=="F") & (cust$crscore>=600),cols]
# select all customers having a credit score more than 700 OR bank balance more than 750000
cols=c('name', 'crscore', 'balance','gender')
cust[(cust$crscore>700) | (cust$balance>750000), cols]
# get a count, proportion of customers based on the account type
# count
table(cust$acctype)
# proportion of customers
round(prop.table(table(cust$acctype))*100,1)
# get all customers whose acctype is not INR
# select * from cust where acctype <> INR
cols=c('custid','name','acctype')
cust[cust$acctype!="INR", cols]
# get customers whose email contains 'gmail'
cols=c('name','email')
cust[grep("gmail", cust$email),cols]
# this is case-sensitive search
# case-insensitive search
cust[grep("GMaiL", cust$email, ignore.case = T),cols]
# sorting the dataset
# select * from cust ORDER BY crscore
# default order: Ascending
cust[with(cust, order(crscore)),]
# descending order
cols=c('name','acctype','crscore')
cust[with(cust,order(crscore,decreasing=T)),cols]
# accessing dataframe by row numbers
cust[c(1,2,3),]
# creating and removing columns from dataframe
# create
cust$lastname = ''
cust$lastdeposit = 0
# drop features from dataframe
cust$lastname = NULL
cust$lastdeposit = NULL
# update the dataframe
# update the name and email ID of customer ID 12752 with the right values
cust$name[cust$custid == 12752] = "sriraman"
cust$email[cust$custid == 12752] = "sriraman@crscompany.in"
# verify the change
cols=c('custid','name','email')
cust[cust$custid==12752,cols]
# check for 0 and Nulls
# create functions
checkZero=function(x) return(any(x)<=0)
checkNull=function(x) return(any(is.na(x)))
# apply this function on the dataframe
# check for NULLs
# return cols that have Nulls
names(cust)[apply(cust,2,checkNull)]
# method 2
colSums(is.na(cust))
# check for 0
names(cust)[apply(cust,2,checkZero)]
# 0 check is applicable only on the numeric columns
# method 2
colSums(cust==0)
# get the numeric and factor columns
# numeric cols
nc=names(cust)[sapply(cust,is.numeric)]
# factor cols
fc=names(cust)[sapply(cust,is.factor)]
fc
# to identify the levels of all factor columns
for(f in fc)
{
print(paste("factor column = ",f))
print(levels(factor(cust[[f]])))
print(" ... ")
}
# method 2 - get all values with their counts
for(f in fc)
print(table(cust[f]))
# .................................... #
# Visualisation
# --------------------
library(ggplot2)
# install.packages("ggplot2", dependencies=T)
# histogram
hist(cust$crscore,col='yellow',breaks=10,main='Histogram for Credit Score',xlab='Credit Score')
# boxplot
boxplot(cust$balance,col='blue',horizontal = T,
main='Boxplot')
# scatter plot
plot(cust$age, cust$crscore,col='green')
# bar plot
barplot(cust$crscore,main='Distribution of Customers')
# plot count of customers based on acct type
ggplot(cust,aes(x=acctype,fill=acctype)) +
geom_bar() +
ggtitle('Customer Distribution by Account type')
# Pie Chart - distribution of customers by acc type
ggplot(cust, aes(x=factor(''),fill=acctype)) +
geom_bar() +
coord_polar(theta = "y")
# histogram - by Gender and Age
ggplot(cust,aes(x=age,fill=gender)) +
geom_histogram(binwidth = 10) +
ggtitle('Histogram')
# split the above graph
ggplot(cust,aes(x=age,fill=gender)) +
geom_histogram(binwidth = 10) +
ggtitle('Histogram') +
facet_wrap(~gender)
# density plot
ggplot(cust,aes(x=crscore)) + geom_density()
# density plot for Male/Female
ggplot(cust,aes(x=crscore,fill=gender)) +
geom_density(adjust=0.1)
# line chart
ggplot(cust,aes(x=age,y=balance)) + geom_line()