-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmissing_data.R
47 lines (46 loc) · 1.27 KB
/
missing_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
install.packages("DMwR2")
library(DMwR2)
data(algae)
str(algae)
summary(algae)
mean(algae$Cl)
mean(algae$Cl,na.rm=TRUE)
## missing values
## 1. Remove the instances with missing values
## when the proportion of instances with missing values is small wrt the size
complete.cases(algae)
!complete.cases(algae)
sum(!complete.cases(algae))
algae2=na.omit(algae)
summary(algae2)
apply(algae,1,function(x) sum(is.na(x)))
manyNAs(algae,0.2)
manyNAs(algae,1)
manyNAs(algae,2)
algae=algae[-manyNAs(algae),]
## Fill in the missing values with the most frequent value
algae[48,]
summary(algae$mxPH)
hist(algae$mxPH)
algae[48,"mxPH"]=mean(algae$mxPH,na.rm=TRUE)
summary(algae$Chla)
hist(algae$Chla)
algae[is.na(algae$Chla),"Chla"]=median(algae$Chla,na.rm=TRUE)
data(algae)
algae=algae[-manyNAs(algae),]
algae=centralImputation(algae)
summary(algae)
## Fill in the missing values by exploring the correlations btw attributes
data(algae)
cor(algae[,4:18])
cor(algae[,4:18],use="complete.obs")
install.packages("corrplot")
library(corrplot)
cm=cor(algae[,4:18],use="complete.obs")
corrplot(cm,type="upper",tl.pos="d")
corrplot(cm,add=TRUE, type="lower",method="number",
diag=FALSE,tl.pos="n",cl.pos="n")
data(algae)
algae=algae[-manyNAs(algae),]
lm(PO4~oPO4, data=algae)
algae[28,"PO4"]=42.897+1.293*algae[28,"oPO4"]