-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_import_cleaning.R
64 lines (53 loc) · 1.74 KB
/
data_import_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Library
### Count
install.packages("plyr")
library(plyr)
### Json reader
install.packages("jsonlite")
library(jsonlite)
### HTTP request
install.packages("httr")
library(httr)
# Utils function
### Split function
split.data <- function(data, p = 0.7, s = 1) {
set.seed(s)
index <- sample(seq_len(dim(data)[1]))
train <- data[index[1:floor(dim(data)[1] * p)], ]
test <- data[index[((ceiling(dim(data)[1] * p)) + 1):dim(data)[1]], ]
return(list(train = train, test = test))
}
# Data Import
if (!file.exists("waterQuality1.csv")) {
kaggle_credentials <- jsonlite::fromJSON("kaggle.json")
username <- kaggle_credentials$username
key <- kaggle_credentials$key
Sys.setenv(KAGGLE_USERNAME = username)
Sys.setenv(KAGGLE_KEY = key)
dataset <- httr::GET("https://www.kaggle.com/api/v1/datasets/download/mssmartypants/water-quality?datasetVersionNumber=3",
httr::authenticate(username, key, type = "basic"))
temp <- tempfile()
download.file(dataset$url,temp)
dataset <- read.csv(unz(temp, "waterQuality1.csv"), na.strings = c("#NUM!"))
write.csv(dataset, destination_file, row.names = FALSE)
unlink(temp)
}else{
dataset <- read.csv("waterQuality1.csv", na.strings = c("#NUM!"))
}
# Data Cleaning
## Casting
dataset$ammonia <- as.numeric(dataset$ammonia)
dataset$is_safe <- factor(dataset$is_safe)
## Removing rows with null values
count(is.na.data.frame(dataset))
dataset <- dataset[complete.cases(dataset), ]
## Delete duplicate rows
dataset <- unique(dataset)
# wrong
negative_rows <- which(dataset$ammonia < 0)
print(length(negative_rows))
dataset <- subset(dataset, ammonia >= 0)
# Splitting dataset ~ Train & Test
allset <- split.data(dataset, p = 0.7)
trainset <- allset$train
testset <- allset$test