-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data_preparing_1.R
117 lines (77 loc) · 3.63 KB
/
Data_preparing_1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
##### ARRANGE DATA #######
#Install Packages needed
R.Version()
install.packages(c("devtools","usethis","zoo","naniar"))
install.packages('tidyverse', repos='https://cloud.r-project.org')
devtools::install_github('ohi-science/ohicore')
library(ohicore)
library("usethis")
library("tidyverse")
library("devtools")
library("zoo")
library("naniar")
#Upload datasets, IMPORTANT! add na.string to identify different characters used as missing data
IGE_tourism<-read.csv('C:/Users/Antía/Documents/R/ATLA_1/DATA/IGE_Tourist_26072023.csv',na.strings = c("", "..","NA"))
try_IGE_tourism_2<-read.csv('C:/Users/Antía/Documents/R/ATLA_1/DATA/IGE_Tourist_26072023_M_R.csv',na.strings = c("", "..","NA"))
try_IGE_tourism<-read.csv('C:/Users/Antía/Documents/R/ATLA_1/DATA/try_IGE_tourism.csv',na.strings = c("", "..","NA"), sep=";")
# Change column names and remove X.2
names(try_IGE_tourism_2)[names(try_IGE_tourism_2) == "Ano"] <- "Ano"
names(try_IGE_tourism_2)[names(try_IGE_tourism_2) == "Tipo.aloxamento"] <- "Tipo_Aloxamento"
names(try_IGE_tourism_2)[names(try_IGE_tourism_2) == "X"] <- "Area"
names(try_IGE_tourism_2)[names(try_IGE_tourism_2) == "X.1"] <- "Codigo"
names(try_IGE_tourism_2)[names(try_IGE_tourism_2) == "Establecementos"] <- "Numero_Establecementos"
try_IGE_tourism_2<-subset(try_IGE_tourism_2,select = - X.2)
names(try_IGE_tourism)[names(try_IGE_tourism) == "Ano"] <- "Tipo_Aloxamento"
names(try_IGE_tourism)[names(try_IGE_tourism) == "Numero_Establecementos"] <- "Ano"
names(try_IGE_tourism)[names(try_IGE_tourism) == "X"] <- "Codigo"
names(try_IGE_tourism)[names(try_IGE_tourism) == "Area"] <- "Numero_Establecementos"
names(try_IGE_tourism)[names(try_IGE_tourism) == "Codigo"] <- "Area"
try_IGE_tourism<-subset(try_IGE_tourism,select = - Tipo_Aloxamento)
#Fill in empty spaces
#replace NA by blank spaces
try_IGE_tourism[is.na(try_IGE_tourism)]<-""
#METHOD 1 - define function and use it (USED)
#****Codigo funcinou nalgún intento, pero non sei todavia como/porque? Porque para empregar a funcion filltheblanks a variable ten que ser character non numerica
fillTheBlanks <- function(try_IGE_tourism, missing=""){
rle <- rle(as.character(try_IGE_tourism))
empty <- which(rle$value==missing)
rle$values[empty] <- rle$value[empty-1]
inverse.rle(rle)
}
try_IGE_tourism$Tipo_Aloxamento<-fillTheBlanks(try_IGE_tourism$Tipo_Aloxamento)
try_IGE_tourism$Area<-fillTheBlanks(try_IGE_tourism$Area)
try_IGE_tourism$Codigo<-as.character(try_IGE_tourism$Codigo)
try_IGE_tourism$Codigo<-fillTheBlanks(try_IGE_tourism$Codigo)
write.table(try_IGE_tourism,"try_IGE_tourism.csv",sep=";",row.names = TRUE)
###FILTER OUT MIXED ROWS
#METHOD 1: dlpyr
library(dplyr)
filter(try_IGE_tourism, try_IGE_tourism$Ano %in% c("2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022"))
#METHOD 2: Remove rows with empty cells with a loop (NON FURRULA)
# declaring an empty vector to store
# the rows with all the blank values
vec <- c()
# looping the rows
for (i in 1:nrow(try_IGE_tourism)){
# counter for blank values in
# each row
count = 0
# looping through columns
for(j in 1:ncol(try_IGE_tourism)){
# checking if the value is blank
if(isTRUE(try_IGE_tourism[i,j] == "")){
count = count + 1
}
}
# if count is equivalent to number
# of columns
if(count == ncol(try_IGE_tourism)){
# append row number
vec <- append(vec,i)
}
}
summary(try_IGE_tourism)
# deleting rows using index in vector (TAMPOUCO FURRULA)
Mod_try_IGE_tourism <- try_IGE_tourism[ -vec, ]
print ("Modified dataframe")
print (data_frame_mod)