Exploratory_Analysis_and_Regression
library(dplyr) library(tidyverse) library(knitr) library(readr) library(lubridate) library(ggplot2) library(psych) library(Hmisc) library(corrplot) library(PerformanceAnalytics) library(car) library(magrittr) library(lmtest) library(usethis)
#Loading in data for analysis OAC_Raw_uVariables_2011 <- read.csv("GY7702_2021-22_Assignment_2_v1-1_datapack/2011_OAC_Raw_uVariables-GY7702_2021-22_CW2.csv")
#Loading data that would be used to extract my Output Area LAD_Allocation_data <- read.csv("GY7702_2021-22_Assignment_2_v1-1_datapack/new.csv")
#Filtering out my allocated LAD LAD_Allocation_data <- LAD_Allocation_data %>% filter(LAD11CD == "E09000006")
#Joining the two data to select my allocated Output Area only OwnLadd <- LAD_Allocation_data %>% left_join( OAC_Raw_uVariables_2011, by = c("OA11CD" = "OA") ) %>% select(- c(LSOA11CD, LSO11ANM, MSOA11CD, MSOA11NM, LAD11CD, LAD11NM, LAD11NMW))
#Selecting variables needed for Analysis explorData <- OwnLadd %>% select( u104:u115, u159:u167)
describe(explorData,skew=TRUE, IQR = TRUE)
str(explorData) %>% knitr::kable()
par(mar=c(5,5,3,0)) ##This margin command should do the trick
explorData %>% gather() %>% ggplot2::ggplot( aes( x = value ) )+ ggplot2:: geom_histogram(binwidth = 5) + facet_wrap(~key, scales = 'free_x')
hist.data.frame(explorData)
for (i in 1:ncol(explorData)) { plt <- ggplot2::ggplot(explorData, aes( sample = explorData[,i] ) ) + ggplot2::stat_qq() + ggplot2::stat_qq_line()+ ggplot2::xlab(colnames( explorData[i])) print(plt)
}
for (i in 1:ncol(explorData)) { plt <- ggplot2::ggplot(explorData, aes( #adding inverse hyperbolic sine function sample = asinh(explorData[,i]) ) ) + ggplot2::stat_qq() + ggplot2::stat_qq_line()+ ggplot2::xlab(colnames( explorData[i])) print(plt)
}
stat_view <- explorData %>% pastecs::stat.desc(norm = TRUE) %>% round(5)
print(stat_view)
corrplot(cor(explorData, method = "kendall"), type = "upper", tl.cex=0.5, method = 'shade', order = 'AOE', diag = FALSE, tl.col="black")
regression_data <- OwnLadd %>% select( Total_Population, u104:u115, u159:u167) %>% #converting each column to represent percentage of population mutate( across(u104:u167, function(x){ (x/Total_Population)*100 }) ) %>% #renaming the variables rename_with( function(x){paste('perc', x, sep = "_")}, u104:u167 )
stat_view2 <- regression_data %>% pastecs::stat.desc(norm = TRUE) %>% round(5) print(stat_view2)
forregression <- regression_data %>% select(perc_u106, perc_u112, perc_u162)
forregression %$% cor.test(perc_u106, perc_u112)
forregression %$% cor.test( perc_u106, perc_u162)
2.31 Regression analysis between variable perc_u106 (dependent) ~ perc_u114 + perc_u165(Independent)
health_model <- forregression %$% lm(perc_u106 ~ perc_u112 + perc_u162)
#2.32 Summary of the model summary(health_model)
health_model %>% stats::rstandard() %>% stats::shapiro.test()
health_model %>% lmtest::bptest()
health_model %>% lmtest::dwtest()
health_model %>% vif()
health_model %>% plot(which = c(1))
health_model %>% plot(which = c(2))
health_model %>% plot(which = c(5))