diff --git a/.Rhistory b/.Rhistory deleted file mode 100644 index 1492a7d..0000000 --- a/.Rhistory +++ /dev/null @@ -1,512 +0,0 @@ -#' @title find the closest index -#' -#' @description -#' This function finds the closest index to a given value in a vector. -#' -#' @param x a vector -#' @param y a value -#' -#' @return the index of the closest value in the vector -#' -#' @examples -#' find_max_index(c(1, 2, 3, 4, 5), 3.5) -#' @export -find_index <- function(x, y) { -index <- which(x == y) -if (length(index) >= 1) { -index_name <- names(x)[index] -value<-get_quantily_value(index_name) -return(value) -} -else { -closest_index <- which(abs(x - y) == min(abs(x-y))) -closest_index_name <- names(x)[closest_index] -value <- get_quantily_value(closest_index_name) -return(value) -} -} -#' @title find the right rank -#' -#' @description -#' This function finds the right rank of a response value in a quantile random forest. -#' -#' @param response a vector of response values -#' @param outMatrix a matrix of out values -#' @param median_outMatrix a vector of median out values -#' @param rmse_ a vector of rmse values -#' -#' @return a vector of ranks -#' -get_right_rank <- function(response,outMatrix,median_outMatrix,rmse_){ -rank_value <-c() -for (i in 1:length(response)){ -rank_<- find_index(outMatrix[i,],response[i]) -if (length(rank_)>1){ -diff = response[i] -median_outMatrix[i] -if (abs(diff)>3*rmse_ & diff<0 ){ -min_value <- min(rank_) -rank_value<-c(rank_value,min_value) -} else if (abs(diff)>3*rmse_ & diff>0) { -max_value <- max(rank_) -rank_value<-c(rank_value,max_value) -}else { -mean_value <- mean(rank_) -rank_value<-c(rank_value,mean_value) -} -}else { -rank_value<-c(rank_value,rank_) -} -} -return(rank_value) -} -find_quantile_position <- function(x, data) { -ecdf_data <- ecdf(data) -return(ecdf_data(x)) -} -#' @title find outliers -#' -#' @description -#' This function finds outliers in a dataset using quantile random forests. -#' -#' @param data a data frame -#' @param quantiles_type 'all':seq(from = 0.001, to = 0.999, by = 0.001),'other':c(threshold,0.5,1-threshold) -#' @param threshold a threshold for outlier detection -#' @param verbose a boolean value indicating whether to print verbose output -#' @param ... additional arguments passed to the ranger function -#' -#' @return a data frame of outliers -#' -#' @examples -#' outqrf(iris) -#' @export -outqrf <-function(data, -quantiles_type=1000, -threshold =0.025, -verbose = 1, -...){ -data <- as.data.frame(data) -numeric_features <- names(data)[sapply(data,is.numeric)] -threshold_low<-threshold -threshold_high<-1-threshold -rmse <-c() -oob.error <-c() -r.squared <-c() -outliers <- data.frame() -outMatrixs <- list() -if(quantiles_type == 1000){ -quantiles <- seq(0.001, 0.999,0.001) -}else if(quantiles_type == 400){ -quantiles <- c(seq(0.0025,0.9975,0.0025)) -}else{ -quantiles <- c(seq(0.025,0.9975,0.025)) -} -if (verbose) { -cat("\nOutlier identification by quantiles random forests\n") -cat("\n Variables to check:\t\t") -cat(numeric_features, sep = ", ") -cat("\n Variables used to check:\t") -cat(names(data), sep = ", ") -cat("\n\n Checking: ") -} -for (v in numeric_features){ -if (verbose) { -cat(v, " ") -} -covariables <- setdiff(names(data), v) -qrf <- ranger::ranger( -formula = stats::reformulate(covariables, response = v), -data = data, -quantreg = TRUE, -...) -pred <- predict(qrf, data[,covariables], type = "quantiles",quantiles=quantiles) -oob.error <- c(oob.error,qrf$prediction.error) -r.squared <- c(qrf$r.squared,r.squared) -outMatrix <- pred$predictions -outMatrixs[[v]]<-outMatrix -median_outMatrix <- outMatrix[,(length(quantiles)+1)/2] -response<- data[,v] -diffs = response - median_outMatrix -rmse_ <- sqrt(sum(diffs*diffs)/(length(diffs)-1)) -rmse <- c(rmse,rmse_) -rank_value <- get_right_rank(response,outMatrix,median_outMatrix,rmse_) -#rank_value <- find_quantile_position(response,outMatrix) -outlier <- data.frame(row = as.numeric(row.names(data)),col = v,observed = response, predicted = median_outMatrix,rank = rank_value) -outlier<- outlier|>dplyr::filter(rank<=threshold_low| rank>=threshold_high) -outliers <- rbind(outliers,outlier) -} -names(rmse) <- numeric_features -names(oob.error) <- numeric_features -names(r.squared) <- numeric_features -list( -Data = data, -outliers = outliers, -n_outliers = table(outliers$col), -threshold = threshold, -rmse = rmse, -oob.error = oob.error, -r.squared = r.squared, -outMatrixs =outMatrixs -) -} -qrf = outqrf(data,quantiles_type=400) -qrf$outliers -qrf$n_outliers -qrf$oob.error -qrf$r.squared -library(outqrf) -out <- outForest(data) -out$outliers -qrf = outqrf(data,quantiles_type=40) -#' @title get numberic value from string -#' -#' @description -#' This function extracts the numeric value from a string. -#' -#' @param name a string -#' -#' @return a numeric value -#' -#' @examples -#' get_quantily_value("quantiles = 0.001") -#' @export -get_quantily_value <- function(name){ -str<- gsub("[^0-9.]", "", name) -value <- as.numeric(str) -return(value) -} -#' @title find the closest index -#' -#' @description -#' This function finds the closest index to a given value in a vector. -#' -#' @param x a vector -#' @param y a value -#' -#' @return the index of the closest value in the vector -#' -#' @examples -#' find_max_index(c(1, 2, 3, 4, 5), 3.5) -#' @export -find_index <- function(x, y) { -index <- which(x == y) -if (length(index) >= 1) { -index_name <- names(x)[index] -value<-get_quantily_value(index_name) -return(value) -} -else { -closest_index <- which(abs(x - y) == min(abs(x-y))) -closest_index_name <- names(x)[closest_index] -value <- get_quantily_value(closest_index_name) -return(value) -} -} -#' @title find the right rank -#' -#' @description -#' This function finds the right rank of a response value in a quantile random forest. -#' -#' @param response a vector of response values -#' @param outMatrix a matrix of out values -#' @param median_outMatrix a vector of median out values -#' @param rmse_ a vector of rmse values -#' -#' @return a vector of ranks -#' -get_right_rank <- function(response,outMatrix,median_outMatrix,rmse_){ -rank_value <-c() -for (i in 1:length(response)){ -rank_<- find_index(outMatrix[i,],response[i]) -if (length(rank_)>1){ -diff = response[i] -median_outMatrix[i] -if (abs(diff)>3*rmse_ & diff<0 ){ -min_value <- min(rank_) -rank_value<-c(rank_value,min_value) -} else if (abs(diff)>3*rmse_ & diff>0) { -max_value <- max(rank_) -rank_value<-c(rank_value,max_value) -}else { -mean_value <- mean(rank_) -rank_value<-c(rank_value,mean_value) -} -}else { -rank_value<-c(rank_value,rank_) -} -} -return(rank_value) -} -find_quantile_position <- function(x, data) { -ecdf_data <- ecdf(data) -return(ecdf_data(x)) -} -#' @title find outliers -#' -#' @description -#' This function finds outliers in a dataset using quantile random forests. -#' -#' @param data a data frame -#' @param quantiles_type 'all':seq(from = 0.001, to = 0.999, by = 0.001),'other':c(threshold,0.5,1-threshold) -#' @param threshold a threshold for outlier detection -#' @param verbose a boolean value indicating whether to print verbose output -#' @param ... additional arguments passed to the ranger function -#' -#' @return a data frame of outliers -#' -#' @examples -#' outqrf(iris) -#' @export -outqrf <-function(data, -quantiles_type=1000, -threshold =0.025, -verbose = 1, -...){ -data <- as.data.frame(data) -numeric_features <- names(data)[sapply(data,is.numeric)] -threshold_low<-threshold -threshold_high<-1-threshold -rmse <-c() -oob.error <-c() -r.squared <-c() -outliers <- data.frame() -outMatrixs <- list() -if(quantiles_type == 1000){ -quantiles <- seq(0.001, 0.999,0.001) -}else if(quantiles_type == 400){ -quantiles <- c(seq(0.0025,0.9975,0.0025)) -}else{ -quantiles <- c(seq(0.025,0.9975,0.025)) -} -if (verbose) { -cat("\nOutlier identification by quantiles random forests\n") -cat("\n Variables to check:\t\t") -cat(numeric_features, sep = ", ") -cat("\n Variables used to check:\t") -cat(names(data), sep = ", ") -cat("\n\n Checking: ") -} -for (v in numeric_features){ -if (verbose) { -cat(v, " ") -} -covariables <- setdiff(names(data), v) -qrf <- ranger::ranger( -formula = stats::reformulate(covariables, response = v), -data = data, -quantreg = TRUE, -...) -pred <- predict(qrf, data[,covariables], type = "quantiles",quantiles=quantiles) -oob.error <- c(oob.error,qrf$prediction.error) -r.squared <- c(qrf$r.squared,r.squared) -outMatrix <- pred$predictions -outMatrixs[[v]]<-outMatrix -median_outMatrix <- outMatrix[,(length(quantiles)+1)/2] -response<- data[,v] -diffs = response - median_outMatrix -rmse_ <- sqrt(sum(diffs*diffs)/(length(diffs)-1)) -rmse <- c(rmse,rmse_) -#rank_value <- get_right_rank(response,outMatrix,median_outMatrix,rmse_) -rank_value <- find_quantile_position(response,outMatrix) -outlier <- data.frame(row = as.numeric(row.names(data)),col = v,observed = response, predicted = median_outMatrix,rank = rank_value) -outlier<- outlier|>dplyr::filter(rank<=threshold_low| rank>=threshold_high) -outliers <- rbind(outliers,outlier) -} -names(rmse) <- numeric_features -names(oob.error) <- numeric_features -names(r.squared) <- numeric_features -list( -Data = data, -outliers = outliers, -n_outliers = table(outliers$col), -threshold = threshold, -rmse = rmse, -oob.error = oob.error, -r.squared = r.squared, -outMatrixs =outMatrixs -) -} -qrf = outqrf(data,quantiles_type=400) -qrf$n_outliers -qrf$outliers -setwd("E:/github/outqrf") -system("R CMD build outqrf") -system("R CMD build outqrf") -renv::status() -setwd("E:/github") -system("R CMD build outqrf") -setwd("E:/github/outqrf") -system("R CMD check --as-cran") -setwd("E:/github") -system("R CMD check --as-cran") -system("R CMD check --as-cran outqrf") -system("R CMD check --as-cran outqrf") -system("R CMD check --as-cran outqrf") -system("R CMD check --as-cran outqrf") -setwd("E:/github") -system("R CMD check --as-cran outqrf") -setwd("E:/github/outqrf") -system("R CMD check --as-cran outqrf") -setwd("E:/github") -setwd("E:/github/outqrf") -setwd("E:/github") -system("R CMD check --as-cran outqrf") -system("R CMD check --as-cran outqrf") -system("R CMD check --as-cran outqrf") -#' @title Plots outqrf -#' @description -#' This function can plot paired boxplot of an "outqrf" object. -#' It helps us to better observe the relationship between the original and predicted values -#' @param qrf An object of class "outqrf". -#' @returns A ggplot2 object -#' @export -#' @examples -#' irisWithOutliers <- generateOutliers(iris, seed = 2024) -#' qrf <- outqrf(irisWithOutliers) -#' plot(qrf) -plot.outqrf<- function(qrf) { -result_df <- data.frame() -data <- qrf$Data -for (i in seq_along(qrf$outMatrixs)) { -temp_df <- as.data.frame(qrf$outMatrixs[[i]][,qrf$quantiles_type/2]) -if (nrow(result_df) == 0) { -result_df <- temp_df -} else { -result_df <- cbind(result_df, temp_df) -} -} -names(result_df) = names(qrf$outMatrixs) -result_df <- dplyr::mutate(result_df,tag = "predicted") -numeric_features <- names(data)[sapply(data,is.numeric)] -data <- data[numeric_features] -data <- dplyr::mutate(data,tag = "observed") -plot_in <-rbind(result_df,data) -plot_in_longer<- plot_in|>tidyr::pivot_longer(!tag,names_to ="features",values_to ="value" ) -p<- ggpubr::ggpaired(plot_in_longer, x="tag", y="value", -fill="tag", palette = "jco", -line.color = "grey", line.size =0.8, width = 0.4,short.panel.labs = FALSE)+ -ggpubr::stat_compare_means(label = "p.format", paired = TRUE)+ggplot2::theme(legend.position = "none")+ggplot2::facet_wrap(~features, scales = "free") -return(p) -} -irisWithOutliers <- generateOutliers(iris, seed = 2024) -library(outqrf) -#' @title Plots outqrf -#' @description -#' This function can plot paired boxplot of an "outqrf" object. -#' It helps us to better observe the relationship between the original and predicted values -#' @param qrf An object of class "outqrf". -#' @returns A ggplot2 object -#' @export -#' @examples -#' irisWithOutliers <- generateOutliers(iris, seed = 2024) -#' qrf <- outqrf(irisWithOutliers) -#' plot(qrf) -plot.outqrf<- function(qrf) { -result_df <- data.frame() -data <- qrf$Data -for (i in seq_along(qrf$outMatrixs)) { -temp_df <- as.data.frame(qrf$outMatrixs[[i]][,qrf$quantiles_type/2]) -if (nrow(result_df) == 0) { -result_df <- temp_df -} else { -result_df <- cbind(result_df, temp_df) -} -} -names(result_df) = names(qrf$outMatrixs) -result_df <- dplyr::mutate(result_df,tag = "predicted") -numeric_features <- names(data)[sapply(data,is.numeric)] -data <- data[numeric_features] -data <- dplyr::mutate(data,tag = "observed") -plot_in <-rbind(result_df,data) -plot_in_longer<- plot_in|>tidyr::pivot_longer(!tag,names_to ="features",values_to ="value" ) -p<- ggpubr::ggpaired(plot_in_longer, x="tag", y="value", -fill="tag", palette = "jco", -line.color = "grey", line.size =0.8, width = 0.4,short.panel.labs = FALSE)+ -ggpubr::stat_compare_means(label = "p.format", paired = TRUE)+ggplot2::theme(legend.position = "none")+ggplot2::facet_wrap(~features, scales = "free") -return(p) -} -irisWithOutliers <- generateOutliers(iris, seed = 2024) -qrf <- outqrf(irisWithOutliers) -plot(qrf) -#' @title Plots outqrf -#' @description -#' This function can plot paired boxplot of an "outqrf" object. -#' It helps us to better observe the relationship between the original and predicted values -#' @param qrf An object of class "outqrf". -#' @returns A ggplot2 object -#' @export -#' @examples -#' irisWithOutliers <- generateOutliers(iris, seed = 2024) -#' qrf <- outqrf(irisWithOutliers) -#' plot(qrf) -plot.outqrf<- function(x) { -result_df <- data.frame() -data <- x$Data -for (i in seq_along(x$outMatrixs)) { -temp_df <- as.data.frame(x$outMatrixs[[i]][,x$quantiles_type/2]) -if (nrow(result_df) == 0) { -result_df <- temp_df -} else { -result_df <- cbind(result_df, temp_df) -} -} -names(result_df) = names(x$outMatrixs) -result_df <- dplyr::mutate(result_df,tag = "predicted") -numeric_features <- names(data)[sapply(data,is.numeric)] -data <- data[numeric_features] -data <- dplyr::mutate(data,tag = "observed") -plot_in <-rbind(result_df,data) -plot_in_longer<- plot_in|>tidyr::pivot_longer(!tag,names_to ="features",values_to ="value" ) -p<- ggpubr::ggpaired(plot_in_longer, x="tag", y="value", -fill="tag", palette = "jco", -line.color = "grey", line.size =0.8, width = 0.4,short.panel.labs = FALSE)+ -ggpubr::stat_compare_means(label = "p.format", paired = TRUE)+ggplot2::theme(legend.position = "none")+ggplot2::facet_wrap(~features, scales = "free") -return(p) -} -plot(qrf) -importFrom("stats", "predict", "rnorm", "sd") -S3method -system("R CMD check --as-cran outqrf") -#' @title Plots outqrf -#' @description -#' This function can plot paired boxplot of an "outqrf" object. -#' It helps us to better observe the relationship between the original and predicted values -#' @param qrf An object of class "outqrf". -#' @returns A ggplot2 object -#' @export -#' @examples -#' irisWithOutliers <- generateOutliers(iris, seed = 2024) -#' qrf <- outqrf(irisWithOutliers) -#' plot(qrf) -plot<- function(x) { -result_df <- data.frame() -data <- x$Data -for (i in seq_along(x$outMatrixs)) { -temp_df <- as.data.frame(x$outMatrixs[[i]][,x$quantiles_type/2]) -if (nrow(result_df) == 0) { -result_df <- temp_df -} else { -result_df <- cbind(result_df, temp_df) -} -} -names(result_df) = names(x$outMatrixs) -result_df <- dplyr::mutate(result_df,tag = "predicted") -numeric_features <- names(data)[sapply(data,is.numeric)] -data <- data[numeric_features] -data <- dplyr::mutate(data,tag = "observed") -plot_in <-rbind(result_df,data) -plot_in_longer<- plot_in|>tidyr::pivot_longer(!tag,names_to ="features",values_to ="value" ) -p<- ggpubr::ggpaired(plot_in_longer, x="tag", y="value", -fill="tag", palette = "jco", -line.color = "grey", line.size =0.8, width = 0.4,short.panel.labs = FALSE)+ -ggpubr::stat_compare_means(label = "p.format", paired = TRUE)+ggplot2::theme(legend.position = "none")+ggplot2::facet_wrap(~features, scales = "free") -return(p) -} -plot(qrf) -devtools::document() -setwd("E:/github/outqrf") -devtools::document() -library(outqrf) -irisWithOutliers <- generateOutliers(iris, p = 0.05,seed =2024) -qrf <- outqrf(irisWithOutliers,quantiles_type=400) -plot(qrf) -devtools::check() -pkgdown::build_site()