From 299faa3fdf84292ccd84a02f803f4a3d9848a240 Mon Sep 17 00:00:00 2001 From: Sixiang Hu Date: Tue, 22 Sep 2015 20:47:18 +0100 Subject: [PATCH] Add resiPlot function --- NAMESPACE | 5 +++++ R/DataSummary.R | 18 +++++++++--------- R/PopMiss.r | 26 ++++++++------------------ R/{resiPlot => resiPlot.r} | 1 - README.md | 3 +++ man/resiPlot.Rd | 32 ++++++++++++++++++++++++++++++++ 6 files changed, 57 insertions(+), 28 deletions(-) rename R/{resiPlot => resiPlot.r} (99%) create mode 100644 man/resiPlot.Rd diff --git a/NAMESPACE b/NAMESPACE index c4e790f..a595392 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,15 +27,20 @@ export(DetMiss) export(PopMiss) export(dataPlot) export(modelPlot) +export(resiPlot) export(tree2data) importFrom(data.table,":=") importFrom(data.table,as.data.table) +importFrom(data.table,data.table) importFrom(data.table,setkey) importFrom(ggplot2,aes) importFrom(ggplot2,element_text) importFrom(ggplot2,ggplot) importFrom(googleVis,gvisComboChart) +importFrom(grid,grid.newpage) +importFrom(grid,pushViewport) importFrom(grid,unit.pmax) +importFrom(grid,viewport) importFrom(gridExtra,grid.arrange) importFrom(reshape2,melt) importFrom(scales,date_format) diff --git a/R/DataSummary.R b/R/DataSummary.R index 5aae232..0cf61c1 100644 --- a/R/DataSummary.R +++ b/R/DataSummary.R @@ -9,7 +9,7 @@ #' @details This function provides a data summary including min, max, number of unique values and number if missing values. #' The min and max will ignore missing value in the data. The input should be a `data.frame`. #' @author Sixiang Hu -#' @importFrom data.table as.data.table := +#' @importFrom data.table data.table := #' @export DataSummary #' @examples #' DataSummary(cars) @@ -34,7 +34,7 @@ DataSummary.data.frame <- function(data,wt=NULL,sparkline=FALSE){ dsMean <- sapply(data,function(x){ if(is.numeric(x) || is.integer(x)) as.character(round(weighted.mean(x,weight,na.rm = TRUE),6)) else { - x.dt<-data.table(x,weight) + x.dt<-data.table::data.table(x,weight) dsTemp <- as.character(x.dt[,sum(weight),by=x][order(-V1)][1,list(x)]) if(is.null(dsTemp) || is.na(dsTemp)) as.character(x.dt[,sum(weight),by=x][order(-V1)][2,list(x)]) else dsTemp @@ -44,7 +44,7 @@ DataSummary.data.frame <- function(data,wt=NULL,sparkline=FALSE){ dsMax <- sapply(data,function(x){ if(is.numeric(x) || is.integer(x)) as.character(round(max(x,na.rm = TRUE),6)) else { - x.dt<-data.table(x,weight) + x.dt<-data.table::data.table(x,weight) dsTemp <- as.character(x.dt[,sum(weight),by=x][order(-V1)][1,list(x)]) if(is.null(dsTemp) || is.na(dsTemp)) as.character(x.dt[,sum(weight),by=x][order(-V1)][2,list(x)]) else dsTemp @@ -54,7 +54,7 @@ DataSummary.data.frame <- function(data,wt=NULL,sparkline=FALSE){ dsMin <- sapply(data,function(x){ if(is.numeric(x) || is.integer(x)) as.character(round(min(x,na.rm = TRUE),6)) else { - x.dt<-data.table(x,weight) + x.dt<-data.table::data.table(x,weight) dsTemp <- as.character(x.dt[,sum(weight),by=x][order(V1)][1,list(x)]) if(is.null(dsTemp) || is.na(dsTemp)) as.character(x.dt[,sum(weight),by=x][order(V1)][2,list(x)]) else dsTemp @@ -63,7 +63,7 @@ DataSummary.data.frame <- function(data,wt=NULL,sparkline=FALSE){ if (sparkline) { dsStr <- sapply(data,function(x) { - freq <- data.table(x)[,.N,by=x][order(x)][,f:=N/sum(N)] + freq <- data.table::data.table(x)[,.N,by=x][order(x)][,f:=N/sum(N)] if (dim(freq)[1]>=100) return("More than 100 levels") else return(paste0(round(freq[,f],3),collapse= ",")) }) @@ -104,7 +104,7 @@ DataSummary.data.table <- function(data,wt=NULL,sparkline=FALSE){ dsMean <- data[,sapply(.SD,function(x){ if(is.numeric(x) || is.integer(x)) as.character(round(weighted.mean(x,weight,na.rm = TRUE),6)) else { - x.dt<- data.table(x,weight) + x.dt<- data.table::data.table(x,weight) dsTemp <- as.character(x.dt[,sum(weight),by=x][order(-V1)][1,list(x)]) if(is.null(dsTemp) || is.na(dsTemp)) as.character(x.dt[,sum(weight),by=x][order(-V1)][2,list(x)]) else dsTemp @@ -114,7 +114,7 @@ DataSummary.data.table <- function(data,wt=NULL,sparkline=FALSE){ dsMax <- data[,sapply(.SD,function(x){ if(is.numeric(x) || is.integer(x)) as.character(round(max(x,na.rm = TRUE),6)) else { - x.dt<-data.table(x,weight) + x.dt<-data.table::data.table(x,weight) dsTemp <- as.character(x.dt[,sum(weight),by=x][order(-V1)][1,list(x)]) if(is.null(dsTemp) || is.na(dsTemp)) as.character(x.dt[,sum(weight),by=x][order(-V1)][2,list(x)]) else dsTemp @@ -124,7 +124,7 @@ DataSummary.data.table <- function(data,wt=NULL,sparkline=FALSE){ dsMin <- data[,sapply(data,function(x){ if(is.numeric(x) || is.integer(x)) as.character(round(min(x,na.rm = TRUE),6)) else { - x.dt<-data.table(x,weight) + x.dt<-data.table::data.table(x,weight) dsTemp <- as.character(x.dt[,sum(weight),by=x][order(V1)][1,list(x)]) if(is.null(dsTemp) || is.na(dsTemp)) as.character(x.dt[,sum(weight),by=x][order(V1)][2,list(x)]) else dsTemp @@ -133,7 +133,7 @@ DataSummary.data.table <- function(data,wt=NULL,sparkline=FALSE){ if (sparkline) { dsStr <- data[,sapply(data,function(x) { - freq <- data.table(x)[,.N,by=x][order(x)][,f:=N/sum(N)] + freq <- data.table::data.table(x)[,.N,by=x][order(x)][,f:=N/sum(N)] if (dim(freq)[1]>=100) return("More than 100 levels") else return(paste0(round(freq[,f],3),collapse= ",")) })] diff --git a/R/PopMiss.r b/R/PopMiss.r index ffbaff7..81bce0a 100644 --- a/R/PopMiss.r +++ b/R/PopMiss.r @@ -18,22 +18,12 @@ PopMiss <- function(data,na.treatment=c("mean.or.mode","delete","replace"),repla } #' @export -PopMiss.factor<-function(data,na.treatment,replace=NULL){ - num_miss <- length(which(is.na(data))) - if(isTRUE(na.treatment=="delete")) { - return(na.omit(data)) - } - else if(isTRUE(na.treatment=="mean.or.mode")){ - replace <- names(table(data))[order(table(data),decreasing=TRUE)[1]] - } - - if (is.null(replace)) stop("NULL value used for populating NAs. Chosen \"replace\" method, but didn't provide a value for replace parameter?") - data[is.na(data)] <- replace - return(data) +PopMiss.factor<-function(data,na.treatment,replace){ + PopMiss.character(data,na.treatment,replace) } #' @export -PopMiss.character<-function(data,na.treatment,replace=NULL){ +PopMiss.character<-function(data,na.treatment,replace){ num_miss <- sum(is.na(data)) if(isTRUE(na.treatment=="delete")) { return(data[!is.na(data)]) @@ -48,7 +38,7 @@ PopMiss.character<-function(data,na.treatment,replace=NULL){ } #' @export -PopMiss.integer<-function(data,na.treatment,replace=NULL){ +PopMiss.integer<-function(data,na.treatment,replace){ num_miss <- length(which(is.na(data))) if(isTRUE(na.treatment=="delete")) { return(na.omit(data)) @@ -62,7 +52,7 @@ PopMiss.integer<-function(data,na.treatment,replace=NULL){ } #' @export -PopMiss.numeric<-function(data,na.treatment,replace=NULL){ +PopMiss.numeric<-function(data,na.treatment,replace){ num_miss <- length(which(is.na(data))) if(isTRUE(na.treatment=="delete")) { cat(paste(num_miss," rows have been delete.\n\n")) @@ -79,7 +69,7 @@ PopMiss.numeric<-function(data,na.treatment,replace=NULL){ } #' @export -PopMiss.data.frame<-function(data,na.treatment,replace=NULL){ +PopMiss.data.frame<-function(data,na.treatment,replace){ num_miss <- which(sapply(data,function(x) any(is.na(x)))) if(isTRUE(na.treatment=="delete")) { return(na.omit(data)) @@ -111,7 +101,7 @@ PopMiss.data.frame<-function(data,na.treatment,replace=NULL){ } #' @export -PopMiss.data.table <-function(data,na.treatment,replace=NULL){ +PopMiss.data.table <-function(data,na.treatment,replace){ num_miss <- which(data[,sapply(.SD,function(x) any(is.na(x)))]) if(isTRUE(na.treatment=="delete")) { return(na.omit(data)) @@ -144,7 +134,7 @@ PopMiss.data.table <-function(data,na.treatment,replace=NULL){ } #' @export -PopMiss.matrix <- function(data,na.treatment,replace=NULL){ +PopMiss.matrix <- function(data,na.treatment,replace){ if(identical(na.treatment,"delete")) warning("na.treatment is \"delete\" and 'data' is a matrix. This only works correctly if whole rows are missing") matrix(PopMiss(as.vector(data),na.treatment,replace),ncol=ncol(data)) } \ No newline at end of file diff --git a/R/resiPlot b/R/resiPlot.r similarity index 99% rename from R/resiPlot rename to R/resiPlot.r index 9286597..6288764 100644 --- a/R/resiPlot +++ b/R/resiPlot.r @@ -20,7 +20,6 @@ #' @importFrom ggplot2 aes ggplot #' @importFrom grid grid.newpage viewport pushViewport #' @export resiPlot -#' @examples #' resiPlot <- function(act,pred,bucket=20){ diff --git a/README.md b/README.md index fae02d0..79456a4 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,9 @@ iris.mod <- gbm(Species ~ ., distribution="multinomial", data=iris, n.trees=2000 tree_data <- tree2data(iris.mod,1) sankeyNetwork(tree_data[[1]],tree_data[[2]],Source="src",Target="tar",Value="value",NodeID="name") + +* `resiPlot`: This function assess the residual using given actual and predicted values. + ``` ### Getting Started diff --git a/man/resiPlot.Rd b/man/resiPlot.Rd new file mode 100644 index 0000000..f5067a5 --- /dev/null +++ b/man/resiPlot.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/resiPlot.r +\name{resiPlot} +\alias{resiPlot} +\title{resiPlot} +\usage{ +resiPlot(act,pred,bucket=20) +} +\arguments{ +\item{act}{numerical vector for actual observation.} + +\item{pred}{numerical vector for model preidctions. It must have the same length as act.} + +\item{bucket}{Integer. It specifies the number of bucket of the AvsE plot.} +} +\description{ +This function assess the residual using given actual and predicted values. +} +\details{ +Currently, the residual in this function is defined as: Residual = actual - predicted. I don't use `resi` or `reisdual` function +from `stats` package because this function will be used for much wider model assess (e.g. `randomFoest`, `gbm`), but 2 functions +mentioned about can only applied to `glm` and `lm`. May code in some other residual function. + +This function will give 3 plots: +Residual vs Prediction: This plot is used to assess the baise and heterogeneity +Residual histogram: Check the residual distribution. +AvsE plot: The aggregated (by provided number of bucket) average actual and predicted value, with a diagnal line for comparison. +} +\author{ +Sixiang Hu +} +