diff --git a/DESCRIPTION b/DESCRIPTION index 2313af3..88c8485 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: DataMan Type: Package -Title: This is an R package for data and model analysis +Title: a R package for data and model analysis Version: 0.4.1 Date: 2015-12-04 Author: Sixiang Hu @@ -15,6 +15,7 @@ Imports: Rcpp (>= 0.11.6), dplyr, rbokeh, + plotly, ROCR, gbm, randomForest diff --git a/NAMESPACE b/NAMESPACE index 043ba46..6f97029 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,4 @@ # Generated by roxygen2: do not edit by hand -useDynLib(DataMan) -importFrom(Rcpp, evalCpp) -exportPattern("^[[:alpha:]]+") S3method(CramersV,data.frame) S3method(CramersV,default) @@ -27,6 +24,7 @@ export(DataSummary) export(DetMiss) export(PopMiss) export(dataPlot) +export(interPlot) export(liftPlot) export(modelPlot) export(resiPlot) @@ -34,15 +32,20 @@ export(rocPlot) export(tree2data) importFrom(ROCR,performance) importFrom(ROCR,prediction) +importFrom(Rcpp,evalCpp) importFrom(data.table,":=") importFrom(data.table,as.data.table) importFrom(data.table,data.table) +importFrom(data.table,dcast) +importFrom(data.table,key) importFrom(data.table,setkey) importFrom(dplyr,"%>%") importFrom(dplyr,group_by) importFrom(dplyr,left_join) importFrom(dplyr,summarise) importFrom(gbm,pretty.gbm.tree) +importFrom(plotly,layout) +importFrom(plotly,plot_ly) importFrom(randomForest,getTree) importFrom(rbokeh,figure) importFrom(rbokeh,grid_plot) @@ -52,3 +55,4 @@ importFrom(rbokeh,ly_hist) importFrom(rbokeh,ly_lines) importFrom(rbokeh,ly_points) importFrom(rbokeh,y_axis) +useDynLib(DataMan) diff --git a/NEWS.Rmd b/NEWS.md similarity index 87% rename from NEWS.Rmd rename to NEWS.md index 02c43b1..ec330ee 100644 --- a/NEWS.Rmd +++ b/NEWS.md @@ -2,6 +2,8 @@ DataMan 0.4.1 ------------------------------------------------------------- NEW FEATURES +* Add `interPlot` that gives 3D surface plot utilising `plotly` package. + * Add `liftPlot` that compares different model predictions. * Add `rocPlot` that assesses binary classification under `ROCR` package. It plots ROC curves with one or more predictions. AUC will be calculated and showed in legend. @@ -17,3 +19,5 @@ BUG FIXES AND MINOR IMPROVEMENTS * Change the glyphs of points in `modelPlot` * Change `.travis.yaml` to install `rbokeh` from github. + +* Improve the `PopMiss`. diff --git a/R/DataMan.R b/R/DataMan.R index e69de29..cd54e90 100644 --- a/R/DataMan.R +++ b/R/DataMan.R @@ -0,0 +1,8 @@ +#' DataMan a data clearning, analysis and statistical modeling assessing tool +#' +#' @description a data clearning, analysis and statistical modeling assessing tool. +#' @docType package +#' @name DataMan +#' @useDynLib DataMan +#' @importFrom Rcpp evalCpp +NULL \ No newline at end of file diff --git a/R/PopMiss.r b/R/PopMiss.r index 870c453..1e97c8e 100644 --- a/R/PopMiss.r +++ b/R/PopMiss.r @@ -2,7 +2,7 @@ #' #' @description This function allows you to populate missing values. #' @usage PopMiss(data,na.treatment=c("mean.or.mode","delete","replace"),replace=NULL) -#' @param data This could be data frame, matrix or a vector. +#' @param data This could be data frame,data.table, matrix or a vector. #' @param na.treatment One of "mean.or.mode", "delete", or "replace", to specify the method to populate missing values. #' @param replace A single value used for populating ALL missing value. May not be useful for a data frame with missing values in different type of variable. #' @seealso DetMiss @@ -13,7 +13,13 @@ #' PopMiss(a,"mean.or.mode") PopMiss <- function(data,na.treatment=c("mean.or.mode","delete","replace"),replace=NULL){ + if (is.null(na.treatment)) na.treatment <- "mean.or.mode" + na.treatment <- match.arg(na.treatment) + + if ((na.treatment %in% c("replace","r")) && is.null(replace)) + stop("NULL value used for populating NAs. Chosen \"replace\" method, but didn't provide a value for replace parameter?") + UseMethod("PopMiss",data) } @@ -27,14 +33,13 @@ PopMiss.factor<-function(data,na.treatment,replace){ #' @rdname PopMiss PopMiss.character<-function(data,na.treatment,replace){ num_miss <- sum(is.na(data)) - if(isTRUE(na.treatment=="delete")) { + if(identical(na.treatment,"delete")) { return(data[!is.na(data)]) } - else if(isTRUE(na.treatment=="mean.or.mode")){ + else if(identical(na.treatment,"mean.or.mode")){ replace <- names(table(data))[order(table(data),decreasing=TRUE)[1]] } - if (is.null(replace)) stop("NULL value used for populating NAs. Chosen \"replace\" method, but didn't provide a value for replace parameter?") data[is.na(data)] <- replace return(data) } @@ -43,13 +48,12 @@ PopMiss.character<-function(data,na.treatment,replace){ #' @rdname PopMiss PopMiss.integer<-function(data,na.treatment,replace){ num_miss <- length(which(is.na(data))) - if(isTRUE(na.treatment=="delete")) { + if(identical(na.treatment,"delete")) { return(na.omit(data)) } - else if(isTRUE(na.treatment=="mean.or.mode")){ + else if(identical(na.treatment,"mean.or.mode")){ replace <- floor(mean(data,na.rm=TRUE)) } - if (is.null(replace)) stop("NULL value used for populating NAs. Chosen \"replace\" method, but didn't provide a value for replace parameter?") data[is.na(data)] <- replace return(data) } @@ -64,17 +68,13 @@ PopMiss.Date<-function(data,na.treatment,replace){ #' @rdname PopMiss PopMiss.numeric<-function(data,na.treatment,replace){ num_miss <- length(which(is.na(data))) - if(isTRUE(na.treatment=="delete")) { - cat(paste(num_miss," rows have been delete.\n\n")) + if(identical(na.treatment,"delete")) { return(na.omit(data)) } - else if(isTRUE(na.treatment=="mean.or.mode")){ + else if(identical(na.treatment,"mean.or.mode")){ replace <- mean(data,na.rm=TRUE) } - if (is.null(replace)) stop("NULL value used for populating NAs. Chosen \"replace\" method, but didn't provide a value for replace parameter?") - data[is.na(data)] <- replace - cat(paste(num_miss," row have been populated by value:",replace,".\n\n")) return(data) } @@ -82,31 +82,31 @@ PopMiss.numeric<-function(data,na.treatment,replace){ #' @rdname PopMiss PopMiss.data.frame<-function(data,na.treatment,replace){ num_miss <- which(sapply(data,function(x) any(is.na(x)))) - if(isTRUE(na.treatment=="delete")) { + if(identical(na.treatment,"delete")) { return(na.omit(data)) } - else if(isTRUE(na.treatment=="mean.or.mode")){ - str_name <- names(data) - for(i in num_miss){ - if(class(data[,str_name[i]]) %in% c("character","Date","factor")) - replace <- names(table(data[,str_name[i]]))[order(table(data[,str_name[i]]),decreasing=TRUE)[1]] - else if(class(data[,str_name[i]])=="integer" ) - replace <- floor(mean(data[,str_name[i]],na.rm=TRUE)) + else if(identical(na.treatment,"mean.or.mode")){ + ind <- dim(data)[2] + for(i in 1:ind){ + if(class(data[[i]]) %in% c("character","Date","factor")) + replace <- names(table(data[[i]]))[order(table(data[[i]]),decreasing=TRUE)[1]] + else if(class(data[[i]])=="integer" ) + replace <- floor(mean(data[[i]],na.rm=TRUE)) else - replace <- mean(data[,str_name[i]],na.rm=TRUE) - data[is.na(data[,str_name[i]]),str_name[i]] <- replace + replace <- mean(data[[i]],na.rm=TRUE) + data[[i]][is.na(data[[i]])] <- replace } return(data) } else{ - if (is.null(replace)) stop("NULL value used for populating NAs. Chosen \"replace\" method, but didn't provide a value for replace parameter?") - warning("Provided replacement will be used for all missing value.") - str_name <- names(data) - for(i in num_miss){ - if ("Date" %in% class(data[,str_name[i]]) ) data[is.na(data[,str_name[i]]),str_name[i]] <- as.Date("1960-01-01") - else data[is.na(data[,str_name[i]]),str_name[i]] <- replace + ind <- dim(data)[2] + for(i in 1:ind){ + if ("Date" %in% class(data[[i]]) ) + data[[i]][is.na(data[[i]])] <- as.Date("1960-01-01") + else + data[[i]][is.na(data[[i]])] <- replace } return(data) } @@ -115,42 +115,14 @@ PopMiss.data.frame<-function(data,na.treatment,replace){ #' @export #' @rdname PopMiss PopMiss.data.table <-function(data,na.treatment,replace){ - num_miss <- which(data[,sapply(.SD,function(x) any(is.na(x)))]) - if(isTRUE(na.treatment=="delete")) { - return(na.omit(data)) - } - else if(isTRUE(na.treatment=="mean.or.mode")){ - str_name <- names(data) - for(i in num_miss){ - if( data[,class(get(str_name[i]))] %in% c("character","Date","factor")) - replace <- as.data.frame(data[,.N,get(str_name[i])][order(-N)])[1,1] - else if( data[,class(get(str_name[i]))]=="integer" ) - replace <- data[,floor(mean(get(str_name[i]),na.rm=TRUE))] - else - replace <- data[,mean(get(str_name[i]),na.rm=TRUE)] - - data[which(is.na(data[,i,with=FALSE])),i] <- replace - } - - return(data) - } - else{ - if (is.null(replace)) stop("NULL value used for populating NAs. Chosen \"replace\" method, but didn't provide a value for replace parameter?") - - warning("Provided replacement will be used for all missing value.") - str_name <- names(data) - for(i in num_miss){ - if ("Date" %in% class(data[,i,with=FALSE])) data[which(is.na(data[,i,with=FALSE])),i] <- as.Date("1960-01-01") - data[which(is.na(data[,i,with=FALSE])),i] <- replace - } - return(data) - } + PopMiss.data.frame(data,na.treatment,replace) } #' @export #' @rdname PopMiss PopMiss.matrix <- function(data,na.treatment,replace){ - if(identical(na.treatment,"delete")) warning("na.treatment is \"delete\" and 'data' is a matrix. This only works correctly if whole rows are missing") + if(identical(na.treatment,"delete")) + warning("na.treatment is \"delete\" and 'data' is a matrix. This only works correctly if whole rows are missing") matrix(PopMiss(as.vector(data),na.treatment,replace),ncol=ncol(data)) } diff --git a/R/RcppExports.R b/R/RcppExports.R index b167beb..f553241 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,12 +1,10 @@ # This file was generated by Rcpp::compileAttributes # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -#' @rdname CramersV CramersV_C <- function(x, y) { .Call('DataMan_CramersV_C', PACKAGE = 'DataMan', x, y) } -#' @rdname CramersV CramersV_DF <- function(dm) { .Call('DataMan_CramersV_DF', PACKAGE = 'DataMan', dm) } diff --git a/R/dataPlot.r b/R/dataPlot.r index 972e245..e6a8739 100644 --- a/R/dataPlot.r +++ b/R/dataPlot.r @@ -78,6 +78,9 @@ dataPlot <- function(data,xvar,yvar,byvar=NULL,weights=NULL, length.out=newGroupNum ) x <- cut(x,new_band,include.lowest = TRUE) + + # solve the order issue when plotting using rbokeh. + # can also set the label in cut function. x <- as.character(paste(as.integer(x),as.character(x),sep=";")) } diff --git a/R/interPlot.R b/R/interPlot.R new file mode 100644 index 0000000..c8800c4 --- /dev/null +++ b/R/interPlot.R @@ -0,0 +1,60 @@ +#' interPlot +#' +#' @description Interactive 3D plot by calling `plotly` package functions. +#' @usage interPlot(data,xvar,yvar,zvar) +#' @param data a data frame. +#' @param xvar either an integer to specify the position of the variable in the data frame, or the name of the variable. +#' @param yvar either an integer to specify the position of the variable in the data frame, or the name of the variable. +#' @param zvar either an integer to specify the position of the variable in the data frame, or the name of the variable. +#' @details This functions gives a 3D interactive view of between factors. +#' This is really useful when modeller wants to assess the interaction terms. +#' Hence the "zvar" can be actual response data or model predictions. +#' @author Sixiang Hu +#' @importFrom data.table data.table setkey key := dcast +#' @importFrom plotly plot_ly layout +#' @export interPlot +#' @examples +#' +#' interPlot(mtcars,"wt","mpg","vs") + +interPlot <- function(data,xvar,yvar,zvar){ + # Error Trap + if( .isDFnull(data) ) stop("data set provided is null.") + if( is.null(xvar) ) stop("X variable provided is null.") + if( is.null(yvar) ) stop("Y variable provided is null.") + if( is.null(zvar) ) stop("Z variable provided is null.") + + # Find data column + posi <- .VarPosition(data,xvar) + x <- data[[posi$posi]] + + posi <- .VarPosition(data,yvar) + y <- data[[posi$posi]] + + posi <- .VarPosition(data,zvar) + z <- data[[posi$posi]] + + dt <- data.table::data.table(x = x, + y = y, + z = z) + data.table::setkey(dt,x,y) + + dt2 <- data.table::dcast(dt,x~y,fun.aggregate=mean, + na.rm=TRUE, + value.var="z", + drop=FALSE) + + zvalue <- suppressWarnings(DataMan::PopMiss(as.matrix(dt2[,-1,with=FALSE]), + na.treatment = "replace", + 0)) + + plotly::plot_ly( + x=dt2[[1]], + y=colnames(dt2), + z=zvalue, + type = "surface", + showlegend=FALSE) %>% + plotly::layout(xaxis=list(title=xvar), + yaxis=list(title=yvar), + zaxis=list(title=zvar)) +} \ No newline at end of file diff --git a/man/CramersV.Rd b/man/CramersV.Rd index fba529f..7fa000e 100644 --- a/man/CramersV.Rd +++ b/man/CramersV.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CramersV.r, R/RcppExports.R +% Please edit documentation in R/CramersV.r \name{CramersV} \alias{CramersV} \alias{CramersV.data.frame} \alias{CramersV.default} \alias{CramersV.matrix} -\alias{CramersV_C} -\alias{CramersV_DF} \title{Cramers' V Test} \usage{ CramersV(x,y=NULL) @@ -16,10 +14,6 @@ CramersV(x,y=NULL) \method{CramersV}{data.frame}(x, y = NULL) \method{CramersV}{matrix}(x, y = NULL) - -CramersV_C(x, y) - -CramersV_DF(dm) } \arguments{ \item{x}{It could be non-numerical variable, data.frame or matrix.} diff --git a/man/DataMan-package.Rd b/man/DataMan-package.Rd deleted file mode 100644 index dbde310..0000000 --- a/man/DataMan-package.Rd +++ /dev/null @@ -1,23 +0,0 @@ -\name{DataMan-package} -\alias{DataMan-package} -\alias{DataMan} -\docType{package} -\title{ -This is an R package for data cleaning and preliminary data analysis. -} -\description{ -This package provides some simple function used for: Data Cleaning - detecting and populating missing value in a given data frame or vector; Data Analysis - an enhanced summary function to provide more info about variables in the given data. -} -\details{ -\tabular{ll}{ -Package: \tab DataMan\cr -Type: \tab Package\cr -Version: \tab 0.1\cr -Date: \tab 2014-12-23\cr -License: \tab GPL 2\cr -} -} -\author{ -Sixiang Hu -} - diff --git a/man/DataMan.Rd b/man/DataMan.Rd new file mode 100644 index 0000000..d643102 --- /dev/null +++ b/man/DataMan.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataMan.R +\docType{package} +\name{DataMan} +\alias{DataMan} +\alias{DataMan-package} +\title{DataMan a data clearning, analysis and statistical modeling assessing tool} +\description{ +a data clearning, analysis and statistical modeling assessing tool. +} + diff --git a/man/PopMiss.Rd b/man/PopMiss.Rd index 16663d4..a10fce4 100644 --- a/man/PopMiss.Rd +++ b/man/PopMiss.Rd @@ -31,7 +31,7 @@ PopMiss(data,na.treatment=c("mean.or.mode","delete","replace"),replace=NULL) \method{PopMiss}{matrix}(data, na.treatment, replace) } \arguments{ -\item{data}{This could be data frame, matrix or a vector.} +\item{data}{This could be data frame,data.table, matrix or a vector.} \item{na.treatment}{One of "mean.or.mode", "delete", or "replace", to specify the method to populate missing values.} diff --git a/man/interPlot.Rd b/man/interPlot.Rd new file mode 100644 index 0000000..0548bd8 --- /dev/null +++ b/man/interPlot.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interPlot.R +\name{interPlot} +\alias{interPlot} +\title{interPlot} +\usage{ +interPlot(data,xvar,yvar,zvar) +} +\arguments{ +\item{data}{a data frame.} + +\item{xvar}{either an integer to specify the position of the variable in the data frame, or the name of the variable.} + +\item{yvar}{either an integer to specify the position of the variable in the data frame, or the name of the variable.} + +\item{zvar}{either an integer to specify the position of the variable in the data frame, or the name of the variable.} +} +\description{ +Interactive 3D plot by calling `plotly` package functions. +} +\details{ +This functions gives a 3D interactive view of between factors. +This is really useful when modeller wants to assess the interaction terms. +Hence the "zvar" can be actual response data or model predictions. +} +\examples{ + +interPlot(mtcars,"wt","mpg","vs") +} +\author{ +Sixiang Hu +} +