Merge pull request #1012 from HealthCatalyst/levy1011vignettes

Levy1011vignettes
HealthCatalyst · Apr 3, 2018 · 21976fa · 21976fa
2 parents 3980496 + b872252
commit 21976fa
Show file tree

Hide file tree

Showing 40 changed files with 545 additions and 838 deletions.
diff --git a/R/flash_models.R b/R/flash_models.R
@@ -28,12 +28,12 @@
 #'
 #' @return A model_list object
 #' @details This function has two major differences from
-#'   \code{\link{tune_models}}: \enumerate{\item{It uses default hyperparameter
-#'   values to train models instead of using cross-validation to optimize
-#'   hyperparameter values for predictive performance.} \item{It is much
-#'   faster.}}
+#'   \code{\link{tune_models}}: 1. It uses fixed hyperparameter values to train
+#'   models instead of using cross-validation to optimize hyperparameter values
+#'   for predictive performance, and, as a result, 2. It is much faster.
 #'
 #' @examples
+#' \dontrun{
 #' # Prepare data
 #' prepped_data <- prep_data(pima_diabetes, patient_id, outcome = diabetes)
 #'
@@ -60,15 +60,14 @@
 #' summary(models)
 #'
 #' # Speed comparison of no tuning with flash_models vs. tuning with tune_models:
-#' \dontrun{
-#'   # ~40 seconds:
-#'   system.time(
-#'     tune_models(prepped_data, diabetes)
-#'   )
-#'   # ~6 seconds:
-#'   system.time(
-#'     flash_models(prepped_data, diabetes)
-#'   )
+#' # ~40 seconds:
+#' system.time(
+#'   tune_models(prepped_data, diabetes)
+#' )
+#' # ~6 seconds:
+#' system.time(
+#'   flash_models(prepped_data, diabetes)
+#' )
 #' }
 flash_models <- function(d,
                          outcome,

diff --git a/R/machine_learn.R b/R/machine_learn.R
@@ -30,31 +30,37 @@
 #'   wraps. For finer control of model tuning use \code{\link{tune_models}}.
 #'
 #' @examples
-#' # Split data into training and test sets using a subset of the data for speed
-#' training_data <- pima_diabetes[1:50, ]
-#' test_data <- pima_diabetes[51:60, ]
+#' # Split the data into training and test sets, using just 100 rows for speed
+#' d <- split_train_test(d = pima_diabetes[1:100, ],
+#'                       outcome = diabetes,
+#'                       percent_train = .9)
 #'
 #' ### Classification ###
 #'
-#' # Clean and prep the data, tune algorithms over hyperparameter values to predict diabetes
-#' diabetes_models <- machine_learn(training_data, outcome = diabetes)
+#' # Clean and prep the training data, specifying that patient_id is an ID column,
+#' # and tune algorithms over hyperparameter values to predict diabetes
+#' diabetes_models <- machine_learn(d$train, patient_id, outcome = diabetes)
+#'
+#' # Inspect model specification and performance
+#' diabetes_models
 #'
 #' # Make predictions (predicted probability of diabetes) on test data
-#' predict(diabetes_models, test_data)
+#' predict(diabetes_models, d$test)
 #'
 #' ### Regression ###
 #'
-#' # Predict numeric outcomes simply by specifying the name of the outcome variable
-#' age_model <- machine_learn(training_data, outcome = age)
+#' # If the outcome variable is numeric, regression models will be trained
+#' age_model <- machine_learn(d$train, patient_id, outcome = age)
 #'
-#' # If new data isn't specifed, get predictions on training data. Plot predictions
+#' # If new data isn't specifed, get predictions on training data
 #' predict(age_model)
 #'
 #' ### Faster model training without tuning hyperparameters ###
 #'
-#' # Train models at set hyperparameter values by setting tune to FALSE.
-#' # This is faster (especially on larger datasets), but produces models with less predictive accuracy.
-#' machine_learn(training_data, outcome = diabetes, tune = FALSE)
+#' # Train models at set hyperparameter values by setting tune to FALSE. This is
+#' # faster (especially on larger datasets), but produces models with less
+#' # predictive accuracy.
+#' machine_learn(d$train, patient_id, outcome = diabetes, tune = FALSE)
 machine_learn <- function(d, ..., outcome, models,
                           tune = TRUE, n_folds = 5, tune_depth = 10,
                           impute = TRUE) {

diff --git a/R/model_list_generics.R b/R/model_list_generics.R
@@ -98,9 +98,8 @@ summary.model_list <- function(object, ...) {
 #' @importFrom purrr map_df
 #' @export
 #' @examples
-#' models <- tune_models(mtcars, mpg)
+#' models <- tune_models(mtcars, mpg, models = "knn", tune_depth = 5)
 #' plot(models)
-#' plot(as.model_list(models$`Random Forest`))
 plot.model_list <- function(x, print = TRUE, ...) {
   if (!length(x))
     stop("x is empty.")

diff --git a/R/plot_predictions.R b/R/plot_predictions.R
@@ -11,13 +11,14 @@
 #' @export
 #'
 #' @details The following arguments can be provided to customize the plot: For
-#'   regression: title, point_size, point_alpha, font_size. For
-#'   classification: title, fill_colors, fill_alpha, curve_flex, font_size. For
-#'   details on how to use them, see \code{\link{plot_regression_predictions}}
-#'   or \code{\link{plot_classification_predictions}}.
+#'   regression: title, point_size, point_alpha, font_size. For classification:
+#'   title, fill_colors, fill_alpha, curve_flex, font_size. For details on how
+#'   to use them, see \code{\link{plot_regression_predictions}} or
+#'   \code{\link{plot_classification_predictions}}.
 #'
 #' @examples
-#' models <- machine_learn(pima_diabetes[1:50, ], patient_id, outcome = plasma_glucose)
+#' models <- machine_learn(pima_diabetes[1:50, ], patient_id, outcome = plasma_glucose,
+#'                         models = "rf", tune = FALSE)
 #' predictions <- predict(models)
 #' plot(predictions)
 #' plot(predictions, title = "This model's predictions regress to the mean",

diff --git a/R/predict.R b/R/predict.R
@@ -31,12 +31,15 @@
 #'   returning your predictions with the newdata in its original format.
 #'
 #' @examples
-#' # Tune models using only the first 50 rows to keep computation fast
-#' models <- machine_learn(pima_diabetes[1:50, ], outcome = diabetes)
-#' # Make prediction on the next 20 rows. This uses the best-performing model from
+#' # Tune models using only the first 20 rows to keep computation fast
+#'
+#' models <- machine_learn(pima_diabetes[1:20, ], patient_id, outcome = diabetes)
+#'
+#' # Make prediction on the next 5 rows. This uses the best-performing model from
 #' # tuning cross validation, and it also prepares the new data in the same way as
 #' # the training data was prepared.
-#' predictions <- predict(models, newdata = pima_diabetes[51:70, ])
+#'
+#' predictions <- predict(models, newdata = pima_diabetes[21:25, ])
 #' predictions
 #' plot(predictions)
 predict.model_list <- function(object, newdata, prepdata, ...) {

diff --git a/R/split_train_test.R b/R/split_train_test.R
@@ -3,7 +3,7 @@
 #' @param d Data frame
 #' @param outcome Target column, unquoted. Split will be stratified across this
 #'   variable
-#' @param p Proportion of rows in d to put into training. Default is 0.8
+#' @param percent_train Proportion of rows in d to put into training. Default is 0.8
 #' @param seed Optional, if provided the function will return the same split
 #'   each time it is called
 #'
@@ -14,7 +14,7 @@
 #'
 #' @examples
 #' split_train_test(mtcars, am, .9)
-split_train_test <- function(d, outcome, p = .8, seed) {
+split_train_test <- function(d, outcome, percent_train = .8, seed) {
   outcome <- rlang::enquo(outcome)
   if (rlang::quo_is_missing(outcome))
     stop("You must provide an outcome variable to tune_models.")
@@ -23,6 +23,7 @@ split_train_test <- function(d, outcome, p = .8, seed) {
     stop(outcome_chr, " isn't a column in d.")
   if (!missing(seed))
     set.seed(seed)
-  train_rows <- caret::createDataPartition(dplyr::pull(d, !!outcome), p = p)[[1]]
+  train_rows <- caret::createDataPartition(dplyr::pull(d, !!outcome),
+                                           p = percent_train)[[1]]
   list(train = d[train_rows, ], test = d[-train_rows, ])
 }
diff --git a/readme.Rmd → README.Rmd b/readme.Rmd → README.Rmd
@@ -2,11 +2,12 @@
 output: github_document
 ---
 
-<!-- README.md is generated from README.Rmd. Please edit that file -->
+<!-- README.md is generated from README.Rmd. Please edit the .Rmd and knit it to generate the .md. -->
 
 ```{r, include = FALSE}
 knitr::opts_chunk$set(collapse = TRUE, comment = "# >",
-                      fig.height = 4, fig.width = 6)
+                      fig.height = 4, fig.width = 6,
+                      fig.path = "man/figures/README-")
 options(tibble.print_max = 5)
 library(healthcareai)
 ```
@@ -45,7 +46,7 @@ models
 
 Make predictions and examine predictive performance:
 
-```{r, fig.height = 3}
+```{r plot_predictions, fig.height = 3}
 predictions <- predict(models)
 plot(predictions)
 ```

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 
-<!-- README.md is generated from README.Rmd. Please edit that file -->
+<!-- README.md is generated from README.Rmd. Please edit the .Rmd and knit it to generate the .md. -->
 
 # healthcareai <img src="man/figures/logo.png" align="right" />
 
@@ -58,15 +58,15 @@ models
 # > Performance Metric: ROC
 # > Number of Observations: 768
 # > Number of Features: 12
-# > Models Trained: 2018-04-02 11:02:25 
+# > Models Trained: 2018-04-02 16:57:26 
 # > 
 # > Models tuned via 5-fold cross validation over 10 combinations of hyperparameter values.
 # > Best model: Random Forest
 # > ROC = 0.85
 # > Optimal hyperparameter values:
-# >   mtry = 3
+# >   mtry = 5
 # >   splitrule = extratrees
-# >   min.node.size = 19
+# >   min.node.size = 11
 ```
 
 Make predictions and examine predictive performance:
@@ -76,7 +76,7 @@ predictions <- predict(models)
 plot(predictions)
 ```
 
-![](readme_files/figure-gfm/unnamed-chunk-3-1.png)<!-- -->
+![](man/figures/README-plot_predictions-1.png)<!-- -->
 
 ## Learn More