Skip to content

Commit

Permalink
Merge pull request #1012 from HealthCatalyst/levy1011vignettes
Browse files Browse the repository at this point in the history
Levy1011vignettes
  • Loading branch information
Michael Levy authored Apr 3, 2018
2 parents 3980496 + b872252 commit 21976fa
Show file tree
Hide file tree
Showing 40 changed files with 545 additions and 838 deletions.
25 changes: 12 additions & 13 deletions R/flash_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@
#'
#' @return A model_list object
#' @details This function has two major differences from
#' \code{\link{tune_models}}: \enumerate{\item{It uses default hyperparameter
#' values to train models instead of using cross-validation to optimize
#' hyperparameter values for predictive performance.} \item{It is much
#' faster.}}
#' \code{\link{tune_models}}: 1. It uses fixed hyperparameter values to train
#' models instead of using cross-validation to optimize hyperparameter values
#' for predictive performance, and, as a result, 2. It is much faster.
#'
#' @examples
#' \dontrun{
#' # Prepare data
#' prepped_data <- prep_data(pima_diabetes, patient_id, outcome = diabetes)
#'
Expand All @@ -60,15 +60,14 @@
#' summary(models)
#'
#' # Speed comparison of no tuning with flash_models vs. tuning with tune_models:
#' \dontrun{
#' # ~40 seconds:
#' system.time(
#' tune_models(prepped_data, diabetes)
#' )
#' # ~6 seconds:
#' system.time(
#' flash_models(prepped_data, diabetes)
#' )
#' # ~40 seconds:
#' system.time(
#' tune_models(prepped_data, diabetes)
#' )
#' # ~6 seconds:
#' system.time(
#' flash_models(prepped_data, diabetes)
#' )
#' }
flash_models <- function(d,
outcome,
Expand Down
30 changes: 18 additions & 12 deletions R/machine_learn.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,31 +30,37 @@
#' wraps. For finer control of model tuning use \code{\link{tune_models}}.
#'
#' @examples
#' # Split data into training and test sets using a subset of the data for speed
#' training_data <- pima_diabetes[1:50, ]
#' test_data <- pima_diabetes[51:60, ]
#' # Split the data into training and test sets, using just 100 rows for speed
#' d <- split_train_test(d = pima_diabetes[1:100, ],
#' outcome = diabetes,
#' percent_train = .9)
#'
#' ### Classification ###
#'
#' # Clean and prep the data, tune algorithms over hyperparameter values to predict diabetes
#' diabetes_models <- machine_learn(training_data, outcome = diabetes)
#' # Clean and prep the training data, specifying that patient_id is an ID column,
#' # and tune algorithms over hyperparameter values to predict diabetes
#' diabetes_models <- machine_learn(d$train, patient_id, outcome = diabetes)
#'
#' # Inspect model specification and performance
#' diabetes_models
#'
#' # Make predictions (predicted probability of diabetes) on test data
#' predict(diabetes_models, test_data)
#' predict(diabetes_models, d$test)
#'
#' ### Regression ###
#'
#' # Predict numeric outcomes simply by specifying the name of the outcome variable
#' age_model <- machine_learn(training_data, outcome = age)
#' # If the outcome variable is numeric, regression models will be trained
#' age_model <- machine_learn(d$train, patient_id, outcome = age)
#'
#' # If new data isn't specifed, get predictions on training data. Plot predictions
#' # If new data isn't specifed, get predictions on training data
#' predict(age_model)
#'
#' ### Faster model training without tuning hyperparameters ###
#'
#' # Train models at set hyperparameter values by setting tune to FALSE.
#' # This is faster (especially on larger datasets), but produces models with less predictive accuracy.
#' machine_learn(training_data, outcome = diabetes, tune = FALSE)
#' # Train models at set hyperparameter values by setting tune to FALSE. This is
#' # faster (especially on larger datasets), but produces models with less
#' # predictive accuracy.
#' machine_learn(d$train, patient_id, outcome = diabetes, tune = FALSE)
machine_learn <- function(d, ..., outcome, models,
tune = TRUE, n_folds = 5, tune_depth = 10,
impute = TRUE) {
Expand Down
3 changes: 1 addition & 2 deletions R/model_list_generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,8 @@ summary.model_list <- function(object, ...) {
#' @importFrom purrr map_df
#' @export
#' @examples
#' models <- tune_models(mtcars, mpg)
#' models <- tune_models(mtcars, mpg, models = "knn", tune_depth = 5)
#' plot(models)
#' plot(as.model_list(models$`Random Forest`))
plot.model_list <- function(x, print = TRUE, ...) {
if (!length(x))
stop("x is empty.")
Expand Down
11 changes: 6 additions & 5 deletions R/plot_predictions.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@
#' @export
#'
#' @details The following arguments can be provided to customize the plot: For
#' regression: title, point_size, point_alpha, font_size. For
#' classification: title, fill_colors, fill_alpha, curve_flex, font_size. For
#' details on how to use them, see \code{\link{plot_regression_predictions}}
#' or \code{\link{plot_classification_predictions}}.
#' regression: title, point_size, point_alpha, font_size. For classification:
#' title, fill_colors, fill_alpha, curve_flex, font_size. For details on how
#' to use them, see \code{\link{plot_regression_predictions}} or
#' \code{\link{plot_classification_predictions}}.
#'
#' @examples
#' models <- machine_learn(pima_diabetes[1:50, ], patient_id, outcome = plasma_glucose)
#' models <- machine_learn(pima_diabetes[1:50, ], patient_id, outcome = plasma_glucose,
#' models = "rf", tune = FALSE)
#' predictions <- predict(models)
#' plot(predictions)
#' plot(predictions, title = "This model's predictions regress to the mean",
Expand Down
11 changes: 7 additions & 4 deletions R/predict.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,15 @@
#' returning your predictions with the newdata in its original format.
#'
#' @examples
#' # Tune models using only the first 50 rows to keep computation fast
#' models <- machine_learn(pima_diabetes[1:50, ], outcome = diabetes)
#' # Make prediction on the next 20 rows. This uses the best-performing model from
#' # Tune models using only the first 20 rows to keep computation fast
#'
#' models <- machine_learn(pima_diabetes[1:20, ], patient_id, outcome = diabetes)
#'
#' # Make prediction on the next 5 rows. This uses the best-performing model from
#' # tuning cross validation, and it also prepares the new data in the same way as
#' # the training data was prepared.
#' predictions <- predict(models, newdata = pima_diabetes[51:70, ])
#'
#' predictions <- predict(models, newdata = pima_diabetes[21:25, ])
#' predictions
#' plot(predictions)
predict.model_list <- function(object, newdata, prepdata, ...) {
Expand Down
7 changes: 4 additions & 3 deletions R/split_train_test.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#' @param d Data frame
#' @param outcome Target column, unquoted. Split will be stratified across this
#' variable
#' @param p Proportion of rows in d to put into training. Default is 0.8
#' @param percent_train Proportion of rows in d to put into training. Default is 0.8
#' @param seed Optional, if provided the function will return the same split
#' each time it is called
#'
Expand All @@ -14,7 +14,7 @@
#'
#' @examples
#' split_train_test(mtcars, am, .9)
split_train_test <- function(d, outcome, p = .8, seed) {
split_train_test <- function(d, outcome, percent_train = .8, seed) {
outcome <- rlang::enquo(outcome)
if (rlang::quo_is_missing(outcome))
stop("You must provide an outcome variable to tune_models.")
Expand All @@ -23,6 +23,7 @@ split_train_test <- function(d, outcome, p = .8, seed) {
stop(outcome_chr, " isn't a column in d.")
if (!missing(seed))
set.seed(seed)
train_rows <- caret::createDataPartition(dplyr::pull(d, !!outcome), p = p)[[1]]
train_rows <- caret::createDataPartition(dplyr::pull(d, !!outcome),
p = percent_train)[[1]]
list(train = d[train_rows, ], test = d[-train_rows, ])
}
7 changes: 4 additions & 3 deletions readme.Rmd → README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
output: github_document
---

<!-- README.md is generated from README.Rmd. Please edit that file -->
<!-- README.md is generated from README.Rmd. Please edit the .Rmd and knit it to generate the .md. -->

```{r, include = FALSE}
knitr::opts_chunk$set(collapse = TRUE, comment = "# >",
fig.height = 4, fig.width = 6)
fig.height = 4, fig.width = 6,
fig.path = "man/figures/README-")
options(tibble.print_max = 5)
library(healthcareai)
```
Expand Down Expand Up @@ -45,7 +46,7 @@ models

Make predictions and examine predictive performance:

```{r, fig.height = 3}
```{r plot_predictions, fig.height = 3}
predictions <- predict(models)
plot(predictions)
```
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

<!-- README.md is generated from README.Rmd. Please edit that file -->
<!-- README.md is generated from README.Rmd. Please edit the .Rmd and knit it to generate the .md. -->

# healthcareai <img src="man/figures/logo.png" align="right" />

Expand Down Expand Up @@ -58,15 +58,15 @@ models
# > Performance Metric: ROC
# > Number of Observations: 768
# > Number of Features: 12
# > Models Trained: 2018-04-02 11:02:25
# > Models Trained: 2018-04-02 16:57:26
# >
# > Models tuned via 5-fold cross validation over 10 combinations of hyperparameter values.
# > Best model: Random Forest
# > ROC = 0.85
# > Optimal hyperparameter values:
# > mtry = 3
# > mtry = 5
# > splitrule = extratrees
# > min.node.size = 19
# > min.node.size = 11
```

Make predictions and examine predictive performance:
Expand All @@ -76,7 +76,7 @@ predictions <- predict(models)
plot(predictions)
```

![](readme_files/figure-gfm/unnamed-chunk-3-1.png)<!-- -->
![](man/figures/README-plot_predictions-1.png)<!-- -->

## Learn More

Expand Down
Loading

0 comments on commit 21976fa

Please sign in to comment.