diff --git a/R/flash_models.R b/R/flash_models.R index fe89f6b28..1f4cf0953 100644 --- a/R/flash_models.R +++ b/R/flash_models.R @@ -28,12 +28,12 @@ #' #' @return A model_list object #' @details This function has two major differences from -#' \code{\link{tune_models}}: \enumerate{\item{It uses default hyperparameter -#' values to train models instead of using cross-validation to optimize -#' hyperparameter values for predictive performance.} \item{It is much -#' faster.}} +#' \code{\link{tune_models}}: 1. It uses fixed hyperparameter values to train +#' models instead of using cross-validation to optimize hyperparameter values +#' for predictive performance, and, as a result, 2. It is much faster. #' #' @examples +#' \dontrun{ #' # Prepare data #' prepped_data <- prep_data(pima_diabetes, patient_id, outcome = diabetes) #' @@ -60,15 +60,14 @@ #' summary(models) #' #' # Speed comparison of no tuning with flash_models vs. tuning with tune_models: -#' \dontrun{ -#' # ~40 seconds: -#' system.time( -#' tune_models(prepped_data, diabetes) -#' ) -#' # ~6 seconds: -#' system.time( -#' flash_models(prepped_data, diabetes) -#' ) +#' # ~40 seconds: +#' system.time( +#' tune_models(prepped_data, diabetes) +#' ) +#' # ~6 seconds: +#' system.time( +#' flash_models(prepped_data, diabetes) +#' ) #' } flash_models <- function(d, outcome, diff --git a/R/machine_learn.R b/R/machine_learn.R index 6d44ad62f..498fc56a8 100644 --- a/R/machine_learn.R +++ b/R/machine_learn.R @@ -30,31 +30,37 @@ #' wraps. For finer control of model tuning use \code{\link{tune_models}}. #' #' @examples -#' # Split data into training and test sets using a subset of the data for speed -#' training_data <- pima_diabetes[1:50, ] -#' test_data <- pima_diabetes[51:60, ] +#' # Split the data into training and test sets, using just 100 rows for speed +#' d <- split_train_test(d = pima_diabetes[1:100, ], +#' outcome = diabetes, +#' percent_train = .9) #' #' ### Classification ### #' -#' # Clean and prep the data, tune algorithms over hyperparameter values to predict diabetes -#' diabetes_models <- machine_learn(training_data, outcome = diabetes) +#' # Clean and prep the training data, specifying that patient_id is an ID column, +#' # and tune algorithms over hyperparameter values to predict diabetes +#' diabetes_models <- machine_learn(d$train, patient_id, outcome = diabetes) +#' +#' # Inspect model specification and performance +#' diabetes_models #' #' # Make predictions (predicted probability of diabetes) on test data -#' predict(diabetes_models, test_data) +#' predict(diabetes_models, d$test) #' #' ### Regression ### #' -#' # Predict numeric outcomes simply by specifying the name of the outcome variable -#' age_model <- machine_learn(training_data, outcome = age) +#' # If the outcome variable is numeric, regression models will be trained +#' age_model <- machine_learn(d$train, patient_id, outcome = age) #' -#' # If new data isn't specifed, get predictions on training data. Plot predictions +#' # If new data isn't specifed, get predictions on training data #' predict(age_model) #' #' ### Faster model training without tuning hyperparameters ### #' -#' # Train models at set hyperparameter values by setting tune to FALSE. -#' # This is faster (especially on larger datasets), but produces models with less predictive accuracy. -#' machine_learn(training_data, outcome = diabetes, tune = FALSE) +#' # Train models at set hyperparameter values by setting tune to FALSE. This is +#' # faster (especially on larger datasets), but produces models with less +#' # predictive accuracy. +#' machine_learn(d$train, patient_id, outcome = diabetes, tune = FALSE) machine_learn <- function(d, ..., outcome, models, tune = TRUE, n_folds = 5, tune_depth = 10, impute = TRUE) { diff --git a/R/model_list_generics.R b/R/model_list_generics.R index dacfe5ca6..6088e871d 100644 --- a/R/model_list_generics.R +++ b/R/model_list_generics.R @@ -98,9 +98,8 @@ summary.model_list <- function(object, ...) { #' @importFrom purrr map_df #' @export #' @examples -#' models <- tune_models(mtcars, mpg) +#' models <- tune_models(mtcars, mpg, models = "knn", tune_depth = 5) #' plot(models) -#' plot(as.model_list(models$`Random Forest`)) plot.model_list <- function(x, print = TRUE, ...) { if (!length(x)) stop("x is empty.") diff --git a/R/plot_predictions.R b/R/plot_predictions.R index 27a09d821..132aa5dbd 100644 --- a/R/plot_predictions.R +++ b/R/plot_predictions.R @@ -11,13 +11,14 @@ #' @export #' #' @details The following arguments can be provided to customize the plot: For -#' regression: title, point_size, point_alpha, font_size. For -#' classification: title, fill_colors, fill_alpha, curve_flex, font_size. For -#' details on how to use them, see \code{\link{plot_regression_predictions}} -#' or \code{\link{plot_classification_predictions}}. +#' regression: title, point_size, point_alpha, font_size. For classification: +#' title, fill_colors, fill_alpha, curve_flex, font_size. For details on how +#' to use them, see \code{\link{plot_regression_predictions}} or +#' \code{\link{plot_classification_predictions}}. #' #' @examples -#' models <- machine_learn(pima_diabetes[1:50, ], patient_id, outcome = plasma_glucose) +#' models <- machine_learn(pima_diabetes[1:50, ], patient_id, outcome = plasma_glucose, +#' models = "rf", tune = FALSE) #' predictions <- predict(models) #' plot(predictions) #' plot(predictions, title = "This model's predictions regress to the mean", diff --git a/R/predict.R b/R/predict.R index d01c1bac3..d6465e799 100644 --- a/R/predict.R +++ b/R/predict.R @@ -31,12 +31,15 @@ #' returning your predictions with the newdata in its original format. #' #' @examples -#' # Tune models using only the first 50 rows to keep computation fast -#' models <- machine_learn(pima_diabetes[1:50, ], outcome = diabetes) -#' # Make prediction on the next 20 rows. This uses the best-performing model from +#' # Tune models using only the first 20 rows to keep computation fast +#' +#' models <- machine_learn(pima_diabetes[1:20, ], patient_id, outcome = diabetes) +#' +#' # Make prediction on the next 5 rows. This uses the best-performing model from #' # tuning cross validation, and it also prepares the new data in the same way as #' # the training data was prepared. -#' predictions <- predict(models, newdata = pima_diabetes[51:70, ]) +#' +#' predictions <- predict(models, newdata = pima_diabetes[21:25, ]) #' predictions #' plot(predictions) predict.model_list <- function(object, newdata, prepdata, ...) { diff --git a/R/split_train_test.R b/R/split_train_test.R index d671210cd..6a6c48864 100644 --- a/R/split_train_test.R +++ b/R/split_train_test.R @@ -3,7 +3,7 @@ #' @param d Data frame #' @param outcome Target column, unquoted. Split will be stratified across this #' variable -#' @param p Proportion of rows in d to put into training. Default is 0.8 +#' @param percent_train Proportion of rows in d to put into training. Default is 0.8 #' @param seed Optional, if provided the function will return the same split #' each time it is called #' @@ -14,7 +14,7 @@ #' #' @examples #' split_train_test(mtcars, am, .9) -split_train_test <- function(d, outcome, p = .8, seed) { +split_train_test <- function(d, outcome, percent_train = .8, seed) { outcome <- rlang::enquo(outcome) if (rlang::quo_is_missing(outcome)) stop("You must provide an outcome variable to tune_models.") @@ -23,6 +23,7 @@ split_train_test <- function(d, outcome, p = .8, seed) { stop(outcome_chr, " isn't a column in d.") if (!missing(seed)) set.seed(seed) - train_rows <- caret::createDataPartition(dplyr::pull(d, !!outcome), p = p)[[1]] + train_rows <- caret::createDataPartition(dplyr::pull(d, !!outcome), + p = percent_train)[[1]] list(train = d[train_rows, ], test = d[-train_rows, ]) } diff --git a/readme.Rmd b/README.Rmd similarity index 95% rename from readme.Rmd rename to README.Rmd index 183e67b9e..a35460e6a 100644 --- a/readme.Rmd +++ b/README.Rmd @@ -2,11 +2,12 @@ output: github_document --- - + ```{r, include = FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "# >", - fig.height = 4, fig.width = 6) + fig.height = 4, fig.width = 6, + fig.path = "man/figures/README-") options(tibble.print_max = 5) library(healthcareai) ``` @@ -45,7 +46,7 @@ models Make predictions and examine predictive performance: -```{r, fig.height = 3} +```{r plot_predictions, fig.height = 3} predictions <- predict(models) plot(predictions) ``` diff --git a/README.md b/README.md index 286da0392..4836d54cf 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ - + # healthcareai @@ -58,15 +58,15 @@ models # > Performance Metric: ROC # > Number of Observations: 768 # > Number of Features: 12 -# > Models Trained: 2018-04-02 11:02:25 +# > Models Trained: 2018-04-02 16:57:26 # > # > Models tuned via 5-fold cross validation over 10 combinations of hyperparameter values. # > Best model: Random Forest # > ROC = 0.85 # > Optimal hyperparameter values: -# > mtry = 3 +# > mtry = 5 # > splitrule = extratrees -# > min.node.size = 19 +# > min.node.size = 11 ``` Make predictions and examine predictive performance: @@ -76,7 +76,7 @@ predictions <- predict(models) plot(predictions) ``` -![](readme_files/figure-gfm/unnamed-chunk-3-1.png) +![](man/figures/README-plot_predictions-1.png) ## Learn More diff --git a/docs/dev/CHANGELOG.html b/docs/dev/CHANGELOG.html deleted file mode 100644 index 036d78230..000000000 --- a/docs/dev/CHANGELOG.html +++ /dev/null @@ -1,298 +0,0 @@ - - - - - - - - -Change Log • healthcareai - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - -
- -
-
- - -
- -

All notable changes to this project will be documented in this file.

-

The format is based on Keep a Changelog and this project adheres to Semantic Versioning.

-
-

-[2.0.0] - 2018-02-01

-

Major, breaking changes. R6 is out; S3 is in.

-
-
-

-[1.2.0] - 2017-10-19

-
-

-Added

-
    -
  • -Limone – a lime-like model interpretation tool. -
      -
    • Called via getProcessVariablesDf -
    • -
    • See examples at the end of the help files for RandomForestDeployment and LassoDeployment for usage details
    • -
    -
  • -
-
-
-
-

-[1.1.0] - 2017-10-11

-
-

-Added

-
    -
  • Deploy now saves information about the model and deployment as an attribute of the output dataframe. This information is written to a log file in the working directory.
  • -
  • -skip_on_not_appveyor will skip a unit test unless it’s being run on Appveyor.
  • -
-
-
-

-Changed

-
    -
  • Unit tests involving MSSQL now only run on Appveyor.
  • -
-
-
-

-Removed

-
    -
  • -skip_if_no_mssql isn’t needed as a test utility anymore.
  • -
-
-
-
-

-[1.0.0] - 2017-08-02

-
-

-Added

-
    -
  • Multiclass functionality with XGBoost is supported using XGBoostDevelopment and XGBoostDeployment.
  • -
  • K-means clustering is supported using KmeansClustering.
  • -
  • -findVariaion will return groups with the highest variation of a chosen target measure within a data set.
  • -
  • -variationAcrossGroups will plot a boxplot of variation between groups for a chosen target measure.
  • -
-
-
-

-Changed

-
    -
  • -SupervisedModelDevelopment now saves the model after training
  • -
  • -SupervisedModelDeployment no longer trains models. It only loads the model saved in SupervisedModelDevelopment. Predictions are made for all data.
  • -
  • -imputeColumn was replaced with imputeDF -
  • -
  • SQL tools now use a DBI backend. We support reading and writing to MSSQL and SQLite databases.
  • -
  • SQL tools are now common functions used outside the algorithms.
  • -
  • Model file documentation files now accurately reflect the available methods.
  • -
-
-
-

-Removed

-
    -
  • -testWindowCol is no longer a param in SupervisedModelDeployment or used in the algorithms.
  • -
  • -writeToDB is no longer a param in SupervisedModelDeployment or used in the algorithms.
  • -
  • -destSchemaTable is no longer a param in SupervisedModelDeployment or used in the algorithms.
  • -
-
-
-
-

-[0.1.12] - 2017-05-08

-
-

-Added

-
    -
  • Added getters for predictions getPredictions() in development (lasso, random forest, linear mixed model)
  • -
  • Added getOutDf to each algorithm deploy file so predictions can go to CSV
  • -
  • Added percentDataAvailableInDateRange, to eventually replace countPercentEmpty
  • -
  • Added featureAvailabilityProfiler
  • -
-
-
-

-Changed

-
    -
  • TimeStamp column predictive output is now local time (not GMT)
  • -
-
-
-

-Fixed

-
-
-
-

-0.1.11 - 2017-03-02

-
-

-Added

-
    -
  • Added changelog
  • -
  • Added travis.yml to prepare for CRAN release
  • -
-
-
-

-Changed

-
    -
  • generateAUC now calls getCutOffs to give guidance on ideal cutoffs.
  • -
  • getCutOffs now generates list of cutoffs and suggests ideal ones.
  • -
  • API changes for both functions.
  • -
  • calculatePerformance (model class method) now calls generateAUC
  • -
-
-
-

-Fixed

-
    -
  • Bug fixes in example files concerning reproducability
  • -
-
-
-
- -
- -
- - - -
- - - - - - diff --git a/docs/dev/articles/healthcareai.html b/docs/dev/articles/healthcareai.html index c9c9e7128..1fcf04f99 100644 --- a/docs/dev/articles/healthcareai.html +++ b/docs/dev/articles/healthcareai.html @@ -103,63 +103,63 @@

Getting Started with healthcareai

library(healthcareai)

healthcareai comes with a built in dataset documenting diabetes among adult Pima females. Once you attach the package, the dataset is available in the variable pima_diabetes. Let’s take a look at the data with the str function. There are 768 records in 10 variables including one identifier column, several nominal variables, and substantial missingness (represented in R by NA).

str(pima_diabetes)
-#>  Classes 'tbl_df', 'tbl' and 'data.frame':   768 obs. of  10 variables:
-#>   $ patient_id    : int  1 2 3 4 5 6 7 8 9 10 ...
-#>   $ pregnancies   : int  6 1 8 1 0 5 3 10 2 8 ...
-#>   $ plasma_glucose: int  148 85 183 89 137 116 78 115 197 125 ...
-#>   $ diastolic_bp  : int  72 66 64 66 40 74 50 NA 70 96 ...
-#>   $ skinfold      : int  35 29 NA 23 35 NA 32 NA 45 NA ...
-#>   $ insulin       : int  NA NA NA 94 168 NA 88 NA 543 NA ...
-#>   $ weight_class  : chr  "obese" "overweight" "normal" "overweight" ...
-#>   $ pedigree      : num  0.627 0.351 0.672 0.167 2.288 ...
-#>   $ age           : int  50 31 32 21 33 30 26 29 53 54 ...
-#>   $ diabetes      : chr  "Y" "N" "Y" "N" ...
+# > Classes 'tbl_df', 'tbl' and 'data.frame': 768 obs. of 10 variables: +# > $ patient_id : int 1 2 3 4 5 6 7 8 9 10 ... +# > $ pregnancies : int 6 1 8 1 0 5 3 10 2 8 ... +# > $ plasma_glucose: int 148 85 183 89 137 116 78 115 197 125 ... +# > $ diastolic_bp : int 72 66 64 66 40 74 50 NA 70 96 ... +# > $ skinfold : int 35 29 NA 23 35 NA 32 NA 45 NA ... +# > $ insulin : int NA NA NA 94 168 NA 88 NA 543 NA ... +# > $ weight_class : chr "obese" "overweight" "normal" "overweight" ... +# > $ pedigree : num 0.627 0.351 0.672 0.167 2.288 ... +# > $ age : int 50 31 32 21 33 30 26 29 53 54 ... +# > $ diabetes : chr "Y" "N" "Y" "N" ...

Easy Machine Learning

If you don’t want to fuss with details any more than necessary, machine_learn is the function for you. It makes it as easy as possible to implement machine learning models by putting all the detais in the background so that you don’t have to worry about them. Of course it might be wise to worry about them, and we’ll get to how to do that further down, but for now, you can automatically take care of problems in the data, do basic feature engineering, and tune multiple machine learning models using cross validation with machine_learn.

machine_learn always gets the name of the data frame, then any columns that should not be used by the model (uninformative columns, such as IDs), then the variable to be predicted with outcome =. If you want machine_learn to run faster, you can have that—at the expense of a bit of predictive power—by setting its tune argument to FALSE.

quick_models <- machine_learn(pima_diabetes, patient_id, outcome = diabetes)
-#>  Training new data prep recipe
-#>  Variable(s) ignored in prep_data won't be used to tune models: patient_id
-#>  diabetes looks categorical, so training classification algorithms.
-#>  Running cross validation for Random Forest
-#>  Running cross validation for k-Nearest Neighbors
+# > Training new data prep recipe +# > Variable(s) ignored in prep_data won't be used to tune models: patient_id +# > diabetes looks categorical, so training classification algorithms. +# > Running cross validation for Random Forest +# > Running cross validation for k-Nearest Neighbors

machine_learn has told us that it has created a recipe for data preparation (this allows us to do exactly the same data cleaning and feature engineering when you want predictions on a new dataset), is ignoring patient_id when tuning models as we told it to, is training classification algorithms because the outcome variable diabetes is categorical, and has executed cross validation for two machine learning models: random forests, and k-nearest neighbors. Let’s see what the models look like.

quick_models
-#>  Algorithms Trained: Random Forest, k-Nearest Neighbors
-#>  Target: diabetes
-#>  Class: Classification
-#>  Performance Metric: ROC
-#>  Number of Observations: 768
-#>  Number of Features: 12
-#>  Models Trained: 2018-04-02 11:00:11 
-#>  
-#>  Models tuned via 5-fold cross validation over 9 combinations of hyperparameter values.
-#>  Best model: Random Forest
-#>  ROC = 0.84
-#>  Optimal hyperparameter values:
-#>    mtry = 5
-#>    splitrule = extratrees
-#>    min.node.size = 12
+# > Algorithms Trained: Random Forest, k-Nearest Neighbors +# > Target: diabetes +# > Class: Classification +# > Performance Metric: ROC +# > Number of Observations: 768 +# > Number of Features: 12 +# > Models Trained: 2018-04-02 18:37:21 +# > +# > Models tuned via 5-fold cross validation over 9 combinations of hyperparameter values. +# > Best model: Random Forest +# > ROC = 0.84 +# > Optimal hyperparameter values: +# > mtry = 5 +# > splitrule = extratrees +# > min.node.size = 12

Everything looks as expected, and the best model is is a random forest that achives performance of AUROC = 0.84. Not bad for one line of code.

Now that we have our models, we can make predictions using the predict function. If you provide a new data frame to predict it will make predictions on the new data; otherwise, it will make predictions on the training data.

predictions <- predict(quick_models)
 predictions
-#>  "predicted_diabetes" predicted by Random Forest last trained: 2018-04-02 11:00:11
-#>  Performance in training: ROC = 0.84
-#>  # A tibble: 768 x 14
-#>    diabetes predicted_diabetes pregnancies plasma_glucose diastolic_bp
-#>  * <fct>                 <dbl>       <int>          <dbl>        <dbl>
-#>  1 Y                   0.796             6           148.          72.
-#>  2 N                   0.0740            1            85.          66.
-#>  3 Y                   0.608             8           183.          64.
-#>  4 N                   0.00639           1            89.          66.
-#>  5 Y                   0.717             0           137.          40.
-#>  # ... with 763 more rows, and 9 more variables: skinfold <dbl>,
-#>  #   insulin <dbl>, pedigree <dbl>, age <int>, weight_class_normal <dbl>,
-#>  #   weight_class_obese <dbl>, weight_class_overweight <dbl>,
-#>  #   weight_class_other <dbl>, weight_class_hcai_missing <dbl>
+# > "predicted_diabetes" predicted by Random Forest last trained: 2018-04-02 18:37:21 +# > Performance in training: ROC = 0.84 +# > # A tibble: 768 x 14 +# > diabetes predicted_diabetes pregnancies plasma_glucose diastolic_bp +# > * <fct> <dbl> <int> <dbl> <dbl> +# > 1 Y 0.796 6 148. 72. +# > 2 N 0.0740 1 85. 66. +# > 3 Y 0.608 8 183. 64. +# > 4 N 0.00639 1 89. 66. +# > 5 Y 0.717 0 137. 40. +# > # ... with 763 more rows, and 9 more variables: skinfold <dbl>, +# > # insulin <dbl>, pedigree <dbl>, age <int>, weight_class_normal <dbl>, +# > # weight_class_obese <dbl>, weight_class_overweight <dbl>, +# > # weight_class_other <dbl>, weight_class_hcai_missing <dbl>

We get a message about when the model was trained and how well it preformed in training, and we get back a data frame that looks sort of like the original, but has a new column predited_diabetes that contains the model-generated probability each individual has diabetes, and contains changes that were made preparing the data for model training, e.g. missingness has been filled in and weight_class has been split into a series of “dummy” variables.

We can plot how effectively the model is able to separate diabetic from non-diabetic individuals by calling the plot function on the output of predict.

plot(predictions)
@@ -170,17 +170,17 @@

Data Profiling

It is always a good idea to be aware of where there are missing values in data. The missingness function helps with that. In addition to looking for values R sees as missing, it looks for other values that might represent missing, such as "NULL", and issues a warning if it finds any.

missingness(pima_diabetes)
-#>           variable percent_missing
-#>  1      patient_id             0.0
-#>  2     pregnancies             0.0
-#>  3        pedigree             0.0
-#>  4             age             0.0
-#>  5        diabetes             0.0
-#>  6  plasma_glucose             0.7
-#>  7    weight_class             1.4
-#>  8    diastolic_bp             4.6
-#>  9        skinfold            29.6
-#>  10        insulin            48.7
+# > variable percent_missing +# > 1 patient_id 0.0 +# > 2 pregnancies 0.0 +# > 3 pedigree 0.0 +# > 4 age 0.0 +# > 5 diabetes 0.0 +# > 6 plasma_glucose 0.7 +# > 7 weight_class 1.4 +# > 8 diastolic_bp 4.6 +# > 9 skinfold 29.6 +# > 10 insulin 48.7

It’s good that we don’t have any missingness in our ID or outcome columns. We’ll see how missingness in predictors is addressed further down.

@@ -197,7 +197,7 @@

+# > Training new data prep recipe

The “recipe” that the above message refers to is a set of instructions for how to transform a dataset the way we just transformed our training data. Any machine learning that we do (within healthcareai) on prepped_training_data will retain that recipe and apply it before making predictions on new data. That means that when you have models making predictions in production, you don’t have to figure out how to transform the data or worry about encountering missing data or new category levels.

@@ -211,10 +211,10 @@

models = "RF", tune_depth = 25, metric = "PR") -#> Variable(s) ignored in prep_data won't be used to tune models: patient_id -#> diabetes looks categorical, so training classification algorithms. -#> You've chosen to tune 125 models (n_folds = 5 x tune_depth = 25 x length(models) = 1) on a 692 row dataset. This may take a while... -#> Running cross validation for Random Forest

+# > Variable(s) ignored in prep_data won't be used to tune models: patient_id +# > diabetes looks categorical, so training classification algorithms. +# > You've chosen to tune 125 models (n_folds = 5 x tune_depth = 25 x length(models) = 1) on a 692 row dataset. This may take a while... +# > Running cross validation for Random Forest

We get a message saying the training may take a while because we’re training so many models, but in this case it takes just about 20 seconds to train all those models.

We can examine how the model performs across hyperparameters by plotting the model object. It looks like extratrees is a superior split rule for this model, and larger values of minimum node size tend to do better.

plot(models)
@@ -228,23 +228,23 @@

outcome = diabetes, models = "RF", metric = "PR") -#> Variable(s) ignored in prep_data won't be used to tune models: patient_id -#> diabetes looks categorical, so training classification algorithms. -#> Algorithms Trained: Random Forest -#> Target: diabetes -#> Class: Classification -#> Performance Metric: PR -#> Number of Observations: 692 -#> Number of Features: 13 -#> Models Trained: 2018-04-02 11:00:37 -#> -#> Models have not been tuned. Performance estimated via 5-fold cross validation at fixed hyperparameter values. -#> Best model: Random Forest -#> PR = 0.89 -#> User-selected hyperparameter values: -#> mtry = 5 -#> splitrule = extratrees -#> min.node.size = 10 +# > Variable(s) ignored in prep_data won't be used to tune models: patient_id +# > diabetes looks categorical, so training classification algorithms. +# > Algorithms Trained: Random Forest +# > Target: diabetes +# > Class: Classification +# > Performance Metric: PR +# > Number of Observations: 692 +# > Number of Features: 13 +# > Models Trained: 2018-04-02 18:37:46 +# > +# > Models have not been tuned. Performance estimated via 5-fold cross validation at fixed hyperparameter values. +# > Best model: Random Forest +# > PR = 0.89 +# > User-selected hyperparameter values: +# > mtry = 5 +# > splitrule = extratrees +# > min.node.size = 10

In this case we sacrificed just 0.01 AUPR versus tuning the models. In our experience, that’s on the small side of typical. A good workflow is often to do all of your development using flash_models, and as a final step before putting a model into production, retrain the model using tune_models.

@@ -253,24 +253,24 @@

Prediction

predict will automatically use the best-performing model from training (evaluated out-of-fold in cross validation). If no new data is passed to predict it will make predictions on the training dataset. The predicted probabilities appear in the predicted_diabetes column.

predict(models)
-#>  "predicted_diabetes" predicted by Random Forest last trained: 2018-04-02 11:00:34
-#>  Performance in training: PR = 0.9
-#>  # A tibble: 692 x 15
-#>    diabetes predicted_diabetes pregnancies plasma_glucose diastolic_bp
-#>  * <fct>                 <dbl>       <dbl>          <dbl>        <dbl>
-#>  1 N                   0.0677       -0.843         -1.19        -0.521
-#>  2 Y                   0.642         1.22           2.01        -0.686
-#>  3 N                   0.00366      -0.843         -1.05        -0.521
-#>  4 Y                   0.723        -1.14           0.509       -2.66 
-#>  5 N                   0.228         0.338         -0.175        0.138
-#>  # ... with 687 more rows, and 10 more variables: skinfold <dbl>,
-#>  #   insulin <dbl>, pedigree <dbl>, age <dbl>, weight_class_normal <dbl>,
-#>  #   weight_class_obese <dbl>, weight_class_overweight <dbl>,
-#>  #   weight_class_underweight <dbl>, weight_class_hcai_missing <dbl>,
-#>  #   weight_class_other <dbl>
+# > "predicted_diabetes" predicted by Random Forest last trained: 2018-04-02 18:37:43 +# > Performance in training: PR = 0.9 +# > # A tibble: 692 x 15 +# > diabetes predicted_diabetes pregnancies plasma_glucose diastolic_bp +# > * <fct> <dbl> <dbl> <dbl> <dbl> +# > 1 N 0.0677 -0.843 -1.19 -0.521 +# > 2 Y 0.642 1.22 2.01 -0.686 +# > 3 N 0.00366 -0.843 -1.05 -0.521 +# > 4 Y 0.723 -1.14 0.509 -2.66 +# > 5 N 0.228 0.338 -0.175 0.138 +# > # ... with 687 more rows, and 10 more variables: skinfold <dbl>, +# > # insulin <dbl>, pedigree <dbl>, age <dbl>, weight_class_normal <dbl>, +# > # weight_class_obese <dbl>, weight_class_overweight <dbl>, +# > # weight_class_underweight <dbl>, weight_class_hcai_missing <dbl>, +# > # weight_class_other <dbl>

To get predictions on a new dataset, pass the new data to predict, and it will automatically be prepared based on the recipe generated on the training data. We can plot the predictions to see how well our model is doing, and we see that it’s separating diabetic from non-diabetic individuals pretty well, although there a fair number of non-diabetics with high predicted probabilities of diabetes. This may be due to optimizing for precision recall, or may indicate pre-diabetic patients.

test_predictions <- predict(models, split_data$test)
-#>  Prepping data based on provided recipe
+# > Prepping data based on provided recipe
 plot(test_predictions)

@@ -279,44 +279,44 @@

A Regression Example

All the examples above have been classification tasks, predicting a yes/no outcome. Here’s an example of a full regression modeling pipeline on a silly problem: predicting individuals’ ages. The code is very similar to classification.

regression_models <- machine_learn(pima_diabetes, patient_id, outcome = age)
-#>  Training new data prep recipe
-#>  Variable(s) ignored in prep_data won't be used to tune models: patient_id
-#>  age looks numeric, so training regression algorithms.
-#>  Running cross validation for Random Forest
-#>  Running cross validation for k-Nearest Neighbors
+# > Training new data prep recipe
+# > Variable(s) ignored in prep_data won't be used to tune models: patient_id
+# > age looks numeric, so training regression algorithms.
+# > Running cross validation for Random Forest
+# > Running cross validation for k-Nearest Neighbors
 summary(regression_models)
-#>  Models trained: 2018-04-02 11:00:50
-#>  
-#>  Models tuned via 5-fold cross validation over 10 combinations of hyperparameter values.
-#>  Best performance: RMSE = 9.07
-#>  By Random Forest with hyperparameters:
-#>    mtry = 10
-#>    splitrule = extratrees
-#>    min.node.size = 10
-#>  
-#>  Out-of-fold performance of all trained models:
-#>  
-#>  $`Random Forest`
-#>  # A tibble: 10 x 9
-#>    min.node.size  mtry splitrule   RMSE Rsquared   MAE RMSESD RsquaredSD
-#>  *         <int> <int> <fct>      <dbl>    <dbl> <dbl>  <dbl>      <dbl>
-#>  1            10    10 extratrees  9.07    0.404  6.43  0.640     0.0358
-#>  2             8    11 extratrees  9.09    0.402  6.43  0.626     0.0396
-#>  3            12     5 extratrees  9.13    0.405  6.56  0.666     0.0272
-#>  4            10    13 variance    9.33    0.376  6.60  0.633     0.0358
-#>  5             7    10 variance    9.34    0.374  6.61  0.583     0.0303
-#>  # ... with 5 more rows, and 1 more variable: MAESD <dbl>
-#>  
-#>  $`k-Nearest Neighbors`
-#>  # A tibble: 10 x 9
-#>     kmax distance kernel       RMSE Rsquared   MAE RMSESD RsquaredSD MAESD
-#>  * <dbl>    <dbl> <fct>       <dbl>    <dbl> <dbl>  <dbl>      <dbl> <dbl>
-#>  1   16.    2.60  inv          9.44    0.363  6.65  0.811     0.0649 0.551
-#>  2   14.    1.73  gaussian     9.44    0.361  6.66  0.717     0.0593 0.452
-#>  3   13.    1.58  triangular   9.49    0.355  6.66  0.764     0.0697 0.461
-#>  4   10.    0.933 rectangular  9.55    0.346  6.79  0.637     0.0438 0.412
-#>  5    6.    1.68  inv          9.64    0.340  6.74  0.723     0.0677 0.465
-#>  # ... with 5 more rows
+# > Models trained: 2018-04-02 18:37:59 +# > +# > Models tuned via 5-fold cross validation over 10 combinations of hyperparameter values. +# > Best performance: RMSE = 9.07 +# > By Random Forest with hyperparameters: +# > mtry = 10 +# > splitrule = extratrees +# > min.node.size = 10 +# > +# > Out-of-fold performance of all trained models: +# > +# > $`Random Forest` +# > # A tibble: 10 x 9 +# > min.node.size mtry splitrule RMSE Rsquared MAE RMSESD RsquaredSD +# > * <int> <int> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> +# > 1 10 10 extratrees 9.07 0.404 6.43 0.640 0.0358 +# > 2 8 11 extratrees 9.09 0.402 6.43 0.626 0.0396 +# > 3 12 5 extratrees 9.13 0.405 6.56 0.666 0.0272 +# > 4 10 13 variance 9.33 0.376 6.60 0.633 0.0358 +# > 5 7 10 variance 9.34 0.374 6.61 0.583 0.0303 +# > # ... with 5 more rows, and 1 more variable: MAESD <dbl> +# > +# > $`k-Nearest Neighbors` +# > # A tibble: 10 x 9 +# > kmax distance kernel RMSE Rsquared MAE RMSESD RsquaredSD MAESD +# > * <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> +# > 1 16. 2.60 inv 9.44 0.363 6.65 0.811 0.0649 0.551 +# > 2 14. 1.73 gaussian 9.44 0.361 6.66 0.717 0.0593 0.452 +# > 3 13. 1.58 triangular 9.49 0.355 6.66 0.764 0.0697 0.461 +# > 4 10. 0.933 rectangular 9.55 0.346 6.79 0.637 0.0438 0.412 +# > 5 6. 1.68 inv 9.64 0.340 6.74 0.723 0.0677 0.465 +# > # ... with 5 more rows

Let’s make a prediction on a hypothetical new patient. Note that the model handles missingness in insulin and a new category level in weight_class without a problem (but warns about it).

new_patient <- data.frame(
   pregnancies = 0,
@@ -328,17 +328,17 @@ 

pedigree = .2, diabetes = "N") predict(regression_models, new_patient) -#> Warning in ready_with_prep(object, newdata, mi): The following variables(s) had the following value(s) in predict that were not observed in training. -#> weight_class: ??? -#> Prepping data based on provided recipe -#> "predicted_age" predicted by Random Forest last trained: 2018-04-02 11:00:50 -#> Performance in training: RMSE = 9.07 -#> # A tibble: 1 x 9 -#> predicted_age pregnancies plasma_glucose diastolic_bp skinfold insulin -#> * <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> -#> 1 23.9 0. 80. 55. 24. NA -#> # ... with 3 more variables: weight_class <fct>, pedigree <dbl>, -#> # diabetes <fct>

+# > Warning in ready_with_prep(object, newdata, mi): The following variables(s) had the following value(s) in predict that were not observed in training. +# > weight_class: ??? +# > Prepping data based on provided recipe +# > "predicted_age" predicted by Random Forest last trained: 2018-04-02 18:37:59 +# > Performance in training: RMSE = 9.07 +# > # A tibble: 1 x 9 +# > predicted_age pregnancies plasma_glucose diastolic_bp skinfold insulin +# > * <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> +# > 1 23.9 0. 80. 55. 24. NA +# > # ... with 3 more variables: weight_class <fct>, pedigree <dbl>, +# > # diabetes <fct> diff --git a/docs/dev/index.html b/docs/dev/index.html index 9d777463b..e0c00f37d 100644 --- a/docs/dev/index.html +++ b/docs/dev/index.html @@ -93,7 +93,7 @@ - +
+# > min.node.size = 11

Make predictions and examine predictive performance:

predictions <- predict(models)
 plot(predictions)
-

+

diff --git a/docs/dev/index_files/figure-html/unnamed-chunk-3-1.png b/docs/dev/index_files/figure-html/unnamed-chunk-3-1.png index 968599a95..811753174 100644 Binary files a/docs/dev/index_files/figure-html/unnamed-chunk-3-1.png and b/docs/dev/index_files/figure-html/unnamed-chunk-3-1.png differ diff --git a/docs/dev/reference/figures/README-plot predictions-1.png b/docs/dev/reference/figures/README-plot predictions-1.png new file mode 100644 index 000000000..13fbefd95 Binary files /dev/null and b/docs/dev/reference/figures/README-plot predictions-1.png differ diff --git a/docs/dev/reference/figures/README-plot_predictions-1.png b/docs/dev/reference/figures/README-plot_predictions-1.png new file mode 100644 index 000000000..5816c4544 Binary files /dev/null and b/docs/dev/reference/figures/README-plot_predictions-1.png differ diff --git a/docs/dev/reference/flash_models.html b/docs/dev/reference/flash_models.html index 3ad2b0894..5ede582f2 100644 --- a/docs/dev/reference/flash_models.html +++ b/docs/dev/reference/flash_models.html @@ -186,10 +186,9 @@

Value

Details

This function has two major differences from - tune_models:

    -
  1. It is much - faster.

  2. -
+ tune_models: 1. It uses fixed hyperparameter values to train + models instead of using cross-validation to optimize hyperparameter values + for predictive performance, and, as a result, 2. It is much faster.

See also

@@ -198,24 +197,13 @@

See a

Examples

-
# Prepare data -prepped_data <- prep_data(pima_diabetes, patient_id, outcome = diabetes)
#> Training new data prep recipe
+
# NOT RUN {
+# Prepare data
+prepped_data <- prep_data(pima_diabetes, patient_id, outcome = diabetes)
+
 # Simplest use. Get models quickly at default hyperparameter values
-flash_models(prepped_data, diabetes)
#> Variable(s) ignored in prep_data won't be used to tune models: patient_id
#> diabetes looks categorical, so training classification algorithms.
#> Algorithms Trained: Random Forest, k-Nearest Neighbors -#> Target: diabetes -#> Class: Classification -#> Performance Metric: ROC -#> Number of Observations: 768 -#> Number of Features: 12 -#> Models Trained: 2018-04-02 10:59:04 -#> -#> Models have not been tuned. Performance estimated via 5-fold cross validation at fixed hyperparameter values. -#> Best model: Random Forest -#> ROC = 0.84 -#> User-selected hyperparameter values: -#> mtry = 5 -#> splitrule = extratrees -#> min.node.size = 10
+flash_models(prepped_data, diabetes) + # Set non-default hyperparameter values by passing a list of lists to \code{hyperparameters} models <- flash_models(d = prepped_data, @@ -232,35 +220,18 @@

Examp kernel = "gaussian" ) ) - )

#> Variable(s) ignored in prep_data won't be used to tune models: patient_id
#> diabetes looks categorical, so training classification algorithms.
summary(models)
#> Models trained: 2018-04-02 10:59:06 -#> -#> Models have not been tuned. Performance estimated via 5-fold cross validation at fixed hyperparameter values. -#> Best algorithm: Random Forest with ROC = 0.84 -#> -#> Out-of-fold performance of all trained models: -#> -#> $`Random Forest` -#> # A tibble: 1 x 9 -#> mtry splitrule min.node.size ROC Sens Spec ROCSD SensSD SpecSD -#> * <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> -#> 1 3. gini 1. 0.836 0.868 0.590 0.0313 0.0466 0.0602 -#> -#> $`k-Nearest Neighbors` -#> # A tibble: 1 x 9 -#> kmax distance kernel ROC Sens Spec ROCSD SensSD SpecSD -#> * <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> -#> 1 3. 2. gaussian 0.784 0.828 0.593 0.0165 0.0286 0.0564 -#>
-# Speed comparison of no tuning with flash_models vs. tuning with tune_models: -
# NOT RUN { - # ~40 seconds: - system.time( - tune_models(prepped_data, diabetes) - ) - # ~6 seconds: - system.time( - flash_models(prepped_data, diabetes) ) +summary(models) + +# Speed comparison of no tuning with flash_models vs. tuning with tune_models: +# ~40 seconds: +system.time( + tune_models(prepped_data, diabetes) +) +# ~6 seconds: +system.time( + flash_models(prepped_data, diabetes) +) # }