v2.1 site update

HealthCatalyst · Jun 29, 2018 · b723d25 · b723d25
1 parent 36ef01b
commit b723d25
Show file tree

Hide file tree

Showing 102 changed files with 5,265 additions and 2,455 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: healthcareai
 Type: Package
 Title: Tools for Healthcare Machine Learning
-Version: 2.0.0
-Date: 2018-04-20
+Version: 2.1.0
+Date: 2018-06-29
 Authors@R: c(
     person("Levi", "Thatcher", email = "levi.thatcher@healthcatalyst.com", role = "aut"),
     person("Michael", "Levy", email = "michael.levy@healthcatalyst.com", role = c("aut", "cre")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,29 @@
+# healthcareai 2.1.0
+
+#### Added
+
+- Identify values of high-cardinality variables that will make good features, even with multiple values per observation with `add_best_levels` and `get_best_levels`.
+- glmnet for regularized linear and logistic regression.
+- `interpret` and `plot.interpret` to extract glmnet estimates.
+- XGBoost for regression and classification models.
+- `variable_importance` returns random forest or xgboost importances, whichever model performs better.
+
+#### Changed
+
+- `predict` can now write an extensive log file, and if that option is activated, as in production, `predict` is a safe function that always completes; if there is an error, it returns a zero-row data frame that is otherwise the same as what would have been returned (provided `prep_data` or `machine_learn` was used).
+- Control how low variance must be to remove columns by providing a numeric value to the `remove_near_zero_variance` argument of `prep_data`.
+- Fixed bug in missingness that caused very small values to round to zero.
+- Messages about time required for model training are improved.
+- `separate_drgs` returns `NA` for complication when the DRG is missing.
+- Removed some redundent training data from `model_list` objects.
+- `methods` is attached on attaching the package so that scripts operate the same in Rscript, R GUI, and R Studio.
+- Minor changes to maintain compatibility with `ggplot2`, `broom`, and `recipes`.
+
+#### Removed
+
+- Removed support for k-nearest neighbors
+- Remove support for maxstat splitting rule in random forests
+
 # healthcareai 2.0.0
 
 A whole new architecture featuring a simpler API, more rigor under the hood, and attractive plots. 

diff --git a/R/globals.R b/R/globals.R
@@ -8,7 +8,7 @@ utils::globalVariables(c("outside", "percent_missing", "variable", ".outcome",
                          "any_valid", "badness", "best_levels", "fraction_positive",
                          "log_dist_from_in_all", "log_loss", "mean_ssd",
                          "predictor_of", "present_in", "original_data_str",
-                         "coefficient"))
+                         "coefficient", "model", "obs"))
 
 
 printer <- utils::getFromNamespace("printer", "recipes")

diff --git a/R/interpret.R b/R/interpret.R
@@ -29,7 +29,7 @@
 #' @seealso \code{\link{plot.interpret}}
 #'
 #' @examples
-#' m <- machine_learn(pima_diabetes, patient_id, outcome = age, models = "glm")
+#' m <- machine_learn(pima_diabetes, patient_id, outcome = diabetes, models = "glm")
 #' interpret(m)
 #' interpret(m, .2)
 #' interpret(m) %>%

diff --git a/R/tune_models.R b/R/tune_models.R
@@ -65,7 +65,7 @@
 #' # Prepare data for tuning
 #' d <- prep_data(pima_diabetes, patient_id, outcome = diabetes)
 #'
-#' # Tune random forest and k-nearest neighbors classification models
+#' # Tune random forest, xgboost, and regularized regression classification models
 #' m <- tune_models(d, outcome = diabetes)
 #'
 #' # Get some info about the tuned models

diff --git a/README.md b/README.md
@@ -35,21 +35,22 @@ line of code:
 ``` r
 models <- machine_learn(pima_diabetes, patient_id, outcome = diabetes)
 models
-# > Algorithms Trained: Random Forest, k-Nearest Neighbors, glmnet
+# > Algorithms Trained: Random Forest, eXtreme Gradient Boosting, and glmnet
+# > Model Name: diabetes
 # > Target: diabetes
 # > Class: Classification
 # > Performance Metric: AUROC
 # > Number of Observations: 768
 # > Number of Features: 12
-# > Models Trained: 2018-06-08 19:10:01 
+# > Models Trained: 2018-06-29 17:19:43 
 # > 
 # > Models tuned via 5-fold cross validation over 10 combinations of hyperparameter values.
 # > Best model: Random Forest
 # > AUPR = 0.71, AUROC = 0.84
 # > Optimal hyperparameter values:
-# >   mtry = 4
+# >   mtry = 2
 # >   splitrule = extratrees
-# >   min.node.size = 8
+# >   min.node.size = 6
 ```
 
 Make predictions and examine predictive performance:

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -53,8 +53,9 @@ reference:
     - predict.model_list
   - title: Model Interpretation
     contents:
-    - evaluate
     - get_variable_importance
+    - interpret
+    - evaluate
   - title: Visualization
     contents:
     - plot.model_list
@@ -64,6 +65,7 @@ reference:
     - control_chart
   - title: Data Preparation
     contents:
+    - add_best_levels
     - prep_data
     - impute
     - split_train_test

diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html