diff --git a/articles/aorsf.html b/articles/aorsf.html index f3961c84..3de319d0 100644 --- a/articles/aorsf.html +++ b/articles/aorsf.html @@ -114,10 +114,8 @@
Everything in aorsf
begins with the orsf()
function. Here we begin with an oblique RF for survival using the
-pbc_orsf
data, an oblique RF for classification using the
-penguins_orsf
data, and FILL IN FOR REGRESSION. Note that
-n_tree
is 5 for convenience in these examples, but should
-be >= 500 in practice.
pbc_orsf
data. Note that n_tree
is 5 for
+convenience in these examples, but should be >= 500 in practice.
library(aorsf)
@@ -136,15 +134,18 @@ Oblique RFs for
#> N trees: 5
#> N predictors total: 17
#> N predictors per node: 5
-#> Average leaves per tree: 19.8
+#> Average leaves per tree: 20.8
#> Min observations in leaf: 5
#> Min events in leaf: 1
-#> OOB stat value: 0.76
+#> OOB stat value: 0.82
#> OOB stat type: Harrell's C-index
#> Variable importance: anova
#>
-#> -----------------------------------------
-
+#> -----------------------------------------
Next, an oblique RF for classification using the
+penguins_orsf
data:
+
# An oblique classification RF
penguin_fit <- orsf(data = penguins_orsf,
n_tree = 5,
@@ -159,16 +160,17 @@ Oblique RFs for
#> N trees: 5
#> N predictors total: 7
#> N predictors per node: 3
-#> Average leaves per tree: 6
+#> Average leaves per tree: 5.8
#> Min observations in leaf: 5
#> OOB stat value: 0.99
#> OOB stat type: AUC-ROC
#> Variable importance: anova
#>
-#> -----------------------------------------
-
+#> -----------------------------------------
and for regression, we use the mtcars
data:
+
# An oblique regression RF
-
cars_fit <- orsf(data = mtcars,
n_tree = 5,
formula = mpg ~ .)
@@ -181,9 +183,9 @@ Oblique RFs for
#> N trees: 5
#> N predictors total: 10
#> N predictors per node: 4
-#> Average leaves per tree: 4.8
+#> Average leaves per tree: 5.2
#> Min observations in leaf: 5
-#> OOB stat value: 0.75
+#> OOB stat value: 0.70
#> OOB stat type: RSQ
#> Variable importance: anova
#>
@@ -192,7 +194,7 @@ Oblique RFs for
data
. This is a design choice that makes it easier to use
orsf
with pipes (i.e., %>%
or
|>
). For instance,
-
+
library(dplyr)
@@ -217,48 +219,46 @@ Variable importance
+
orsf_vi_negate(pbc_fit)
-#> bili copper age protime spiders
-#> 0.1168221744 0.0640918012 0.0318717527 0.0295703184 0.0199482278
-#> ascites stage trig ast hepato
-#> 0.0145030496 0.0138362817 0.0093934850 0.0081600305 0.0081045745
-#> edema albumin trt chol platelet
-#> 0.0074171879 0.0070565813 0.0049965458 0.0043845830 0.0007886543
-#> sex alk.phos
-#> -0.0023614972 -0.0040932561
+#> bili sex protime age trig hepato
+#> 0.091256795 0.037885991 0.033193742 0.025753962 0.024653311 0.023304006
+#> copper platelet albumin chol stage ascites
+#> 0.021780249 0.018800353 0.018477750 0.016248435 0.015860457 0.015326803
+#> ast edema trt spiders alk.phos
+#> 0.010218841 0.005566226 0.004845179 -0.003034859 -0.003217300
You can also compute variable importance using
permutation, a more classical approach that noises up a
predictor and then assigned the resulting degradation in prediction
accuracy to be the importance of that predictor.
-
+
orsf_vi_permute(pbc_fit)
-#> bili copper age ascites albumin
-#> 0.0681612536 0.0264039589 0.0154990015 0.0145135549 0.0128863883
-#> ast spiders stage edema protime
-#> 0.0112889819 0.0042083643 0.0036260906 0.0031464934 0.0029252926
-#> trt platelet sex hepato chol
-#> -0.0002451595 -0.0002523982 -0.0005419264 -0.0010103185 -0.0012341940
-#> alk.phos trig
-#> -0.0033725370 -0.0039837212
+#> bili albumin age copper ascites
+#> 0.0504199752 0.0249318229 0.0236104615 0.0185219224 0.0166280749
+#> protime spiders chol ast stage
+#> 0.0105379706 0.0093098117 0.0085422793 0.0066676560 0.0045685176
+#> trig edema hepato sex platelet
+#> 0.0043659357 0.0040124274 0.0026238052 0.0023832364 0.0006564563
+#> alk.phos trt
+#> -0.0041506586 -0.0052309067
A faster alternative to permutation and negation importance is
ANOVA importance, which computes the proportion of times each variable
obtains a low p-value (p < 0.01) while the forest is grown.
-
+
orsf_vi_anova(pbc_fit)
-#> ascites copper albumin bili edema age hepato
-#> 0.50000000 0.41176471 0.35294118 0.35294118 0.29417989 0.26315789 0.23529412
-#> spiders protime chol stage alk.phos ast platelet
-#> 0.21428571 0.21052632 0.16666667 0.13333333 0.06250000 0.05263158 0.04545455
-#> trig sex trt
-#> 0.04545455 0.00000000 0.00000000
+#> edema ascites bili spiders stage copper sex
+#> 0.43990930 0.36363636 0.33333333 0.30000000 0.29411765 0.26086957 0.16666667
+#> platelet ast age hepato trig chol protime
+#> 0.15000000 0.15000000 0.14285714 0.13636364 0.13043478 0.12500000 0.12000000
+#> alk.phos albumin trt
+#> 0.05882353 0.05263158 0.00000000
@@ -289,7 +289,7 @@ What about the original ORSF?
+
orsf_net <- orsf(data = pbc_orsf,
formula = Surv(time, status) ~ . - id,
diff --git a/articles/fast.html b/articles/fast.html
index b125925e..d9725d8f 100644
--- a/articles/fast.html
+++ b/articles/fast.html
@@ -127,7 +127,7 @@ Don’t specify a control
# control_fast() is much faster
time_net['elapsed'] / time_fast['elapsed']
#> elapsed
-#> 53.4
+#> 51.35
n_thread
@@ -155,7 +155,7 @@ n_thread
#> N trees: 5
#> N predictors total: 17
#> N predictors per node: 5
-#> Average leaves per tree: 21.6
+#> Average leaves per tree: 20
#> Min observations in leaf: 5
#> Min events in leaf: 1
#> OOB stat value: 0.78
@@ -206,7 +206,7 @@ The out-of-bag estimate of 1 (the default method to evaluate -out-of-bag predictions) is 0.7923697.
+out-of-bag predictions) is 0.8011103.