From a4f379e34dfaaac590c147fc4f67ab02c5862985 Mon Sep 17 00:00:00 2001
From: bcjaeger Oblique RFs for
prediction accuracy to more computational approaches.
Everything in aorsf
begins with the orsf()
function. Here we begin with an oblique RF for survival using the
-pbc_orsf
data, an oblique RF for classification using the
-penguins_orsf
data, and FILL IN FOR REGRESSION. Note that
-n_tree
is 5 for convenience in these examples, but should
-be >= 500 in practice.
pbc_orsf
data. Note that n_tree
is 5 for
+convenience in these examples, but should be >= 500 in practice.
library(aorsf)
@@ -136,15 +134,18 @@ Oblique RFs for
#> N trees: 5
#> N predictors total: 17
#> N predictors per node: 5
-#> Average leaves per tree: 19.8
+#> Average leaves per tree: 20.8
#> Min observations in leaf: 5
#> Min events in leaf: 1
-#> OOB stat value: 0.76
+#> OOB stat value: 0.82
#> OOB stat type: Harrell's C-index
#> Variable importance: anova
#>
-#> -----------------------------------------
-
+#> -----------------------------------------
Next, an oblique RF for classification using the
+penguins_orsf
data:
+
# An oblique classification RF
penguin_fit <- orsf(data = penguins_orsf,
n_tree = 5,
@@ -159,16 +160,17 @@ Oblique RFs for
#> N trees: 5
#> N predictors total: 7
#> N predictors per node: 3
-#> Average leaves per tree: 6
+#> Average leaves per tree: 5.8
#> Min observations in leaf: 5
#> OOB stat value: 0.99
#> OOB stat type: AUC-ROC
#> Variable importance: anova
#>
-#> -----------------------------------------
-
+#> -----------------------------------------
and for regression, we use the mtcars
data:
+
# An oblique regression RF
-
cars_fit <- orsf(data = mtcars,
n_tree = 5,
formula = mpg ~ .)
@@ -181,9 +183,9 @@ Oblique RFs for
#> N trees: 5
#> N predictors total: 10
#> N predictors per node: 4
-#> Average leaves per tree: 4.8
+#> Average leaves per tree: 5.2
#> Min observations in leaf: 5
-#> OOB stat value: 0.75
+#> OOB stat value: 0.70
#> OOB stat type: RSQ
#> Variable importance: anova
#>
@@ -192,7 +194,7 @@ Oblique RFs for
data
. This is a design choice that makes it easier to use
orsf
with pipes (i.e., %>%
or
|>
). For instance,
-
+
library(dplyr)
@@ -217,48 +219,46 @@ Variable importance
+
orsf_vi_negate(pbc_fit)
-#> bili copper age protime spiders
-#> 0.1168221744 0.0640918012 0.0318717527 0.0295703184 0.0199482278
-#> ascites stage trig ast hepato
-#> 0.0145030496 0.0138362817 0.0093934850 0.0081600305 0.0081045745
-#> edema albumin trt chol platelet
-#> 0.0074171879 0.0070565813 0.0049965458 0.0043845830 0.0007886543
-#> sex alk.phos
-#> -0.0023614972 -0.0040932561
+#> bili sex protime age trig hepato
+#> 0.091256795 0.037885991 0.033193742 0.025753962 0.024653311 0.023304006
+#> copper platelet albumin chol stage ascites
+#> 0.021780249 0.018800353 0.018477750 0.016248435 0.015860457 0.015326803
+#> ast edema trt spiders alk.phos
+#> 0.010218841 0.005566226 0.004845179 -0.003034859 -0.003217300
You can also compute variable importance using
permutation, a more classical approach that noises up a
predictor and then assigned the resulting degradation in prediction
accuracy to be the importance of that predictor.
-
+
orsf_vi_permute(pbc_fit)
-#> bili copper age ascites albumin
-#> 0.0681612536 0.0264039589 0.0154990015 0.0145135549 0.0128863883
-#> ast spiders stage edema protime
-#> 0.0112889819 0.0042083643 0.0036260906 0.0031464934 0.0029252926
-#> trt platelet sex hepato chol
-#> -0.0002451595 -0.0002523982 -0.0005419264 -0.0010103185 -0.0012341940
-#> alk.phos trig
-#> -0.0033725370 -0.0039837212
+#> bili albumin age copper ascites
+#> 0.0504199752 0.0249318229 0.0236104615 0.0185219224 0.0166280749
+#> protime spiders chol ast stage
+#> 0.0105379706 0.0093098117 0.0085422793 0.0066676560 0.0045685176
+#> trig edema hepato sex platelet
+#> 0.0043659357 0.0040124274 0.0026238052 0.0023832364 0.0006564563
+#> alk.phos trt
+#> -0.0041506586 -0.0052309067
A faster alternative to permutation and negation importance is
ANOVA importance, which computes the proportion of times each variable
obtains a low p-value (p < 0.01) while the forest is grown.
-
+
orsf_vi_anova(pbc_fit)
-#> ascites copper albumin bili edema age hepato
-#> 0.50000000 0.41176471 0.35294118 0.35294118 0.29417989 0.26315789 0.23529412
-#> spiders protime chol stage alk.phos ast platelet
-#> 0.21428571 0.21052632 0.16666667 0.13333333 0.06250000 0.05263158 0.04545455
-#> trig sex trt
-#> 0.04545455 0.00000000 0.00000000
+#> edema ascites bili spiders stage copper sex
+#> 0.43990930 0.36363636 0.33333333 0.30000000 0.29411765 0.26086957 0.16666667
+#> platelet ast age hepato trig chol protime
+#> 0.15000000 0.15000000 0.14285714 0.13636364 0.13043478 0.12500000 0.12000000
+#> alk.phos albumin trt
+#> 0.05882353 0.05263158 0.00000000
@@ -289,7 +289,7 @@ What about the original ORSF?
+
orsf_net <- orsf(data = pbc_orsf,
formula = Surv(time, status) ~ . - id,
diff --git a/articles/fast.html b/articles/fast.html
index b125925e..d9725d8f 100644
--- a/articles/fast.html
+++ b/articles/fast.html
@@ -127,7 +127,7 @@ Don’t specify a control
# control_fast() is much faster
time_net['elapsed'] / time_fast['elapsed']
#> elapsed
-#> 53.4
+#> 51.35
n_thread
@@ -155,7 +155,7 @@ n_thread
#> N trees: 5
#> N predictors total: 17
#> N predictors per node: 5
-#> Average leaves per tree: 21.6
+#> Average leaves per tree: 20
#> Min observations in leaf: 5
#> Min events in leaf: 1
#> OOB stat value: 0.78
@@ -206,7 +206,7 @@ The out-of-bag estimate of 1 (the default method to evaluate -out-of-bag predictions) is 0.7923697.
+out-of-bag predictions) is 0.8011103.
zl2kZFsC`7uW1kzE;c2Yw?m{nm@THug&d~b~eXXgO4`a4gs`oJIdN1!Z2S^Yn1jK&x
z-Ll!#;VbFmw_aC6`p{1uN9WJ$`Hk!M$|X?!*ysF}WQOOP7GoQrf#A9c_t{DeF}$ZH
zGMU>(KFyV=cIaY#D^RA@&aI4`Xk%N316WOlk@iA5=25~Kq<%@pz+4#Ax_e`eX%%B8
zKiU#%OP)n>W)axN9uatRt$td>5P>&9ITa_*oPiqZlvE^LvEG*3)8L?Wx1^3RY{koq
z25&4pV>52mef0z& d;84&QNc8usS5HEDd?I=K{tB9S`feJm37qY
zX9+(7AeZaoac*jDdL_AKDVB@WFIy h=)P|u&mKHnag=*KM!hGO_NLW%gK5&z9EY)nU6?43tQ`d_k_UX6=_n_Pxkq
z>Oujx(-9LDYKE-sSK{=i&1e>;v8olDwsmJmq7f?@)}%U=nKlVW%L@&3EFa5JY$Ctj
z@uyxy)V~Yx2+feZkVgc~513J}hEOu|RCZ-69DKdgp{OuMO80{aFIwv?@EB?9#u}mz
ztUI>nax?0j0?*>4tu6kR!A|=VX6ccj(2>b)8=PDF8fPKcvE$o<#!PzHQu4i44SwuO_cSpWW8svY+5QzS+J|RF8bZZaHjw{amPXFuTR=@Ls
z`kG04MWuX+15j50kFS*b@9Eckzuu++*)q6rjmSuNw;?*vD*>=WXU_`G0%KZi&JO#bxG(`{9ojR}R!8{vkvF~o(&I7ip>z$q8R
zNmrz!oQ>{19nrHL_(av#vPrFQ({l6@o!0|ED(>vc1IM~n&Y+|1B)_iuuLmUnZmWapZQ7G+~0ZzjEVR{&+29r^j~y=BQQXxB43uZniz{bbXkr
z**7C*81?%^V2}u$b)6%reiO9CFwi_1apwo3*xl=oK6?ps>=rL0K|PkxvYAh}L@z1k
zdMdRvZmugbR}_lY1oX~-2F5PH@;6x|huzz76ZQi4j`10aEw2v^*H6tX&yQqP&?gh7
zKF7U`xh*4c0lWBlc_lg%e^Gc+hem*XiP6+qm*r}FOr(x2+>vccB9yz`TAA7Eq)JMgN5cpapspiM098e)f2op>nL670sFD&>qIaHi
z$K1rn2gwtJBh^|>_i08<=r+JMwm8(p8pC*aa>)-*D#eQz;^AP=5qi
zyv76puH(ucF
zdM2~*89Kv<)9p`-jR&|{nG
z_&WhXeRM!X74kBA+qAVWi*AB=7V>EU0zfMp;*vJ~=vgRNjeI7Si?V5+;YaPDftVL!
zIpto!SfH~2A2ReB;*vRy!)!lzrp^oTjw-47;U)?p{1eGqd-U$M7ts313q_l&y{~~!
z{3h6ypCeSL`v@#gZ+UChiL<}c!G|r7MSEcZXdZ?EQXK`g(Y`EOgCREo05_iK1Jp-G
zOc0@ix>ItgKQy{jJuN~w)Cip?{#g~WHn2aL6F9sP%hHms$$hiN=N6=X6jbkyMjd-j
z@_sh6VX>Chv^j{5D){%?f!*9mqbXV3of>HDXW5>c4uVN&bxBY?t9z4`Yri65H0APM
zuyb`hz+4gFdDHLIoR9q+IQoy# >q{bfe#7$-KSm=AF!KyhagmI5eNY%Diy}u6AxOdk{p?RJw8pwIrfKc8
zDzS%i?u6QtnzPLWLbCH^J~K<90B)`3(r$~z70KG~Td<&5-V4f*!ZQeis)d*MBkbJ|
zKBOA(>WQ{YHEH6dZO
Cv}>A^KLz#o2X!_C`PfqFCy}aVO|x!J-x)Y=>=)pm
z{6<^{oCWw#b&!n^Zi6T)x8DEAKNvVeL4-nQaI{(|zrkmfs$6Q5&%F+-