From 62f2a7201d302b12e03394b5adc37793017b7e06 Mon Sep 17 00:00:00 2001 From: blind-contours Date: Fri, 17 Feb 2023 10:40:36 -0800 Subject: [PATCH] update again. --- paper/paper.bib | 20 ++++++++------------ paper/paper.md | 4 ++-- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 2401e0e..ac3ce47 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -40,18 +40,14 @@ @article{Bobb2014 volume = {16}, year = {2014} } -@article{keil2020, -author = {{Alexander P. Keil, Jessie P. Buckley, Katie M. O'Brien, Kelly K. Ferguson, Shanshan Zhao}, and Alexandra J. White}, -doi = {10.1289/EHP8739}, -file = {:Users/davidmccoy/Downloads/EHP5838.pdf:pdf}, -issn = {15529924}, -journal = {Environmental Health Perspectives}, -number = {3}, -pages = {1--10}, -pmid = {33688746}, -title = {{A quantile-based g-computation approach to addressing the effects of exposure mixtures}}, -volume = {129}, -year = {2021} +@article{keil2020quantile, + title={A quantile-based g-computation approach to addressing the effects of exposure mixtures}, + author={Keil, Alexander P and Buckley, Jessie P and O’Brien, Katie M and Ferguson, Kelly K and Zhao, Shanshan and White, Alexandra J}, + journal={Environmental health perspectives}, + volume={128}, + number={4}, + pages={047004}, + year={2020} } @article{Athey2016, diff --git a/paper/paper.md b/paper/paper.md index 0e78aa2..082f8c7 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,13 +30,13 @@ bibliography: paper.bib # Summary -Statistical causal inference of mixed exposures has been limited by reliance on parametric models and, until recently, by researchers considering only one exposure at a time, usually estimated as a beta coefficient in a generalized linear regression model (GLM). This independent assessment of exposures poorly estimates the joint impact of a collection of the same exposures in a realistic exposure setting. Marginal methods for mixture variable selection such as ridge/lasso regression are biased by linear assumptions and the interactions modeled are chosen by the user. Clustering methods such as principal component regression lose both interpretability and valid inference. Newer mixture methods such as quantile g-computation [@keil2020] are biased by linear/additive assumptions. More flexible methods such as Bayesian kernel machine regression (BKMR)[@Bobb2014] are sensitive to the choice of tuning parameters, are computationally taxing and lack an interpretable and robust summary statistic of dose-response relationships. No methods currently exist which finds the best flexible model to adjust for covariates while applying a non-parametric model that targets for interactions in a mixture and delivers valid inference for a target parameter. Non-parametric methods such as decision trees are a useful tool to evaluate combined exposures by finding partitions in the joint-exposure (mixture) space that best explain the variance in an outcome. However, current methods using decision trees to assess statistical inference for interactions are biased and are prone to overfitting by using the full data to both identify nodes in the tree and make statistical inference given these nodes. Other methods have used an independent test set to derive inference which does not use the full data. The `CVtreeMLE` `R` package provides researchers in (bio)statistics, epidemiology, and environmental health sciences with access to state-of-the-art statistical methodology for evaluating the causal effects of a data-adaptively determined mixed exposure using decision trees. Our target audience are those analysts who would normally use a potentially biased GLM based model for a mixed exposure. Instead, we hope to provide users with a non-parametric statistical machine where users simply specify the exposures, covariates and outcome, `CVtreeMLE` then determines if a best fitting decision tree exists and delivers interpretable results. +Statistical causal inference of mixed exposures has been limited by reliance on parametric models and, until recently, by researchers considering only one exposure at a time, usually estimated as a beta coefficient in a generalized linear regression model (GLM). This independent assessment of exposures poorly estimates the joint impact of a collection of the same exposures in a realistic exposure setting. Marginal methods for mixture variable selection such as ridge/lasso regression are biased by linear assumptions and the interactions modeled are chosen by the user. Clustering methods such as principal component regression lose both interpretability and valid inference. Newer mixture methods such as quantile g-computation [@keil2020quantile] are biased by linear/additive assumptions. More flexible methods such as Bayesian kernel machine regression (BKMR)[@Bobb2014] are sensitive to the choice of tuning parameters, are computationally taxing and lack an interpretable and robust summary statistic of dose-response relationships. No methods currently exist which finds the best flexible model to adjust for covariates while applying a non-parametric model that targets for interactions in a mixture and delivers valid inference for a target parameter. Non-parametric methods such as decision trees are a useful tool to evaluate combined exposures by finding partitions in the joint-exposure (mixture) space that best explain the variance in an outcome. However, current methods using decision trees to assess statistical inference for interactions are biased and are prone to overfitting by using the full data to both identify nodes in the tree and make statistical inference given these nodes. Other methods have used an independent test set to derive inference which does not use the full data. The `CVtreeMLE` `R` package provides researchers in (bio)statistics, epidemiology, and environmental health sciences with access to state-of-the-art statistical methodology for evaluating the causal effects of a data-adaptively determined mixed exposure using decision trees. Our target audience are those analysts who would normally use a potentially biased GLM based model for a mixed exposure. Instead, we hope to provide users with a non-parametric statistical machine where users simply specify the exposures, covariates and outcome, `CVtreeMLE` then determines if a best fitting decision tree exists and delivers interpretable results. Although users do not need strong knowledge of the underlying theory, `CVtreeMLE` builds off the general theorem of cross-validated minimum loss-based estimation (CV-TMLE) which allows for the full utilization of loss-based ensemble machine learning to obtain the initial estimators needed for our target parameter without risk of overfitting. `CVtreeMLE` uses V-fold cross-validation and partitions the full data into parameter-generating samples and estimation samples. For example, when V=10, integers 1-10 are randomly assigned to each observation with equal probability. In fold 1, observations assigned to 1 are used in the estimation sample and all other observations are used in the parameter-generating sample. This process rotates through the data until all the folds are complete. In the parameter-generating sample, decision trees are applied to a mixed exposure to obtain rules and estimators are created for our statistical target parameter. The rules from decision trees are then applied to the estimation sample where the statistical target parameter is estimated. `CVtreeMLE` makes possible the non-parametric estimation of the causal effects of a mixed exposure producing results that are both interpretable and guaranteed to converge to the truth (under assumptions) at a particular rate as sample size increases. Additionally, `CVtreeMLE` allows for discovery of important mixtures of exposure *and also* provides robust statistical inference for the impact of these mixtures. # Statement of Need -In many disciplines there is a demonstrable need to ascertain the causal effects of a mixed exposure. Advancement in the area of mixed exposures is challenged by real-world joint exposure scenarios where complex agonistic or antagonistic relationships between mixture components can occur. More flexible methods which can fit these interactions may be less biased, but results are typically difficult to interpret, which has led researchers to favor more biased methods based on GLM's. Current software tools for mixtures rarely report performance tests using data that reflect the complexities of real-world exposures [@Yu2022; @keil2020; @carlin2013unraveling]. In many instances, new methods are not tested against a ground-truth target parameter under various mixture conditions. New areas of statistical research, rooted in non/semi-parametric efficiency theory for statistical functionals, allow for robust estimation of data-adaptive parameters. That is, it is possible to use the data to both define and estimate a target parameter. This is important in mixtures when the most important set of variables and levels in these variables are almost always unknown. Thus, the development of asymptotically linear estimators for data-adaptive parameters are critical for the field of mixed exposure statistics. However, the development of open-source software which translates semi-parametric statistical theory into well-documented functional software is a formidable challenge. Such implementation requires understanding of causal inference, semi-parametric statistical theory, machine learning, and the intersection of these disciplines. The `CVtreeMLE` `R` package provides researchers with an open-source tool for evaluating the causal effects of a mixed exposure by treating decision trees as a data-adaptive target parameter to define exposure. The `CVtreeMLE` package is well documented and includes a vignette detailing semi-parametric theory for data-adaptive parameters, examples of output, results with interpretations under various real-life mixture scenarios, and comparison to existing methods. +In many disciplines there is a demonstrable need to ascertain the causal effects of a mixed exposure. Advancement in the area of mixed exposures is challenged by real-world joint exposure scenarios where complex agonistic or antagonistic relationships between mixture components can occur. More flexible methods which can fit these interactions may be less biased, but results are typically difficult to interpret, which has led researchers to favor more biased methods based on GLM's. Current software tools for mixtures rarely report performance tests using data that reflect the complexities of real-world exposures [@Yu2022; @keil2020quantile; @carlin2013unraveling]. In many instances, new methods are not tested against a ground-truth target parameter under various mixture conditions. New areas of statistical research, rooted in non/semi-parametric efficiency theory for statistical functionals, allow for robust estimation of data-adaptive parameters. That is, it is possible to use the data to both define and estimate a target parameter. This is important in mixtures when the most important set of variables and levels in these variables are almost always unknown. Thus, the development of asymptotically linear estimators for data-adaptive parameters are critical for the field of mixed exposure statistics. However, the development of open-source software which translates semi-parametric statistical theory into well-documented functional software is a formidable challenge. Such implementation requires understanding of causal inference, semi-parametric statistical theory, machine learning, and the intersection of these disciplines. The `CVtreeMLE` `R` package provides researchers with an open-source tool for evaluating the causal effects of a mixed exposure by treating decision trees as a data-adaptive target parameter to define exposure. The `CVtreeMLE` package is well documented and includes a vignette detailing semi-parametric theory for data-adaptive parameters, examples of output, results with interpretations under various real-life mixture scenarios, and comparison to existing methods. # Background