diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index fc1b284..c651df5 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -15,29 +15,14 @@ jobs: fail-fast: false matrix: version: - - '1.9' + - '1' os: - ubuntu-latest + - macOS-latest arch: - - x64 + - 'x64' steps: - uses: actions/checkout@v2 - - name: Install curl - run: sudo apt-get update && sudo apt-get install -y libcurl4-openssl-dev libharfbuzz-dev libfribidi-dev - - name: Setup R - uses: r-lib/actions/setup-r@v2 - with: - r-version: 4.2.3 - - name: Install R dependencies - run: | - install.packages("devtools", repos="http://cran.us.r-project.org", dependecies=TRUE) - require(devtools) - install_version("hal9001", version = "0.4.1", repos = "http://cran.us.r-project.org") - shell: Rscript {0} - - name: Set R_HOME - run: echo "R_HOME=$(R RHOME)" >> $GITHUB_ENV - - name: Set LD_LIBRARY - run: echo "LD_LIBRARY_PATH=$R_HOME/lib" >> $GITHUB_ENV - uses: julia-actions/setup-julia@v1 with: version: ${{ matrix.version }} @@ -45,10 +30,16 @@ jobs: - uses: julia-actions/cache@v1 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v2 with: files: lcov.info + - uses: julia-actions/julia-processcoverage@v1 + # - name: Build App + # run: julia --project --startup-file=no deps/build_app.jl + # - uses: actions/upload-artifact@v4 + # with: + # name: tmle-${{ matrix.os }}-${{ matrix.arch }} + # path: tmle docs: name: Documentation runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 0a77c2f..eb170c5 100644 --- a/.gitignore +++ b/.gitignore @@ -12,12 +12,14 @@ deps/build.log deps/downloads/ deps/usr/ deps/src/ +tmle/ # Build artifacts for creating documentation generated by the Documenter package docs/build/ docs/site/ +docs/Manifest.toml test/Manifest.toml -sysimage/Manifest.toml - Manifest.toml + +build/ diff --git a/Project.toml b/Project.toml index 43498b2..e968a6d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TargetedEstimation" uuid = "2573d147-4098-46ba-9db2-8608d210ccac" authors = ["Olivier Labayle"] -version = "0.7.4" +version = "0.8.0" [deps] ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" @@ -9,11 +9,13 @@ Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" +Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" GLMNet = "8d5ece8b-de18-5317-b113-243142960cc6" -HighlyAdaptiveLasso = "c5dac772-1445-43c4-b698-9440de7877f6" +HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" @@ -24,31 +26,36 @@ MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91" Mmap = "a63ad114-7e13-5084-954f-fe012c677804" MultipleTesting = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b" Optim = "429524aa-4258-5aef-a3af-852621145aeb" +PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" TMLE = "8afdd2fb-6e73-43df-8b62-b1650cd9c8cf" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] +PackageCompiler = "2.1.16" ArgParse = "1.1.4" Arrow = "2.5.2" CSV = "0.10" CategoricalArrays = "0.10" Combinatorics = "1.0.2" +Configurations = "0.17.6" DataFrames = "1.3.4" -EvoTrees = "0.16" +EvoTrees = "0.16.5" GLMNet = "0.7" -HighlyAdaptiveLasso = "0.2.0" JLD2 = "0.4.22" +JSON = "0.21.4" MKL = "0.6" -MLJ = "0.19" -MLJBase = "0.21" -MLJLinearModels = "0.9" +MLJ = "0.20.0" +MLJBase = "1.0.1" +MLJLinearModels = "0.10.0" MLJModelInterface = "1.8.0" MLJModels = "0.16" MLJXGBoostInterface = "0.3.4" -MultipleTesting = "0.5.1" +MultipleTesting = "0.6.0" Optim = "1.7" -TMLE = "0.11.4" +TMLE = "0.14.0" Tables = "1.10.1" +YAML = "0.4.9" julia = "1.7, 1" diff --git a/data/sample_dataset.csv b/data/sample_dataset.csv new file mode 100644 index 0000000..6dbdd98 --- /dev/null +++ b/data/sample_dataset.csv @@ -0,0 +1,101 @@ +T,W_1,W_2,Y +false,0.18102554215580358,0.5450806703063027,2.9940372099784027 +false,0.3674899461902501,0.6384329204193084,3.6556468914971756 +false,0.6690584311411027,0.43792082835867485,3.6566892558262984 +false,0.04273056581197765,0.7757285573916282,3.4160794440612667 +false,0.4379909608992427,0.3047310991183463,2.789125420976082 +true,0.4832901297478609,0.7451730217771686,1.5859557539740305 +false,0.7639737291557767,0.8904601966212045,5.188296343573969 +true,0.9244830959536434,0.13974660123511873,0.4982270316308855 +false,0.5150451217580685,0.6374757520926937,3.935855162247756 +false,0.49504171746863457,0.8565155620522915,4.561765097844799 +false,0.7013399928855184,0.8136974906450547,4.829305702798148 +false,0.6935693200259092,0.7770630474285287,4.708377370910661 +false,0.8762727975157072,0.6985801341473621,4.853277788606795 +true,0.9442193089864095,0.152194194900529,4.037983372295075 +false,0.7392125921999604,0.15113069121626244,2.928937148645259 +false,0.5595446681103335,0.32420235015472953,3.0840500870314176 +true,0.5068756305502102,0.7933086736255439,4.003336818622111 +true,0.6460238520948196,0.09241238685905295,1.8598644023282838 +false,0.26637285700482627,0.32928469720586406,2.5254756391406956 +true,0.25314126112406954,0.005639780239579784,2.0525644282124227 +false,0.3516179276178317,0.562947511861783,3.3891190015292962 +true,0.5273890936851706,0.5791653992166597,2.236487556841665 +true,0.7210925534616786,0.0815702398426259,3.0907289031725105 +false,0.18851150251759496,0.4507062485489093,2.7103706395089318 +false,0.637996251650204,0.2796408144910678,3.1179620782873023 +false,0.42784339776115243,0.05608349255807643,2.021998670506748 +false,0.36344680674044993,0.8176529855849277,4.187058502288485 +false,0.5776470899697952,0.5038402847145775,3.664411058533266 +false,0.4552051132605446,0.80168727438732,4.324422128772599 +true,0.16582015869838074,0.6517102569667845,3.4493844928436257 +true,0.3485177133589714,0.6950429952352082,3.6465473336626584 +false,0.7370786697973803,0.38967172496607816,3.634455876824883 +false,0.8097943410644535,0.30785356657086305,3.5631085002530503 +true,0.17116662327378251,0.38189260775289746,1.4561452308524758 +true,0.2239059463776638,0.37754440958420843,2.9358196480011927 +false,0.8482049921559374,0.7931999164105743,5.073800536916948 +false,0.6129496106778634,0.5582656110841486,3.9006286335176275 +false,0.9525421237374148,0.7075371298070849,5.027958774493786 +false,0.329471340006501,0.8244276908646733,4.133141608706676 +true,0.9183731870724761,0.5155924027190455,2.1766450039509775 +true,0.5406318800132754,0.7964424508760488,4.684487622068989 +false,0.820474440393214,0.3278374872665033,3.6201478925383053 +false,0.22304578643880224,0.15454141257308707,1.9141989300490125 +true,0.5987759444612732,0.8176931599179378,4.883829492724324 +true,0.5391280234619427,0.0800996880924989,-0.15332305595018275 +true,0.6195348270893413,0.04758713076380294,1.5044413418755769 +false,0.4197760589260566,0.6919387484370496,3.913552132910235 +false,0.45325909384306007,0.8196586617380355,4.361479188698554 +false,0.458012070794656,0.16787410906435518,2.3890361585879396 +false,0.8360692316060747,0.12572716005598905,3.0494749469842053 +false,0.68704221750134,0.9336977783694771,5.162792856806991 +false,0.3539590866764071,0.4938068514848526,3.201878898498728 +false,0.15146985093210463,0.9318499781184257,4.0825781392628055 +true,0.7036713552821277,0.3110022402796051,4.247461621524256 +false,0.7858058549340399,0.7913869099880062,4.9487597218140555 +false,0.5516353577049822,0.7651183843708445,4.394357280603648 +false,0.33689370624999193,0.8200595760169511,4.122787054444181 +false,0.7103550345192344,0.5218538906399544,3.9851980589961844 +false,0.3437537135972244,0.7082383555963896,3.8223674411348094 +true,0.40543796744015514,0.07340489667656125,-0.44867162036508734 +false,0.418787685820859,0.9537197956213714,4.671368501348916 +false,0.3461876693258523,0.17116028512837467,2.196479242014222 +false,0.256693308150987,0.7535261803886308,3.7658816939059676 +false,0.15717578481324845,0.9086295629550201,4.041940493212971 +false,0.06397027012871725,0.748570362698747,3.3694111418545805 +true,0.5960710257852946,0.6663504027833114,3.367016019904959 +true,0.3313524247810329,0.6591751071404244,1.591879202485502 +true,0.09653466861970061,0.6363397790684187,2.9197943852934563 +false,0.2444274100956212,0.2098958139673206,2.1388072276125003 +true,0.128071302925437,0.2212452403166849,0.2377448923043096 +true,0.42570257768498054,0.19674477205988938,1.387006507647053 +true,0.9265378351770237,0.2414821377114318,3.7979295737435383 +false,0.49186280724413045,0.12746729440853555,2.384783686379066 +false,0.13454928219280093,0.4781845302027954,2.710629287272798 +true,0.7767793160877585,0.524392439209832,2.5943698653629785 +false,0.5975255336244989,0.390410633670742,3.3711308436058625 +true,0.36037123968128437,0.08296139972284933,-0.759956439158037 +false,0.0346420282305675,0.0617778348993705,1.2471920828108296 +false,0.002639461613289207,0.7009785602029246,3.0993420302705896 +false,0.6043418433725678,0.6486688636856162,4.156769560508067 +false,0.34013328112005636,0.1345850741469954,2.08159579861953 +false,0.08884383382645145,0.3088038486943412,2.1056178972765487 +false,0.27042373335313585,0.006888583580566321,1.556777817484051 +false,0.2906905645217257,0.541835309258762,3.2236471704885887 +false,0.1159966466957052,0.5135297293779133,2.783128796238891 +true,0.8470732321945746,0.42816797170836707,2.0301351712154556 +true,0.8139519778944555,0.24560986612792113,1.7245624951948406 +false,0.060180250784984235,0.49673727324525174,2.622413005826214 +true,0.42137186429269047,0.16330846948665134,0.8216594084825344 +true,0.6798160152993227,0.5456659244000286,2.8674322144135957 +false,0.7229464588051613,0.43577650941532386,3.7550371808310166 +false,0.7375979790215319,0.9110656955098189,5.216039115140225 +false,0.5523948722167735,0.9189451331877909,4.858501050384263 +false,0.970875486702566,0.06627679288609234,3.143553167360774 +true,0.7978957981860126,0.16648221966941223,0.3279875381813927 +false,0.6832983780571866,0.6493963093415174,4.318015004991571 +false,0.523122205661108,0.05689713675107577,2.230304870061633 +false,0.8553411083874956,0.7608458973060162,4.992917779986071 +false,0.2884613639525233,0.40667243126317154,2.79960851563243 +true,0.4312330027658198,0.24848292057152732,2.172648627086597 diff --git a/deps/build_app.jl b/deps/build_app.jl new file mode 100644 index 0000000..8e3ea90 --- /dev/null +++ b/deps/build_app.jl @@ -0,0 +1,6 @@ +using PackageCompiler +PackageCompiler.create_app(".", "tmle", + executables = ["tmle" => "julia_main"], + precompile_execution_file="deps/execute.jl", + include_lazy_artifacts=true +) diff --git a/deps/build_sysimage.jl b/deps/build_sysimage.jl new file mode 100644 index 0000000..208628a --- /dev/null +++ b/deps/build_sysimage.jl @@ -0,0 +1,7 @@ +using PackageCompiler +PackageCompiler.create_sysimage( + ["TargetedEstimation"], + cpu_target="generic", + sysimage_path="TMLESysimage.so", + precompile_execution_file="deps/execute.jl", +) diff --git a/sysimage/precompile_exec_file.jl b/deps/execute.jl similarity index 61% rename from sysimage/precompile_exec_file.jl rename to deps/execute.jl index a45912e..70f58fe 100644 --- a/sysimage/precompile_exec_file.jl +++ b/deps/execute.jl @@ -1,6 +1,7 @@ -import TargetedEstimation +using TargetedEstimation +@info "Running precompilation script." +# Run workload TEST_DIR = joinpath(pkgdir(TargetedEstimation), "test") push!(LOAD_PATH, TEST_DIR) -cd(TEST_DIR) include(joinpath(TEST_DIR, "runtests.jl")) \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 8634ee2..f711fc9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM julia:1.9.0-bullseye +FROM julia:1.10-bullseye ARG DEBIAN_FRONTEND=noninteractive @@ -6,42 +6,23 @@ ENV TZ=Europe/Amsterdam ENV JULIA_DEPOT_PATH=/opt -RUN apt-get update && apt-get install -y wget unzip procps +RUN apt-get update && \ + apt-get -y install gcc mono-mcs vim procps wget lsb-release software-properties-common gnupg && \ + rm -rf /var/lib/apt/lists/* -# Install R and hal9001 +RUN bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" -RUN apt-get install -y r-base \ - r-base-core \ - r-recommended \ - r-base-dev - -RUN apt-get install -y libssl-dev \ - libxml2-dev \ - libcurl4-openssl-dev \ - libgit2-dev \ - libharfbuzz-dev \ - libfribidi-dev \ - libfontconfig1-dev \ - libfreetype6-dev \ - libpng-dev \ - libtiff5-dev \ - libjpeg-dev - -RUN R -e "install.packages('devtools', repos='http://cran.us.r-project.org', dependecies=TRUE); \ - require(devtools);\ - install_version('hal9001', version = '0.4.1', repos = 'http://cran.us.r-project.org')" - -# Import project, build and precompile +# Import the project COPY . /TargetedEstimation.jl WORKDIR /TargetedEstimation.jl -# Precompile project -RUN julia -q --project -e'using Pkg; Pkg.instantiate(); Pkg.resolve(); Pkg.precompile()' +# Precompile the project +RUN julia --project -e'using Pkg; Pkg.instantiate(); Pkg.resolve(); Pkg.precompile()' -# Precompile Sysimage project -RUN julia -q --project=/TargetedEstimation.jl/sysimage -e'using Pkg;Pkg.instantiate(); Pkg.resolve(); Pkg.precompile()' +# Build CLI +RUN julia --project --startup-file=no deps/build_sysimage.jl -# Build Sysimage -RUN julia --project -t auto --startup-file=no sysimage/create_sysimage.jl +# Test the CLI runs +RUN julia --startup-file=no --project -JTMLESysimage.so tmle.jl tmle data/sample_dataset.csv \ No newline at end of file diff --git a/docs/Project.toml b/docs/Project.toml index 9aa1999..6932532 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -2,4 +2,4 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" [compat] -Documenter = "1.1.2" \ No newline at end of file +Documenter = "1.2.1" diff --git a/docs/make.jl b/docs/make.jl index 1403e7b..e278c5d 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,7 +15,7 @@ makedocs( modules = [TargetedEstimation], pages=[ "Home" => "index.md", - "Command Line Interfaces" => ["environment.md", "tmle_estimation.md", "sieve_variance.md", "merge.md"], + "Command Line Interface" => ["cli.md", "tmle_estimation.md", "sieve_variance.md", "make_summary.md"], "MLJ Extensions" => ["models.md", "resampling.md"], ], pagesonly=true, diff --git a/docs/src/cli.md b/docs/src/cli.md new file mode 100644 index 0000000..17dc2d6 --- /dev/null +++ b/docs/src/cli.md @@ -0,0 +1,30 @@ +# The Command Line Interface (CLI) + +## CLI Installation + +### Via Docker (requires Docker) + +While we are getting close to providing a standalone application, the most reliable way to use the app is still via the provided [Docker container](https://hub.docker.com/r/olivierlabayle/targeted-estimation/tags). In this container, the command line interface is accessible and can be used directly. For example via: + +```bash +docker run -it --rm -v HOST_DIR:CONTAINER_DIR olivierlabayle/targeted-estimation:TAG tmle --help +``` + +where `HOST_DIR:CONTAINER_DIR` will map the host directory `HOST_DIR` to the container's `CONTAINER_DIR` and `TAG` is the currently released version of the project. + +### Build (requires Julia) + +Alternatively, provided you have Julia installed, you can build the app via: + +```bash +julia --project deps/build_app.jl app +``` + +Bellow is a description of the functionalities offered by the CLI. + +## CLI Description + +```@contents +Pages = ["tmle_estimation.md", "sieve_variance.md", "make_summary.md"] +Depth = 5 +``` diff --git a/docs/src/environment.md b/docs/src/environment.md deleted file mode 100644 index 24a40dc..0000000 --- a/docs/src/environment.md +++ /dev/null @@ -1,22 +0,0 @@ -# The Run Environment - -## General usage - -At this point in time, the package depends on several R dependencies which makes it difficult to package as a single Julia executable. We thus rely on a docker container for the execution of the various command line interfaces. Some familiarity with [Docker](https://docs.docker.com/get-started/) or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/quick_start.html) is thus beneficial. - -- The container is available for download from the [Docker registry](https://hub.docker.com/r/olivierlabayle/targeted-estimation/tags). -- In this container, the project is stored in `/TargetedEstimation.jl`, as such, any script can be run using the following template command: `julia --startup-file=no --project=/TargetedEstimation.jl /TargetedEstimation.jl/scripts/SCRIPT_NAME.jl`. Dont forget to mount the output directory in order to retrieve the output data. - -Example Docker command: - -```bash -docker run -it --rm -v HOST_DIR:CONTAINER_DIR olivierlabayle/targeted-estimation:0.7 \ -julia --project=/TargetedEstimation.jl /TargetedEstimation.jl/scripts/tmle.jl --help -``` - -## Alternatives - -Here are a couple alternatives to using the Docker container: - -- If you are not using the HAL algorithm, you can simply clone this repository and instantiate the project in order to use the scripts or any other functionality. -- If you are using the HAL algorithm you can use the `docker/Dockerfile` as a guide for your local installation. diff --git a/docs/src/estimators/glm.jl b/docs/src/estimators/glm.jl deleted file mode 100644 index 6aea32a..0000000 --- a/docs/src/estimators/glm.jl +++ /dev/null @@ -1,14 +0,0 @@ -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache = false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target - Q_continuous = LinearRegressor(), - # For the estimation of E[Y|W, T]: binary target - Q_binary = LogisticClassifier(lambda=0.), - # For the estimation of p(T| W) - G = LogisticClassifier(lambda=0.) -) \ No newline at end of file diff --git a/docs/src/estimators/glmnet.jl b/docs/src/estimators/glmnet.jl deleted file mode 100644 index 27a89f3..0000000 --- a/docs/src/estimators/glmnet.jl +++ /dev/null @@ -1,14 +0,0 @@ -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache = false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target - Q_continuous = GLMNetRegressor(resampling=CV(nfolds=3)), - # For the estimation of E[Y|W, T]: binary target - Q_binary = GLMNetClassifier(resampling=StratifiedCV(nfolds=3)), - # For the estimation of p(T| W) - G = GLMNetClassifier(resampling=StratifiedCV(nfolds=3)) -) \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 10952d5..84cd4cb 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,11 +1,8 @@ # TargetedEstimation.jl -The goal of this package, eventually, is to provide a standalone executable to run large scale Targeted Minimum Loss-based Estimation ([TMLE](https://link.springer.com/book/10.1007/978-1-4419-9782-1)) on tabular datasets. To learn more about TMLE, please visit [TMLE.jl](https://targene.github.io/TMLE.jl/stable/), the companion package. +The goal of this package, is to provide a standalone executable to run large scale Targeted Minimum Loss-based Estimation ([TMLE](https://link.springer.com/book/10.1007/978-1-4419-9782-1)) on tabular datasets. To learn more about TMLE, please visit [TMLE.jl](https://targene.github.io/TMLE.jl/stable/), the companion package. -The various command line interfaces provided here are described in the following sections and can be run in the associated [Docker container](https://hub.docker.com/r/olivierlabayle/targeted-estimation/tags): - -- [Targeted Minimum Loss Based Estimation](@ref): The main command line interface provided in this project to run TMLE. -- [Sieve Variance Plateau Estimation](@ref): Variance correction for non i.i.d. data. +- Jump to [The Command Line Interface (CLI)](@ref) We also provide extensions to the [MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/) universe that are particularly useful in statistical genetics (but not restricted to it): diff --git a/docs/src/make_summary.md b/docs/src/make_summary.md new file mode 100644 index 0000000..d93db23 --- /dev/null +++ b/docs/src/make_summary.md @@ -0,0 +1,11 @@ +# Merging TMLE outputs + +## Usage + +```bash +tmle make-summary --help +``` + +```@docs +make_summary +``` diff --git a/docs/src/merge.md b/docs/src/merge.md deleted file mode 100644 index 924bf6b..0000000 --- a/docs/src/merge.md +++ /dev/null @@ -1,17 +0,0 @@ -# Merging TMLE and SVP outputs - -If multiple `scripts/tmle.jl` and potentially `scripts/sieve_variance.jl` have been run, you may want to combine the generated CSV outputs in a single result file. This is the purpose of this command line interface. - -## Usage - -You can merge summary CSV files by running: - -```bash -julia scripts/merge_summaries.jl TMLE_PREFIX OUT --sieve-prefix=SIEVE_PREFIX -``` - -where: - -- `TMLE_PREFIX`: is a prefix to all output CSV files generated by the `scripts/tmle.jl` script. -- `OUT`: is a path to the output file that will be generated. -- `--sieve-prefix`: is an optional prefix to the CSV output of the `scripts/sieve_variance.jl` script. diff --git a/docs/src/models.md b/docs/src/models.md index 4d978bd..a660e13 100644 --- a/docs/src/models.md +++ b/docs/src/models.md @@ -11,11 +11,10 @@ Because [TMLE.jl](https://targene.github.io/TMLE.jl/stable/) is based on top of - [EvoTrees.jl](https://evovest.github.io/EvoTrees.jl/stable/): A pure Julia implementation of histogram based gradient boosting trees (subset of XGBoost) - [GLMNet](https://github.com/JuliaStats/GLMNet.jl): A Julia wrapper of the [glmnet](https://glmnet.stanford.edu/articles/glmnet.html) package. See the [GLMNet](@ref) section. - [MLJModels](https://github.com/JuliaAI/MLJModels.jl): General utilities such as the `OneHotEncoder` or `InteractionTransformer`. -- [HighlyAdaptiveLasso](https://github.com/olivierlabayle/HighlyAdaptiveLasso.jl): A Julia wrapper of the [HAL](https://tlverse.org/hal9001/) algorithm, experimental. Further support for more packages can be added on request, please fill an [issue](https://github.com/TARGENE/TargetedEstimation.jl/issues). -Also, because the [Estimator File](@ref) is a pure Julia file, it is possible to use it in order to install additional package that can be used to define additional models. +Also, because the estimator file used by the TMLE CLI is a pure Julia file, it is possible to use it in order to install additional package that can be used to define additional models. Finally, we also provide some additional models described in [Additional models provided by TargetedEstimation.jl](@ref). diff --git a/docs/src/sieve_variance.md b/docs/src/sieve_variance.md index 3abd1fa..780812e 100644 --- a/docs/src/sieve_variance.md +++ b/docs/src/sieve_variance.md @@ -2,22 +2,12 @@ If the i.i.d. (independent and identically distributed) hypothesis is not satisfied, most of the traditional statistical inference theory falls apart. This is typically possible in population genetics where a study may contain related individuals. Here we leverage a non-parametric method called [Sieve Variance Plateau](https://biostats.bepress.com/ucbbiostat/paper322/) (SVP) estimation. The hypothesis is that the dependence between individuals is sufficiently small, so that our targeted estimator will still be asymptotically unbiased, but its variance will be under estimated. In brief, the SVP estimator computes a variance estimate for a range of thresholds 𝜏, by considering individuals to be independent if their distance exceeds 𝜏. As the distance threshold 𝜏 increases, fewer individuals are assumed to be independent. The maximum of this curve is the most conservative estimate of the variance of the target parameter estimator and constitutes our SVP corrected variance estimator. -## Usage - -At the moment, this script is restricted to the analysis of population genetics datasets mostly in the context of [TarGene](https://targene.github.io/targene-pipeline/stable/sieve_variance/). It can be run with the following command: +## [Usage](@id svp_command) ```bash -julia scripts/sieve_variance.jl PREFIX GRM_PREFIX OUT_PREFIX - --nb-estimators=100 - --max-tau=1.0 - --verbosity=1 +tmle sieve-variance-plateau --help ``` -where: - -- `PREFIX`: A prefix to HDF5 files generated by `scripts/tmle.jl` (potentially multiple). -- `GRM_PREFIX`: A prefix to the aggregated Genetic Relationship Matrix. -- `OUT_PREFIX`: Output prefix to save SVP curves and final variance estimates. -- `--nb-estimators`: The number of points per SVP curve. -- `--max-tau`: Maximum distance between individuals to consider. -- `--verbosity`: Verbosity level. +```@docs +sieve_variance_plateau +``` diff --git a/docs/src/tmle_estimation.md b/docs/src/tmle_estimation.md index b22592f..e7b6ae3 100644 --- a/docs/src/tmle_estimation.md +++ b/docs/src/tmle_estimation.md @@ -4,96 +4,19 @@ This is the main script in this package, it provides a command line interface fo ## Usage -Provided you have the package and all dependencies installed or in the provided docker container, you can run TMLE via the following command: - ```bash -julia scripts/tmle.jl DATAFILE PARAMFILE OUTFILE - --estimator-file=docs/estimators/glmnet.jl - --hdf5-out=output.hdf5 - --pval-threshold=0.05 - --chunksize=100 - --verbosity=1 +tmle tmle --help ``` -where: - -- `DATAFILE`: A CSV (.csv) or Arrow (.arrow) file containing the tabular data. The format will be infered from the extension. -- `PARAMFILE`: A serialized [YAML](https://targene.github.io/TMLE.jl/stable/user_guide/#Reading-Parameters-from-YAML-files) or [bin](https://docs.julialang.org/en/v1/stdlib/Serialization/) file containing the estimands to be estimated. The YAML file can be written by hand or programmatically using the [TMLE.parameters_to_yaml](https://targene.github.io/TMLE.jl/stable/api/#TMLE.parameters_to_yaml-Tuple{Any,%20Any}) function. -- `OUTFILE`: The output .csv file (see [Output file](@ref)) -- `--estimator-file`: A Julia file describing the TMLE specifications (see [Estimator File](@ref)). -- `--hdf5-out`: if provided, a path to a file to save the influence curves. -- `--pval-threshold`: Only "significant" (< this threshold) estimates will actually have their influence curves stored in the previous file. -- `--chunksize`: To manage memory, the results are appended to the output files in batches the size of which can be controlled via this option. -- `--verbosity`: The verbosity level. - -## Output file - -The output file is a plain CSV file containing one line per estimand in the input `PARAMFILE`. The file contains the following columns: - -- `PARAMETER_TYPE`: The estimand type (e.g. "ATE", "IATE", ...). -- `TREATMENTS`: A "_&_" separated string containing all treatment variables associated with the estimand. -- `CASE`: A "_&_" separated string containing the treatment variables' case values in the same order as `TREATMENTS`. -- `CONTROL`: A "_&_" separated string containing the treatment variables' control values in the same order as `TREATMENTS`. -- `TARGET`: The outcome variable. -- `CONFOUNDERS`: A "_&_" separated string containing the confounding variables. -- `COVARIATES`: A "_&_" separated string containing the extra covariates used to estimate the outcome's mean. -- `INITIAL_ESTIMATE`: The initial estimate before the targeting step. -- `TMLE_ESTIMATE`: The targeted estimate. -- `TMLE_STD`: The standard deviation associated with the targeted estimate. -- `TMLE_PVALUE`: The p-value associated with the targeted estimate. -- `TMLE_LWB`: The 95% confidence interval lower bound associated with the targeted estimate. -- `TMLE_UPB`: The 95% confidence interval upper bound associated with the targeted estimate. -- `ONESTEP_ESTIMATE`: The one step estimate. -- `ONESTEP_STD`: The standard deviation associated with the one step estimate. -- `ONESTEP_PVALUE`: The p-value associated with the one step estimate. -- `ONESTEP_LWB`: The 95% confidence interval lower bound associated with the one step estimate. -- `ONESTEP_UPB`: The 95% confidence interval upper bound associated with the one step estimate. -- `LOG`: A log message if estimation failed. - -## Estimator File - -TMLE is an adaptive procedure that depends on the specification of learning algorithms for the estimation of the nuisance parameters (see [TMLE.jl](https://targene.github.io/TMLE.jl/stable/) for a description of the assumed setting). In our case, there are two nuisance parameters for which we need to specify learning algorithms: - -- `E[Y|T, W, C]`: The mean outcome given the treatment, confounders and extra covariates. It is commonly denoted by `Q` in the Targeted Learning litterature. -- `p(T|W)`: The propensity score. It is commonly denoted by `G` in the Targeted Learning litterature. - -### Description of the file - -In order to provide maximum flexibility as to the choice of learning algorithms, the estimator file is a plain [Julia](https://julialang.org/) file. This file is optional and omitting it defaults to using generalized linear models. If provided, it must define a [NamedTuple](https://docs.julialang.org/en/v1/base/base/#Core.NamedTuple) called `tmle_spec` containing any of the following fields as follows (default configuration): - -```julia - -tmle_spec = ( - Q_continuous = LinearRegressor(), - Q_binary = LogisticClassifier(lambda=0.), - G = LogisticClassifier(lambda=0.), - threshold = 1e-8, - cache = false, - weighted_fluctuation = false -) +```@docs +tmle ``` -where: - -- `Q_continuous`: is a MLJ model used for the estimation of `E[Y|T, W, C]` when the outcome `Y` is continuous. -- `Q_binary`: is a MLJ model used for the estimation of `E[Y|T, W, C]` when the outcome `Y` is binary. -- `G`: is a MLJ model used for the estimation of `p(T|W)`. -- `threshold`: is the minimum value the propensity score `G` is allowed to take. -- `cache`: controls caching of data by [MLJ machines](https://alan-turing-institute.github.io/MLJ.jl/dev/machines/). Setting it to `true` may result in faster runtime but higher memory usage. -- `weighted_fluctuation`: controls whether the fluctuation for `Q` is a weighted glm or not. If some of the treatment values are rare it may lead to more robust estimation. - -Typically, `Q_continuous`, `Q_binary` and `G` will be adjusted and other fields can be left unspecified. - -### Ready to use estimator files +## Note on TMLE Outputs -We recognize not everyone will be familiar with [Julia](https://julialang.org/). We thus provide a set of ready to use estimator files that can be simplified or extended as needed: +We can output results in three different formats: HDF5, JSON and JLS. By default no output is written, so you need to specify at least one. An output can be generated by specifying an output filename for it. For instance `--outputs.json.filename=output.json` will output a JSON file. Note that you can generate multiple formats at once, e.g. `--outputs.json.filename=output.json --outputs.hdf5.filename=output.hdf5` will output both JSON and HDF5 result files. Another important output option is the `pval_threshold`. Each estimation result is accompanied by an influence curve vector and by default these vectors are erased before saving the results because they typically take up too much space and are not usually needed. In some occasions you might want to keep them and this can be achieved by specifiying the output's `pval_threhsold`. For instance `--outputs.hdf5.pval_threshold=1.` will keep all such vectors because all p-values lie in between 0 and 1. -- Super Learning: [with](./estimators/superlearning-with-interactions-for-Q.jl) and [without](./estimators/superlearning.jl) interaction terms in the GLM models for Q. -- Super Learning for G and GLMNet for Q: [here](./estimators/G-superlearning-Q-glmnet.jl). -- Super Learning for G and GLM for Q: [here](./estimators/G-superlearning-Q-glm.jl). -- GLMNet: [with](./estimators/glmnet-with-interactions-for-Q.jl) and [without](./estimators/glmnet.jl) interaction terms in the GLM models for Q. -- GLM: [with](./estimators/glm-with-interactions-for-Q.jl) and [without](./estimators/glm.jl) interaction terms in the GLM models for Q. -- XGBoost: [with tuning](./estimators/tuned-xgboost.jl). +In order to run sieve variance plateau correction after a TMLE run you need to save the results in HDF5 format with influence curve vectors. Furthermore, you will need to save the sample-ids associated with each result. A complete option set for this could be: `--outputs.hdf5.filename=output.hdf5 --outputs.hdf5.pval_threshold=0.05 --sample_ids=true`. In this case, only those results with an individual p-value of less than ``0.05`` will keep track of their influence curves and be considered for sieve variance correction. ## Runtime @@ -113,8 +36,8 @@ In what follows, `Y` is an outcome of interest, `W` a set of confounding variabl For all the following experiments: -- The Julia script can be found at `experiments/runtime.jl`. -- The various estimators used below are further described in [Ready to use estimator files](@ref). +- The Julia script can be found at [experiments/runtime.jl](https://github.com/TARGENE/TargetedEstimation.jl/tree/main/experiments/runtime.jl). +- The various estimators used below are further described in the[estimators-configs](https://github.com/TARGENE/TargetedEstimation.jl/tree/main/estimators-configs) folder. ### Multiple treatment contrasts @@ -138,12 +61,12 @@ In a PheWAS, one is interested in the effect of a genetic variation across many With this setup in mind, the computational complexity is mostly driven by the specification of the learning algorithms for `Q`, which will have to be fitted for each outcome. For 10 outcomes, we estimate the 3 Average Treatment Effects corresponding to the 3 possible treatment contrasts defined in the previous section. There are thus two levels of reuse of `G` and `Q` in this study design. In the table below are presented some runtimes for various specifications of `G` and `Q` using a single cpu. The "Unit runtime" is the average runtime across all estimands and can roughly be extrapolated to bigger studies. -| Estimator file | Unit runtime (s) | Extrapolated runtime to 1000 outcomes | +| Estimator | Unit runtime (s) | Extrapolated runtime to 1000 outcomes | | --- | :---: | :---: | -| `docs/src/estimators/glm.jl` | 4.65 | ≈ 1h20 | -| `docs/src/estimators/glmnet.jl` | 7.19 | ≈ 2h | -| `docs/src/estimators/G-superlearning-Q-glmnet.jl` | 50.05| ≈ 13h45 | -| `docs/src/estimators/superlearning.jl` | 168.98 | ≈ 46h | +| `glm.` | 4.65 | ≈ 1h20 | +| `glmnet` | 7.19 | ≈ 2h | +| `G-superlearning-Q-glmnet` | 50.05| ≈ 13h45 | +| `superlearning` | 168.98 | ≈ 46h | Depending on the exact setup, this means one can probably afford to use Super Learning for at least the estimation of `G` (and potentially also for `Q` for a single PheWAS). This turns out to be a great news because TMLE is a double robust estimator. As a reminder, it means that only one of the estimators for `G` or `Q` needs to converge sufficiently fast to the ground truth to guarantee that our estimates will be asymptotically unbiased. @@ -168,9 +91,9 @@ Again, we estimate the 3 Average Treatment Effects corresponding to the 3 possib | Estimator file | Continuous outcome unit runtime (s) | Binary outcome unit runtime (s) | Projected Time on HPC (200 folds //) | | --- | :---: | :---: | :---: | -| `docs/src/estimators/glm.jl` | 5.64 | 6.14 | ≈ 6h30 | -| `docs/src/estimators/glmnet.jl` | 17.46 | 22.24 | ≈ 22h | -| `docs/src/estimators/G-superlearning-Q-glmnet.jl` | 430.54 | 438.67 | ≈ 20 days | -| `docs/src/estimators/superlearning.jl` | 511.26 | 567.72 | ≈ 24 days | +| `glm` | 5.64 | 6.14 | ≈ 6h30 | +| `glmnet` | 17.46 | 22.24 | ≈ 22h | +| `G-superlearning-Q-glmnet` | 430.54 | 438.67 | ≈ 20 days | +| `superlearning` | 511.26 | 567.72 | ≈ 24 days | We can see that modern high performance computing platforms definitely enable this study design when using GLMs or GLMNets. It is unlikely however, that you will be able to use Super Learning for any of `P(V|W)` or `E[Y|V, W]` if you don't have privileged access to such platform. While the double robustness guarantees will generally not be satisfied, our estimate will still be targeted, which means that its bias will be reduced compared to classic inference using a parametric model. diff --git a/docs/src/estimators/G-superlearning-Q-glm.jl b/estimators-configs/G-superlearning-Q-glm.jl similarity index 71% rename from docs/src/estimators/G-superlearning-Q-glm.jl rename to estimators-configs/G-superlearning-Q-glm.jl index 83d44d0..02da072 100644 --- a/docs/src/estimators/G-superlearning-Q-glm.jl +++ b/estimators-configs/G-superlearning-Q-glm.jl @@ -1,13 +1,7 @@ xgboost_classifier = XGBoostClassifier(tree_method="hist") -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache = false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous outcome Q_continuous = LinearRegressor(), # For the estimation of E[Y|W, T]: binary target Q_binary = LogisticClassifier(lambda=0.), @@ -30,4 +24,8 @@ tmle_spec = ( cache=false ) ) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=1e-8), ) \ No newline at end of file diff --git a/docs/src/estimators/G-superlearning-Q-glmnet.jl b/estimators-configs/G-superlearning-Q-glmnet.jl similarity index 72% rename from docs/src/estimators/G-superlearning-Q-glmnet.jl rename to estimators-configs/G-superlearning-Q-glmnet.jl index 47094ef..bb8c495 100644 --- a/docs/src/estimators/G-superlearning-Q-glmnet.jl +++ b/estimators-configs/G-superlearning-Q-glmnet.jl @@ -1,13 +1,7 @@ xgboost_classifier = XGBoostClassifier(tree_method="hist") -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache = false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous outcome Q_continuous = GLMNetRegressor(resampling=CV(nfolds=3)), # For the estimation of E[Y|W, T]: binary target Q_binary = GLMNetClassifier(resampling=StratifiedCV(nfolds=3)), @@ -30,4 +24,8 @@ tmle_spec = ( cache=false ) ) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=1e-8), ) \ No newline at end of file diff --git a/docs/src/estimators/glm-with-interactions-for-Q.jl b/estimators-configs/glm-with-interactions-for-Q.jl similarity index 56% rename from docs/src/estimators/glm-with-interactions-for-Q.jl rename to estimators-configs/glm-with-interactions-for-Q.jl index edaeca7..8959c76 100644 --- a/docs/src/estimators/glm-with-interactions-for-Q.jl +++ b/estimators-configs/glm-with-interactions-for-Q.jl @@ -1,11 +1,5 @@ -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache = false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous outcome Q_continuous = Pipeline( RestrictedInteractionTransformer(order=2, primary_variables_patterns=[r"^rs[0-9]+"]), LinearRegressor(), @@ -19,4 +13,8 @@ tmle_spec = ( ), # For the estimation of p(T| W) G = LogisticClassifier(lambda=0.) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=1e-8), ) \ No newline at end of file diff --git a/estimators-configs/glm.jl b/estimators-configs/glm.jl new file mode 100644 index 0000000..9a8b166 --- /dev/null +++ b/estimators-configs/glm.jl @@ -0,0 +1,12 @@ +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous outcome + Q_continuous = LinearRegressor(), + # For the estimation of E[Y|W, T]: binary target + Q_binary = LogisticClassifier(lambda=0.), + # For the estimation of p(T| W) + G = LogisticClassifier(lambda=0.) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=1e-8), +) \ No newline at end of file diff --git a/docs/src/estimators/glmnet-with-interactions-for-Q.jl b/estimators-configs/glmnet-with-interactions-for-Q.jl similarity index 60% rename from docs/src/estimators/glmnet-with-interactions-for-Q.jl rename to estimators-configs/glmnet-with-interactions-for-Q.jl index 003cd7e..b255974 100644 --- a/docs/src/estimators/glmnet-with-interactions-for-Q.jl +++ b/estimators-configs/glmnet-with-interactions-for-Q.jl @@ -1,11 +1,5 @@ -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache = false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous outcome Q_continuous = Pipeline( RestrictedInteractionTransformer(order=2, primary_variables_patterns=[r"^rs[0-9]+"]), GLMNetRegressor(resampling=CV(nfolds=3)), @@ -19,4 +13,8 @@ tmle_spec = ( ), # For the estimation of p(T| W) G = GLMNetClassifier(resampling=StratifiedCV(nfolds=3)) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=1e-8), ) \ No newline at end of file diff --git a/estimators-configs/glmnet.jl b/estimators-configs/glmnet.jl new file mode 100644 index 0000000..5fd2584 --- /dev/null +++ b/estimators-configs/glmnet.jl @@ -0,0 +1,12 @@ +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous outcome + Q_continuous = GLMNetRegressor(resampling=CV(nfolds=3)), + # For the estimation of E[Y|W, T]: binary outcome + Q_binary = GLMNetClassifier(resampling=StratifiedCV(nfolds=3)), + # For the estimation of p(T| W) + G = GLMNetClassifier(resampling=StratifiedCV(nfolds=3)) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=1e-8), +) \ No newline at end of file diff --git a/docs/src/estimators/superlearning-with-interactions-for-Q.jl b/estimators-configs/superlearning-with-interactions-for-Q.jl similarity index 88% rename from docs/src/estimators/superlearning-with-interactions-for-Q.jl rename to estimators-configs/superlearning-with-interactions-for-Q.jl index f3d75ce..df2372a 100644 --- a/docs/src/estimators/superlearning-with-interactions-for-Q.jl +++ b/estimators-configs/superlearning-with-interactions-for-Q.jl @@ -1,14 +1,8 @@ xgboost_regressor = XGBoostRegressor(tree_method="hist") xgboost_classifier = XGBoostClassifier(tree_method="hist") -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache = false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous outcome Q_continuous = Stack( metalearner = LinearRegressor(fit_intercept=false), resampling = CV(nfolds=3), @@ -81,4 +75,8 @@ tmle_spec = ( cache=false ) ) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=1e-8), ) \ No newline at end of file diff --git a/docs/src/estimators/superlearning.jl b/estimators-configs/superlearning.jl similarity index 83% rename from docs/src/estimators/superlearning.jl rename to estimators-configs/superlearning.jl index 4bb72f6..5f2ce4c 100644 --- a/docs/src/estimators/superlearning.jl +++ b/estimators-configs/superlearning.jl @@ -1,14 +1,8 @@ xgboost_regressor = XGBoostRegressor(tree_method="hist") xgboost_classifier = XGBoostClassifier(tree_method="hist") -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache = false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous outcome Q_continuous = Stack( metalearner = LinearRegressor(fit_intercept=false), resampling = CV(nfolds=3), @@ -27,7 +21,7 @@ tmle_spec = ( cache=false ) ), - # For the estimation of E[Y|W, T]: binary target + # For the estimation of E[Y|W, T]: binary outcome Q_binary = Stack( metalearner = LogisticClassifier(lambda=0., fit_intercept=false), resampling = StratifiedCV(nfolds=3), @@ -65,4 +59,8 @@ tmle_spec = ( cache=false ) ) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=1e-8), ) \ No newline at end of file diff --git a/docs/src/estimators/tuned-xgboost.jl b/estimators-configs/tuned-xgboost.jl similarity index 77% rename from docs/src/estimators/tuned-xgboost.jl rename to estimators-configs/tuned-xgboost.jl index 6432206..d7318c5 100644 --- a/docs/src/estimators/tuned-xgboost.jl +++ b/estimators-configs/tuned-xgboost.jl @@ -1,14 +1,8 @@ xgboost_regressor = XGBoostRegressor(tree_method="hist") xgboost_classifier = XGBoostClassifier(tree_method="hist") -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache = false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous outcome Q_continuous = TunedModel( model = xgboost_regressor, resampling = CV(nfolds=3), @@ -44,4 +38,8 @@ tmle_spec = ( measure = log_loss, cache=false ) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=1e-8), ) \ No newline at end of file diff --git a/experiments/runtime.jl b/experiments/runtime.jl index e641673..e3e2b62 100644 --- a/experiments/runtime.jl +++ b/experiments/runtime.jl @@ -2,10 +2,10 @@ using ArgParse using TargetedEstimation const ESTIMATORS = [ - "docs/src/estimators/glm.jl", - "docs/src/estimators/glmnet.jl", - "docs/src/estimators/G-superlearning-Q-glmnet.jl", - "docs/src/estimators/superlearning.jl" + "glm", + "glmnet", + "G-superlearning-Q-glmnet", + "superlearning" ] const PARAMETERS = [ "experiments/parameters.phewas.yaml", diff --git a/scripts/merge_summaries.jl b/scripts/merge_summaries.jl deleted file mode 100644 index 34fd604..0000000 --- a/scripts/merge_summaries.jl +++ /dev/null @@ -1,26 +0,0 @@ -using ArgParse -using TargetedEstimation - -function parse_commandline() - s = ArgParseSettings( - description = "Merge files outputs by tmle.jl and sieve_variance.jl in a single file.", - commands_are_required = false) - - @add_arg_table s begin - "tmle-prefix" - help = "Prefix to files output by tmle.jl" - required = true - "out" - help = "Output file to be generated" - required = true - "--sieve-prefix" - help = "Prefix to files output by sieve_variance.jl" - required = false - arg_type = String - end - - return parse_args(s) -end - -parsed_args = parse_commandline() -merge_csv_files(parsed_args) \ No newline at end of file diff --git a/scripts/sieve_variance.jl b/scripts/sieve_variance.jl deleted file mode 100644 index f6a551e..0000000 --- a/scripts/sieve_variance.jl +++ /dev/null @@ -1,41 +0,0 @@ -using TargetedEstimation -using ArgParse - -function parse_commandline() - s = ArgParseSettings(description="Compute the Sieve Variance Plateau estimate for each phenotype in the result file") - - @add_arg_table s begin - "prefix" - help = "Prefix to the .hdf5 files generated by the `tmle.jl` script" - arg_type = String - required = true - "grm-prefix" - arg_type = String - help = "Prefix of the aggregated GRM" - required = true - "out-prefix" - arg_type = String - help = "output filename" - required = true - "--nb-estimators", "-n" - arg_type = Int - help = "Number of variance estimators to compute" - default = 10 - "--max-tau", "-m" - arg_type = Float64 - help = "Maximum distance of individuals to take into account (maximum=2)"* - "It was witnessed that beyond 0.9, weird limit effects happen" - default = 0.8 - "--verbosity", "-v" - arg_type = Int - help = "Verbosity level" - default = 1 - end - - return parse_args(s) -end - - -parsed_args = parse_commandline() - -sieve_variance_plateau(parsed_args) diff --git a/scripts/tmle.jl b/scripts/tmle.jl deleted file mode 100644 index d16849f..0000000 --- a/scripts/tmle.jl +++ /dev/null @@ -1,50 +0,0 @@ -using ArgParse -using TargetedEstimation - -function parse_commandline() - s = ArgParseSettings( - description = "Targeted Learning estimation", - commands_are_required = false, - version = "0.2", - add_version = true) - - @add_arg_table s begin - "data" - help = "Path to dataset file (.csv|.arrow)" - required = true - "param-file" - help = "A file (.yaml|.bin) listing all parameters to estimate." - required = true - "csv-out" - help = "Path to output `.csv` file" - required = true - "--estimator-file" - help = "A file (.jl) describing the tmle estimator to use, README.md" - arg_type= String - required = false - "--hdf5-out" - help = "If the influence curves also need to be stored (see also: --pval-threshold)" - arg_type = String - default = nothing - "--pval-threshold" - help = "Only those parameters passing the threshold will have their influence curve saved." - default = 1. - arg_type = Float64 - "--chunksize" - help = "Results will be appended to outfiles every chunk" - default = 100 - arg_type = Int - "--verbosity", "-v" - help = "Verbosity level" - arg_type = Int - default = 1 - end - - return parse_args(s) -end - -parsed_args = parse_commandline() - -tmle_estimation(parsed_args) - - diff --git a/src/TargetedEstimation.jl b/src/TargetedEstimation.jl index 6c40623..a3ee31c 100644 --- a/src/TargetedEstimation.jl +++ b/src/TargetedEstimation.jl @@ -4,13 +4,13 @@ if occursin("Intel", Sys.cpu_info()[1].model) using MKL end +using ArgParse using DataFrames using MLJBase using MLJ using CSV using Arrow using TMLE -using HighlyAdaptiveLasso using EvoTrees using MLJXGBoostInterface using MLJLinearModels @@ -24,19 +24,27 @@ using MultipleTesting using Combinatorics using Tables using Random +using YAML +using JSON +using Configurations import MLJModelInterface +import Base.tryparse -include("tmle.jl") +include("failed_estimate.jl") +include("cache_managers.jl") +include("outputs.jl") +include("runner.jl") include("utils.jl") include("sieve_variance.jl") -include("merge.jl") +include("summary.jl") include("resampling.jl") include(joinpath("models", "glmnet.jl")) include(joinpath("models", "adaptive_interaction_transformer.jl")) include(joinpath("models", "biallelic_snp_encoder.jl")) +include("cli.jl") -export tmle_estimation, sieve_variance_plateau, merge_csv_files +export Runner, tmle, sieve_variance_plateau, make_summary, main export GLMNetRegressor, GLMNetClassifier export RestrictedInteractionTransformer, BiAllelicSNPEncoder export AdaptiveCV, AdaptiveStratifiedCV, JointStratifiedCV diff --git a/src/cache_managers.jl b/src/cache_managers.jl new file mode 100644 index 0000000..50d7fd3 --- /dev/null +++ b/src/cache_managers.jl @@ -0,0 +1,76 @@ +abstract type CacheManager end + +struct ReleaseUnusableCacheManager <: CacheManager + cache::Dict + η_counts::Dict + ReleaseUnusableCacheManager(η_counts) = new(Dict(), η_counts) +end + +function release!(cache_manager::ReleaseUnusableCacheManager, Ψ) + # Always drop fluctuations + haskey(cache_manager.cache, :last_fluctuation) && pop!(cache_manager.cache, :last_fluctuation) + + # Drop Basic nuisance functions + for η in TMLE.nuisance_functions_iterator(Ψ) + cache_manager.η_counts[η] -= 1 + if cache_manager.η_counts[η] == 0 + delete!(cache_manager.cache, η) + end + end + + # Drop aggregate nuisance function + for η in keys(cache_manager.cache) + if η isa TMLE.CMRelevantFactors + delete!(cache_manager.cache, η) + end + end +end + +struct MaxSizeCacheManager <: CacheManager + cache::Dict + max_size::Int + MaxSizeCacheManager(max_size) = new(Dict(), max_size) +end + +function release!(cache_manager::MaxSizeCacheManager, Ψ) + # Prioritize the release of the last fluctuation + if haskey(cache_manager.cache, :last_fluctuation) + pop!(cache_manager.cache, :last_fluctuation) + end + # Drop aggregate nuisance function + for η in keys(cache_manager.cache) + if η isa TMLE.CMRelevantFactors + delete!(cache_manager.cache, η) + end + end + # Drop the rest randomly until the size is acceptable + while length(cache_manager.cache) > cache_manager.max_size + pop!(cache_manager.cache) + end +end + +struct NoCacheManager <: CacheManager + cache::Dict + NoCacheManager() = new(Dict()) +end + +function release!(cache_manager::NoCacheManager, Ψ) + empty!(cache_manager.cache) +end + +function make_cache_manager(estimands, string) + if string == "release-unusable" + return ReleaseUnusableCacheManager(TMLE.nuisance_function_counts(estimands)) + elseif string == "no-cache" + return NoCacheManager() + else + maxsize = try parse(Int, string) + catch E + throw(ArgumentError(string("Could not convert the provided cache value to an integer: ", string))) + end + return MaxSizeCacheManager(maxsize) + end +end + + + diff --git a/src/cli.jl b/src/cli.jl new file mode 100644 index 0000000..51d84b1 --- /dev/null +++ b/src/cli.jl @@ -0,0 +1,180 @@ +function cli_settings() + s = ArgParseSettings(description="TMLE CLI.") + + @add_arg_table! s begin + "tmle" + action = :command + help = "Run TMLE." + + "svp" + action = :command + help = "Run Sieve Variance Plateau." + + "merge" + action = :command + help = "Merges TMLE outputs together." + end + + @add_arg_table! s["tmle"] begin + "dataset" + arg_type = String + required = true + help = "Path to the dataset (either .csv or .arrow)" + + "--estimands" + arg_type = String + help = "A string (`factorialATE`) or a serialized TMLE.Configuration (accepted formats: .json | .yaml | .jls)" + default = "factorialATE" + + "--estimators" + arg_type = String + help = "A julia file containing the estimators to use." + default = "glmnet" + + "--verbosity" + arg_type = Int + default = 0 + help = "Verbosity level" + + "--hdf5-output" + arg_type = String + help = "HDF5 file output." + + "--json-output" + arg_type = String + help = "JSON file output." + + "--jls-output" + arg_type = String + help = "JLS file output." + + "--chunksize" + arg_type = Int + help = "Results are written in batches of size chunksize." + default = 100 + + "--rng" + arg_type = Int + help = "Random seed (Only used for estimands ordering at the moment)." + default = 123 + + "--cache-strategy" + arg_type = String + help = "Caching Strategy for the nuisance functions, any of (`release-unusable`, `no-cache`, `max-size`)." + default = "release-unusable" + + "--sort-estimands" + help = "Sort estimands to minimize cache usage (A brute force approach will be used, resulting in exponentially long sorting time)." + action = :store_true + end + + @add_arg_table! s["svp"] begin + "input-prefix" + arg_type = String + help = "Input prefix to HDF5 files generated by the tmle CLI." + + "--out" + arg_type = String + help = "Output filename." + default = "svp.hdf5" + + "--grm-prefix" + arg_type = String + help = "Prefix to the aggregated GRM." + default = "GRM" + + "--verbosity" + arg_type = Int + default = 0 + help = "Verbosity level" + + "--n-estimators" + arg_type = Int + default = 10 + help = "Number of variance estimators to build for each estimate." + + "--max-tau" + arg_type = Float64 + default = 0.8 + help = "Maximum distance between any two individuals." + + "--estimator-key" + arg_type = String + help = "Estimator to use to proceed with sieve variance correction." + default = "TMLE" + end + + @add_arg_table! s["merge"] begin + "prefix" + arg_type = String + help = "Prefix to .hdf5 files to be used to create the summary file." + + "--hdf5-output" + arg_type = String + help = "HDF5 file output." + + "--json-output" + arg_type = String + help = "JSON file output." + + "--jls-output" + arg_type = String + help = "JLS file output." + end + + return s +end + + +makeOutput(T::Type, ::Nothing) = T() + +function makeOutput(T::Type, str) + args = split(str, ",") + kwargs = Dict(fn => tryparse(ft, val) for (val, fn, ft) ∈ zip(args, fieldnames(T), fieldtypes(T))) + return T(;kwargs...) +end + +make_outputs(hdf5_string, json_string, jls_tring) = Outputs( + hdf5=makeOutput(HDF5Output, hdf5_string), + json=makeOutput(JSONOutput, json_string), + jls=makeOutput(JLSOutput, jls_tring) +) + +function main(args=ARGS) + settings = parse_args(args, cli_settings()) + cmd = settings["%COMMAND%"] + cmd_settings = settings[cmd] + if cmd ∈ ("tmle", "merge") + outputs = make_outputs(cmd_settings["hdf5-output"], cmd_settings["json-output"], cmd_settings["jls-output"]) + if cmd == "tmle" + tmle(cmd_settings["dataset"]; + estimands=cmd_settings["estimands"], + estimators=cmd_settings["estimators"], + verbosity=cmd_settings["verbosity"], + outputs=outputs, + chunksize=cmd_settings["chunksize"], + rng=cmd_settings["rng"], + cache_strategy=cmd_settings["cache-strategy"], + sort_estimands=cmd_settings["sort-estimands"] + ) + else + make_summary(cmd_settings["prefix"]; + outputs=outputs + ) + end + else + sieve_variance_plateau(cmd_settings["input-prefix"]; + out=cmd_settings["out"], + grm_prefix=cmd_settings["grm-prefix"], + verbosity=cmd_settings["verbosity"], + n_estimators=cmd_settings["n-estimators"], + max_tau=cmd_settings["max-tau"], + estimator_key=cmd_settings["estimator-key"] + ) + end +end + +function julia_main()::Cint + main() + return 0 +end \ No newline at end of file diff --git a/src/failed_estimate.jl b/src/failed_estimate.jl new file mode 100644 index 0000000..b512e93 --- /dev/null +++ b/src/failed_estimate.jl @@ -0,0 +1,11 @@ +struct FailedEstimate + estimand::TMLE.Estimand + msg::String +end + +TMLE.to_dict(x::FailedEstimate) = Dict( + :estimand => TMLE.to_dict(x.estimand), + :error => x.msg +) + +TMLE.emptyIC(result::FailedEstimate, pval_threshold) = result diff --git a/src/merge.jl b/src/merge.jl deleted file mode 100644 index 3c34649..0000000 --- a/src/merge.jl +++ /dev/null @@ -1,66 +0,0 @@ - -function files_matching_prefix_and_suffix(prefix, suffix) - dirname_, prefix_ = splitdir(prefix) - dirname__ = dirname_ == "" ? "." : dirname_ - files = filter( - x -> startswith(x, prefix_) && endswith(x, suffix), - readdir(dirname__) - ) - return [joinpath(dirname_, x) for x in files] -end - -read_output_with_types(file) = - CSV.read(file, DataFrame, types=Dict(key => String for key in joining_keys())) - -function load_csv_files(data, files) - for file in files - new_data = read_output_with_types(file) - if size(new_data, 1) > 0 - data = vcat(data, new_data) - end - end - return data -end - -joining_keys() = ["PARAMETER_TYPE", "TREATMENTS", "CASE", "CONTROL", "TARGET", "CONFOUNDERS", "COVARIATES"] - -function merge_csv_files(parsed_args) - tmle_files = files_matching_prefix_and_suffix( - parsed_args["tmle-prefix"], - ".csv" - ) - # Load tmle data - data = load_csv_files(empty_tmle_output(), tmle_files) - # Load sieve data - sieveprefix = parsed_args["sieve-prefix"] - if sieveprefix !== nothing - sieve_files = files_matching_prefix_and_suffix( - parsed_args["sieve-prefix"], - ".csv" - ) - sieve_data = load_csv_files(empty_sieve_output(), sieve_files) - if size(sieve_data, 1) > 0 - data = leftjoin(data, sieve_data, on=joining_keys(), matchmissing=:equal) - end - end - - # Pvalue Adjustment by Target - for gp in groupby(data, :TARGET) - gp.TRAIT_ADJUSTED_TMLE_PVALUE = gp[:, :TMLE_PVALUE] - pvalues = collect(skipmissing(gp.TMLE_PVALUE)) - if length(pvalues) > 0 - adjusted_pvalues = adjust(pvalues, BenjaminiHochberg()) - adjusted_pval_index = 1 - for index in eachindex(gp.TRAIT_ADJUSTED_TMLE_PVALUE) - gp.TRAIT_ADJUSTED_TMLE_PVALUE[index] === missing && continue - gp.TRAIT_ADJUSTED_TMLE_PVALUE[index] = adjusted_pvalues[adjusted_pval_index] - adjusted_pval_index += 1 - end - end - end - - # Write to output file - CSV.write(parsed_args["out"], data) - - return 0 -end \ No newline at end of file diff --git a/src/models/glmnet.jl b/src/models/glmnet.jl index 4232ffd..189aca1 100644 --- a/src/models/glmnet.jl +++ b/src/models/glmnet.jl @@ -73,6 +73,10 @@ end function MLJBase.fit(model::GLMNetModel, verbosity::Int, X, y) folds = getfolds(model.resampling, X, y) res = glmnetcv(MLJBase.matrix(X), y; folds=folds, model.params...) + # This is currently not caught by the GLMNet package + if length(res.meanloss) == 0 + throw(error("glmnetcv's mean loss is empty. Probably meaning convergence failed at the first lambda for some fold.")) + end return make_fitresult(model, res, y), nothing, nothing end diff --git a/src/outputs.jl b/src/outputs.jl new file mode 100644 index 0000000..09e223a --- /dev/null +++ b/src/outputs.jl @@ -0,0 +1,154 @@ +FileExistsError(filename) = ArgumentError(string("File ", filename, " already exists.")) + +check_file_exists(filename::Nothing) = nothing +check_file_exists(filename) = !isfile(filename) || throw(FileExistsError(filename)) + +Base.tryparse(::Type{Union{String, Nothing}}, x::AbstractString) = x +Base.tryparse(::Type{Union{Float64, Nothing}}, x::AbstractString) = tryparse(Float64, x) +Base.tryparse(::Type{Union{T, Nothing}}, x::Nothing) where T = nothing + +""" + initialize(output) + +Default intialization procedure only checks that file does not exist. +""" +initialize(output) = check_file_exists(output.filename) + +##################################################################### +#####  JSON OUTPUT #### +##################################################################### + +@option struct JSONOutput + filename::Union{Nothing, String} = nothing + pval_threshold::Union{Nothing, Float64} = nothing +end + +""" + initialize(output::JSONOutput) + +Checks that file does not exist and inialize the json file +""" +function initialize(output::JSONOutput) + check_file_exists(output.filename) + initialize_json(output.filename) +end + +initialize_json(filename::Nothing) = nothing + +initialize_json(filename::String) = open(filename, "w") do io + print(io, '[') +end + +function update_file(output::JSONOutput, results; finalize=false) + output.filename === nothing && return + open(output.filename, "a") do io + for result in results + result = TMLE.emptyIC(result, output.pval_threshold) + JSON.print(io, TMLE.to_dict(result)) + print(io, ',') + end + if finalize + skip(io, -1) # get rid of the last comma which JSON doesn't allow + print(io, ']') + end + end +end + +##################################################################### +#####  HDF5 OUTPUT #### +##################################################################### + +@option struct HDF5Output + filename::Union{Nothing, String} = nothing + pval_threshold::Union{Nothing, Float64} = nothing + sample_ids::Bool = false + compress::Bool = false +end + +function update_file(output::HDF5Output, results; finalize=false) + jldopen(output.filename, "a+", compress=output.compress) do io + batches_keys = keys(io) + latest_index = isempty(batches_keys) ? 0 : maximum(parse(Int, split(key, "_")[2]) for key in batches_keys) + io[string("Batch_", latest_index + 1)] = results + end +end + +function update_file(output::HDF5Output, results, dataset) + output.filename === nothing && return + results = post_process(results, dataset, output.pval_threshold, output.sample_ids) + update_file(output, results) +end + +##################################################################### +#####  JLS OUTPUT #### +##################################################################### + +@option struct JLSOutput + filename::Union{Nothing, String} = nothing + pval_threshold::Union{Nothing, Float64} = nothing + sample_ids::Bool = false +end + +function update_file(output::JLSOutput, results; finalize=false) + open(output.filename, "a") do io + for result in results + serialize(io, result) + end + end +end + +function update_file(output::JLSOutput, results, dataset) + output.filename === nothing && return + results = post_process(results, dataset, output.pval_threshold, output.sample_ids) + update_file(output, results) +end + + +##################################################################### +#####  OUTPUTS #### +##################################################################### + +@option struct Outputs + json::JSONOutput = JSONOutput() + hdf5::HDF5Output = HDF5Output() + jls::JLSOutput = JLSOutput() +end + +""" + initialize(output::Outputs) + +Initializes all outputs in output. +""" +function initialize(outputs::Outputs) + initialize(outputs.json) + initialize(outputs.jls) + initialize(outputs.hdf5) +end + +function post_process(results, dataset, pval_threshold, save_sample_ids) + results = [TMLE.emptyIC(result, pval_threshold) for result ∈ results] + if save_sample_ids + sample_ids = get_sample_ids(dataset, results) + results = [(result..., SAMPLE_IDS=s_ids) for (result, s_ids) in zip(results, sample_ids)] + end + return results +end + +sample_ids_from_variables(dataset, variables) = dropmissing(dataset[!, [:SAMPLE_ID, variables...]]).SAMPLE_ID + +function get_sample_ids(dataset, results) + previous_variables = nothing + sample_ids = [] + current_ref_id = 0 + for (index, result) in enumerate(results) + current_variables = variables(first(result).estimand) + if previous_variables != current_variables + push!(sample_ids, sample_ids_from_variables(dataset, current_variables)) + current_ref_id = index + previous_variables = current_variables + else + push!(sample_ids, current_ref_id) + end + end + return sample_ids +end \ No newline at end of file diff --git a/src/runner.jl b/src/runner.jl new file mode 100644 index 0000000..e91c09d --- /dev/null +++ b/src/runner.jl @@ -0,0 +1,176 @@ +mutable struct Runner + estimators::NamedTuple + estimands::Vector{TMLE.Estimand} + dataset::DataFrame + cache_manager::CacheManager + chunksize::Int + outputs::Outputs + verbosity::Int + failed_nuisance::Set + function Runner(dataset; + estimands="factorialATE", + estimators="glmnet", + verbosity=0, + outputs=Outputs(), + chunksize=100, + rng=123, + cache_strategy="release-unusable", + sort_estimands=false + ) + # Retrieve TMLE specifications + estimators = TargetedEstimation.load_tmle_spec(file=estimators) + # Load dataset + dataset = TargetedEstimation.instantiate_dataset(dataset) + # Read parameter files + estimands = TargetedEstimation.build_estimands_list(estimands, dataset) + if sort_estimands + estimands = groups_ordering(estimands; + brute_force=true, + do_shuffle=true, + rng=MersenneTwister(rng), + verbosity=verbosity + ) + end + cache_manager = make_cache_manager(estimands, cache_strategy) + + failed_nuisance = Set([]) + + return new(estimators, estimands, dataset, cache_manager, chunksize, outputs, verbosity, failed_nuisance) + end +end + +function save(runner::Runner, results, partition, finalize) + # Append JSON Output + update_file(runner.outputs.json, results; finalize=finalize) + # Append JLS Output + update_file(runner.outputs.jls, results, runner.dataset) + # Append HDF5 Output + update_file(runner.outputs.hdf5, results, runner.dataset) +end + +function try_estimation(runner, Ψ, estimator) + try + result, _ = estimator(Ψ, runner.dataset, + cache=runner.cache_manager.cache, + verbosity=runner.verbosity, + ) + return result + catch e + # Some nuisance function fits may fail. We do not interrupt on them but log instead. + if e isa TMLE.FitFailedError + # This also allows to skip fast the next estimands requiring the same nuisance functions. + if !(e.model isa TMLE.Fluctuation) + push!(runner.failed_nuisance, e.estimand) + end + return FailedEstimate(Ψ, e.msg) + # On other errors, rethrow + else + rethrow(e) + end + end +end + +function skip_fast(runner, Ψ) + ηs = TMLE.nuisance_functions_iterator(Ψ) + any(η ∈ runner.failed_nuisance for η in ηs) && return true + return false +end + +function (runner::Runner)(partition) + results = Vector{NamedTuple}(undef, size(partition, 1)) + for (partition_index, param_index) in enumerate(partition) + Ψ = runner.estimands[param_index] + if skip_fast(runner, Ψ) + results[partition_index] = NamedTuple{keys(runner.estimators)}([FailedEstimate(Ψ, "Skipped due to shared failed nuisance fit.") for _ in 1:length(runner.estimators)]) + continue + end + # Make sure data types are appropriate for the estimand + TargetedEstimation.coerce_types!(runner.dataset, Ψ) + # Maybe update cache with new η_spec + estimators_results = [] + for estimator in runner.estimators + result = try_estimation(runner, Ψ, estimator) + push!(estimators_results, result) + end + # Update results + results[partition_index] = NamedTuple{keys(runner.estimators)}(estimators_results) + # Release cache + release!(runner.cache_manager, Ψ) + # Try clean C memory + GC.gc() + if Sys.islinux() + ccall(:malloc_trim, Cvoid, (Cint,), 0) + end + end + return results +end + +function (runner::Runner)() + # Initialize output files + initialize(runner.outputs) + # Split worklist in partitions + nparams = size(runner.estimands, 1) + partitions = collect(Iterators.partition(1:nparams, runner.chunksize)) + for partition in partitions + results = runner(partition) + save(runner, results, partition, partition===partitions[end]) + end +end + + +""" + tmle(dataset; + estimands="factorialATE", + estimators="glmnet"; + verbosity=0, + outputs=Outputs(), + chunksize=100, + rng=123, + cache_strategy="release-unusable", + sort_estimands=false + ) + +TMLE CLI. + +# Args + +- `dataset`: Data file (either .csv or .arrow) + +# Options + +- `--estimands`: A string ("factorialATE") or a serialized TMLE.Configuration (accepted formats: .json | .yaml | .jls) +- `--estimators`: A julia file containing the estimators to use. +- `-v, --verbosity`: Verbosity level. +- `-o, --outputs`: Ouputs to be generated. +- `--chunksize`: Results are written in batches of size chunksize. +- `-r, --rng`: Random seed (Only used for estimands ordering at the moment). +- `-c, --cache-strategy`: Caching Strategy for the nuisance functions, any of ("release-unusable", "no-cache", "max-size"). + +# Flags + +- `-s, --sort_estimands`: Sort estimands to minimize cache usage (A brute force approach will be used, resulting in exponentially long sorting time). +""" +function tmle(dataset::String; + estimands::String="factorialATE", + estimators::String="glmnet", + verbosity::Int=0, + outputs::Outputs=Outputs(), + chunksize::Int=100, + rng::Int=123, + cache_strategy::String="release-unusable", + sort_estimands::Bool=false + ) + runner = Runner(dataset; + estimands=estimands, + estimators=estimators, + verbosity=verbosity, + outputs=outputs, + chunksize=chunksize, + rng=rng, + cache_strategy=cache_strategy, + sort_estimands=sort_estimands + ) + runner() + verbosity >= 1 && @info "Done." + return +end diff --git a/src/sieve_variance.jl b/src/sieve_variance.jl index 27c4c04..ea41eb9 100644 --- a/src/sieve_variance.jl +++ b/src/sieve_variance.jl @@ -23,44 +23,6 @@ function align_ic(ic, sample_ids, grm_ids) return coalesce.(aligned_ic, 0) end -sieve_dataframe() = DataFrame( - PARAMETER_TYPE=String[], - TREATMENTS=String[], - CASE=String[], - CONTROL=Union{String, Missing}[], - TARGET=String[], - CONFOUNDERS=String[], - COVARIATES=Union{String, Missing}[], - TMLE_ESTIMATE=Float64[], -) - -empty_sieve_output() = DataFrame( - PARAMETER_TYPE=String[], - TREATMENTS=String[], - CASE=String[], - CONTROL=Union{String, Missing}[], - TARGET=String[], - CONFOUNDERS=String[], - COVARIATES=Union{String, Missing}[], - SIEVE_STD = Float64[], - SIEVE_PVALUE = Float64[], - SIEVE_LWB = Float64[], - SIEVE_UPB = Float64[], -) - -function push_sieveless!(output, Ψ, Ψ̂) - target = string(Ψ.target) - param_type = param_string(Ψ) - treatments = treatment_string(Ψ) - case = case_string(Ψ) - control = control_string(Ψ) - confounders = confounders_string(Ψ) - covariates = covariates_string(Ψ) - push!(output, ( - param_type, treatments, case, control, target, confounders, covariates, Ψ̂ - )) -end - """ bit_distances(sample_grm, nτs) @@ -80,40 +42,59 @@ end default_τs(nτs;max_τ=2) = Float32[max_τ*(i-1)/(nτs-1) for i in 1:nτs] +retrieve_sample_ids(sample_ids::AbstractVector, batch_results) = sample_ids + +retrieve_sample_ids(index::Int, batch_results) = batch_results[index].SAMPLE_IDS -function build_work_list(prefix, grm_ids) +function update_work_lists_with!(result::TMLE.ComposedEstimate, sample_ids, batch_results, grm_ids, results, influence_curves, n_obs) + for estimate in result.estimates + update_work_lists_with!(estimate, sample_ids, batch_results, grm_ids, results, influence_curves, n_obs) + end +end + +function update_work_lists_with!(result, sample_ids, batch_results, grm_ids, results, influence_curves, n_obs) + if length(result.IC) > 0 + sample_ids = string.(retrieve_sample_ids(sample_ids, batch_results)) + push!(influence_curves, align_ic(result.IC, sample_ids, grm_ids)) + push!(n_obs, size(sample_ids, 1)) + push!(results, result) + end +end + +function build_work_list(prefix, grm_ids; estimator_key=:TMLE) dirname_, prefix_ = splitdir(prefix) dirname__ = dirname_ == "" ? "." : dirname_ hdf5files = filter( x -> startswith(x, prefix_) && endswith(x, ".hdf5"), readdir(dirname__) ) - hdf5files = [joinpath(dirname_, x) for x in hdf5files] + hdf5files = sort([joinpath(dirname_, x) for x in hdf5files]) influence_curves = Vector{Float32}[] n_obs = Int[] - sieve_df = sieve_dataframe() + results = [] for hdf5file in hdf5files jldopen(hdf5file) do io - # templateΨs = io["parameters"] - # results = io["results"] for key in keys(io) - result_group = io[key] - tmleresult = io[key]["result"] - Ψ = tmleresult.parameter - sample_ids = haskey(result_group, "sample_ids") ? result_group["sample_ids"] : - io[string(result_group["sample_ids_idx"])]["sample_ids"] - sample_ids = string.(sample_ids) - Ψ̂ = TMLE.estimate(tmleresult.tmle) - - push!(influence_curves, align_ic(tmleresult.tmle.IC, sample_ids, grm_ids)) - push!(n_obs, size(sample_ids, 1)) - push_sieveless!(sieve_df, Ψ, Ψ̂) + batch_results = io[key] + for nt_result in batch_results + result = nt_result[estimator_key] + result isa FailedEstimate && continue + sample_ids = nt_result.SAMPLE_IDS + update_work_lists_with!( + result, + sample_ids, + batch_results, + grm_ids, results, + influence_curves, + n_obs + ) + end end end end influence_curves = length(influence_curves) > 0 ? reduce(vcat, transpose(influence_curves)) : Matrix{Float32}(undef, 0, 0) - return sieve_df, influence_curves, n_obs + return results, influence_curves, n_obs end @@ -184,7 +165,6 @@ function compute_variances(influence_curves, grm, τs, n_obs) return variances end - function grm_rows_bounds(n_samples) bounds = Pair{Int, Int}[] start_idx = 1 @@ -198,59 +178,77 @@ function grm_rows_bounds(n_samples) return bounds end - -function save_results(outprefix, output, τs, variances) - CSV.write(string(outprefix, ".csv"), output) - jldopen(string(outprefix, ".hdf5"), "w") do io +function save_results(filename, results, τs, variances) + jldopen(filename, "w") do io io["taus"] = τs io["variances"] = variances + io["results"] = results end end +corrected_stderrors(variances) = + sqrt.(view(maximum(variances, dims=1), 1, :)) -corrected_stderrors(variances, n_obs) = - sqrt.(view(maximum(variances, dims=1), 1, :) ./ n_obs) - -function update_sieve_df!(df, stds) - n = size(stds, 1) - df.SIEVE_STD = Vector{Float64}(undef, n) - df.SIEVE_PVALUE = Vector{Float64}(undef, n) - df.SIEVE_LWB = Vector{Float64}(undef, n) - df.SIEVE_UPB = Vector{Float64}(undef, n) - - for index in 1:n - std = stds[index] - estimate = df.TMLE_ESTIMATE[index] - testresult = OneSampleZTest(estimate, std, 1) - lwb, upb = confint(testresult) - df.SIEVE_STD[index] = std - df.SIEVE_PVALUE[index] = pvalue(testresult) - df.SIEVE_LWB[index] = lwb - df.SIEVE_UPB[index] = upb - end +with_updated_std(estimate::T, std) where T = T( + estimate.estimand, + estimate.estimate, + convert(Float64, std), + estimate.n, + Float64[] +) - select!(df, Not(:TMLE_ESTIMATE)) -end +with_updated_std(results, stds, estimator_key) = + [NamedTuple{(estimator_key,)}([with_updated_std(result, std)]) for (result, std) in zip(results, stds)] -function sieve_variance_plateau(parsed_args) - prefix = parsed_args["prefix"] - outprefix = parsed_args["out-prefix"] - verbosity = parsed_args["verbosity"] - τs = default_τs(parsed_args["nb-estimators"];max_τ=parsed_args["max-tau"]) - grm, grm_ids = readGRM(parsed_args["grm-prefix"]) +""" + sieve_variance_plateau(input_prefix; + out="svp.hdf5", + grm_prefix="GRM", + verbosity=0, + n_estimators=10, + max_tau=0.8, + estimator_key="TMLE" + ) + +Sieve Variance Plateau CLI. + +# Args + +- `input-prefix`: Input prefix to HDF5 files generated by the tmle CLI. + +# Options + +- `-o, --out`: Output filename. +- `-g, --grm-prefix`: Prefix to the aggregated GRM. +- `-v, --verbosity`: Verbosity level. +- `-n, --n_estimators`: Number of variance estimators to build for each estimate. +- `-m, --max_tau`: Maximum distance between any two individuals. +- `-e, --estimator-key`: Estimator to use to proceed with sieve variance correction. +""" +function sieve_variance_plateau(input_prefix::String; + out::String="svp.hdf5", + grm_prefix::String="GRM", + verbosity::Int=0, + n_estimators::Int=10, + max_tau::Float64=0.8, + estimator_key::String="TMLE" + ) + estimator_key = Symbol(estimator_key) + τs = default_τs(n_estimators;max_τ=max_tau) + grm, grm_ids = readGRM(grm_prefix) verbosity > 0 && @info "Preparing work list." - sieve_df, influence_curves, n_obs = build_work_list(prefix, grm_ids) + results, influence_curves, n_obs = build_work_list(input_prefix, grm_ids, estimator_key=estimator_key) if length(influence_curves) > 0 verbosity > 0 && @info "Computing variance estimates." variances = compute_variances(influence_curves, grm, τs, n_obs) - std_errors = corrected_stderrors(variances, n_obs) - update_sieve_df!(sieve_df, std_errors) + std_errors = corrected_stderrors(variances) + results = with_updated_std(results, std_errors, estimator_key) else variances = Float32[] end - save_results(outprefix, sieve_df, τs, variances) + save_results(out, results, τs, variances) verbosity > 0 && @info "Done." return 0 diff --git a/src/summary.jl b/src/summary.jl new file mode 100644 index 0000000..a83a383 --- /dev/null +++ b/src/summary.jl @@ -0,0 +1,67 @@ + +function files_matching_prefix_and_suffix(prefix, suffix) + dirname_, prefix_ = splitdir(prefix) + dirname__ = dirname_ == "" ? "." : dirname_ + files = filter( + x -> startswith(x, prefix_) && endswith(x, suffix), + readdir(dirname__) + ) + return [joinpath(dirname_, x) for x in files] +end + +read_output_with_types(file) = + CSV.read(file, DataFrame, types=Dict(key => String for key in joining_keys())) + +""" + make_summary( + prefix; + outputs=Outputs(json=JSONOutput(filename="summary.json")) + ) + +Combines multiple TMLE .hdf5 output files in a single file. Multiple formats can be output at once. + +# Args + +- `prefix`: Prefix to .hdf5 files to be used to create the summary file + +# Options + +- `-o, --outputs`: Ouptuts configuration. +""" +function make_summary( + prefix::String; + outputs::Outputs=Outputs() + ) + + # Initialize output files + initialize(outputs) + actual_outputs = [getfield(outputs, field) for field ∈ fieldnames(Outputs) + if getfield(outputs, field).filename !== nothing] + + # Get all input .hdf5 files + dirname_, prefix_ = splitdir(prefix) + dirname__ = dirname_ == "" ? "." : dirname_ + files = sort(filter( + x -> startswith(x, prefix_), + readdir(dirname__) + )) + nfiles = length(files) + + # Write to files + for (file_index, filename) in enumerate(files) + filepath = joinpath(dirname_, filename) + jldopen(filepath) do io + batch_keys = collect(keys(io)) + nbatches = length(batch_keys) + for (batch_index, batch_key) in enumerate(batch_keys) + results = io[batch_key] + finalize = file_index == nfiles && batch_index == nbatches + for output in actual_outputs + update_file(output, results; finalize=finalize) + end + end + end + end + + return 0 +end \ No newline at end of file diff --git a/src/tmle.jl b/src/tmle.jl deleted file mode 100644 index 9f0a62e..0000000 --- a/src/tmle.jl +++ /dev/null @@ -1,94 +0,0 @@ -struct MissingTMLEResult - parameter::TMLE.Parameter -end - -function try_tmle!(cache; verbosity=1, threshold=1e-8, weighted_fluctuation=false) - try - tmle_result, _ = tmle!(cache; verbosity=verbosity, threshold=threshold, weighted_fluctuation=weighted_fluctuation) - return tmle_result, missing - catch e - @warn string("Failed to run Targeted Estimation for parameter:", cache.Ψ) - return MissingTMLEResult(cache.Ψ), string(e) - end -end - - -function partition_tmle!( - cache, - tmle_results, - logs, - partition, - tmle_spec, - parameters, - variables; - verbosity=0) - for (partition_index, param_index) in enumerate(partition) - previous_target_is_binary = isdefined(cache, :Ψ) ? cache.Ψ.target ∈ variables.binarytargets : nothing - Ψ = parameters[param_index] - # Update cache with new Ψ - update!(cache, Ψ) - # Maybe update cache with new η_spec - target_is_binary = Ψ.target ∈ variables.binarytargets - if !isdefined(cache, :η_spec) || !(target_is_binary === previous_target_is_binary) - Q_spec = target_is_binary ? tmle_spec.Q_binary : tmle_spec.Q_continuous - η_spec = NuisanceSpec(Q_spec, tmle_spec.G, cache=tmle_spec.cache) - update!(cache, η_spec) - end - # Run TMLE - tmle_result, log = TargetedEstimation.try_tmle!( - cache; - verbosity=verbosity, - threshold=tmle_spec.threshold, - weighted_fluctuation=tmle_spec.weighted_fluctuation - ) - # Update results - tmle_results[partition_index] = tmle_result - logs[partition_index] = log - - # Try clean C memory - GC.gc() - if Sys.islinux() - ccall(:malloc_trim, Cvoid, (Cint,), 0) - end - end -end - -function tmle_estimation(parsed_args) - datafile = parsed_args["data"] - paramfile = parsed_args["param-file"] - estimatorfile = parsed_args["estimator-file"] - verbosity = parsed_args["verbosity"] - csv_file = parsed_args["csv-out"] - jld2_file = parsed_args["hdf5-out"] - pval_threshold = parsed_args["pval-threshold"] - chunksize = parsed_args["chunksize"] - - # Load dataset - dataset = TargetedEstimation.instantiate_dataset(datafile) - # Read parameter files - parameters = TargetedEstimation.read_parameters(paramfile, dataset) - optimize_ordering!(parameters) - - # Get covariate, confounder and treatment columns - variables = TargetedEstimation.variables(parameters, dataset) - TargetedEstimation.coerce_types!(dataset, variables) - - # Retrieve TMLE specifications - tmle_spec = TargetedEstimation.load_tmle_spec(estimatorfile) - - cache = TMLECache(dataset) - nparams = size(parameters, 1) - for partition in Iterators.partition(1:nparams, chunksize) - partition_size = size(partition, 1) - tmle_results = Vector{Union{TMLE.TMLEResult, MissingTMLEResult}}(undef, partition_size) - logs = Vector{Union{String, Missing}}(undef, partition_size) - partition_tmle!(cache, tmle_results, logs, partition, tmle_spec, parameters, variables; verbosity=verbosity) - # Append CSV result with partition - append_csv(csv_file, tmle_results, logs) - # Append HDF5 result if save-ic is true - update_jld2_output(jld2_file, partition, tmle_results, dataset; pval_threshold=pval_threshold) - end - - verbosity >= 1 && @info "Done." - return 0 -end diff --git a/src/utils.jl b/src/utils.jl index 9ca8dea..8fa48ca 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,188 +1,111 @@ - - ##################################################################### -#####  CSV OUTPUT #### +#####  Read TMLE Estimands Configuration #### ##################################################################### - -empty_tmle_output(;size=0) = DataFrame( - PARAMETER_TYPE=Vector{String}(undef, size), - TREATMENTS=Vector{String}(undef, size), - CASE=Vector{String}(undef, size), - CONTROL=Vector{Union{Missing, String}}(undef, size), - TARGET=Vector{String}(undef, size), - CONFOUNDERS=Vector{String}(undef, size), - COVARIATES=Vector{Union{Missing, String}}(undef, size), - INITIAL_ESTIMATE=Vector{Union{Missing, Float64}}(undef, size), - TMLE_ESTIMATE=Vector{Union{Missing, Float64}}(undef, size), - TMLE_STD=Vector{Union{Missing, Float64}}(undef, size), - TMLE_PVALUE=Vector{Union{Missing, Float64}}(undef, size), - TMLE_LWB=Vector{Union{Missing, Float64}}(undef, size), - TMLE_UPB=Vector{Union{Missing, Float64}}(undef, size), - ONESTEP_ESTIMATE=Vector{Union{Missing, Float64}}(undef, size), - ONESTEP_STD=Vector{Union{Missing, Float64}}(undef, size), - ONESTEP_PVALUE=Vector{Union{Missing, Float64}}(undef, size), - ONESTEP_LWB=Vector{Union{Missing, Float64}}(undef, size), - ONESTEP_UPB=Vector{Union{Missing, Float64}}(undef, size), - LOG=Vector{Union{Missing, String}}(undef, size) -) - -covariates_string(Ψ; join_string="_&_") = - length(Ψ.covariates) != 0 ? join(Ψ.covariates, join_string) : missing - -function param_string(param::T) where T <: TMLE.Parameter - str = string(T) - return startswith(str, "TMLE.") ? str[6:end] : str +function convert_treatment_values(treatment_levels::NamedTuple{names, <:Tuple{Vararg{NamedTuple}}}, treatment_types) where names + return [( + case = convert(treatment_types[tn], treatment_levels[tn].case), + control = convert(treatment_types[tn], treatment_levels[tn].control) + ) + for tn in names] end -case(nt::NamedTuple) = nt.case -case(x) = x -case_string(Ψ; join_string="_&_") = join((case(x) for x in values(Ψ.treatment)), join_string) +convert_treatment_values(treatment_levels::NamedTuple{names,}, treatment_types) where names = + [convert(treatment_types[tn], treatment_levels[tn]) for tn in names] -control_string(t::Tuple{Vararg{NamedTuple}}; join_string="_&_") = - join((val.control for val in t), join_string) +MissingSCMError() = ArgumentError(string("A Structural Causal Model should be provided in the configuration file in order to identify causal estimands.")) -control_string(t; join_string="_&_") = missing +get_identification_method(method::Nothing) = BackdoorAdjustment() +get_identification_method(method) = method -control_string(Ψ::TMLE.Parameter; join_string="_&_") = - control_string(values(Ψ.treatment); join_string=join_string) - -treatment_string(Ψ; join_string="_&_") = join(keys(Ψ.treatment), join_string) -confounders_string(Ψ; join_string="_&_") = join(Ψ.confounders, join_string) - - -function statistics_from_estimator(estimator) - Ψ̂ = TMLE.estimate(estimator) - std = √(var(estimator)) - testresult = OneSampleTTest(estimator) - pval = pvalue(testresult) - l, u = confint(testresult) - return (Ψ̂, std, pval, l, u) +function read_estimands_config(filename) + if endswith(filename, ".json") + TMLE.read_json(filename, use_mmap=false) + elseif endswith(filename, ".yaml") + TMLE.read_yaml(filename) + elseif endswith(filename, ".jls") + return deserialize(filename) + else + throw(ArgumentError(string("Can't read from ", extension, " file"))) + end end -function statistics_from_result(result::TMLE.TMLEResult) - Ψ̂₀ = result.initial - # TMLE stats - tmle_stats = statistics_from_estimator(result.tmle) - # OneStep stats - onestep_stats = statistics_from_estimator(result.onestep) - return Ψ̂₀, tmle_stats, onestep_stats +function fix_treatment_values!(treatment_types::AbstractDict, Ψ::ComposedEstimand, dataset) + new_args = Tuple(fix_treatment_values!(treatment_types, arg, dataset) for arg in Ψ.args) + return ComposedEstimand(Ψ.f, new_args) end -statistics_from_result(result::MissingTMLEResult) = - missing, - (missing, missing, missing, missing, missing), - (missing, missing, missing, missing, missing) - -function append_csv(filename, tmle_results, logs) - data = empty_tmle_output(size=size(tmle_results, 1)) - for (i, (result, log)) in enumerate(zip(tmle_results, logs)) - Ψ = result.parameter - param_type = param_string(Ψ) - treatments = treatment_string(Ψ) - case = case_string(Ψ) - control = control_string(Ψ) - confounders = confounders_string(Ψ) - covariates = covariates_string(Ψ) - Ψ̂₀, tmle_stats, onestep_stats = statistics_from_result(result) - data[i, :] = ( - param_type, treatments, case, control, string(Ψ.target), confounders, covariates, - Ψ̂₀, tmle_stats..., onestep_stats..., log - ) +""" +Uses the values found in the dataset to create a new estimand with adjusted values. +""" +function fix_treatment_values!(treatment_types::AbstractDict, Ψ, dataset) + treatment_names = keys(Ψ.treatment_values) + for tn in treatment_names + haskey(treatment_types, tn) ? nothing : treatment_types[tn] = eltype(dataset[!, tn]) end - CSV.write(filename, data, append=true, header=!isfile(filename)) + new_treatment = NamedTuple{treatment_names}( + convert_treatment_values(Ψ.treatment_values, treatment_types) + ) + return typeof(Ψ)( + outcome = Ψ.outcome, + treatment_values = new_treatment, + treatment_confounders = Ψ.treatment_confounders, + outcome_extra_covariates = Ψ.outcome_extra_covariates + ) end +""" + proofread_estimands(param_file, dataset) -##################################################################### -#####  JLD2 OUTPUT #### -##################################################################### - -update_jld2_output(jld2_file::Nothing, partition, tmle_results, dataset; pval_threshold=0.05) = nothing - -function update_jld2_output(jld2_file::String, partition, tmle_results, dataset; pval_threshold=0.05) - if jld2_file !== nothing - jldopen(jld2_file, "a+", compress=true) do io - # Append only with results passing the threshold - previous_variables = nothing - sample_ids_idx = nothing - - for (partition_index, param_index) in enumerate(partition) - r = tmle_results[partition_index] - if (r isa TMLE.TMLEResult) && (pvalue(OneSampleZTest(r.tmle)) <= pval_threshold) - current_variables = variables(r.parameter) - if previous_variables != current_variables - sample_ids = TargetedEstimation.get_sample_ids(dataset, current_variables) - io["$param_index/sample_ids"] = sample_ids - sample_ids_idx = param_index - end - io["$param_index/result"] = r - io["$param_index/sample_ids_idx"] = sample_ids_idx - - previous_variables = current_variables - end - end - end +Reads estimands from file and ensures that the treatment values in the config file +respects the treatment types in the dataset. +""" +function proofread_estimands(filename, dataset) + config = read_estimands_config(filename) + adjustment_method = get_identification_method(config.adjustment) + estimands = Vector{TMLE.Estimand}(undef, length(config.estimands)) + treatment_types = Dict() + for (index, Ψ) in enumerate(config.estimands) + statisticalΨ = identify(Ψ, config.scm, method=adjustment_method) + estimands[index] = fix_treatment_values!(treatment_types, statisticalΨ, dataset) end + return estimands end -##################################################################### -#####  Read Parameters #### -##################################################################### - - -function treatment_values(Ψ::Union{IATE, ATE}, treatment_names, treatment_types) - return [( - case = convert(treatment_types[tn], Ψ.treatment[tn].case), - control = convert(treatment_types[tn], Ψ.treatment[tn].control) - ) - for tn in treatment_names] -end +""" +This explicitely requires that the following columns belong to the dataset: -treatment_values(Ψ::CM, treatment_names, treatment_types) = - [convert(treatment_types[tn], Ψ.treatment[tn]) for tn in treatment_names] +- `T`: for the treatment variable +- `Y`: for the outcome variable +- `^W`: for the confounding variables +All ATE parameters are generated. """ - parameters_from_yaml(param_file, dataset) +function TMLE.factorialATE(dataset) + colnames = names(dataset) + "T" ∈ colnames || throw(ArgumentError("No column 'T' found in the dataset for the treatment variable.")) + "Y" ∈ colnames || throw(ArgumentError("No column 'Y' found in the dataset for the outcome variable.")) + confounding_variables = Tuple(name for name in colnames if occursin(r"^W", name)) + length(confounding_variables) > 0 || throw(ArgumentError("Could not find any confounding variable (starting with 'W') in the dataset.")) + + return [factorialATE(dataset, (:T, ), :Y; confounders=confounding_variables)] +end -Reads parameters from file and ensures that the parameters treatment in the config file -respect the treatment types in the dataset. -""" -function read_parameters(param_file, dataset) - parameters = if any(endswith(param_file, ext) for ext in ("yaml", "yml")) - parameters_from_yaml(param_file) +function build_estimands_list(estimands_pattern, dataset) + estimands = if estimands_pattern == "factorialATE" + factorialATE(dataset) else - deserialize(param_file) + proofread_estimands(estimands_pattern, dataset) end - - treatment_types = Dict() - for index in eachindex(parameters) - Ψ = parameters[index] - treatment_names = keys(Ψ.treatment) - for tn in treatment_names - haskey(treatment_types, tn) ? nothing : treatment_types[tn] = eltype(dataset[!, tn]) - end - new_treatment = NamedTuple{treatment_names}( - treatment_values(Ψ, treatment_names, treatment_types) - ) - parameters[index] = typeof(Ψ)( - target = Ψ.target, - treatment = new_treatment, - confounders = Ψ.confounders, - covariates = Ψ.covariates - ) - end - return collect(parameters) + return estimands end ##################################################################### #####  ADDITIONAL METHODS #### ##################################################################### -function get_sample_ids(data, variables) - cols = [:SAMPLE_ID, variables.target, variables.treatments..., variables.confounders..., variables.covariates...] - return dropmissing(data[!, cols]).SAMPLE_ID -end +TMLE.emptyIC(nt::NamedTuple{names}, pval_threshold) where names = + NamedTuple{names}([TMLE.emptyIC(result, pval_threshold) for result in nt]) """ instantiate_dataset(path::String) @@ -194,13 +117,15 @@ instantiate_dataset(path::String) = isbinary(col, dataset) = Set(unique(skipmissing(dataset[!, col]))) == Set([0, 1]) +make_categorical(x::CategoricalVector, ordered) = x +make_categorical(x, ordered) = categorical(x, ordered=ordered) function make_categorical!(dataset, colname::Union{String, Symbol}; infer_ordered=false) ordered = false if infer_ordered ordered = eltype(dataset[!, colname]) <: Real end - dataset[!, colname] = categorical(dataset[!, colname], ordered=ordered) + dataset[!, colname] = make_categorical(dataset[!, colname], ordered) end function make_categorical!(dataset, colnames; infer_ordered=false) @@ -209,8 +134,10 @@ function make_categorical!(dataset, colnames; infer_ordered=false) end end +make_float(x) = float(x) + make_float!(dataset, colname::Union{String, Symbol}) = - dataset[!, colname] = float(dataset[!, colname]) + dataset[!, colname] = make_float(dataset[!, colname]) function make_float!(dataset, colnames) for colname in colnames @@ -218,55 +145,45 @@ function make_float!(dataset, colnames) end end -function coerce_types!(dataset, variables) - # Treatment columns are converted to categorical - make_categorical!(dataset, variables.treatments, infer_ordered=true) - # Confounders and Covariates are converted to Float64 - make_float!(dataset, vcat(variables.confounders, variables.covariates)) - # Binary targets are converted to categorical - make_categorical!(dataset, variables.binarytargets, infer_ordered=false) - # Continuous targets are converted to Float64 - make_float!(dataset, variables.continuoustargets) +function coerce_types!(dataset, Ψ::ComposedEstimand) + for arg in Ψ.args + coerce_types!(dataset, arg) + end end -variables(Ψ::TMLE.Parameter) = ( - target = Ψ.target, - covariates = Ψ.covariates, - confounders = Ψ.confounders, - treatments = keys(Ψ.treatment) - ) - -function variables(parameters::Vector{<:TMLE.Parameter}, dataset) - treatments = Set{Symbol}() - confounders = Set{Symbol}() - covariates = Set{Symbol}() - binarytargets = Set{Symbol}() - continuoustargets = Set{Symbol}() - for Ψ in parameters - push!(treatments, keys(Ψ.treatment)...) - push!(confounders, Ψ.confounders...) - length(Ψ.covariates) > 0 && push!(covariates, Ψ.covariates...) - isbinary(Ψ.target, dataset) ? push!(binarytargets, Ψ.target) : push!(continuoustargets, Ψ.target) - end - return ( - treatments=treatments, - confounders=confounders, - covariates=covariates, - binarytargets=binarytargets, - continuoustargets - ) +function coerce_types!(dataset, Ψ) + # Make Treatments categorical but preserve order + categorical_variables = Set(keys(Ψ.treatment_values)) + make_categorical!(dataset, categorical_variables, infer_ordered=true) + # Make Confounders and extra covariates continuous + continuous_variables = Set(Iterators.flatten(values(Ψ.treatment_confounders))) + union!(continuous_variables, Ψ.outcome_extra_covariates) + make_float!(dataset, continuous_variables) + # Make outcome categorical if binary but do not infer order + if TMLE.is_binary(dataset, Ψ.outcome) + make_categorical!(dataset, Ψ.outcome, infer_ordered=false) + else + make_float!(dataset, Ψ.outcome) + end end -load_tmle_spec(file::Nothing) = ( - cache = false, - weighted_fluctuation = false, - threshold = 1e-8, - Q_continuous = LinearRegressor(), - Q_binary = LogisticClassifier(lambda=0.), - G = LogisticClassifier(lambda=0.) - ) +variables(Ψ::TMLE.ComposedEstimand) = union((variables(arg) for arg in Ψ.args)...) -function load_tmle_spec(file) +variables(Ψ::TMLE.Estimand) = Set([ + Ψ.outcome, + keys(Ψ.treatment_values)..., + Ψ.outcome_extra_covariates..., + Iterators.flatten(values(Ψ.treatment_confounders))... + ]) + +function load_tmle_spec(;file="glmnet") + file = endswith(file, ".jl") ? file : joinpath( + pkgdir(TargetedEstimation), + "estimators-configs", + string(file, ".jl")) include(abspath(file)) - return merge(load_tmle_spec(nothing), tmle_spec::NamedTuple) -end \ No newline at end of file + return ESTIMATORS +end + +TMLE.to_dict(nt::NamedTuple{names, <:Tuple{Vararg{Union{TMLE.EICEstimate, FailedEstimate, TMLE.ComposedEstimate}}}}) where names = + Dict(key => TMLE.to_dict(val) for (key, val) ∈ zip(keys(nt), nt)) \ No newline at end of file diff --git a/sysimage/Project.toml b/sysimage/Project.toml deleted file mode 100644 index fbb19f0..0000000 --- a/sysimage/Project.toml +++ /dev/null @@ -1,2 +0,0 @@ -[deps] -PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d" diff --git a/sysimage/create_sysimage.jl b/sysimage/create_sysimage.jl deleted file mode 100644 index 2537aa8..0000000 --- a/sysimage/create_sysimage.jl +++ /dev/null @@ -1,11 +0,0 @@ - -SYSIMAGE_DIR = dirname(@__FILE__) -push!(LOAD_PATH, SYSIMAGE_DIR) - -using PackageCompiler - -create_sysimage( - ["TargetedEstimation"]; - sysimage_path="TargetedEstimationSysimage.so", - precompile_execution_file=joinpath(SYSIMAGE_DIR, "precompile_exec_file.jl") -) \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml index 465caa7..440b866 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -6,14 +6,14 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" -MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c" MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692" MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91" -RCall = "6f49c342-dc21-5d91-9882-a32aef131414" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" TMLE = "8afdd2fb-6e73-43df-8b62-b1650cd9c8cf" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" @@ -23,6 +23,4 @@ YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] Distributions = "0.25" LogExpFunctions = "0.3" -MLJGLMInterface = "0.3" -RCall = "0.13" StableRNGs = "1.0" diff --git a/test/cache_managers.jl b/test/cache_managers.jl new file mode 100644 index 0000000..1567cad --- /dev/null +++ b/test/cache_managers.jl @@ -0,0 +1,113 @@ +module TestRunner + +using TargetedEstimation +using Test +using TMLE + +@testset "Test NoCacheManager" begin + cache_manager = TargetedEstimation.NoCacheManager() + cache_manager.cache["Toto"] = 1 + cache_manager.cache["Tata"] = 2 + TargetedEstimation.release!(cache_manager, nothing) + @test cache_manager.cache == Dict() + # Check this does not throw + TargetedEstimation.release!(cache_manager, nothing) +end + +@testset "Test MaxSizeCacheManager" begin + cache_manager = TargetedEstimation.MaxSizeCacheManager(3) + Y_T₁T₂ = TMLE.ConditionalDistribution(:Y, (:T₁, :T₂, :W)) + cache_manager.cache[Y_T₁T₂] = 1 + T₁_W = TMLE.ConditionalDistribution(:T₁, (:W,)) + cache_manager.cache[T₁_W] = 1 + T₂_W = TMLE.ConditionalDistribution(:T₂, (:W,)) + cache_manager.cache[T₂_W] = 1 + η = TMLE.CMRelevantFactors( + Y_T₁T₂, + (T₁_W, T₂_W) + ) + cache_manager.cache[η] = 1 + cache_manager.cache[:last_fluctuation] = 1 + length(cache_manager.cache) == 5 + TargetedEstimation.release!(cache_manager, nothing) + # CMRelevantFactors and fluctuation dropped + @test cache_manager.cache == Dict( + TMLE.ConditionalDistribution(:Y, (:T₁, :T₂, :W)) => 1, + TMLE.ConditionalDistribution(:T₂, (:W,)) => 1, + TMLE.ConditionalDistribution(:T₁, (:W,)) => 1 + ) +end + +@testset "Test ReleaseUnusableCacheManager" begin + estimands = [ + ATE( + outcome=:Y, + treatment_values=(T₁=(case=1, control=0), T₂=(case=1, control=0)), + treatment_confounders=(T₁=[:W], T₂=[:W]) + ), + ATE( + outcome=:Y, + treatment_values=(T₁=(case=1, control=0), T₂=(case=2, control=0)), + treatment_confounders=(T₁=[:W], T₂=[:W]) + ), + ATE( + outcome=:Y, + treatment_values=(T₁=(case=1, control=0),), + treatment_confounders=(T₁=[:W],) + ), + ATE( + outcome=:Ynew, + treatment_values=(T₃=(case=1, control=0),), + treatment_confounders=(T₃=[:W],) + ) + ] + η_counts = TMLE.nuisance_function_counts(estimands) + cache_manager = TargetedEstimation.ReleaseUnusableCacheManager(η_counts) + # Estimation of the first estimand will fill the cache with the following + Y_T₁T₂ = TMLE.ConditionalDistribution(:Y, (:T₁, :T₂, :W)) + cache_manager.cache[Y_T₁T₂] = 1 + T₁_W = TMLE.ConditionalDistribution(:T₁, (:W,)) + cache_manager.cache[T₁_W] = 1 + T₂_W = TMLE.ConditionalDistribution(:T₂, (:W,)) + cache_manager.cache[T₂_W] = 1 + η = TMLE.CMRelevantFactors( + Y_T₁T₂, + (T₁_W, T₂_W) + ) + cache_manager.cache[η] = 1 + cache_manager.cache[:last_fluctuation] = 1 + @test length(cache_manager.cache) == 5 + # After estimation of the first estimand, the fluctuation and composite factor are released + TargetedEstimation.release!(cache_manager, estimands[1]) + @test cache_manager.cache == Dict( + TMLE.ConditionalDistribution(:Y, (:T₁, :T₂, :W)) => 1, + TMLE.ConditionalDistribution(:T₂, (:W,)) => 1, + TMLE.ConditionalDistribution(:T₁, (:W,)) => 1 + ) + + # Estimation of the second estimand will restore the composite factor + cache_manager.cache[η] = 1 + cache_manager.cache[:last_fluctuation] = 1 + # Y_T₁T₂ and T₂_W are no longer needed + TargetedEstimation.release!(cache_manager, estimands[2]) + @test cache_manager.cache == Dict(TMLE.ConditionalDistribution(:T₁, (:W,)) => 1) + + # Estimation of the third estimand will fill the cache with the following + Y_T₁ = TMLE.ConditionalDistribution(:Y, (:T₁, :W)) + cache_manager.cache[Y_T₁] = 1 + η = TMLE.CMRelevantFactors( + Y_T₁, + (T₁_W, ) + ) + cache_manager.cache[η] = 1 + cache_manager.cache[:last_fluctuation] = 1 + # Y_T₁ and T₁_W are no longer needed + TargetedEstimation.release!(cache_manager, estimands[3]) + @test cache_manager.cache == Dict() + # Check this does not throw + TargetedEstimation.release!(cache_manager, estimands[1]) +end + +end + +true \ No newline at end of file diff --git a/test/config/failing_parameters.yaml b/test/config/failing_parameters.yaml deleted file mode 100644 index 9991cd1..0000000 --- a/test/config/failing_parameters.yaml +++ /dev/null @@ -1,6 +0,0 @@ - Parameters: - - type: ATE - target: EXTREME_BINARY - treatment: (T1 = (control = 0, case = 1), T2 = (control = 0, case = 1)) - confounders: [W1, W2] - covariates: [C1] \ No newline at end of file diff --git a/test/config/ose_config.jl b/test/config/ose_config.jl new file mode 100644 index 0000000..28f2367 --- /dev/null +++ b/test/config/ose_config.jl @@ -0,0 +1,13 @@ + +default_models = TMLE.default_models( + # For the estimation of E[Y|W, T]: continuous target + Q_continuous = LinearRegressor(), + # For the estimation of E[Y|W, T]: binary target + Q_binary = LogisticClassifier(), + # For the estimation of p(T| W) + G = LogisticClassifier() +) + +ESTIMATORS = ( + OSE = OSE(models=default_models), +) \ No newline at end of file diff --git a/test/config/parameters.bin b/test/config/parameters.bin deleted file mode 100644 index 3fe2558..0000000 Binary files a/test/config/parameters.bin and /dev/null differ diff --git a/test/config/parameters.yaml b/test/config/parameters.yaml deleted file mode 100644 index 8399487..0000000 --- a/test/config/parameters.yaml +++ /dev/null @@ -1,29 +0,0 @@ -Parameters: - - type: IATE - target: CONTINUOUS, TARGET - treatment: (T1 = (control = 0, case = 1), T2 = (control = 0, case = 1)) - confounders: [W1, W2] - covariates: [C1] - - type: IATE - target: "BINARY/TARGET" - treatment: (T1 = (control = 0, case = 1), T2 = (control = 0, case = 1)) - confounders: [W1, W2] - covariates: [C1] - - type: ATE - target: CONTINUOUS, TARGET - treatment: (T1 = (control = 0, case = 1),) - confounders: [W1, W2] - - type: IATE - target: CONTINUOUS, TARGET - treatment: (T1 = (control = 0, case = 1), T2 = (control = 1, case = 0)) - confounders: [W1, W2] - - type: IATE - target: "BINARY/TARGET" - treatment: (T1 = (control = 0, case = 1), T2 = (control = 1, case = 0)) - confounders: [W1, W2] - covariates: [C1] - - type: ATE - target: CONTINUOUS, TARGET - treatment: (T1 = (control = 0, case = 1), T2 = (control = 0, case = 1)) - confounders: [W1, W2] - covariates: [C1] \ No newline at end of file diff --git a/test/config/problematic_tmle_ose_config.jl b/test/config/problematic_tmle_ose_config.jl new file mode 100644 index 0000000..a517cf4 --- /dev/null +++ b/test/config/problematic_tmle_ose_config.jl @@ -0,0 +1,14 @@ +default_models = TMLE.default_models( + Q_continuous = LinearRegressor(), + # For the estimation of E[Y|W, T]: binary target + Q_binary = LogisticClassifier(), + # This will fail + G = LogisticClassifier() +) + +models = merge(default_models, (T2 = LinearRegressor(),)) + +ESTIMATORS = ( + TMLE = TMLEE(models=models, weighted=true, ps_lowerbound=0.001), + OSE = OSE(models=models) +) \ No newline at end of file diff --git a/test/config/sieve_tests_parameters_1.yaml b/test/config/sieve_tests_parameters_1.yaml deleted file mode 100644 index 510b500..0000000 --- a/test/config/sieve_tests_parameters_1.yaml +++ /dev/null @@ -1,32 +0,0 @@ -Parameters: - - type: IATE - target: CONTINUOUS, TARGET - treatment: (T1 = (control = 0, case = 1), T2 = (control = 0, case = 1)) - confounders: [W1, W2] - covariates: [C1] - - type: IATE - target: CONTINUOUS, TARGET - treatment: (T1 = (control = 0, case = 1), T2 = (control = 1, case = 0)) - confounders: [W1, W2] - covariates: [C1] - - type: ATE - target: CONTINUOUS, TARGET - treatment: (T1 = (control = 0, case = 1), T2 = (control = 0, case = 1)) - confounders: [W1, W2] - covariates: [C1] - - type: IATE - target: "BINARY/TARGET" - treatment: (T1 = (control = 0, case = 1), T2 = (control = 0, case = 1)) - confounders: [W1, W2] - covariates: [C1] - - type: IATE - target: "BINARY/TARGET" - treatment: (T1 = (control = 0, case = 1), T2 = (control = 1, case = 0)) - confounders: [W1, W2] - covariates: [C1] - - type: ATE - target: "BINARY/TARGET" - treatment: (T1 = (control = 0, case = 1), T2 = (control = 0, case = 1)) - confounders: [W1, W2] - covariates: [C1] - \ No newline at end of file diff --git a/test/config/sieve_tests_parameters_2.yaml b/test/config/sieve_tests_parameters_2.yaml deleted file mode 100644 index 371868c..0000000 --- a/test/config/sieve_tests_parameters_2.yaml +++ /dev/null @@ -1,9 +0,0 @@ -Parameters: - - type: ATE - target: CONTINUOUS, TARGET - treatment: (T1 = (control = 0, case = 1),) - confounders: [W1] - - type: CM - target: CONTINUOUS, TARGET - treatment: (T1 = 0,) - confounders: [W1] \ No newline at end of file diff --git a/test/config/tmle_config_2.jl b/test/config/tmle_config_2.jl deleted file mode 100644 index d77aaa0..0000000 --- a/test/config/tmle_config_2.jl +++ /dev/null @@ -1,44 +0,0 @@ - -evotree = EvoTreeClassifier(nrounds=10) - -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true`` may result in faster execution but higher memory usage - cache = true, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = true, - # Propensity score threshold - threshold = 1e-8, - # For the estimation of E[Y|W, T]: continuous target - Q_continuous = Stack( - metalearner = LinearRegressor(fit_intercept=false), - cache = true, - resampling = AdaptiveCV(), - interaction_glmnet = Pipeline( - interaction_transformer = InteractionTransformer(order=3), - glmnet = GLMNetRegressor(), - cache = true - ), - evo_1 = EvoTreeRegressor(nrounds=10, lambda=0., gamma=0.3), - evo_2 = EvoTreeRegressor(nrounds=10, lambda=1., gamma=0.3), - evo_3 = EvoTreeRegressor(nrounds=20, lambda=0., gamma=0.3), - evo_4 = EvoTreeRegressor(nrounds=20, lambda=1., gamma=0.3), - constant = ConstantRegressor(), - hal = HALRegressor(max_degree=1, smoothness_orders=1, num_knots=[10, 5], lambda=10, cv_select=false) - ), - # For the estimation of E[Y|W, T]: binary target - Q_binary = Pipeline( - interaction_transformer = InteractionTransformer(order=2), - glmnet = GLMNetClassifier(), - cache = false - ), - # For the estimation of p(T| W) - G = TunedModel( - model = evotree, - resampling = CV(), - tuning = Grid(goal=5), - range = [range(evotree, :max_depth, lower=3, upper=5), range(evotree, :lambda, lower=1e-5, upper=10, scale=:log)], - measure = log_loss, - cache=true - ) -) - diff --git a/test/config/tmle_config.jl b/test/config/tmle_ose_config.jl similarity index 76% rename from test/config/tmle_config.jl rename to test/config/tmle_ose_config.jl index 4281e80..1997b70 100644 --- a/test/config/tmle_config.jl +++ b/test/config/tmle_ose_config.jl @@ -1,16 +1,10 @@ evotree = EvoTreeClassifier(nrounds=10) -tmle_spec = ( - # Controls caching of data by MLJ machines: turning to `true` may result in faster execution but higher memory usage - cache=false, - # Controls whether the fluctuation is weighted or not - weighted_fluctuation = false, - # Propensity score threshold - threshold = 0.001, - # For the estimation of E[Y|W, T]: continuous target +default_models = TMLE.default_models( Q_continuous = Stack( metalearner = LinearRegressor(fit_intercept=false), resampling = CV(nfolds=2), + cache = false, interaction_glmnet = Pipeline( interaction_transformer = RestrictedInteractionTransformer(order=3, primary_variables_patterns=[r"^rs[0-9]+"]), glmnet = GLMNetRegressor(), @@ -19,7 +13,6 @@ tmle_spec = ( evo_10 = EvoTreeRegressor(nrounds=10), evo_20 = EvoTreeRegressor(nrounds=20), constant = ConstantRegressor(), - hal = HALRegressor(max_degree=1, smoothness_orders=1, num_knots=[10, 5], lambda=10, cv_select=false) ), # For the estimation of E[Y|W, T]: binary target Q_binary = Stack( @@ -32,7 +25,6 @@ tmle_spec = ( cache = false ), constant = ConstantClassifier(), - hal = HALClassifier(max_degree=1, smoothness_orders=1, num_knots=[10, 5], lambda=10, cv_select=false), gridsearch_evo = TunedModel( model = evotree, resampling = CV(), @@ -46,6 +38,7 @@ tmle_spec = ( G = Stack( metalearner = LogisticClassifier(lambda=0., fit_intercept=false), resampling = StratifiedCV(nfolds=2), + cache = false, interaction_glmnet = Pipeline( interaction_transformer = RestrictedInteractionTransformer( order=2, @@ -58,4 +51,9 @@ tmle_spec = ( constant = ConstantClassifier(), evo = EvoTreeClassifier(nrounds=10) ) +) + +ESTIMATORS = ( + TMLE = TMLEE(models=default_models, weighted=true, ps_lowerbound=0.001), + OSE = OSE(models=default_models) ) \ No newline at end of file diff --git a/test/data/merge/empty_sieve.csv b/test/data/merge/empty_sieve.csv deleted file mode 100644 index 4b160ac..0000000 --- a/test/data/merge/empty_sieve.csv +++ /dev/null @@ -1 +0,0 @@ -PARAMETER_TYPE,TREATMENTS,CASE,CONTROL,TARGET,CONFOUNDERS,COVARIATES diff --git a/test/data/merge/sieve_output_1.csv b/test/data/merge/sieve_output_1.csv deleted file mode 100644 index 119c5fa..0000000 --- a/test/data/merge/sieve_output_1.csv +++ /dev/null @@ -1,7 +0,0 @@ -PARAMETER_TYPE,TREATMENTS,CASE,CONTROL,TARGET,CONFOUNDERS,COVARIATES,SIEVE_STD,SIEVE_PVALUE,SIEVE_LWB,SIEVE_UPB -IATE,T2_&_T1,1_&_1,0_&_0,"CONTINUOUS, TARGET",W1_&_W2,C1,0.10514479130506516,0.0,0.39844897646996624,0.42804034052713935 -IATE,T2_&_T1,0_&_1,1_&_0,"CONTINUOUS, TARGET",W1_&_W2,C1,0.10514479130506516,0.0,-0.42804034052713935,-0.39844897646996624 -ATE,T2_&_T1,1_&_1,0_&_0,"CONTINUOUS, TARGET",W1_&_W2,C1,0.14041906595230103,0.0,-0.6661267914170061,-0.6266080320986587 -IATE,T2_&_T1,1_&_1,0_&_0,BINARY/TARGET,W1_&_W2,C1,0.3126221001148224,0.9941422670223119,-0.04427033991279833,0.04393992135752185 -IATE,T2_&_T1,0_&_1,1_&_0,BINARY/TARGET,W1_&_W2,C1,0.3126221001148224,0.9941422670223119,-0.04393992135752185,0.04427033991279833 -ATE,T2_&_T1,1_&_1,0_&_0,BINARY/TARGET,W1_&_W2,C1,0.3183199465274811,2.0186741955776768e-7,-0.16400271059341004,-0.07418473022532235 \ No newline at end of file diff --git a/test/data/merge/sieve_output_2.csv b/test/data/merge/sieve_output_2.csv deleted file mode 100644 index 1de809d..0000000 --- a/test/data/merge/sieve_output_2.csv +++ /dev/null @@ -1,3 +0,0 @@ -PARAMETER_TYPE,TREATMENTS,CASE,CONTROL,TARGET,CONFOUNDERS,COVARIATES,SIEVE_STD,SIEVE_PVALUE,SIEVE_LWB,SIEVE_UPB -ATE,T1,1,0,"CONTINUOUS, TARGET",W1,,0.17398861050605774,0.0,-1.1780933630666999,-1.1291269782236455 -CM,T1,0,,"CONTINUOUS, TARGET",W1,,0.09048639982938766,0.0,3.4078416054701566,3.433307593526622 diff --git a/test/data/merge/tmle_output_1.csv b/test/data/merge/tmle_output_1.csv deleted file mode 100644 index 21d2ae0..0000000 --- a/test/data/merge/tmle_output_1.csv +++ /dev/null @@ -1,7 +0,0 @@ -PARAMETER_TYPE,TREATMENTS,CASE,CONTROL,TARGET,CONFOUNDERS,COVARIATES,INITIAL_ESTIMATE,TMLE_ESTIMATE,TMLE_STD,TMLE_PVALUE,TMLE_LWB,TMLE_UPB,ONESTEP_ESTIMATE,ONESTEP_STD,ONESTEP_PVALUE,ONESTEP_LWB,ONESTEP_UPB,LOG -IATE,T2_&_T1,1_&_1,0_&_0,"CONTINUOUS, TARGET",W1_&_W2,C1,0.31289224196843934,0.4132446584985528,0.11215083413905078,0.0002973305980956673,0.19204601585918746,0.6344433011379181,0.847922052214297,0.020796900602100377,0.1808979087784819,0.935635289898083,0.26988547749823344, -IATE,T2_&_T1,0_&_1,1_&_0,"CONTINUOUS, TARGET",W1_&_W2,C1,-0.31289224196843934,-0.4132446584985528,0.11215083413905078,0.0002973305980956673,-0.6344433011379181,-0.19204601585918746,0.1132683682000456,0.007992877453115943,0.05298134725065751,0.3761329000024115,0.8446783494259822, -ATE,T2_&_T1,1_&_1,0_&_0,"CONTINUOUS, TARGET",W1_&_W2,C1,-0.6913496525247373,-0.6463674117578324,0.14465023358495563,1.340594727468874e-5,-0.9316656493686948,-0.36106917414697,0.5750731876257001,0.6616018441386402,0.6626823260683342,0.9943324985582943,0.379330384132208, -IATE,T2_&_T1,1_&_1,0_&_0,BINARY/TARGET,W1_&_W2,C1,0.015114902768326591,-0.0001652092776382386,0.3106592646611987,0.9995762353961254,-0.6129084528900649,0.6125780343347885,0.18524882713929447,0.6791824198934945,0.375539677029601,0.2563919842828919,0.48004747095683487, -IATE,T2_&_T1,0_&_1,1_&_0,BINARY/TARGET,W1_&_W2,C1,-0.015114902768326591,0.0001652092776382386,0.3106592646611987,0.9995762353961254,-0.6125780343347885,0.6129084528900649,0.8483223420477747,0.6377742233856645,0.653653091532227,0.6594583118531032,0.3862219522578093, -ATE,T2_&_T1,1_&_1,0_&_0,BINARY/TARGET,W1_&_W2,C1,-0.07124029524113125,-0.1190937204093662,0.3182495428000389,0.7086573850781657,-0.7468080019909507,0.5086205611722183,0.7833975115669672,0.13752408975674002,0.8906874812178406,0.7407394467826026,0.6391102550858685, diff --git a/test/data/merge/tmle_output_2.csv b/test/data/merge/tmle_output_2.csv deleted file mode 100644 index 4e76aa3..0000000 --- a/test/data/merge/tmle_output_2.csv +++ /dev/null @@ -1,3 +0,0 @@ -PARAMETER_TYPE,TREATMENTS,CASE,CONTROL,TARGET,CONFOUNDERS,COVARIATES,INITIAL_ESTIMATE,TMLE_ESTIMATE,TMLE_STD,TMLE_PVALUE,TMLE_LWB,TMLE_UPB,ONESTEP_ESTIMATE,ONESTEP_STD,ONESTEP_PVALUE,ONESTEP_LWB,ONESTEP_UPB,LOG -ATE,T1,1,0,"CONTINUOUS, TARGET",W1,,-1.170325854136744,,,,,,,,,,,"Error" -CM,T1,0,,"CONTINUOUS, TARGET",W1,,3.4304882451014653,3.4205745994983894,0.08649674229047534,1.6698354099787253e-94,3.249974334825743,3.5911748641710357,0.11925931782610122,0.1908267610598129,0.3548787761302413,0.6543239505251285,0.8668053182115685, diff --git a/test/data/sieve_variances.hdf5 b/test/data/sieve_variances.hdf5 deleted file mode 100644 index ea0776c..0000000 Binary files a/test/data/sieve_variances.hdf5 and /dev/null differ diff --git a/test/load_tmle_spec.jl b/test/load_tmle_spec.jl deleted file mode 100644 index 71988cc..0000000 --- a/test/load_tmle_spec.jl +++ /dev/null @@ -1,113 +0,0 @@ -module TestsStackBuilding - -using Test -using TargetedEstimation -using MLJ -using MLJGLMInterface -using MLJLinearModels -using EvoTrees - -@testset "Test tmle_spec_from_yaml: Only Stacks" begin - tmle_spec = TargetedEstimation.load_tmle_spec(joinpath("config", "tmle_config.jl")) - - @test tmle_spec.threshold == 0.001 - @test tmle_spec.weighted_fluctuation == false - # Test binary target TMLE's Qstack - Q_binary = tmle_spec.Q_binary - @test Q_binary.cache == false - ## Checking Qstack.metalearner - @test Q_binary.metalearner isa LogisticClassifier - @test Q_binary.metalearner.fit_intercept == false - ## Checking Qstack.resampling - @test Q_binary.resampling isa StratifiedCV - @test Q_binary.resampling.nfolds == 2 - ## Checking Qstack EvoTree models - @test Q_binary.gridsearch_evo.tuning.goal == 5 - @test Q_binary.gridsearch_evo.cache == false - @test Q_binary.gridsearch_evo.model.nrounds == 10 - @test Q_binary.gridsearch_evo.resampling isa CV - ranges = Q_binary.gridsearch_evo.range - @test ranges[2].lower == 1e-5 - @test ranges[2].upper == 10 - @test ranges[2].scale == :log - @test ranges[1].lower == 3 - @test ranges[1].upper == 5 - @test ranges[1].scale == :linear - ## Checking Qstack Interaction Logistic models - @test Q_binary.interaction_glmnet isa MLJ.ProbabilisticPipeline - @test Q_binary.interaction_glmnet.interaction_transformer.order == 2 - ## Checking Qstack HAL model - @test Q_binary.hal.lambda == 10 - @test Q_binary.hal.smoothness_orders == 1 - @test Q_binary.hal.cv_select == false - @test Q_binary.hal.num_knots == [10, 5] - - # Test continuous target TMLE's Qstack - Q_continuous = tmle_spec.Q_continuous - ## Checking Qstack.metalearner - @test Q_continuous.metalearner isa MLJLinearModels.LinearRegressor - @test Q_continuous.metalearner.fit_intercept == false - - ## Checking Qstack.resampling - @test Q_continuous.resampling isa CV - @test Q_continuous.resampling.nfolds == 2 - ## Checking Qstack EvoTree models - @test Q_continuous.evo_10.nrounds == 10 - @test Q_continuous.evo_20.nrounds == 20 - ## Checking Qstack Interaction Linear model - @test Q_continuous.interaction_glmnet isa MLJ.DeterministicPipeline - @test Q_continuous.interaction_glmnet.interaction_transformer.order == 3 - @test Q_continuous.interaction_glmnet.interaction_transformer.primary_variables == [] - @test Q_continuous.interaction_glmnet.interaction_transformer.primary_variables_patterns == [r"^rs[0-9]+"] - ## Checking Qstack HAL model - @test Q_continuous.hal.lambda == 10 - @test Q_continuous.hal.smoothness_orders == 1 - @test Q_continuous.hal.cv_select == false - @test Q_continuous.hal.num_knots == [10, 5] - - # TMLE G Stack - G = tmle_spec.G - ## Checking Gstack.metalearner - @test G.metalearner isa LogisticClassifier - @test G.metalearner.fit_intercept == false - ## Checking Gstack.resampling - @test G.resampling isa StratifiedCV - @test G.resampling.nfolds == 2 - ## Checking Gstack models - @test G.interaction_glmnet.interaction_transformer.order == 2 - @test G.interaction_glmnet.interaction_transformer.primary_variables == [:T1, :T2] - @test G.interaction_glmnet.interaction_transformer.primary_variables_patterns == [r"C"] - @test G.evo.nrounds == 10 - - @test tmle_spec.cache == false -end - -@testset "Test tmle_spec_from_yaml: Simple models and GridSearch" begin - tmle_spec = TargetedEstimation.load_tmle_spec(joinpath("config", "tmle_config_2.jl")) - @test tmle_spec.G.cache == true - @test tmle_spec.weighted_fluctuation == true - @test tmle_spec.G.measure isa LogLoss - @test tmle_spec.G.tuning.goal == 5 - @test tmle_spec.G.model.nrounds == 10 - lambda_range = tmle_spec.G.range[2] - @test lambda_range.lower == 1e-5 - @test lambda_range.upper == 10 - @test lambda_range.scale == :log - depth_range = tmle_spec.G.range[1] - @test depth_range.lower == 3 - @test depth_range.upper == 5 - @test depth_range.scale == :linear - - @test tmle_spec.Q_binary isa MLJ.ProbabilisticPipeline - @test tmle_spec.threshold == 1e-8 - - @test tmle_spec.Q_continuous.cache == true - @test tmle_spec.Q_continuous.interaction_glmnet.cache == true - - @test tmle_spec.cache == true -end - -end; - -true - diff --git a/test/merge.jl b/test/merge.jl deleted file mode 100644 index 038ed03..0000000 --- a/test/merge.jl +++ /dev/null @@ -1,110 +0,0 @@ -module TestMergeCSVFiles - -using TargetedEstimation -using Test -using CSV -using DataFrames - -@testset "Test merge_csv_files, no sieve file" begin - parsed_args = Dict( - "tmle-prefix" => joinpath("data", "merge", "tmle"), - "sieve-prefix" => nothing, - "out" => "output.csv" - ) - merge_csv_files(parsed_args) - output = CSV.read(parsed_args["out"], DataFrame) - @test names(output) == [ - "PARAMETER_TYPE", "TREATMENTS", "CASE", - "CONTROL", "TARGET", "CONFOUNDERS", - "COVARIATES", "INITIAL_ESTIMATE", - "TMLE_ESTIMATE", "TMLE_STD", "TMLE_PVALUE", "TMLE_LWB", "TMLE_UPB", - "ONESTEP_ESTIMATE", "ONESTEP_STD", "ONESTEP_PVALUE", "ONESTEP_LWB", "ONESTEP_UPB", - "LOG", "TRAIT_ADJUSTED_TMLE_PVALUE" - ] - @test size(output, 1) == 8 - - for (pval, adjusted_pval) in zip(output.TMLE_PVALUE, output.TRAIT_ADJUSTED_TMLE_PVALUE) - if pval === missing - @test adjusted_pval === missing - else - @test pval <= adjusted_pval - end - end - - @test output.PARAMETER_TYPE == [ - "IATE", "IATE", "ATE", - "IATE", "IATE", "ATE", - "ATE", "CM" - ] - rm(parsed_args["out"]) -end - -@testset "Test merge_csv_files, sieve file" begin - sieve_colnames = [ - "PARAMETER_TYPE", "TREATMENTS", "CASE", - "CONTROL", "TARGET", "CONFOUNDERS", - "COVARIATES", "INITIAL_ESTIMATE", - "TMLE_ESTIMATE", "TMLE_STD", "TMLE_PVALUE", "TMLE_LWB", "TMLE_UPB", - "ONESTEP_ESTIMATE", "ONESTEP_STD", "ONESTEP_PVALUE", "ONESTEP_LWB", "ONESTEP_UPB", - "LOG", "SIEVE_STD", "SIEVE_PVALUE", "SIEVE_LWB", "SIEVE_UPB", "TRAIT_ADJUSTED_TMLE_PVALUE" - ] - parsed_args = Dict( - "tmle-prefix" => joinpath("data", "merge", "tmle"), - "sieve-prefix" => joinpath("data", "merge", "sieve"), - "out" => "output.csv" - ) - merge_csv_files(parsed_args) - output = CSV.read(parsed_args["out"], DataFrame) - @test names(output) == sieve_colnames - @test size(output, 1) == 8 - @test output.SIEVE_STD isa Vector{Float64} - @test output.PARAMETER_TYPE == [ - "IATE", "IATE", "ATE", - "IATE", "IATE", "ATE", - "ATE", "CM" - ] - - parsed_args = Dict( - "tmle-prefix" => joinpath("data", "merge", "tmle"), - "sieve-prefix" => joinpath("data", "merge", "sieve_output_2"), - "out" => "output.csv" - ) - merge_csv_files(parsed_args) - output = CSV.read(parsed_args["out"], DataFrame) - @test names(output) == sieve_colnames - @test size(output, 1) == 8 - @test all(x===missing for x in output.SIEVE_STD[3:end]) - - rm(parsed_args["out"]) -end - -@testset "Test merge_csv_files, empty sieve file" begin - parsed_args = Dict( - "tmle-prefix" => joinpath("data", "merge", "tmle"), - "sieve-prefix" => joinpath("data", "merge", "empty"), - "out" => "output.csv" - ) - merge_csv_files(parsed_args) - output = CSV.read(parsed_args["out"], DataFrame) - @test names(output) == [ - "PARAMETER_TYPE", "TREATMENTS", "CASE", - "CONTROL", "TARGET", "CONFOUNDERS", - "COVARIATES", "INITIAL_ESTIMATE", - "TMLE_ESTIMATE", "TMLE_STD", "TMLE_PVALUE", "TMLE_LWB", "TMLE_UPB", - "ONESTEP_ESTIMATE", "ONESTEP_STD", "ONESTEP_PVALUE", "ONESTEP_LWB", "ONESTEP_UPB", - "LOG", "TRAIT_ADJUSTED_TMLE_PVALUE" - ] - @test size(output, 1) == 8 - @test output.PARAMETER_TYPE == [ - "IATE", "IATE", "ATE", - "IATE", "IATE", "ATE", - "ATE", "CM" - ] - - rm(parsed_args["out"]) -end - - -end - -true \ No newline at end of file diff --git a/test/models/biallelic_snp_encoder.jl b/test/models/biallelic_snp_encoder.jl index f929508..27d544b 100644 --- a/test/models/biallelic_snp_encoder.jl +++ b/test/models/biallelic_snp_encoder.jl @@ -17,7 +17,7 @@ using MLJBase fit!(mach, verbosity=0) fitresult = fitted_params(mach).fitresult @test fitresult == Dict(:rs1234 => 'A', :rs4567 => 'C') - Xt = transform(mach) + Xt = MLJBase.transform(mach) @test Xt.rs1234[1:3] == [1, 0, 0] @test Xt.rs1234[4] === missing @test Xt.rs4567 == [0, 1, 2, 2] @@ -49,3 +49,4 @@ end end +true \ No newline at end of file diff --git a/test/outputs.jl b/test/outputs.jl new file mode 100644 index 0000000..73b9207 --- /dev/null +++ b/test/outputs.jl @@ -0,0 +1,60 @@ +module TestOutputs + +using TargetedEstimation +using Test +using JSON + +TESTDIR = joinpath(pkgdir(TargetedEstimation), "test") + +include(joinpath(TESTDIR, "testutils.jl")) + +@testset "Test initialize" begin + outputs = TargetedEstimation.Outputs( + json = TargetedEstimation.JSONOutput(filename="output.json"), + jls = TargetedEstimation.JLSOutput(filename="output.jls"), + hdf5 = TargetedEstimation.HDF5Output(filename="output.hdf5"), + ) + + TargetedEstimation.initialize(outputs) + + @test isfile(outputs.json.filename) + @test_throws TargetedEstimation.FileExistsError(outputs.json.filename) TargetedEstimation.initialize(outputs) + rm(outputs.json.filename) + + touch(outputs.jls.filename) + @test_throws TargetedEstimation.FileExistsError(outputs.jls.filename) TargetedEstimation.initialize(outputs) + rm(outputs.jls.filename) + rm(outputs.json.filename) + + touch(outputs.hdf5.filename) + @test_throws TargetedEstimation.FileExistsError(outputs.hdf5.filename) TargetedEstimation.initialize(outputs) + rm(outputs.hdf5.filename) + rm(outputs.json.filename) +end + +@testset "Test JSON update_file" begin + results = [] + for Ψ in statistical_estimands_only_config().estimands + push!(results, ( + TMLE=TMLE.TMLEstimate(Ψ, rand(), rand(), 10, Float64[]), + OSE=TMLE.OSEstimate(Ψ, rand(), rand(), 10, Float64[]) + )) + end + tmpdir = mktempdir(cleanup=true) + jsonoutput = TargetedEstimation.JSONOutput(filename=joinpath(tmpdir, "output_test.json")) + TargetedEstimation.initialize_json(jsonoutput.filename) + TargetedEstimation.update_file(jsonoutput, results[1:3]) + TargetedEstimation.update_file(jsonoutput, results[4:end]; finalize=true) + loaded_results = TMLE.read_json(jsonoutput.filename, use_mmap=false) + @test size(loaded_results) == size(results) + for (result, loaded_result) in zip(results, loaded_results) + @test result.TMLE.estimate == loaded_result[:TMLE].estimate + @test result.TMLE.std == loaded_result[:TMLE].std + @test result.OSE.estimate == loaded_result[:OSE].estimate + @test result.OSE.std == loaded_result[:OSE].std + end +end + +end + +true \ No newline at end of file diff --git a/test/resampling.jl b/test/resampling.jl index 1e13e8d..4976dc3 100644 --- a/test/resampling.jl +++ b/test/resampling.jl @@ -76,7 +76,6 @@ end @test stratification_col == ["_0", "_0", "_1", "_0", "_1", "_0", "_missing"] end - @testset "Test JointStratifiedCV" begin X = ( X1 = [0, 0, 1, 0, 1, 0, missing], @@ -105,6 +104,6 @@ end @test length(ttp) == 5 end -true +end -end \ No newline at end of file +true \ No newline at end of file diff --git a/test/runner.jl b/test/runner.jl new file mode 100644 index 0000000..01bd6da --- /dev/null +++ b/test/runner.jl @@ -0,0 +1,305 @@ +module TestsTMLE + +using Test +using TargetedEstimation +using TMLE +using JLD2 +using CSV +using Serialization +using YAML +using JSON + +TESTDIR = joinpath(pkgdir(TargetedEstimation), "test") + +CONFIGDIR = joinpath(TESTDIR, "config") + +include(joinpath(TESTDIR, "testutils.jl")) + +@testset "Integration Test" begin + build_dataset(;n=1000, format="csv") + tmpdir = mktempdir(cleanup=true) + estimands_filename = joinpath(tmpdir, "configuration.yaml") + TMLE.write_json(estimands_filename, statistical_estimands_only_config()) + outputs = TargetedEstimation.Outputs( + json=TargetedEstimation.JSONOutput(filename="output.json"), + hdf5=TargetedEstimation.HDF5Output(filename="output.hdf5", pval_threshold=1., sample_ids=true), + jls=TargetedEstimation.JLSOutput(filename="output.jls", pval_threshold=1e-5), + ) + runner = Runner( + "data.csv"; + estimands=estimands_filename, + estimators=joinpath(CONFIGDIR, "tmle_ose_config.jl"), + outputs=outputs, + cache_strategy="release-unusable", + ) + partition = 4:6 + results = runner(partition) + for result in results + @test result.TMLE isa TMLE.TMLEstimate + @test result.OSE isa TMLE.OSEstimate + end + + # Save outputs + TargetedEstimation.initialize(outputs) + TargetedEstimation.save(runner, results, partition, true) + + # Test Save to JSON + loaded_results = TMLE.read_json(outputs.json.filename, use_mmap=false) + for (result, loaded_result) in zip(results, loaded_results) + @test loaded_result[:TMLE] isa TMLE.TMLEstimate + @test result.TMLE.estimate == loaded_result[:TMLE].estimate + @test loaded_result[:TMLE].IC == [] + + @test loaded_result[:OSE] isa TMLE.OSEstimate + @test result.OSE.estimate == loaded_result[:OSE].estimate + @test loaded_result[:OSE].IC == [] + end + + # Test Save to JLS + loaded_results = [] + open(outputs.jls.filename) do io + while !eof(io) + push!(loaded_results, deserialize(io)) + end + end + for (index, (result, loaded_result)) in enumerate(zip(results, loaded_results)) + @test loaded_result[:TMLE] isa TMLE.TMLEstimate + @test result.TMLE.estimate == loaded_result[:TMLE].estimate + @test loaded_result[:OSE] isa TMLE.OSEstimate + @test result.OSE.estimate == loaded_result[:OSE].estimate + @test !haskey(loaded_result, :SAMPLE_IDS) + if index ∈ (1, 2) + @test loaded_result[:TMLE].IC == [] + @test loaded_result[:OSE].IC == [] + else + @test length(loaded_result[:TMLE].IC) > 0 + @test length(loaded_result[:OSE].IC) > 0 + end + end + + # Test Save to HDF5 + hdf5file = jldopen(outputs.hdf5.filename, "r") + loaded_results = hdf5file[string("Batch_1")] + for (param_index, result) in enumerate(loaded_results) + @test result.TMLE isa TMLE.TMLEstimate + @test results[param_index].TMLE.estimate == result.TMLE.estimate + + @test result.OSE isa TMLE.OSEstimate + @test results[param_index].OSE.estimate == result.OSE.estimate + end + + @test loaded_results[1].SAMPLE_IDS == collect(2:1000) + @test size(loaded_results[1].TMLE.IC, 1) == 999 + + @test loaded_results[2].SAMPLE_IDS == 1 + @test size(loaded_results[2].TMLE.IC, 1) == 999 + + @test loaded_results[3].SAMPLE_IDS == collect(1:1000) + @test size(loaded_results[3].TMLE.IC, 1) == 1000 + + close(hdf5file) + + # Clean + rm("data.csv") + rm(outputs.jls.filename) + rm(outputs.json.filename) + rm(outputs.hdf5.filename) +end + +@testset "Test tmle: varying dataset format and chunksize" begin + tmpdir = mktempdir(cleanup=true) + estimands_filename = joinpath(tmpdir, "configuration.json") + configuration = statistical_estimands_only_config() + TMLE.write_json(estimands_filename, configuration) + outputs = TargetedEstimation.Outputs( + json=TargetedEstimation.JSONOutput(filename="output.json"), + hdf5=TargetedEstimation.HDF5Output(filename="output.hdf5", pval_threshold=1.), + ) + estimatorfile = joinpath(CONFIGDIR, "tmle_ose_config.jl") + # Run tests over CSV and Arrow data formats + for format in ("csv", "arrow") + datafile = string("data.", format) + build_dataset(;n=1000, format=format) + for chunksize in (4, 10) + tmle(datafile; + estimands=estimands_filename, + estimators=estimatorfile, + outputs=outputs, + chunksize=chunksize, + ) + + results_from_hdf5 = jldopen(outputs.hdf5.filename) do io + map(keys(io)) do key + io[key] + end + end + results_from_hdf5 = vcat(results_from_hdf5...) + results_from_json = TMLE.read_json(outputs.json.filename, use_mmap=false) + + for i in 1:6 + Ψ = configuration.estimands[i] + for estimator_name in (:OSE, :TMLE) + @test Ψ == results_from_hdf5[i][estimator_name].estimand == results_from_json[i][estimator_name].estimand + @test results_from_hdf5[i][estimator_name].estimate == results_from_json[i][estimator_name].estimate + end + end + + # Clean + rm(outputs.hdf5.filename) + rm(outputs.json.filename) + end + GC.gc() # memory freed for deleting arrow file + rm(datafile) + end +end + +@testset "Test tmle: lower p-value threshold only JSON output" begin + build_dataset(;n=1000, format="csv") + tmpdir = mktempdir(cleanup=true) + estimandsfile = joinpath(tmpdir, "configuration.json") + configuration = statistical_estimands_only_config() + TMLE.write_json(estimandsfile, configuration) + estimatorfile = joinpath(CONFIGDIR, "ose_config.jl") + datafile = "data.csv" + + # Using the main entry point + main([ + "tmle", + datafile, + "--estimands", estimandsfile, + "--estimators", estimatorfile, + "--json-output", "output.json,1e-15"] + ) + + # Essential results + results_from_json = TMLE.read_json("output.json", use_mmap=false) + n_IC_empties = 0 + for result in results_from_json + if result[:OSE].IC != [] + n_IC_empties += 1 + end + end + @test n_IC_empties > 0 + + rm(datafile) + rm("output.json") +end + +@testset "Test tmle: Failing estimands" begin + build_dataset(;n=1000, format="csv") + outputs = TargetedEstimation.Outputs( + json=TargetedEstimation.JSONOutput(filename="output.json"), + hdf5=TargetedEstimation.HDF5Output(filename="output.hdf5") + ) + tmpdir = mktempdir(cleanup=true) + estimandsfile = joinpath(tmpdir, "configuration.json") + configuration = statistical_estimands_only_config() + TMLE.write_json(estimandsfile, configuration) + estimatorfile = joinpath(CONFIGDIR, "problematic_tmle_ose_config.jl") + datafile = "data.csv" + + runner = Runner(datafile; + estimands=estimandsfile, + estimators=estimatorfile, + outputs=outputs + ); + runner() + + # Test failed nuisance estimates (T2 model) + @test runner.failed_nuisance == Set([ + TMLE.ConditionalDistribution(:T2, (:W1, :W2)) + ]) + + # Check results from JSON + results_from_json = TMLE.read_json(outputs.json.filename, use_mmap=false) + for estimator in (:OSE, :TMLE) + @test results_from_json[1][estimator][:error] == "Could not fit the following propensity score model: P₀(T2 | W1, W2)" + @test results_from_json[1][estimator][:estimand] isa TMLE.Estimand + @test results_from_json[2][estimator] isa TMLE.EICEstimate + for i in 3:6 + @test results_from_json[i][estimator][:error] == "Skipped due to shared failed nuisance fit." + @test results_from_json[i][estimator][:estimand] isa TMLE.Estimand + end + end + + # Check results from HDF5 + jldopen(outputs.hdf5.filename) do io + results_from_hdf5 = io["Batch_1"] + for estimator in (:OSE, :TMLE) + @test results_from_hdf5[1][estimator] isa TargetedEstimation.FailedEstimate + @test results_from_hdf5[2][estimator] isa TMLE.EICEstimate + for i in 3:6 + @test results_from_hdf5[i][estimator] isa TargetedEstimation.FailedEstimate + @test results_from_hdf5[i][estimator].estimand isa TMLE.Estimand + end + end + end + # Clean + rm(outputs.json.filename) + rm(outputs.hdf5.filename) + rm(datafile) +end + +@testset "Test tmle: Causal and Composed Estimands" begin + build_dataset(;n=1000, format="csv") + tmpdir = mktempdir(cleanup=true) + estimandsfile = joinpath(tmpdir, "configuration.jls") + + configuration = causal_and_composed_estimands_config() + serialize(estimandsfile, configuration) + estimatorfile = joinpath(CONFIGDIR, "ose_config.jl") + datafile = "data.csv" + + # Using the main entry point + main([ + "tmle", + datafile, + "--estimands", estimandsfile, + "--estimators", estimatorfile, + "--chunksize", "2", + "--json-output", "output.json", + "--hdf5-output", "output.hdf5", + "--jls-output", "output.jls" + ]) + + # JLS Output + results = [] + open("output.jls") do io + while !eof(io) + push!(results, deserialize(io)) + end + end + for (index, Ψ) ∈ enumerate(configuration.estimands) + @test results[index].OSE.estimand == identify(Ψ, configuration.scm) + end + # The components of the diff should match the estimands 1 and 2 + for index in 1:2 + ATE_from_diff = results[3].OSE.estimates[index] + ATE_standalone = results[index].OSE + @test ATE_from_diff.estimand == ATE_standalone.estimand + @test ATE_from_diff.estimate == ATE_standalone.estimate + @test ATE_from_diff.std == ATE_standalone.std + end + @test results[3].OSE isa TMLE.ComposedEstimate + + # JSON Output + results_from_json = TMLE.read_json("output.json", use_mmap=false) + @test length(results_from_json) == 3 + + # HDF5 + jldopen("output.hdf5") do io + @test length(io["Batch_1"]) == 2 + composed_result = only(io["Batch_2"]) + @test composed_result.OSE.cov == results[3].OSE.cov + end + + rm(datafile) + rm("output.jls") + rm("output.json") + rm("output.hdf5") +end + + +end; + +true \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 7c461fe..50cc785 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,9 +1,17 @@ -include("tmle.jl") -include("load_tmle_spec.jl") -include("utils.jl") -include("sieve_variance.jl") -include("merge.jl") -include("resampling.jl") -include(joinpath("models", "glmnet.jl")) -include(joinpath("models", "adaptive_interaction_transformer.jl")) -include(joinpath("models", "biallelic_snp_encoder.jl")) +using TargetedEstimation +using Test + +TESTDIR = joinpath(pkgdir(TargetedEstimation), "test") + +@time begin + @test include(joinpath(TESTDIR, "outputs.jl")) + @test include(joinpath(TESTDIR, "cache_managers.jl")) + @test include(joinpath(TESTDIR, "utils.jl")) + @test include(joinpath(TESTDIR, "sieve_variance.jl")) + @test include(joinpath(TESTDIR, "runner.jl")) + @test include(joinpath(TESTDIR, "summary.jl")) + @test include(joinpath(TESTDIR, "resampling.jl")) + @test include(joinpath(TESTDIR, "models", "glmnet.jl")) + @test include(joinpath(TESTDIR, "models", "adaptive_interaction_transformer.jl")) + @test include(joinpath(TESTDIR, "models", "biallelic_snp_encoder.jl")) +end \ No newline at end of file diff --git a/test/sieve_variance.jl b/test/sieve_variance.jl index b90153d..0095067 100644 --- a/test/sieve_variance.jl +++ b/test/sieve_variance.jl @@ -11,6 +11,10 @@ using StableRNGs using Distributions using LogExpFunctions +TESTDIR = joinpath(pkgdir(TargetedEstimation), "test") + +include(joinpath(TESTDIR, "testutils.jl")) + function build_dataset(sample_ids) rng = StableRNG(123) n = size(sample_ids, 1) @@ -38,30 +42,27 @@ function build_dataset(sample_ids) C1 = C₁, ) - dataset[!, "CONTINUOUS, TARGET"] = y₁ - dataset[!, "BINARY/TARGET"] = categorical(y₂) + dataset[!, "CONTINUOUS, OUTCOME"] = y₁ + dataset[!, "BINARY/OUTCOME"] = categorical(y₂) CSV.write("data.csv", dataset) end -function build_tmle_output_file(sample_ids, param_file, outprefix) +function build_tmle_output_file(sample_ids, estimandfile, outprefix; + pval=1., + estimatorfile=joinpath(TESTDIR, "config", "tmle_ose_config.jl") + ) build_dataset(sample_ids) - # Only one continuous phenotype / machines not saved / no adaptive cv - parsed_args = Dict( - "data" => "data.csv", - "param-file" => param_file, - "estimator-file" => joinpath("config", "tmle_config.jl"), - "csv-out" => string(outprefix, ".csv"), - "verbosity" => 0, - "hdf5-out" => string(outprefix, ".hdf5"), - "pval-threshold" => 1., - "chunksize" => 100 + outputs = TargetedEstimation.Outputs( + hdf5=TargetedEstimation.HDF5Output(filename=string(outprefix, ".hdf5"), pval_threshold=pval, sample_ids=true), + ) + tmle("data.csv"; + estimands=estimandfile, + estimators=estimatorfile, + outputs=outputs ) - - TargetedEstimation.tmle_estimation(parsed_args) end - function basic_variance_implementation(matrix_distance, influence_curve, n_obs) variance = 0.f0 n_samples = size(influence_curve, 1) @@ -91,7 +92,7 @@ end function test_initial_output(output, expected_output) # Metadata columns - for col in [:PARAMETER_TYPE, :TREATMENTS, :CASE, :CONTROL, :TARGET, :CONFOUNDERS, :COVARIATES] + for col in [:PARAMETER_TYPE, :TREATMENTS, :CASE, :CONTROL, :OUTCOME, :CONFOUNDERS, :COVARIATES] for index in eachindex(output[!, col]) if expected_output[index, col] === missing @test expected_output[index, col] === output[index, col] @@ -101,9 +102,8 @@ function test_initial_output(output, expected_output) end end end - @testset "Test readGRM" begin - prefix = joinpath("data", "grm", "test.grm") + prefix = joinpath(TESTDIR, "data", "grm", "test.grm") GRM, ids = TargetedEstimation.readGRM(prefix) @test eltype(ids.SAMPLE_ID) == String @test size(GRM, 1) == 18915 @@ -111,65 +111,55 @@ end end @testset "Test build_work_list" begin - grm_ids = TargetedEstimation.GRMIDs(joinpath("data", "grm", "test.grm.id")) - param_file_1 = joinpath("config", "sieve_tests_parameters_1.yaml") - outprefix_1 = "tmle_output_1" - prefix = "tmle_output" - # CASE_1: only one file - build_tmle_output_file(grm_ids.SAMPLE_ID, param_file_1, outprefix_1) - # Since pval = 1., all parameters are considered for sieve variance - sieve_df, influence_curves, n_obs = TargetedEstimation.build_work_list(prefix, grm_ids) - @test n_obs == [193, 193, 193, 194, 194, 194] + grm_ids = TargetedEstimation.GRMIDs(joinpath(TESTDIR, "data", "grm", "test.grm.id")) + tmpdir = mktempdir(cleanup=true) + configuration = statistical_estimands_only_config() + + # CASE_1: pval = 1. + # Simulate multiple runs that occured + config_1 = TMLE.Configuration(estimands=configuration.estimands[1:3]) + estimandsfile_1 = joinpath(tmpdir, "configuration_1.json") + TMLE.write_json(estimandsfile_1, config_1) + build_tmle_output_file(grm_ids.SAMPLE_ID, estimandsfile_1, "tmle_output_1") + + config_2 = TMLE.Configuration(estimands=configuration.estimands[4:end]) + estimandsfile_2 = joinpath(tmpdir, "configuration_2.json") + TMLE.write_json(estimandsfile_2, config_2) + build_tmle_output_file(grm_ids.SAMPLE_ID, estimandsfile_2, "tmle_output_2") + + results, influence_curves, n_obs = TargetedEstimation.build_work_list("tmle_output", grm_ids) + # Check n_obs + @test n_obs == [194, 194, 194, 193, 193, 194] # Check influence curves - io = jldopen(string(outprefix_1, ".hdf5")) - for key in keys(io) - result = io[key]["result"] - IC = result.tmle.IC - # missing sample - if result.parameter.target == Symbol("BINARY/TARGET") - IC = vcat(0, IC) - end - @test convert(Vector{Float32}, IC) == influence_curves[parse(Int, key), :] + expected_influence_curves = [size(r.IC, 1) == 194 ? r.IC : vcat(0, r.IC) for r in results] + for rowindex in 1:6 + @test convert(Vector{Float32}, expected_influence_curves[rowindex]) == influence_curves[rowindex, :] end - close(io) - # Check output - some_expected_cols = DataFrame( - PARAMETER_TYPE = ["IATE", "IATE", "ATE", "IATE", "IATE", "ATE"], - TREATMENTS = ["T1_&_T2", "T1_&_T2", "T1_&_T2", "T1_&_T2", "T1_&_T2", "T1_&_T2"], - CASE=["true_&_true", "true_&_false", "true_&_true", "true_&_true", "true_&_false", "true_&_true"], - CONTROL=["false_&_false", "false_&_true", "false_&_false", "false_&_false", "false_&_true", "false_&_false"], - TARGET = ["BINARY/TARGET", "BINARY/TARGET", "BINARY/TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET"], - CONFOUNDERS = ["W1_&_W2", "W1_&_W2", "W1_&_W2", "W1_&_W2", "W1_&_W2", "W1_&_W2"], - COVARIATES = ["C1", "C1", "C1", "C1", "C1", "C1"] - ) - test_initial_output(sieve_df, some_expected_cols) - - # CASE_2: add another file - param_file_2 = joinpath("config", "sieve_tests_parameters_2.yaml") - outprefix_2 = "tmle_output_2" - build_tmle_output_file(grm_ids.SAMPLE_ID, param_file_2, outprefix_2) - # This p-value filters the influence curves for the binary outcome - sieve_df, influence_curves, n_obs = TargetedEstimation.build_work_list(prefix, grm_ids) - @test size(influence_curves) == (8, 194) - @test n_obs == [193, 193, 193, 194, 194, 194, 194, 194] - - # Check output - some_expected_cols = DataFrame( - PARAMETER_TYPE = ["IATE", "IATE", "ATE", "IATE", "IATE", "ATE", "ATE", "CM"], - TREATMENTS = ["T1_&_T2", "T1_&_T2", "T1_&_T2", "T1_&_T2", "T1_&_T2", "T1_&_T2", "T1", "T1"], - CASE = ["true_&_true", "true_&_false", "true_&_true", "true_&_true", "true_&_false", "true_&_true", "true", "false"], - CONTROL = ["false_&_false", "false_&_true", "false_&_false", "false_&_false", "false_&_true", "false_&_false", "false", missing], - TARGET = ["BINARY/TARGET", "BINARY/TARGET", "BINARY/TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET"], - CONFOUNDERS = ["W1_&_W2", "W1_&_W2", "W1_&_W2", "W1_&_W2", "W1_&_W2", "W1_&_W2", "W1", "W1"], - COVARIATES = ["C1", "C1", "C1", "C1", "C1", "C1", missing, missing] - ) - test_initial_output(sieve_df, some_expected_cols) - + # Check results + all(x isa TMLE.TMLEstimate for x in results) + all(size(x.IC, 1) > 0 for x in results) # clean - rm(string(outprefix_1, ".hdf5")) - rm(string(outprefix_1, ".csv")) - rm(string(outprefix_2, ".hdf5")) - rm(string(outprefix_2, ".csv")) + rm("tmle_output_1.hdf5") + rm("tmle_output_2.hdf5") + + # CASE_2: pval = 0.1 + pval = 0.1 + estimandsfile = joinpath(tmpdir, "configuration.json") + TMLE.write_json(estimandsfile, configuration) + build_tmle_output_file(grm_ids.SAMPLE_ID, estimandsfile, "tmle_output"; pval=pval) + results, influence_curves, n_obs = TargetedEstimation.build_work_list("tmle_output", grm_ids) + # Check n_obs + @test n_obs == [194, 193, 193, 194] + # Check influence curves + expected_influence_curves = [size(r.IC, 1) == 194 ? r.IC : vcat(0, r.IC) for r in results] + for rowindex in 1:4 + @test convert(Vector{Float32}, expected_influence_curves[rowindex]) == influence_curves[rowindex, :] + end + # Check results + all(x isa TMLE.TMLEstimate for x in results) + all(size(x.IC, 1) > 0 for x in results) + # Clean + rm("tmle_output.hdf5") rm("data.csv") end @@ -257,7 +247,6 @@ end # Check by hand for a single τ=0.5 @test variances[2, :] ≈ Float32[0.03666667, 0.045, 0.045] - end @testset "Test grm_rows_bounds" begin @@ -271,79 +260,116 @@ end end @testset "Test corrected_stderrors" begin - io = jldopen(joinpath("data", "sieve_variances.hdf5")) - variances = io["variances"] - n_obs = [10, 10, 10, 10, 10, 100, 100, 1000, 1000, 1000] - stderrors = TargetedEstimation.corrected_stderrors(variances, n_obs) + variances = [ + 1. 2. 6. + 4. 5. 3. + ] + stderrors = TargetedEstimation.corrected_stderrors(variances) # sanity check - @test size(stderrors, 1) == 10 + @test stderrors == sqrt.([4., 5., 6.]) +end - # check for the first curve - stderrors[1] == sqrt(maximum(variances[:,1])/n_obs[1]) +@testset "Test SVP" begin + # Generate data + grm_ids = TargetedEstimation.GRMIDs(joinpath(TESTDIR, "data", "grm", "test.grm.id")) + tmpdir = mktempdir(cleanup=true) + configuration = statistical_estimands_only_config() + pval = 0.1 + config_1 = TMLE.Configuration(estimands=configuration.estimands[1:3]) + estimandsfile_1 = joinpath(tmpdir, "configuration_1.json") + TMLE.write_json(estimandsfile_1, config_1) + build_tmle_output_file(grm_ids.SAMPLE_ID, estimandsfile_1, "tmle_output_1"; pval=pval) + + config_2 = TMLE.Configuration(estimands=configuration.estimands[4:end]) + estimandsfile_2 = joinpath(tmpdir, "configuration_2.json") + TMLE.write_json(estimandsfile_2, config_2) + build_tmle_output_file(grm_ids.SAMPLE_ID, estimandsfile_2, "tmle_output_2"; pval=pval) + + # Using the main command + main([ + "svp", + "tmle_output", + "--grm-prefix", joinpath(TESTDIR, "data", "grm", "test.grm"), + "--max-tau", "0.75" + ]) + + io = jldopen("svp.hdf5") + # Check τs + @test io["taus"] == TargetedEstimation.default_τs(10; max_τ=0.75) + # Check variances + @test size(io["variances"]) == (10, 4) + # Check results + svp_results = io["results"] + + tmleout1 = jldopen(x -> x["Batch_1"], "tmle_output_1.hdf5") + tmleout2 = jldopen(x -> x["Batch_1"], "tmle_output_2.hdf5") + src_results = [tmleout1..., tmleout2...] + + for svp_result in svp_results + src_result_index = findall(x.TMLE.estimand == svp_result.TMLE.estimand for x in src_results) + src_result = src_results[only(src_result_index)] + @test src_result.TMLE.std != svp_result.TMLE.std + @test src_result.TMLE.estimate == svp_result.TMLE.estimate + @test src_result.TMLE.n == svp_result.TMLE.n + @test svp_result.TMLE.IC == [] + end close(io) + # clean + rm("svp.hdf5") + rm("tmle_output_1.hdf5") + rm("tmle_output_2.hdf5") + rm("data.csv") end -@testset "Test sieve_variance_plateau" begin +@testset "Test SVP: causal and composed estimands" begin # Generate data - nb_estimators = 10 - grm_ids = TargetedEstimation.GRMIDs(joinpath("data", "grm", "test.grm.id")) - param_file_1 = joinpath("config", "sieve_tests_parameters_1.yaml") - tmle_outprefix_1 = "tmle_output_1" - param_file_2 = joinpath("config", "sieve_tests_parameters_2.yaml") - tmle_outprefix_2 = "tmle_output_2" - build_tmle_output_file(grm_ids.SAMPLE_ID, param_file_1, tmle_outprefix_1) - build_tmle_output_file(grm_ids.SAMPLE_ID, param_file_2, tmle_outprefix_2) - - outprefix = "sieve_output" - parsed_args = Dict( - "prefix" => "tmle_output", - "pval" => 1e-10, - "grm-prefix" => "data/grm/test.grm", - "out-prefix" => outprefix, - "nb-estimators" => nb_estimators, - "max-tau" => 0.75, - "verbosity" => 0 + grm_ids = TargetedEstimation.GRMIDs(joinpath(TESTDIR, "data", "grm", "test.grm.id")) + tmpdir = mktempdir(cleanup=true) + configuration = causal_and_composed_estimands_config() + pval = 1. + configfile = joinpath(tmpdir, "configuration.json") + TMLE.write_json(configfile, configuration) + build_tmle_output_file( + grm_ids.SAMPLE_ID, + configfile, + "tmle_output"; + estimatorfile=joinpath(TESTDIR, "config", "ose_config.jl") ) - sieve_variance_plateau(parsed_args) - # check hdf5 file - io = jldopen(string(outprefix, ".hdf5")) - @test io["taus"] == TargetedEstimation.default_τs(nb_estimators; max_τ=parsed_args["max-tau"]) - @test size(io["variances"]) == (10, 8) - close(io) - # check csv file - output = TargetedEstimation.read_output_with_types(string(outprefix, ".csv")) - some_expected_cols = DataFrame( - PARAMETER_TYPE = ["IATE", "IATE", "ATE", "IATE", "IATE", "ATE", "ATE", "CM"], - TREATMENTS = ["T1_&_T2", "T1_&_T2", "T1_&_T2", "T1_&_T2", "T1_&_T2", "T1_&_T2", "T1", "T1"], - CASE = ["true_&_true", "true_&_false", "true_&_true", "true_&_true", "true_&_false", "true_&_true", "true", "false"], - CONTROL = ["false_&_false", "false_&_true", "false_&_false", "false_&_false", "false_&_true", "false_&_false", "false", missing], - TARGET = ["BINARY/TARGET", "BINARY/TARGET", "BINARY/TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET", "CONTINUOUS, TARGET"], - CONFOUNDERS = ["W1_&_W2", "W1_&_W2", "W1_&_W2", "W1_&_W2", "W1_&_W2", "W1_&_W2", "W1", "W1"], - COVARIATES = ["C1", "C1", "C1", "C1", "C1", "C1", missing, missing] - ) - test_initial_output(output, some_expected_cols) - @test output.SIEVE_PVALUE isa Vector{Float64} - @test output.SIEVE_LWB isa Vector{Float64} - @test output.SIEVE_UPB isa Vector{Float64} - @test output.SIEVE_STD isa Vector{Float64} - - tmle_output = TargetedEstimation.load_csv_files( - TargetedEstimation.empty_tmle_output(), - ["tmle_output_1.csv", "tmle_output_2.csv"] - ) + # Using the main command + main([ + "svp", + "tmle_output", + "--grm-prefix", joinpath(TESTDIR, "data", "grm", "test.grm"), + "--max-tau", "0.75", + "--estimator-key", "OSE" + ]) + + # The ComposedEstimate std is not updated but each component is. + src_results = jldopen(x -> x["Batch_1"], "tmle_output.hdf5") + io = jldopen("svp.hdf5") + svp_results = io["results"] + standalone_estimates = svp_results[1:2] + from_composite = svp_results[3:4] + @test standalone_estimates[1].OSE.estimand == from_composite[1].OSE.estimand + @test standalone_estimates[2].OSE.estimand == from_composite[2].OSE.estimand + + # Check std has been updated + for i in 1:2 + @test standalone_estimates[i].OSE.estimand == src_results[i].OSE.estimand + @test standalone_estimates[i].OSE.estimate == src_results[i].OSE.estimate + @test standalone_estimates[i].OSE.std != src_results[i].OSE.std + end - joined = leftjoin(tmle_output, output, on=TargetedEstimation.joining_keys(), matchmissing=:equal) - @test all(joined.SIEVE_PVALUE .> 0 ) + close(io) + # clean - rm(string(outprefix, ".csv")) - rm(string(outprefix, ".hdf5")) - rm(string(tmle_outprefix_1, ".hdf5")) - rm(string(tmle_outprefix_1, ".csv")) - rm(string(tmle_outprefix_2, ".hdf5")) - rm(string(tmle_outprefix_2, ".csv")) + rm("svp.hdf5") + rm("tmle_output.hdf5") rm("data.csv") end end + +true diff --git a/test/summary.jl b/test/summary.jl new file mode 100644 index 0000000..40f3a73 --- /dev/null +++ b/test/summary.jl @@ -0,0 +1,89 @@ +module TestMergeCSVFiles + +using TargetedEstimation +using Test +using CSV +using DataFrames +using Serialization +using JLD2 + +TESTDIR = joinpath(pkgdir(TargetedEstimation), "test") + +CONFIGDIR = joinpath(TESTDIR, "config") + +include(joinpath(TESTDIR, "testutils.jl")) + +@testset "Test make_summary" begin + build_dataset() + datafile = "data.csv" + estimatorfile = joinpath(CONFIGDIR, "ose_config.jl") + tmpdir = mktempdir(cleanup=true) + # First Run + tmle_output_1 = TargetedEstimation.Outputs(hdf5=TargetedEstimation.HDF5Output(filename="tmle_output_1.hdf5")) + config_1 = statistical_estimands_only_config() + configfile_1 = joinpath(tmpdir, "configuration_1.json") + TMLE.write_json(configfile_1, config_1) + tmle(datafile; + estimands=configfile_1, + estimators=estimatorfile, + outputs=tmle_output_1, + chunksize=3 + ) + + # Second Run + tmle_output_2 = TargetedEstimation.Outputs(hdf5=TargetedEstimation.HDF5Output(filename="tmle_output_2.hdf5")) + config_2 = causal_and_composed_estimands_config() + configfile_2 = joinpath(tmpdir, "configuration_2.json") + TMLE.write_json(configfile_2, config_2) + tmle(datafile; + estimands=configfile_2, + estimators=estimatorfile, + outputs=tmle_output_2 + ) + + # Using the main entry point + main([ + "merge", + "tmle_output", + "--json-output", "summary.json", + "--jls-output", "summary.jls", + "--hdf5-output", "summary.hdf5" + ]) + + # Test correctness + hdf5file_1 = jldopen("tmle_output_1.hdf5") + hdf5file_2 = jldopen("tmle_output_2.hdf5") + inputs = vcat(hdf5file_1["Batch_1"], hdf5file_1["Batch_2"], hdf5file_2["Batch_1"]) + + json_outputs = TMLE.read_json("summary.json", use_mmap=false) + jls_outputs = [] + open("summary.jls") do io + while !eof(io) + push!(jls_outputs, deserialize(io)) + end + end + hdf5_output = jldopen("summary.hdf5") + hdf5_outputs = vcat((hdf5_output[key] for key in keys(hdf5_output))...) + + @test length(inputs) == 9 + for (input, jls_output, hdf5_out, json_output) in zip(inputs, jls_outputs, hdf5_outputs, json_outputs) + @test input.OSE.estimand == jls_output.OSE.estimand == hdf5_out.OSE.estimand == json_output[:OSE].estimand + end + + close(hdf5file_1) + close(hdf5file_2) + close(hdf5_output) + + # cleanup + rm("tmle_output_1.hdf5") + rm("tmle_output_2.hdf5") + rm("summary.hdf5") + rm("summary.jls") + rm("summary.json") + rm(datafile) +end + + +end + +true \ No newline at end of file diff --git a/test/testutils.jl b/test/testutils.jl new file mode 100644 index 0000000..ef5b992 --- /dev/null +++ b/test/testutils.jl @@ -0,0 +1,130 @@ +using TMLE +using StableRNGs +using DataFrames +using Distributions +using LogExpFunctions +using CSV +using Arrow +using CategoricalArrays + +function statistical_estimands_only_config() + configuration = Configuration( + estimands=[ + IATE( + outcome = Symbol("CONTINUOUS, OUTCOME"), + treatment_values = ( + T1 = (case = true, control = false), + T2 = (case = true, control = false)), + treatment_confounders = (T1 = (:W1, :W2), T2 = (:W1, :W2)), + outcome_extra_covariates = (:C1,) + ), + ATE( + outcome = Symbol("CONTINUOUS, OUTCOME"), + treatment_values = (T1 = (case = true, control = false),), + treatment_confounders = (T1 = (:W1, :W2),), + outcome_extra_covariates = () + ), + IATE( + outcome = Symbol("CONTINUOUS, OUTCOME"), + treatment_values = ( + T1 = (case = true, control = false), + T2 = (case = false, control = true) + ), + treatment_confounders = (T1 = (:W1, :W2), T2 = (:W1, :W2)), + outcome_extra_covariates = () + ), + IATE( + outcome = Symbol("BINARY/OUTCOME"), + treatment_values = ( + T1 = (case = true, control = false), + T2 = (case = false, control = true) + ), + treatment_confounders = (T1 = (:W1, :W2), T2 = (:W1, :W2)), + outcome_extra_covariates = (:C1,) + ), + IATE( + outcome = Symbol("BINARY/OUTCOME"), + treatment_values = ( + T1 = (case = true, control = false), + T2 = (case = true, control = false)), + treatment_confounders = (T1 = (:W1, :W2), T2 = (:W1, :W2)), + outcome_extra_covariates = (:C1,) + ), + CM( + outcome = Symbol("CONTINUOUS, OUTCOME"), + treatment_values = ( + T1 = true, + T2 = false), + treatment_confounders = (T1 = (:W1, :W2), T2 = (:W1, :W2)), + outcome_extra_covariates = (:C1,) + ) + ] + ) + return configuration +end + +function causal_and_composed_estimands_config() + ATE₁ = ATE( + outcome = Symbol("CONTINUOUS, OUTCOME"), + treatment_values = (T1 = (case = true, control = false),), + ) + ATE₂ = ATE( + outcome = Symbol("CONTINUOUS, OUTCOME"), + treatment_values = (T1 = (case = false, control = true),), + ) + diff = ComposedEstimand(-, (ATE₁, ATE₂)) + scm = StaticSCM( + outcomes = ["CONTINUOUS, OUTCOME"], + treatments = ["T1"], + confounders = [:W1, :W2] + ) + configuration = Configuration( + estimands = [ATE₁, ATE₂, diff], + scm = scm + ) + return configuration +end + +""" +CONTINUOUS_OUTCOME: +- IATE(0->1, 0->1) = E[W₂] = 0.5 +- ATE(0->1, 0->1) = -4 E[C₁] + 1 + E[W₂] = -2 + 1 + 0.5 = -0.5 + +BINARY_OUTCOME: +- IATE(0->1, 0->1) = +- ATE(0->1, 0->1) = + +""" +function build_dataset(;n=1000, format="csv") + rng = StableRNG(123) + # Confounders + W₁ = rand(rng, Uniform(), n) + W₂ = rand(rng, Uniform(), n) + # Covariates + C₁ = rand(rng, n) + # Treatment | Confounders + T₁ = rand(rng, Uniform(), n) .< logistic.(0.5sin.(W₁) .- 1.5W₂) + T₂ = rand(rng, Uniform(), n) .< logistic.(-3W₁ - 1.5W₂) + # target | Confounders, Covariates, Treatments + μ = 1 .+ 2W₁ .+ 3W₂ .- 4C₁.*T₁ .+ T₁ + T₂.*W₂.*T₁ + y₁ = μ .+ rand(rng, Normal(0, 0.01), n) + y₂ = rand(rng, Uniform(), n) .< logistic.(μ) + # Add some missingness + y₂ = vcat(missing, y₂[2:end]) + + dataset = DataFrame( + SAMPLE_ID = 1:n, + T1 = categorical(T₁), + T2 = categorical(T₂), + W1 = W₁, + W2 = W₂, + C1 = C₁, + ) + # Comma in name + dataset[!, "CONTINUOUS, OUTCOME"] = categorical(y₁) + # Slash in name + dataset[!, "BINARY/OUTCOME"] = categorical(y₂) + dataset[!, "EXTREME_BINARY"] = categorical(vcat(0, ones(n-1))) + + format == "csv" ? CSV.write("data.csv", dataset) : Arrow.write("data.arrow", dataset) +end \ No newline at end of file diff --git a/test/tmle.jl b/test/tmle.jl deleted file mode 100644 index 281041b..0000000 --- a/test/tmle.jl +++ /dev/null @@ -1,244 +0,0 @@ -module TestsTMLE - -using Test -using TargetedEstimation -using TMLE -using JLD2 -using StableRNGs -using Distributions -using LogExpFunctions -using CategoricalArrays -using DataFrames -using CSV -using Arrow - -function test_tmle_output(param_index, jldio, data, expected_param, sample_ids_idx) - jld2_res = jldio[string(param_index)] - csv_row = data[param_index, :] - Ψ = jld2_res["result"].parameter - @test jld2_res["result"] isa TMLE.TMLEResult - @test jld2_res["result"].tmle.Ψ̂ isa Float64 - @test Ψ == expected_param - @test jld2_res["sample_ids_idx"] == sample_ids_idx - sample_ids = jldio[string(jld2_res["sample_ids_idx"])]["sample_ids"] - if expected_param.target == Symbol("BINARY/TARGET") - @test sample_ids == 2:1000 - else - @test sample_ids == 1:1000 - end - @test jld2_res["result"] isa TMLE.TMLEResult - - if csv_row.COVARIATES === missing - @test TargetedEstimation.covariates_string(Ψ) === csv_row.COVARIATES - else - @test TargetedEstimation.covariates_string(Ψ) == csv_row.COVARIATES - end - @test TargetedEstimation.param_string(Ψ) == csv_row.PARAMETER_TYPE - @test TargetedEstimation.case_string(Ψ) == csv_row.CASE - @test TargetedEstimation.control_string(Ψ) == csv_row.CONTROL - @test TargetedEstimation.treatment_string(Ψ) == csv_row.TREATMENTS - @test TargetedEstimation.confounders_string(Ψ) == csv_row.CONFOUNDERS - @test csv_row.TMLE_ESTIMATE == jld2_res["result"].tmle.Ψ̂ -end - -""" -CONTINUOUS_TARGET: -- IATE(0->1, 0->1) = E[W₂] = 0.5 -- ATE(0->1, 0->1) = -4 E[C₁] + 1 + E[W₂] = -2 + 1 + 0.5 = -0.5 - -BINARY_TARGET: -- IATE(0->1, 0->1) = -- ATE(0->1, 0->1) = - -""" -function build_dataset(;n=1000, format="csv") - rng = StableRNG(123) - # Confounders - W₁ = rand(rng, Uniform(), n) - W₂ = rand(rng, Uniform(), n) - # Covariates - C₁ = rand(rng, n) - # Treatment | Confounders - T₁ = rand(rng, Uniform(), n) .< logistic.(0.5sin.(W₁) .- 1.5W₂) - T₂ = rand(rng, Uniform(), n) .< logistic.(-3W₁ - 1.5W₂) - # target | Confounders, Covariates, Treatments - μ = 1 .+ 2W₁ .+ 3W₂ .- 4C₁.*T₁ .+ T₁ + T₂.*W₂.*T₁ - y₁ = μ .+ rand(rng, Normal(0, 0.01), n) - y₂ = rand(rng, Uniform(), n) .< logistic.(μ) - # Add some missingness - y₂ = vcat(missing, y₂[2:end]) - - dataset = DataFrame( - SAMPLE_ID = 1:n, - T1 = categorical(T₁), - T2 = categorical(T₂), - W1 = W₁, - W2 = W₂, - C1 = C₁, - ) - # Comma in name - dataset[!, "CONTINUOUS, TARGET"] = categorical(y₁) - # Slash in name - dataset[!, "BINARY/TARGET"] = categorical(y₂) - dataset[!, "EXTREME_BINARY"] = categorical(vcat(0, ones(n-1))) - - format == "csv" ? CSV.write("data.csv", dataset) : Arrow.write("data.arrow", dataset) -end - -@testset "Test partition_tmle!" begin - build_dataset(;n=1000, format="csv") - dataset = TargetedEstimation.instantiate_dataset("data.csv") - parameters = TargetedEstimation.read_parameters(joinpath("config", "parameters.yaml"), dataset) - variables = TargetedEstimation.variables(parameters, dataset) - TargetedEstimation.coerce_types!(dataset, variables) - tmle_spec = TargetedEstimation.load_tmle_spec(joinpath("config", "tmle_config.jl")) - cache = TMLECache(dataset) - - tmle_results = Vector{Union{TMLE.TMLEResult, TargetedEstimation.MissingTMLEResult}}(undef, 3) - logs = Vector{Union{String, Missing}}(undef, 3) - part = 4:6 - TargetedEstimation.partition_tmle!(cache, tmle_results, logs, part, tmle_spec, parameters, variables; verbosity=0) - @test [x.tmle.Ψ̂ for x in tmle_results] isa Vector{Float64} - @test [x.parameter for x in tmle_results] == parameters[part] - @test [x.onestep.Ψ̂ for x in tmle_results] isa Vector{Float64} - @test all(x === missing for x in logs) - rm("data.csv") -end - -@testset "Test tmle_estimation" begin - expected_parameters = [ - ATE(Symbol("CONTINUOUS, TARGET"), (T1 = (case = true, control = false),), [:W1, :W2], Symbol[]), - IATE(Symbol("BINARY/TARGET"), (T1 = (case = true, control = false), T2 = (case = true, control = false)), [:W1, :W2], [:C1]), - IATE(Symbol("BINARY/TARGET"), (T1 = (case = true, control = false), T2 = (case = false, control = true)), [:W1, :W2], [:C1]), - IATE(Symbol("CONTINUOUS, TARGET"), (T1 = (case = true, control = false), T2 = (case = false, control = true)), [:W1, :W2], Symbol[]), - IATE(Symbol("CONTINUOUS, TARGET"), (T1 = (case = true, control = false), T2 = (case = true, control = false)), [:W1, :W2], [:C1]), - ATE(Symbol("CONTINUOUS, TARGET"), (T1 = (case = true, control = false), T2 = (case = true, control = false)), [:W1, :W2], [:C1]) - ] - expected_param_sample_ids_idx = [1, 2, 2, 4, 5, 5] - # Run tests over CSV and Arrow data formats - for format in ("csv", "arrow") - build_dataset(;n=1000, format=format) - parsed_args = Dict( - "data" => string("data.", format), - "param-file" => nothing, - "estimator-file" => joinpath("config", "tmle_config.jl"), - "csv-out" => "output.csv", - "verbosity" => 0, - "hdf5-out" => "output.hdf5", - "pval-threshold" => 1., - "chunksize" => nothing - ) - for param_file in ("parameters.yaml", "parameters.bin") - for chunksize in (4, 10) - # Only one continuous phenotype / machines not saved / no adaptive cv - parsed_args["param-file"] = joinpath("config", param_file) - parsed_args["chunksize"] = chunksize - - tmle_estimation(parsed_args) - - # Given the threshold is 1, all - # estimation results will make the threshold - jldio = jldopen(parsed_args["hdf5-out"]) - data = CSV.read(parsed_args["csv-out"], DataFrame) - - @test all(data[i, :TMLE_ESTIMATE] != data[j, :TMLE_ESTIMATE] for i in 1:5 for j in i+1:6) - - for (param_index, (Ψ, sample_ids_idx)) in enumerate(zip(expected_parameters, expected_param_sample_ids_idx)) - test_tmle_output(param_index, jldio, data, Ψ, sample_ids_idx) - end - # Clean - rm(parsed_args["csv-out"]) - rm(parsed_args["hdf5-out"]) - end - end - rm(parsed_args["data"]) - end -end - -@testset "Test tmle_estimation: No hdf5 file" begin - build_dataset(;n=1000, format="csv") - # Only one continuous phenotype / machines not saved / no adaptive cv - param_file = "parameters.yaml" - parsed_args = Dict( - "data" => "data.csv", - "param-file" => joinpath("config", param_file), - "estimator-file" => joinpath("config", "tmle_config_2.jl"), - "csv-out" => "output.csv", - "verbosity" => 0, - "hdf5-out" => nothing, - "pval-threshold" => 1., - "chunksize" => 10 - ) - - tmle_estimation(parsed_args) - - ## Check CSV file - data = CSV.read(parsed_args["csv-out"], DataFrame) - @test names(TargetedEstimation.empty_tmle_output()) == names(data) - @test size(data) == (6, 19) - all(x === missing for x in data.LOG) - # Clean - rm(parsed_args["csv-out"]) - rm(parsed_args["data"]) -end - - -@testset "Test tmle_estimation: lower p-value threhsold" begin - build_dataset(;n=1000, format="csv") - parsed_args = Dict( - "data" => "data.csv", - "param-file" => joinpath("config", "parameters.yaml"), - "estimator-file" => joinpath("config", "tmle_config.jl"), - "csv-out" => "output.csv", - "verbosity" => 0, - "hdf5-out" => "output.hdf5", - "pval-threshold" => 1e-15, - "chunksize" => 10 - ) - - tmle_estimation(parsed_args) - - # Essential results - data = CSV.read(parsed_args["csv-out"], DataFrame) - jldio = jldopen(parsed_args["hdf5-out"]) - @test !haskey(jldio, "2") - @test !haskey(jldio, "3") - @test !haskey(jldio, "4") - @test !haskey(jldio, "5") - @test !haskey(jldio, "6") - - @test jldio["1"]["result"].tmle.Ψ̂ == data[1, :TMLE_ESTIMATE] - - rm(parsed_args["data"]) - rm(parsed_args["csv-out"]) - rm(parsed_args["hdf5-out"]) -end - -@testset "Test tmle_estimation: Failing parameters" begin - build_dataset(;n=1000, format="csv") - parsed_args = Dict( - "data" => "data.csv", - "param-file" => joinpath("config", "failing_parameters.yaml"), - "estimator-file" => joinpath("config", "tmle_config.jl"), - "csv-out" => "output.csv", - "verbosity" => 0, - "hdf5-out" => nothing, - "pval-threshold" => 1e-10, - "chunksize" => 10 - ) - - tmle_estimation(parsed_args) - - # Essential results - data = CSV.read(parsed_args["csv-out"], DataFrame) - @test size(data) == (1, 19) - @test data[1, :TMLE_ESTIMATE] === missing - - rm(parsed_args["data"]) - rm(parsed_args["csv-out"]) - -end - -end; - -true \ No newline at end of file diff --git a/test/utils.jl b/test/utils.jl index 480fb24..9279170 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -5,75 +5,152 @@ using TargetedEstimation using TMLE using DataFrames using CSV +using MLJBase +using MLJLinearModels using CategoricalArrays -@testset "Test CSV writing" begin - Ψ = IATE( - target=:Y, - treatment=(T₁=(case=1, control=0), T₂=(case="AC", control="CC")), - confounders=[:W₁, :W₂] - ) - @test TargetedEstimation.covariates_string(Ψ) === missing - @test TargetedEstimation.param_string(Ψ) == "IATE" - @test TargetedEstimation.case_string(Ψ) == "1_&_AC" - @test TargetedEstimation.control_string(Ψ) == "0_&_CC" - @test TargetedEstimation.treatment_string(Ψ) == "T₁_&_T₂" - @test TargetedEstimation.confounders_string(Ψ) == "W₁_&_W₂" - - Ψ = CM( - target=:Y, - treatment=(T₁=1, T₂="AC"), - confounders=[:W₁, :W₂], - covariates=[:C₁] - ) +check_type(treatment_value, ::Type{T}) where T = @test treatment_value isa T + +check_type(treatment_values::NamedTuple, ::Type{T}) where T = + @test treatment_values.case isa T && treatment_values.control isa T + +PKGDIR = pkgdir(TargetedEstimation) +TESTDIR = joinpath(PKGDIR, "test") + +include(joinpath(TESTDIR, "testutils.jl")) + +@testset "Test load_tmle_spec" begin + # Default + noarg_estimators = TargetedEstimation.load_tmle_spec() + default_models = noarg_estimators.TMLE.models + @test noarg_estimators.TMLE isa TMLEE + @test default_models.Q_binary_default.glm_net_classifier isa GLMNetClassifier + @test default_models.Q_continuous_default.glm_net_regressor isa GLMNetRegressor + @test default_models.G_default isa GLMNetClassifier + # From template name + for file in readdir(joinpath(PKGDIR, "estimators-configs")) + configname = replace(file, ".jl" => "") + estimators = TargetedEstimation.load_tmle_spec(;file=configname) + @test estimators.TMLE isa TMLEE + end + # From explicit file + estimators = TargetedEstimation.load_tmle_spec(file=joinpath(TESTDIR, "config", "tmle_ose_config.jl")) + @test estimators.TMLE isa TMLE.TMLEE + @test estimators.OSE isa TMLE.OSE + @test estimators.TMLE.weighted === true + @test estimators.TMLE.models.G_default === estimators.OSE.models.G_default + @test estimators.TMLE.models.G_default isa MLJBase.ProbabilisticStack +end +@testset "Test convert_treatment_values" begin + treatment_types = Dict(:T₁=> Union{Missing, Bool}, :T₂=> Int) + newT = TargetedEstimation.convert_treatment_values((T₁=1,), treatment_types) + @test newT isa Vector{Bool} + @test newT == [1] - @test TargetedEstimation.covariates_string(Ψ) === "C₁" - @test TargetedEstimation.param_string(Ψ) == "CM" - @test TargetedEstimation.case_string(Ψ) == "1_&_AC" - @test TargetedEstimation.control_string(Ψ) === missing - @test TargetedEstimation.treatment_string(Ψ) == "T₁_&_T₂" - @test TargetedEstimation.confounders_string(Ψ) == "W₁_&_W₂" + newT = TargetedEstimation.convert_treatment_values((T₁=(case=1, control=0.),), treatment_types) + @test newT isa Vector{NamedTuple{(:case, :control), Tuple{Bool, Bool}}} + @test newT == [(case = true, control = false)] + newT = TargetedEstimation.convert_treatment_values((T₁=(case=1, control=0.), T₂=(case=true, control=0)), treatment_types) + @test newT isa Vector{NamedTuple{(:case, :control)}} + @test newT == [(case = true, control = false), (case = 1, control = 0)] end -@testset "Test variables" begin - parameters = [ - IATE( - target=:Y, - treatment=(T₁=(case=1, control=0), T₂=(case="AC", control="CC")), - confounders=[:W₁, :W₂]), - CM( - target=:Y₂, - treatment=(T₁=1, T₃="AC"), - confounders=[:W₃, :W₂], - covariates=[:C₁]) - ] - dataset = DataFrame(Y=[1.1, 2.2, missing], Y₂=[1, 0, missing]) - variables = TargetedEstimation.variables(parameters, dataset) - @test variables == ( - treatments = Set([:T₃, :T₁, :T₂]), - confounders = Set([:W₁, :W₃, :W₂]), - covariates = Set([:C₁]), - binarytargets = Set([:Y₂]), - continuoustargets = Set([:Y]) +@testset "Test proofread_estimands" for extension in ("yaml", "json") + # Write estimands file + filename = "statistical_estimands.$extension" + eval(Meta.parse("TMLE.write_$extension"))(filename, statistical_estimands_only_config()) + + dataset = DataFrame(T1 = [1., 0.], T2=[true, false]) + estimands = TargetedEstimation.proofread_estimands(filename, dataset) + for estimand in estimands + if haskey(estimand.treatment_values, :T1) + check_type(estimand.treatment_values.T1, Float64) + end + if haskey(estimand.treatment_values, :T2) + check_type(estimand.treatment_values.T2, Bool) + end + end + # Clean estimands file + rm(filename) +end + +@testset "Test factorialATE" begin + dataset = DataFrame(C=[1, 2, 3, 4],) + @test_throws ArgumentError TargetedEstimation.build_estimands_list("factorialATE", dataset) + dataset.T = [0, 1, missing, 2] + @test_throws ArgumentError TargetedEstimation.build_estimands_list("factorialATE", dataset) + dataset.Y = [0, 1, 2, 2] + dataset.W1 = [1, 1, 1, 1] + dataset.W_2 = [1, 1, 1, 1] + composedATE = TargetedEstimation.build_estimands_list("factorialATE", dataset)[1] + @test composedATE.args == ( + TMLE.StatisticalATE(:Y, (T = (case = 1, control = 0),), (T = (:W1, :W_2),), ()), + TMLE.StatisticalATE(:Y, (T = (case = 2, control = 1),), (T = (:W1, :W_2),), ()) ) +end +@testset "Test coerce_types!" begin + Ψ = IATE( + outcome=:Ycont, + treatment_values=(T₁=(case=1, control=0), T₂=(case="AC", control="CC")), + treatment_confounders=(T₁=[:W₁, :W₂], T₂=[:W₁, :W₂]), + ) + + dataset = DataFrame( + Ycont = [1.1, 2.2, missing], + Ycat = [1., 0., missing], + T₁ = [1, 0, missing], + T₂ = [missing, "AC", "CC"], + W₁ = [1., 0., 0.], + W₂ = [missing, 0., 0.], + C = [1, 2, 3] + ) + TargetedEstimation.coerce_types!(dataset, Ψ) + + @test dataset.T₁ isa CategoricalArray + @test dataset.T₂ isa CategoricalArray + for var in [:W₁, :W₂, :Ycont] + @test eltype(dataset[!, var]) <: Union{Missing, Float64} + end - variables = TargetedEstimation.variables(parameters[1]) - @test variables == ( - target = :Y, - covariates = Symbol[], - confounders = [:W₁, :W₂], - treatments = (:T₁, :T₂) + Ψ = IATE( + outcome=:Ycat, + treatment_values=(T₂=(case="AC", control="CC"), ), + treatment_confounders=(T₂=[:W₂],), + outcome_extra_covariates=[:C] ) + TargetedEstimation.coerce_types!(dataset, Ψ) + + @test dataset.Ycat isa CategoricalArray + @test eltype(dataset.C) <: Union{Missing, Float64} end -@testset "Test get_sample_ids" begin - variables = ( - target = :Y, - covariates = Symbol[], - confounders = [:W₁, :W₂], - treatments = (:T₁, :T₂) +@testset "Test misc" begin + Ψ = ATE( + outcome = :Y, + treatment_values = ( + T₁ = (case=1, control=0), + T₂ = (case=1, control=0)), + treatment_confounders = ( + T₁=[:W₁, :W₂], + T₂=[:W₂, :W₃] + ), + outcome_extra_covariates = [:C] + ) + variables = TargetedEstimation.variables(Ψ) + @test variables == Set([:Y, :C, :T₁, :T₂, :W₁, :W₂, :W₃]) + Ψ = ATE( + outcome = :Y, + treatment_values = ( + T₁ = (case=1, control=0), + T₂ = (case=1, control=0)), + treatment_confounders = ( + T₁=[:W₁, :W₂], + T₂=[:W₁, :W₂] + ), ) + variables = TargetedEstimation.variables(Ψ) + @test variables == Set([:Y, :T₁, :T₂, :W₁, :W₂]) data = DataFrame( SAMPLE_ID = [1, 2, 3, 4, 5], Y = [1, 2, 3, missing, 5], @@ -82,75 +159,13 @@ end T₁ = [1, 2, 3, 4, 5], T₂ = [1, 2, 3, 4, missing], ) - sample_ids = TargetedEstimation.get_sample_ids(data, variables) + sample_ids = TargetedEstimation.sample_ids_from_variables(data, variables) @test sample_ids == [2, 3] data.W₁ = [1, 2, missing, 4, 5] - sample_ids = TargetedEstimation.get_sample_ids(data, variables) + sample_ids = TargetedEstimation.sample_ids_from_variables(data, variables) @test sample_ids == [2] end -@testset "Test treatment_values" begin - treatment_types = Dict(:T₁=> Union{Missing, Bool}, :T₂=> Int) - Ψ = CM(target=:Y, treatment=(T₁=1,), confounders=[:W₁]) - newT = TargetedEstimation.treatment_values(Ψ, (:T₁,), treatment_types) - @test newT isa Vector{Bool} - @test newT == [1] - - Ψ = ATE(target=:Y, treatment=(T₁=(case=1, control=0.),), confounders=[:W₁]) - newT = TargetedEstimation.treatment_values(Ψ, (:T₁,), treatment_types) - @test newT isa Vector{NamedTuple{(:case, :control), Tuple{Bool, Bool}}} - @test newT == [(case = true, control = false)] - - Ψ = ATE(target=:Y, treatment=(T₁=(case=1, control=0.), T₂=(case=true, control=0)), confounders=[:W₁]) - newT = TargetedEstimation.treatment_values(Ψ, (:T₁, :T₂), treatment_types) - @test newT isa Vector{NamedTuple{(:case, :control)}} - @test newT == [(case = true, control = false), (case = 1, control = 0)] -end - -@testset "Test read_parameters" for param_file in ("parameters.yaml", "parameters.bin") - param_file = joinpath("config", param_file) - dataset = DataFrame(T1 = [1., 0.], T2=[true, false]) - params = TargetedEstimation.read_parameters(param_file, dataset) - for param in params - if haskey(param.treatment, :T1) - @test param.treatment.T1.case isa Float64 - @test param.treatment.T1.control isa Float64 - end - if haskey(param.treatment, :T2) - @test param.treatment.T2.case isa Bool - @test param.treatment.T2.control isa Bool - end - end -end - - -@testset "Test write_target_results with missing values" begin - filename = "test.csv" - parameters = [ - CM( - target=:Y, - treatment=(T₁=1, T₂="AC"), - confounders=[:W₁, :W₂], - covariates=[:C₁] - )] - tmle_results = [TargetedEstimation.MissingTMLEResult(parameters[1])] - logs = ["Error X"] - TargetedEstimation.append_csv(filename, tmle_results, logs) - out = CSV.read(filename, DataFrame) - expected_out = ["CM", "T₁_&_T₂", "1_&_AC", missing, "Y", "W₁_&_W₂", "C₁", - missing, missing, missing, missing, missing, missing, - missing, missing, missing, missing, missing, - "Error X"] - for (x, y) in zip(first(out), expected_out) - if x === missing - @test x === y - else - @test x == y - end - end - rm(filename) -end - @testset "Test make_categorical! and make_float!" begin dataset = DataFrame( T₁ = [1, 1, 0, 0], @@ -176,6 +191,9 @@ end TargetedEstimation.make_float!(dataset, [:C₁]) @test eltype(dataset.C₁) == Float64 + # If the type is already coerced then no-operation is applied + TargetedEstimation.make_float(dataset.C₁) === dataset.C₁ + TargetedEstimation.make_categorical(dataset.T₁, true) === dataset.T₁ end end; diff --git a/tmle.jl b/tmle.jl new file mode 100644 index 0000000..592b78f --- /dev/null +++ b/tmle.jl @@ -0,0 +1 @@ +using TargetedEstimation; main() \ No newline at end of file