forked from serrano-pozo-lab/glia-ihc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
partition-condition.Rmd
157 lines (109 loc) · 3.95 KB
/
partition-condition.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
---
title: "Partition by Condition"
description: |
This R script partitions the data into training, test, and validation sets using stratified random sampling by condition (i.e., CTRL vs. AD).
author:
- first_name: "Ayush"
last_name: "Noori"
url: https://www.github.com/ayushnoori
affiliation: Massachusetts General Hospital
affiliation_url: https://www.serranopozolab.org
orcid_id: 0000-0003-1420-1236
output:
distill::distill_article:
toc: true
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(eval = FALSE)
```
# Dependencies
Load requisite packages and define directories. Note that this script uses my personal utilities package brainstorm, which can be downloaded via `devtools::install_github("ayushnoori/brainstorm")`.
```{r load-packages, message=FALSE, warning=FALSE}
# data manipulation
library(data.table)
library(purrr)
library(magrittr)
# fast file system operations
library(fs)
# partition data
library(caret)
# utility functions
library(brainstorm)
```
Note that directories are relative to the R project path.
```{r define-directores}
# set directories
ddir = file.path("Data", "3 - ROIs")
pdir = file.path("Data", "4 - Condition Partition")
dir1 = file.path("Results", "CNN", "1.1 - Condition Partition")
# create file structure
celltypes = c("Astrocyte", "Microglia", "Vessel") %>% purrr::set_names()
grp = c("Train", "Test", "Validation") %>% purrr::set_names()
pheno = c("CTRL", "AD") %>% purrr::set_names()
dirs = pmap_chr(expand.grid(pdir, celltypes, grp, pheno), file.path)
# remove prior directories/files if they exist
check_dir = function(fname) {if(fs::dir_exists(fname)) fs::dir_delete(fname); fs::dir_create(fname)}
walk(dirs, check_dir)
```
# Retrieve ROI Paths
Write function to retrieve ROI paths.
```{r retrieve-paths}
retrieve_paths = function(fname) {
# list TIFF files in "/<celltype> ROIs" subdirectories
tiffs = map(celltypes, ~paste(.x, "ROIs") %>%
file.path(fname, .) %>%
list.files(pattern = "\\.tif$", full.names = TRUE))
return(tiffs)
}
```
Then, map function over crop list.
```{r get-paths}
# get crop list
crops = file.path(ddir, pheno) %>% list.files(full.names = TRUE)
# get TIFF file paths
tiffs = map(crops, retrieve_paths)
# aggregate TIFF file paths by cell type
tiffs = map(celltypes, ~unlist(map(tiffs, .x), use.names = FALSE))
```
# Partition ROIs
Define function to partition ROIs into training, test, and validation sets.
```{r partition-rois}
partition_rois = function(flist, lab) {
# construct data table
dat = data.table(Path = flist)
# parse metadata from file path
dat %>%
.[, Name := basename(Path)] %>%
.[, Group := lab] %>%
.[, Condition := map_chr(strsplit(Path, "/"), 3)] %>%
.[, Sample := flist %>% strsplit("/") %>% map_chr(4) %>%
strsplit("_") %>% map_chr(1)] %>%
.[, Batch := ifelse(Sample %in% c("1190", "1301", "2148", "2157",
"2191", "2207"), 1, 2)]
# partition into test, training, and validation sets
train_lab = dat[createDataPartition(Condition, p = 0.6, list = FALSE), Name]
test_lab = dat[!Name %in% train_lab] %>%
.[createDataPartition(Condition, p = 0.5, list = FALSE), Name]
# create partition variable
dat %>%
.[, Partition := "Validation"] %>%
.[Name %in% train_lab, Partition := "Train"] %>%
.[Name %in% test_lab, Partition := "Test"]
# construct output path
dat[, Output := file.path(pdir, Group, Partition, Condition, Name)]
# copy TIFF files to appropriate output folder
pwalk(dat[, .(Path, Output)], ~fs::file_copy(.x, .y))
# print results
cat(paste("\n", lab, "ROIs:\n"))
walk(dat[, .(Condition, Partition, Sample)], ~print(summary(factor(.x))))
# return data table
return(dat)
}
```
Map function over TIFF file paths.
```{r apply-partition}
# partition ROIs
all = imap(tiffs, partition_rois)
# save partition result
saveRDS(all, file.path(dir1, "ROI Partition by Condition.rds"))
```