Skip to content

Commit

Permalink
Add section comments
Browse files Browse the repository at this point in the history
  • Loading branch information
pranavanba committed Feb 28, 2024
1 parent d0c3344 commit 6e5a680
Show file tree
Hide file tree
Showing 10 changed files with 99 additions and 10 deletions.
1 change: 1 addition & 0 deletions scripts/egress/egress.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
cat("Beginning egress: storing output concepts, input concept map, and input variable list in Synapse\n")

# Write the following to Synapse: 1) the final output concepts data, 2) the input data used in this pipeline
latest_commit <-
gh::gh(
endpoint = "/repos/:owner/:repo/commits/main",
Expand Down
10 changes: 9 additions & 1 deletion scripts/fetch-data/fetch_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ library(dplyr)

cat("Fetching data\n")

# Get config variables
config::get(
file = "config/config.yml",
config = "prod"
Expand All @@ -12,6 +13,7 @@ config::get(

synLogin()

# Get input files from synapse
concept_map <-
syn_file_to_df(ontologyFileID, "CONCEPT_CD") %>%
filter(CONCEPT_CD!="<null>")
Expand All @@ -21,7 +23,11 @@ selected_vars <-
mutate(Lower_Bound = suppressWarnings(as.numeric(Lower_Bound)),
Upper_Bound = suppressWarnings(as.numeric(Upper_Bound)))

dataset_name_filter <- selected_vars %>% dplyr::pull(Export) %>% unique()
# Get list of which datasets to use
dataset_name_filter <-
selected_vars %>%
dplyr::pull(Export) %>%
unique()

# Sync S3 bucket to local
token <- synapser::synGetStsStorageToken(
Expand All @@ -43,11 +49,13 @@ if (deleteExistingDir==TRUE) {
unlink(downloadLocation, recursive = T, force = T)
}

# Only sync the bucket folders containing the datasets we need
inclusions <- paste0("--include \"*",dataset_name_filter,"*\"", collapse = " ")
sync_cmd <- glue::glue('aws s3 sync {base_s3_uri} {downloadLocation} --exclude "*" {inclusions}')
system(sync_cmd)
rm(sync_cmd)

# For use in process-data steps
concept_replacements_reversed <- vec_reverse(concept_replacements)

if (!dir.exists(outputConceptsDir)) {
Expand Down
9 changes: 9 additions & 0 deletions scripts/process-data/fitbitactivitylogs.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,21 @@ dataset <- "fitbitactivitylogs"

cat(glue::glue("Transforming data for {dataset}"),"\n")

# Get variables for this dataset
vars <-
selected_vars %>%
filter(grepl(dataset, Export, ignore.case = TRUE)) %>%
pull(Variable)

# Load the desired subset of this dataset in memory
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
select(all_of(vars)) %>%
collect()

colnames(df) <- tolower(colnames(df))

# Create lists for ID variables and i2b2 concept variables
excluded_concepts <- c("participantidentifier", "startdate", "enddate")

approved_concepts_summarized <-
Expand All @@ -26,6 +29,7 @@ approved_concepts_summarized <-

df[approved_concepts_summarized] <- lapply(df[approved_concepts_summarized], as.numeric)

# Get QA/QC ranges for variables and exclude values outside the ranges
bounds <-
selected_vars %>%
filter(grepl(dataset, Export, ignore.case = TRUE),
Expand All @@ -46,6 +50,7 @@ for (col_name in names(df_filtered)) {
}
}

# Pivot data frame from long to wide
df_melted_filtered <-
df_filtered %>%
recoverSummarizeR::melt_df(excluded_concepts = excluded_concepts) %>%
Expand All @@ -57,13 +62,15 @@ df_melted_filtered <-
mutate(value = as.numeric(value))
cat("recoverSummarizeR::melt_df() completed.\n")

# Generate i2b2 summaries
df_summarized <-
df_melted_filtered %>%
select(all_of(c("participantidentifier", "startdate", "enddate", "concept", "value"))) %>%
recoverSummarizeR::stat_summarize() %>%
distinct()
cat("recoverSummarizeR::stat_summarize() completed.\n")

# Add i2b2 columns from concept map (ontology file) and clean the output
output_concepts <-
process_df(df_summarized, concept_map, concept_replacements_reversed, concept_map_concepts = "CONCEPT_CD", concept_map_units = "UNITS_CD") %>%
dplyr::mutate(nval_num = signif(nval_num, 9)) %>%
Expand All @@ -73,12 +80,14 @@ output_concepts <-
dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
cat("recoverSummarizeR::process_df() completed.\n")

# Write the output
output_concepts %>%
write.csv(file.path(outputConceptsDir, glue::glue("{dataset}.csv")), row.names = F)
cat(glue::glue("output_concepts written to {file.path(outputConceptsDir, paste0(dataset, '.csv'))}"),"\n")

cat(glue::glue("Finished transforming data for {dataset}"),"\n\n")

# Remove objects created here from the global environment
rm(dataset,
vars,
df,
Expand Down
16 changes: 14 additions & 2 deletions scripts/process-data/fitbitdailydata.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,25 @@ dataset <- "fitbitdailydata"

cat(glue::glue("Transforming data for {dataset}"),"\n")

# Get variables for this dataset
vars <-
selected_vars %>%
filter(grepl(dataset, Export, ignore.case = TRUE)) %>%
pull(Variable)

# Load the desired subset of this dataset in memory
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
mutate(Tracker_Steps = as.numeric(Tracker_Steps),
HeartRateIntradayMinuteCount = as.numeric(HeartRateIntradayMinuteCount)) %>%
# filter(Tracker_Steps > 0, HeartRateIntradayMinuteCount > 0) %>%
filter(Tracker_Steps != 0,
HeartRateIntradayMinuteCount != 0 | !is.na(HeartRateIntradayMinuteCount)) %>%
select(all_of(vars)) %>%
collect()

colnames(df) <- tolower(colnames(df))

# Create lists for ID variables and i2b2 concept variables
excluded_concepts <- c("participantidentifier", "date")

approved_concepts_summarized <-
Expand All @@ -31,6 +33,7 @@ approved_concepts_summarized <-

df[approved_concepts_summarized] <- lapply(df[approved_concepts_summarized], as.numeric)

# Get QA/QC ranges for variables and exclude values outside the ranges
bounds <-
selected_vars %>%
filter(grepl(dataset, Export, ignore.case = TRUE),
Expand All @@ -51,6 +54,7 @@ for (col_name in names(df_filtered)) {
}
}

# Pivot data frame from long to wide
df_melted_filtered <-
df_filtered %>%
recoverSummarizeR::melt_df(excluded_concepts = excluded_concepts) %>%
Expand All @@ -62,6 +66,7 @@ df_melted_filtered <-
mutate(value = as.numeric(value))
cat("recoverSummarizeR::melt_df() completed.\n")

# Generate i2b2 summaries
df_summarized <-
df_melted_filtered %>%
rename(startdate = dplyr::any_of(c("date", "datetime"))) %>%
Expand All @@ -71,21 +76,28 @@ df_summarized <-
distinct()
cat("recoverSummarizeR::stat_summarize() completed.\n")

# Add i2b2 columns from concept map (ontology file) and clean the output
output_concepts <-
process_df(df_summarized, concept_map, concept_replacements_reversed, concept_map_concepts = "CONCEPT_CD", concept_map_units = "UNITS_CD") %>%
process_df(df_summarized,
concept_map,
concept_replacements_reversed,
concept_map_concepts = "CONCEPT_CD",
concept_map_units = "UNITS_CD") %>%
dplyr::mutate(nval_num = signif(nval_num, 9)) %>%
dplyr::arrange(concept) %>%
dplyr::mutate(dplyr::across(.cols = dplyr::everything(), .fns = as.character)) %>%
replace(is.na(.), "<null>") %>%
dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
cat("recoverSummarizeR::process_df() completed.\n")

# Write the output
output_concepts %>%
write.csv(file.path(outputConceptsDir, glue::glue("{dataset}.csv")), row.names = F)
cat(glue::glue("output_concepts written to {file.path(outputConceptsDir, paste0(dataset, '.csv'))}"), "\n")

cat(glue::glue("Finished transforming data for {dataset}"),"\n\n")

# Remove objects created here from the global environment
rm(dataset,
vars,
df,
Expand Down
9 changes: 9 additions & 0 deletions scripts/process-data/fitbitintradaycombined.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ dataset <- "fitbitintradaycombined"

cat(glue::glue("Transforming data for {dataset}"),"\n")

# Get variables for this dataset
vars <-
selected_vars %>%
filter(grepl(dataset, Export, ignore.case = TRUE)) %>%
pull(Variable)

# Load the desired subset of this dataset in memory
df <-
arrow::open_dataset(file.path(downloadLocation, glue::glue("dataset_{dataset}"))) %>%
select(all_of(vars)) %>%
Expand Down Expand Up @@ -39,6 +41,7 @@ df <-

colnames(df) <- tolower(colnames(df))

# Create lists for ID variables and i2b2 concept variables
excluded_concepts <- c("participantidentifier", "datetime")

approved_concepts_summarized <-
Expand All @@ -49,6 +52,7 @@ approved_concepts_summarized <-

df[approved_concepts_summarized] <- lapply(df[approved_concepts_summarized], as.numeric)

# Get QA/QC ranges for variables and exclude values outside the ranges
bounds <-
selected_vars %>%
filter(grepl(dataset, Export, ignore.case = TRUE),
Expand All @@ -69,6 +73,7 @@ for (col_name in names(df_filtered)) {
}
}

# Pivot data frame from long to wide
df_melted_filtered <-
df_filtered %>%
recoverSummarizeR::melt_df(excluded_concepts = excluded_concepts) %>%
Expand All @@ -80,6 +85,7 @@ df_melted_filtered <-
mutate(value = as.numeric(value))
cat("recoverSummarizeR::melt_df() completed.\n")

# Generate i2b2 summaries
df_summarized <-
df_melted_filtered %>%
rename(startdate = dplyr::any_of(c("date", "datetime"))) %>%
Expand All @@ -89,6 +95,7 @@ df_summarized <-
distinct()
cat("recoverSummarizeR::stat_summarize() completed.\n")

# Add i2b2 columns from concept map (ontology file) and clean the output
output_concepts <-
process_df(df_summarized, concept_map, concept_replacements_reversed, concept_map_concepts = "CONCEPT_CD", concept_map_units = "UNITS_CD") %>%
dplyr::mutate(nval_num = signif(nval_num, 9)) %>%
Expand All @@ -98,12 +105,14 @@ output_concepts <-
dplyr::filter(nval_num != "<null>" | tval_char != "<null>")
cat("recoverSummarizeR::process_df() completed.\n")

# Write the output
output_concepts %>%
write.csv(file.path(outputConceptsDir, glue::glue("{dataset}.csv")), row.names = F)
cat(glue::glue("output_concepts written to {file.path(outputConceptsDir, paste0(dataset, '.csv'))}"), "\n")

cat(glue::glue("Finished transforming data for {dataset}"),"\n\n")

# Remove objects created here from the global environment
rm(dataset,
vars,
df,
Expand Down
Loading

0 comments on commit 6e5a680

Please sign in to comment.