From 2e0ccf3d40b85c275f21701cfe39cd7720abeaba Mon Sep 17 00:00:00 2001 From: yun Date: Mon, 6 Nov 2023 19:09:03 +0800 Subject: [PATCH] improve performance --- R/dedup.R | 69 ++++++++++++++++++++++--------- tests/testthat/test_standardize.R | 13 ++++++ 2 files changed, 63 insertions(+), 19 deletions(-) diff --git a/R/dedup.R b/R/dedup.R index 7241ff4..cf0cf1a 100644 --- a/R/dedup.R +++ b/R/dedup.R @@ -143,14 +143,22 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) { data$demo[ order(-year, -quarter, -fda_dt, i_f_code, -event_dt) ], - by = "primaryid" + by = "primaryid", cols = c( + "year", "quarter", "caseid", "caseversion", + "fda_dt", "i_f_code", "age_in_years", "gender", + "country_code", "event_dt" + ) ) } else { out <- unique( data$demo[!caseid %in% deleted_cases][ order(-year, -quarter, -fda_dt, i_f_code, -event_dt) ], - by = "primaryid" + by = "primaryid", cols = c( + "year", "quarter", "caseid", "caseversion", + "fda_dt", "i_f_code", "age_in_years", "gender", + "country_code", "event_dt" + ) ) } @@ -169,29 +177,45 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) { # match drug, indi, and ther data. common_keys <- c("year", "quarter", "primaryid") cli::cli_alert("merging `drug`, `indi`, `ther`, and `reac` data") - out <- data$drug[order(drug_seq), - list(aligned_drugs = paste0(drugname, collapse = "/")), - by = common_keys - ][out, on = common_keys] + out[ + data$drug[order(drug_seq), + list(aligned_drugs = paste0(drugname, collapse = "/")), + by = common_keys + ], + aligned_drugs := i.aligned_drugs, + on = common_keys + ] # should we remove unknown indications or just translate unknown indications # into NA ? # meddra_code: indi_pt # pt: 10070592 Product used for unknown indication # llt: 10057097 Drug use for unknown indication - out <- data$indi[order(indi_drug_seq, meddra_code), - list(aligned_indi = paste0(meddra_code, collapse = "/")), - by = common_keys - ][out, on = common_keys] - out <- data$ther[order(dsg_drug_seq, start_dt), - list(aligned_start_dt = paste0(start_dt, collapse = "/")), - by = common_keys - ][out, on = common_keys] + out[ + data$indi[order(indi_drug_seq, meddra_code), + list(aligned_indi = paste0(meddra_code, collapse = "/")), + by = common_keys + ], + aligned_indi := i.aligned_indi, + on = common_keys + ] + out[ + data$ther[order(dsg_drug_seq, start_dt), + list(aligned_start_dt = paste0(start_dt, collapse = "/")), + by = common_keys + ], + aligned_start_dt := i.aligned_start_dt, + on = common_keys + ] # meddra_code: pt - out <- data$reac[order(meddra_code), - list(aligned_reac = paste0(meddra_code, collapse = "/")), - by = common_keys - ][out, on = common_keys] + out[ + data$reac[order(meddra_code), + list(aligned_reac = paste0(meddra_code, collapse = "/")), + by = common_keys + ], + aligned_reac := i.aligned_reac, + on = common_keys + ] # consider two cases to be the same if they had a complete match of the # eight criteria which are gender, age, reporting country, event date, start @@ -278,5 +302,12 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) { utils::globalVariables(c( "drug_seq", "drugname", "indi_meddra_code", "start_dt", "indi_drug_seq", "dsg_drug_seq", "primaryid", "caseversion", "fda_dt", "i_f_code", "event_dt", "year", "caseid", "age_in_years_round", - "meddra_code" + "meddra_code", + paste0(c("", "i."), rep( + c( + "aligned_drugs", "aligned_indi", "aligned_start_dt", + "aligned_reac" + ), + each = 2L + )) )) diff --git a/tests/testthat/test_standardize.R b/tests/testthat/test_standardize.R index 99890dc..f511d66 100644 --- a/tests/testthat/test_standardize.R +++ b/tests/testthat/test_standardize.R @@ -37,9 +37,22 @@ testthat::test_that("standardize FAERS ascii data works well", { testthat::test_that("de-duplicating FAERS ascii data works well", { testthat::expect_error(faers_dedup(data)) + # internal don't modify data by reference and drug_seq match well + raw_demo <- data.table::copy(data_std$demo) + raw_drug <- data.table::copy(data_std$drug) + raw_indi <- data.table::copy(data_std$indi) + raw_ther <- data.table::copy(data_std$ther) + raw_reac <- data.table::copy(data_std$reac) testthat::expect_no_error(data_dedup <- faers_dedup(data_std)) testthat::expect_true(data_dedup@deduplication) testthat::expect_equal(anyDuplicated(faers_primaryid(data_dedup)), 0L) + + + testthat::expect_identical(data_dedup$demo, raw_demo) + testthat::expect_identical(data_dedup$drug, raw_drug) + testthat::expect_identical(data_dedup$indi, raw_indi) + testthat::expect_identical(data_dedup$ther, raw_ther) + testthat::expect_identical(data_dedup$reac, raw_reac) # don't introduce absent primaryid testthat::expect_in(data_dedup$indi$primaryid, data_std$indi$primaryid) testthat::expect_in(data_dedup$ther$primaryid, data_std$ther$primaryid)