From 2e0ccf3d40b85c275f21701cfe39cd7720abeaba Mon Sep 17 00:00:00 2001
From: yun <yunyunpp96@outlook.com>
Date: Mon, 6 Nov 2023 19:09:03 +0800
Subject: [PATCH] improve performance

---
 R/dedup.R                         | 69 ++++++++++++++++++++++---------
 tests/testthat/test_standardize.R | 13 ++++++
 2 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/R/dedup.R b/R/dedup.R
index 7241ff4..cf0cf1a 100644
--- a/R/dedup.R
+++ b/R/dedup.R
@@ -143,14 +143,22 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) {
             data$demo[
                 order(-year, -quarter, -fda_dt, i_f_code, -event_dt)
             ],
-            by = "primaryid"
+            by = "primaryid", cols = c(
+                "year", "quarter", "caseid", "caseversion",
+                "fda_dt", "i_f_code", "age_in_years", "gender",
+                "country_code", "event_dt"
+            )
         )
     } else {
         out <- unique(
             data$demo[!caseid %in% deleted_cases][
                 order(-year, -quarter, -fda_dt, i_f_code, -event_dt)
             ],
-            by = "primaryid"
+            by = "primaryid", cols = c(
+                "year", "quarter", "caseid", "caseversion",
+                "fda_dt", "i_f_code", "age_in_years", "gender",
+                "country_code", "event_dt"
+            )
         )
     }
 
@@ -169,29 +177,45 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) {
     # match drug, indi, and ther data.
     common_keys <- c("year", "quarter", "primaryid")
     cli::cli_alert("merging `drug`, `indi`, `ther`, and `reac` data")
-    out <- data$drug[order(drug_seq),
-        list(aligned_drugs = paste0(drugname, collapse = "/")),
-        by = common_keys
-    ][out, on = common_keys]
+    out[
+        data$drug[order(drug_seq),
+            list(aligned_drugs = paste0(drugname, collapse = "/")),
+            by = common_keys
+        ],
+        aligned_drugs := i.aligned_drugs,
+        on = common_keys
+    ]
 
     # should we remove unknown indications or just translate unknown indications
     # into NA ?
     # meddra_code: indi_pt
     # pt: 10070592 Product used for unknown indication
     # llt: 10057097 Drug use for unknown indication
-    out <- data$indi[order(indi_drug_seq, meddra_code),
-        list(aligned_indi = paste0(meddra_code, collapse = "/")),
-        by = common_keys
-    ][out, on = common_keys]
-    out <- data$ther[order(dsg_drug_seq, start_dt),
-        list(aligned_start_dt = paste0(start_dt, collapse = "/")),
-        by = common_keys
-    ][out, on = common_keys]
+    out[
+        data$indi[order(indi_drug_seq, meddra_code),
+            list(aligned_indi = paste0(meddra_code, collapse = "/")),
+            by = common_keys
+        ],
+        aligned_indi := i.aligned_indi,
+        on = common_keys
+    ]
+    out[
+        data$ther[order(dsg_drug_seq, start_dt),
+            list(aligned_start_dt = paste0(start_dt, collapse = "/")),
+            by = common_keys
+        ],
+        aligned_start_dt := i.aligned_start_dt,
+        on = common_keys
+    ]
     # meddra_code: pt
-    out <- data$reac[order(meddra_code),
-        list(aligned_reac = paste0(meddra_code, collapse = "/")),
-        by = common_keys
-    ][out, on = common_keys]
+    out[
+        data$reac[order(meddra_code),
+            list(aligned_reac = paste0(meddra_code, collapse = "/")),
+            by = common_keys
+        ],
+        aligned_reac := i.aligned_reac,
+        on = common_keys
+    ]
 
     # consider two cases to be the same if they had a complete match of the
     # eight criteria which are gender, age, reporting country, event date, start
@@ -278,5 +302,12 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) {
 utils::globalVariables(c(
     "drug_seq", "drugname", "indi_meddra_code", "start_dt",
     "indi_drug_seq", "dsg_drug_seq", "primaryid", "caseversion", "fda_dt", "i_f_code", "event_dt", "year", "caseid", "age_in_years_round",
-    "meddra_code"
+    "meddra_code",
+    paste0(c("", "i."), rep(
+        c(
+            "aligned_drugs", "aligned_indi", "aligned_start_dt",
+            "aligned_reac"
+        ),
+        each = 2L
+    ))
 ))
diff --git a/tests/testthat/test_standardize.R b/tests/testthat/test_standardize.R
index 99890dc..f511d66 100644
--- a/tests/testthat/test_standardize.R
+++ b/tests/testthat/test_standardize.R
@@ -37,9 +37,22 @@ testthat::test_that("standardize FAERS ascii data works well", {
 
 testthat::test_that("de-duplicating FAERS ascii data works well", {
     testthat::expect_error(faers_dedup(data))
+    # internal don't modify data by reference and drug_seq match well
+    raw_demo <- data.table::copy(data_std$demo)
+    raw_drug <- data.table::copy(data_std$drug)
+    raw_indi <- data.table::copy(data_std$indi)
+    raw_ther <- data.table::copy(data_std$ther)
+    raw_reac <- data.table::copy(data_std$reac)
     testthat::expect_no_error(data_dedup <- faers_dedup(data_std))
     testthat::expect_true(data_dedup@deduplication)
     testthat::expect_equal(anyDuplicated(faers_primaryid(data_dedup)), 0L)
+
+
+    testthat::expect_identical(data_dedup$demo, raw_demo)
+    testthat::expect_identical(data_dedup$drug, raw_drug)
+    testthat::expect_identical(data_dedup$indi, raw_indi)
+    testthat::expect_identical(data_dedup$ther, raw_ther)
+    testthat::expect_identical(data_dedup$reac, raw_reac)
     # don't introduce absent primaryid
     testthat::expect_in(data_dedup$indi$primaryid, data_std$indi$primaryid)
     testthat::expect_in(data_dedup$ther$primaryid, data_std$ther$primaryid)