remove rows without matched primaryid

Yunuuuu · Oct 27, 2023 · eff40ae · eff40ae
1 parent fc931c7
commit eff40ae
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 9 deletions.
diff --git a/R/dedup.R b/R/dedup.R
@@ -36,7 +36,10 @@ methods::setMethod("faers_dedup", "FAERSascii", function(object, remove_deleted_
         )
     )
     object@data <- lapply(object@data, function(x) {
-        x[deduplicated_data, on = c("year", "quarter", "primaryid")]
+        x[deduplicated_data,
+            on = c("year", "quarter", "primaryid"),
+            nomatch = NULL
+        ]
     })
     object@deduplication <- TRUE
     # ..__matched_ids__.. <- unique(deduplicated_data$primaryid)
@@ -160,9 +163,9 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) {
         by = common_keys
     ][out, on = common_keys]
 
-    # meddra_code: indi_pt 
     # should we remove unknown indications or just translate unknown indications
     # into NA ?
+    # meddra_code: indi_pt
     # pt: 10070592 Product used for unknown indication
     # llt: 10057097 Drug use for unknown indication
     out <- data$indi[order(indi_drug_seq, meddra_code),
@@ -193,19 +196,19 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) {
     # round age_in_years to prevent minimal differences in age
     cli::cli_alert("deduplication from multiple sources by matching gender, age, reporting country, event date, start date, drug indications, drugs administered, and adverse reactions")
     out[, age_in_years_round := round(age_in_years, 2L)]
-    out[, c("event_dt", "gender", "country_code", "age_in_years_round") := lapply(.SD, function(x) {
-        idx <- is.na(x) | x == "NA"
-        if (any(idx)) {
-            x[idx] <- paste0("..__na_null__..", seq_len(sum(idx)))
-        }
-        x
-    }), .SDcols = c("event_dt", "gender", "country_code", "age_in_years_round")]
     can_be_ignored_columns <- c(
         "event_dt", "gender", "age_in_years_round", "country_code",
         "aligned_start_dt", "aligned_indi"
     )
     must_matched_columns <- c("aligned_drugs", "aligned_reac")
     all_columns <- c(must_matched_columns, can_be_ignored_columns)
+    out[, (all_columns) := lapply(.SD, function(x) {
+        idx <- is.na(x) | x == "NA"
+        if (any(idx)) {
+            x[idx] <- paste0("..__na_null__..", seq_len(sum(idx)))
+        }
+        x
+    }), .SDcols = all_columns]
     for (i in seq_along(can_be_ignored_columns)) {
         data.table::setorderv(out,
             cols = c("primaryid", "year", "quarter"),

diff --git a/tests/testthat/test_standardize.R b/tests/testthat/test_standardize.R
@@ -33,6 +33,14 @@ testthat::test_that("de-duplicating FAERS ascii data works well", {
     testthat::expect_no_error(data_dedup <- faers_dedup(data_std))
     testthat::expect_true(data_dedup@deduplication)
     testthat::expect_equal(anyDuplicated(faers_primaryid(data_dedup)), 0L)
+    # don't introduce absent primaryid
+    testthat::expect_in(data_dedup$indi$primaryid, data_std$indi$primaryid)
+    testthat::expect_in(data_dedup$ther$primaryid, data_std$ther$primaryid)
+    testthat::expect_in(data_dedup$drug$primaryid, data_std$drug$primaryid)
+    testthat::expect_in(data_dedup$demo$primaryid, data_std$demo$primaryid)
+    testthat::expect_in(data_dedup$reac$primaryid, data_std$reac$primaryid)
+    testthat::expect_in(data_dedup$rpsr$primaryid, data_std$rpsr$primaryid)
+    testthat::expect_in(data_dedup$outc$primaryid, data_std$outc$primaryid)
 })
 
 testthat::test_that("`faers_get` for standardizated data works well", {