From eff40aebbb86a073ad65f497fe707ce745ab3cb2 Mon Sep 17 00:00:00 2001
From: yun <yunyunpp96@outlook.com>
Date: Fri, 27 Oct 2023 16:51:23 +0800
Subject: [PATCH] remove rows without matched primaryid

---
 R/dedup.R                         | 21 ++++++++++++---------
 tests/testthat/test_standardize.R |  8 ++++++++
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/R/dedup.R b/R/dedup.R
index df0f0f8..73dd590 100644
--- a/R/dedup.R
+++ b/R/dedup.R
@@ -36,7 +36,10 @@ methods::setMethod("faers_dedup", "FAERSascii", function(object, remove_deleted_
         )
     )
     object@data <- lapply(object@data, function(x) {
-        x[deduplicated_data, on = c("year", "quarter", "primaryid")]
+        x[deduplicated_data,
+            on = c("year", "quarter", "primaryid"),
+            nomatch = NULL
+        ]
     })
     object@deduplication <- TRUE
     # ..__matched_ids__.. <- unique(deduplicated_data$primaryid)
@@ -160,9 +163,9 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) {
         by = common_keys
     ][out, on = common_keys]
 
-    # meddra_code: indi_pt 
     # should we remove unknown indications or just translate unknown indications
     # into NA ?
+    # meddra_code: indi_pt
     # pt: 10070592 Product used for unknown indication
     # llt: 10057097 Drug use for unknown indication
     out <- data$indi[order(indi_drug_seq, meddra_code),
@@ -193,19 +196,19 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) {
     # round age_in_years to prevent minimal differences in age
     cli::cli_alert("deduplication from multiple sources by matching gender, age, reporting country, event date, start date, drug indications, drugs administered, and adverse reactions")
     out[, age_in_years_round := round(age_in_years, 2L)]
-    out[, c("event_dt", "gender", "country_code", "age_in_years_round") := lapply(.SD, function(x) {
-        idx <- is.na(x) | x == "NA"
-        if (any(idx)) {
-            x[idx] <- paste0("..__na_null__..", seq_len(sum(idx)))
-        }
-        x
-    }), .SDcols = c("event_dt", "gender", "country_code", "age_in_years_round")]
     can_be_ignored_columns <- c(
         "event_dt", "gender", "age_in_years_round", "country_code",
         "aligned_start_dt", "aligned_indi"
     )
     must_matched_columns <- c("aligned_drugs", "aligned_reac")
     all_columns <- c(must_matched_columns, can_be_ignored_columns)
+    out[, (all_columns) := lapply(.SD, function(x) {
+        idx <- is.na(x) | x == "NA"
+        if (any(idx)) {
+            x[idx] <- paste0("..__na_null__..", seq_len(sum(idx)))
+        }
+        x
+    }), .SDcols = all_columns]
     for (i in seq_along(can_be_ignored_columns)) {
         data.table::setorderv(out,
             cols = c("primaryid", "year", "quarter"),
diff --git a/tests/testthat/test_standardize.R b/tests/testthat/test_standardize.R
index 5e34be9..7688336 100644
--- a/tests/testthat/test_standardize.R
+++ b/tests/testthat/test_standardize.R
@@ -33,6 +33,14 @@ testthat::test_that("de-duplicating FAERS ascii data works well", {
     testthat::expect_no_error(data_dedup <- faers_dedup(data_std))
     testthat::expect_true(data_dedup@deduplication)
     testthat::expect_equal(anyDuplicated(faers_primaryid(data_dedup)), 0L)
+    # don't introduce absent primaryid
+    testthat::expect_in(data_dedup$indi$primaryid, data_std$indi$primaryid)
+    testthat::expect_in(data_dedup$ther$primaryid, data_std$ther$primaryid)
+    testthat::expect_in(data_dedup$drug$primaryid, data_std$drug$primaryid)
+    testthat::expect_in(data_dedup$demo$primaryid, data_std$demo$primaryid)
+    testthat::expect_in(data_dedup$reac$primaryid, data_std$reac$primaryid)
+    testthat::expect_in(data_dedup$rpsr$primaryid, data_std$rpsr$primaryid)
+    testthat::expect_in(data_dedup$outc$primaryid, data_std$outc$primaryid)
 })
 
 testthat::test_that("`faers_get` for standardizated data works well", {