From eff40aebbb86a073ad65f497fe707ce745ab3cb2 Mon Sep 17 00:00:00 2001 From: yun Date: Fri, 27 Oct 2023 16:51:23 +0800 Subject: [PATCH] remove rows without matched primaryid --- R/dedup.R | 21 ++++++++++++--------- tests/testthat/test_standardize.R | 8 ++++++++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/R/dedup.R b/R/dedup.R index df0f0f8..73dd590 100644 --- a/R/dedup.R +++ b/R/dedup.R @@ -36,7 +36,10 @@ methods::setMethod("faers_dedup", "FAERSascii", function(object, remove_deleted_ ) ) object@data <- lapply(object@data, function(x) { - x[deduplicated_data, on = c("year", "quarter", "primaryid")] + x[deduplicated_data, + on = c("year", "quarter", "primaryid"), + nomatch = NULL + ] }) object@deduplication <- TRUE # ..__matched_ids__.. <- unique(deduplicated_data$primaryid) @@ -160,9 +163,9 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) { by = common_keys ][out, on = common_keys] - # meddra_code: indi_pt # should we remove unknown indications or just translate unknown indications # into NA ? + # meddra_code: indi_pt # pt: 10070592 Product used for unknown indication # llt: 10057097 Drug use for unknown indication out <- data$indi[order(indi_drug_seq, meddra_code), @@ -193,19 +196,19 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) { # round age_in_years to prevent minimal differences in age cli::cli_alert("deduplication from multiple sources by matching gender, age, reporting country, event date, start date, drug indications, drugs administered, and adverse reactions") out[, age_in_years_round := round(age_in_years, 2L)] - out[, c("event_dt", "gender", "country_code", "age_in_years_round") := lapply(.SD, function(x) { - idx <- is.na(x) | x == "NA" - if (any(idx)) { - x[idx] <- paste0("..__na_null__..", seq_len(sum(idx))) - } - x - }), .SDcols = c("event_dt", "gender", "country_code", "age_in_years_round")] can_be_ignored_columns <- c( "event_dt", "gender", "age_in_years_round", "country_code", "aligned_start_dt", "aligned_indi" ) must_matched_columns <- c("aligned_drugs", "aligned_reac") all_columns <- c(must_matched_columns, can_be_ignored_columns) + out[, (all_columns) := lapply(.SD, function(x) { + idx <- is.na(x) | x == "NA" + if (any(idx)) { + x[idx] <- paste0("..__na_null__..", seq_len(sum(idx))) + } + x + }), .SDcols = all_columns] for (i in seq_along(can_be_ignored_columns)) { data.table::setorderv(out, cols = c("primaryid", "year", "quarter"), diff --git a/tests/testthat/test_standardize.R b/tests/testthat/test_standardize.R index 5e34be9..7688336 100644 --- a/tests/testthat/test_standardize.R +++ b/tests/testthat/test_standardize.R @@ -33,6 +33,14 @@ testthat::test_that("de-duplicating FAERS ascii data works well", { testthat::expect_no_error(data_dedup <- faers_dedup(data_std)) testthat::expect_true(data_dedup@deduplication) testthat::expect_equal(anyDuplicated(faers_primaryid(data_dedup)), 0L) + # don't introduce absent primaryid + testthat::expect_in(data_dedup$indi$primaryid, data_std$indi$primaryid) + testthat::expect_in(data_dedup$ther$primaryid, data_std$ther$primaryid) + testthat::expect_in(data_dedup$drug$primaryid, data_std$drug$primaryid) + testthat::expect_in(data_dedup$demo$primaryid, data_std$demo$primaryid) + testthat::expect_in(data_dedup$reac$primaryid, data_std$reac$primaryid) + testthat::expect_in(data_dedup$rpsr$primaryid, data_std$rpsr$primaryid) + testthat::expect_in(data_dedup$outc$primaryid, data_std$outc$primaryid) }) testthat::test_that("`faers_get` for standardizated data works well", {