Skip to content

Commit

Permalink
remove rows without matched primaryid
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunuuuu committed Oct 27, 2023
1 parent fc931c7 commit eff40ae
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 9 deletions.
21 changes: 12 additions & 9 deletions R/dedup.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ methods::setMethod("faers_dedup", "FAERSascii", function(object, remove_deleted_
)
)
object@data <- lapply(object@data, function(x) {
x[deduplicated_data, on = c("year", "quarter", "primaryid")]
x[deduplicated_data,
on = c("year", "quarter", "primaryid"),
nomatch = NULL
]
})
object@deduplication <- TRUE
# ..__matched_ids__.. <- unique(deduplicated_data$primaryid)
Expand Down Expand Up @@ -160,9 +163,9 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) {
by = common_keys
][out, on = common_keys]

# meddra_code: indi_pt
# should we remove unknown indications or just translate unknown indications
# into NA ?
# meddra_code: indi_pt
# pt: 10070592 Product used for unknown indication
# llt: 10057097 Drug use for unknown indication
out <- data$indi[order(indi_drug_seq, meddra_code),
Expand Down Expand Up @@ -193,19 +196,19 @@ dedup_faers_ascii <- function(data, deleted_cases = NULL) {
# round age_in_years to prevent minimal differences in age
cli::cli_alert("deduplication from multiple sources by matching gender, age, reporting country, event date, start date, drug indications, drugs administered, and adverse reactions")
out[, age_in_years_round := round(age_in_years, 2L)]
out[, c("event_dt", "gender", "country_code", "age_in_years_round") := lapply(.SD, function(x) {
idx <- is.na(x) | x == "NA"
if (any(idx)) {
x[idx] <- paste0("..__na_null__..", seq_len(sum(idx)))
}
x
}), .SDcols = c("event_dt", "gender", "country_code", "age_in_years_round")]
can_be_ignored_columns <- c(
"event_dt", "gender", "age_in_years_round", "country_code",
"aligned_start_dt", "aligned_indi"
)
must_matched_columns <- c("aligned_drugs", "aligned_reac")
all_columns <- c(must_matched_columns, can_be_ignored_columns)
out[, (all_columns) := lapply(.SD, function(x) {
idx <- is.na(x) | x == "NA"
if (any(idx)) {
x[idx] <- paste0("..__na_null__..", seq_len(sum(idx)))
}
x
}), .SDcols = all_columns]
for (i in seq_along(can_be_ignored_columns)) {
data.table::setorderv(out,
cols = c("primaryid", "year", "quarter"),
Expand Down
8 changes: 8 additions & 0 deletions tests/testthat/test_standardize.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ testthat::test_that("de-duplicating FAERS ascii data works well", {
testthat::expect_no_error(data_dedup <- faers_dedup(data_std))
testthat::expect_true(data_dedup@deduplication)
testthat::expect_equal(anyDuplicated(faers_primaryid(data_dedup)), 0L)
# don't introduce absent primaryid
testthat::expect_in(data_dedup$indi$primaryid, data_std$indi$primaryid)
testthat::expect_in(data_dedup$ther$primaryid, data_std$ther$primaryid)
testthat::expect_in(data_dedup$drug$primaryid, data_std$drug$primaryid)
testthat::expect_in(data_dedup$demo$primaryid, data_std$demo$primaryid)
testthat::expect_in(data_dedup$reac$primaryid, data_std$reac$primaryid)
testthat::expect_in(data_dedup$rpsr$primaryid, data_std$rpsr$primaryid)
testthat::expect_in(data_dedup$outc$primaryid, data_std$outc$primaryid)
})

testthat::test_that("`faers_get` for standardizated data works well", {
Expand Down

0 comments on commit eff40ae

Please sign in to comment.