initial approach to detect overlapping annotations (#42)

PolMine · Mar 19, 2024 · d48a449 · d48a449
1 parent 77a4abf
commit d48a449
Show file tree

Hide file tree

Showing 8 changed files with 313 additions and 6 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: dbpedia
 Type: Package
 Title: R Wrapper for DBpedia Spotlight
-Version: 0.1.2.9002
-Date: 2024-03-12
+Version: 0.1.2.9003
+Date: 2024-03-19
 Authors@R: c(
   person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
   person("Christoph", "Leonhardt", role = "aut")
@@ -30,7 +30,8 @@ Suggests:
     WikidataQueryServiceR,
     GermaParl2,
     quanteda,
-    testthat
+    testthat,
+    withr
 Remotes:
   polmineR=github::PolMine/polmineR@dev
 AdditionalRepositories: https://polmine.github.io/drat/
@@ -50,6 +51,7 @@ Collate:
     'get_annotation_table.R'
     'xml.R'
     'entity_types.R'
+    'overlaps.R'
     'zzz.R'
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.3

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,7 @@ export(as_annotation)
 export(as_subcorpus)
 export(dbpedia_get_wikidata_uris)
 export(dbpedia_spotlight_status)
+export(detect_overlap)
 export(namespaced_xpath)
 export(sparql_query)
 export(wikidata_query)
@@ -18,6 +19,7 @@ importFrom(NLP,Annotation)
 importFrom(RcppCWB,cl_cpos2struc)
 importFrom(RcppCWB,cl_struc2str)
 importFrom(RcppCWB,get_region_matrix)
+importFrom(cli,cli_abort)
 importFrom(cli,cli_alert_danger)
 importFrom(cli,cli_alert_info)
 importFrom(cli,cli_alert_warning)
@@ -33,7 +35,9 @@ importFrom(cli,style_bold)
 importFrom(data.table,`:=`)
 importFrom(data.table,as.data.table)
 importFrom(data.table,data.table)
+importFrom(data.table,foverlaps)
 importFrom(data.table,is.data.table)
+importFrom(data.table,melt)
 importFrom(data.table,rbindlist)
 importFrom(data.table,setDT)
 importFrom(data.table,setcolorder)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+## dbpedia v0.1.2.9003
+* started introducing functionality to detect and resolve overlaps (see issue #42) with `detect_overlap()`
+* introduced tests for `detect_overlap()`
+* modified test suite to follow recommendations of "R Packages" (2nd edition) by Wickham and Bryan, in particular by using `withr` for self-contained tests
+
 ## dbpedia v0.1.2.9002
 * `expand_to_token` of `get_dbpedia_uris()` also expands spans to the left now (#44)
 * added `end` to data.table grouping in `get_dbpedia_uris()` for subcorpora to address issue #43. This avoids processing multiple entities at the same time.

diff --git a/R/overlaps.R b/R/overlaps.R
@@ -0,0 +1,138 @@
+#' Detect overlapping entities
+#'
+#' The function detects overlapping regions of annotations in a `data.table`
+#' object and assigns an overlap id to each group of overlapping entities.
+#'
+#' @param x the input `data.table` containing a start and an end position of
+#'   regions to be compared.
+#' @param start_col a `character vector` of length 1, referring to the name of
+#'   the column containing the start positions of the regions to be compared.
+#' @param end_col a `character vector` of length 1, referring to the name of the
+#'   column containing the end positions of the regions to be compared.
+#' @param verbose a `logical value` of whether to print messages or not.
+#' @return The input `data.table` is modified by reference. The column "ovl_id"
+#'   (overlap ID) is added. Each group of overlapping entities is annotated with
+#'   a unique ID. The functions `classify_overlap()` and `resolve_overlap()` can
+#'   be used thereafter.
+#' @importFrom data.table foverlaps melt
+#' @importFrom cli cli_abort
+#' @export
+#' @examples
+#' doc <- "Der Deutsche Bundestag tagt in Berlin."
+#'
+#' get_dbpedia_uris(
+#'   x = doc,
+#'   language = getOption("dbpedia.lang"),
+#'   max_len = 5600L,
+#'   confidence = 0.35,
+#'   api = getOption("dbpedia.endpoint"),
+#'   types = character(),
+#'   support = 20,
+#'   types_src = c("DBpedia", "Wikidata"),
+#'   verbose = TRUE
+#' ) |>
+#'   detect_overlap(start = "start", verbose = TRUE)
+detect_overlap <- function(x, start_col, end_col = NULL, verbose = TRUE) {
+
+  # Preparation: Add column "end" if necessary
+  if (is.null(end_col)) {
+    # check if there is already a column called "end".
+    if ("end" %in% colnames(x)) {
+      cli_abort(
+        c("{.var end_col} is NULL but there is already a column named {.val end}.",
+          "x" = "Cannot add new column {.val end} to the input data.table.")
+      ) 
+    }
+
+    cli_alert_warning(
+      "Argument {.var end} is NULL. Setting {.var end} to {.var start_col + nchar}.
+      This can be wrong in case of CWB corpora."
+    )
+    x[, end := get(start_col) + nchar(text)]
+    end_col <- "end"
+  }
+
+  if ("doc" %in% colnames(x)) {
+    x[, ovl_id := detect_overlap_aux(.SD,
+                                     group_id = .GRP,
+                                     start_col = start_col,
+                                     end_col = end_col,
+                                     verbose = verbose),
+      by = doc]
+  } else {
+    x[, ovl_id := detect_overlap_aux(x,
+                                     group_id = NULL,
+                                     start_col = start_col,
+                                     end_col = end_col,
+                                     verbose = verbose)]
+  }
+  return(x)
+}
+
+detect_overlap_aux <- function(input_dt, group_id, start_col, end_col, verbose = TRUE) {
+
+  # create subset of data.table
+  ovl_dt <- input_dt[, which(colnames(input_dt) %in% c("doc", "start", "end",
+                                                       "text", start_col, end_col)),
+                     with = FALSE]
+
+  # add temporary row idx for later join
+  ovl_dt[, row_idx := 1:nrow(ovl_dt)]
+
+  # set keys for the following foverlaps. Should be start and end.
+  setkeyv(ovl_dt, c(start_col, end_col))
+
+  # foverlaps returns a data.table with row numbers in x and y (both x). These
+  # pairs are redundant in the sense that there is both "1-2" and "2-1" (first
+  # row in x overlaps with the second row in y and second row in y overlaps with
+  # the first row in x).
+
+  # To get only relevant overlaps (i.e. not the same entity in both
+  # data.tables), subset by those in which the row idx in x is smaller than in
+  # y.
+
+  overlaps_out <- foverlaps(ovl_dt, ovl_dt, type = "any", which = TRUE)[xid < yid]
+
+  if (nrow(overlaps_out) == 0) {
+
+    if (isTRUE(verbose)) {
+      cli_alert_info("No overlaps found. Returning NA", wrap = TRUE) 
+    }
+
+    retval <- NA_character_
+
+  } else {
+
+    # add an ID for individual overlaps
+    if (!is.null(group_id)) {
+      overlaps_out[, overlap_id := sprintf("ovl_%s_%s",
+                                           group_id,
+                                           1:nrow(overlaps_out))]
+    } else {
+      overlaps_out[, overlap_id := sprintf("ovl_%s", 1:nrow(overlaps_out))] 
+    }
+
+    # and make from wide to long table for join
+    overlaps_out_long <- melt(overlaps_out,
+                              id.vars = "overlap_id",
+                              measure.vars = c("xid", "yid"),
+                              value.name = "row_idx")
+
+    # remove variable name
+    overlaps_out_long[, variable := NULL]
+
+    # print number of overlapping entities and rows
+    if (isTRUE(verbose)) {
+      cli_alert_info(
+        "Found {.val {nrow(overlaps_out)}} overlaps concerning {.val {nrow(overlaps_out_long)}} entities.",
+        wrap = TRUE) 
+    }
+
+    # merge to input
+    ovl_dt[overlaps_out_long, on = "row_idx", ovl_id := i.overlap_id]
+
+    retval <- ovl_dt[["ovl_id"]]
+  }
+
+  return(retval)
+}
diff --git a/man/detect_overlap.Rd b/man/detect_overlap.Rd
diff --git a/tests/testthat/test-dbpedia.R b/tests/testthat/test-dbpedia.R
@@ -1,8 +1,9 @@
 test_that(
   "get_dbpedia_uris() for quanteda corpus",
   {
-    library(quanteda)
-
+
+    withr::local_package("quanteda")
+
     uritab <- quanteda::data_char_ukimmig2010[1:3] %>%
       corpus() %>%
       get_dbpedia_uris(

diff --git a/tests/testthat/test-expand_to_token.R b/tests/testthat/test-expand_to_token.R
@@ -1,7 +1,7 @@
 test_that(
   "argument expand_to_tokens results in additional matches",
   {
-    library(polmineR)
+    withr::local_package("polmineR")
     use("RcppCWB")
 
     reuters_newswire <- corpus("REUTERS") |>

diff --git a/tests/testthat/test-overlaps.R b/tests/testthat/test-overlaps.R
@@ -0,0 +1,111 @@
+test_that(
+  "detect overlap detects overlap for characters",
+  {
+
+    doc <- "Vice President Gore in Washington D.C."
+
+    withr::local_package("data.table") # for copy
+
+    x <- get_dbpedia_uris(
+      x = doc,
+      max_len = 5600L,
+      confidence = 0.35,
+      api = "http://api.dbpedia-spotlight.org/en/annotate",
+      language = "en",
+      types = character(),
+      support = 20,
+      types_src = c("DBpedia", "Wikidata"),
+      verbose = TRUE
+    )
+
+    y <- copy(x)
+
+    # modify x by reference
+    detect_overlap(x = x, start_col = "start", verbose = TRUE)
+
+    # Test 1: This should add the columns "end" and "ovl_id" to x
+    additional_cols <- setdiff(colnames(x), colnames(y))
+    expect_equal(additional_cols, c("end", "ovl_id"))
+
+    # Test 2: This should also keep all rows
+    expect_equal(nrow(x), nrow(y))
+
+    # Test 3: This should return one unique non-na overlap ID
+    length_non_na_unique <- length(unique(x[["ovl_id"]][!is.na(x[["ovl_id"]])]))
+    expect_equal(length_non_na_unique, 1L)
+
+    # Test 4: This should return two rows with non-na overlap IDs
+    length_non_na <- length(x[["ovl_id"]][!is.na(x[["ovl_id"]])])
+    expect_equal(length_non_na, 2L)
+
+    # Test 5: Running detect_overlap() again like above should result in an
+    # error.
+    expect_error(
+      detect_overlap(x = x, start_col = "start", verbose = TRUE),
+      'Cannot add new column'
+    )
+
+  }
+)
+
+test_that(
+  "detect overlap detects overlap for CWB subcorpora",
+  {
+
+    withr::local_package("polmineR")
+    use("RcppCWB")
+
+    reuters_anno <- corpus("REUTERS") |>
+      subset(id == "353") |>
+      get_dbpedia_uris(
+        max_len = 5600L,
+        confidence = 0.35,
+        api = "http://api.dbpedia-spotlight.org/en/annotate",
+        language = "en",
+        types = character(),
+        support = 20,
+        verbose = TRUE
+      ) |>
+      detect_overlap(start_col = "cpos_left", end_col = "cpos_right", verbose = TRUE)
+
+    # there should be two rows with overlapping entity spans in this example
+    testthat::expect_equal(nrow(reuters_anno[!is.na(ovl_id)]), 2L)
+
+    # and there should be only one unique ID
+    testthat::expect_equal(
+      unique(reuters_anno[!is.na(ovl_id)][["ovl_id"]]), "ovl_1"
+    )
+  }
+)
+
+test_that(
+  "detect overlap detects overlap for quanteda corpora (with multiple documents)",
+  {
+
+    withr::local_package("quanteda")
+
+    x <- quanteda::data_corpus_inaugural |>
+      corpus_subset(President == "Bush" & Year > 2000) |>
+      corpus_reshape(to = "paragraphs") |>
+      _[2:3] |>
+      get_dbpedia_uris(
+        max_len = 5000L,
+        confidence = 0.35,
+        api = "http://api.dbpedia-spotlight.org/en/annotate",
+        language = "en",
+        types = character(),
+        support = 20,
+        verbose = FALSE,
+        progress = TRUE
+      )
+
+    # modify x by reference
+    y <- detect_overlap(x, start_col = "start", verbose = TRUE)
+
+    # there should be two rows with overlapping entity spans in this example
+    testthat::expect_equal(nrow(x[!is.na(ovl_id)]), 2L)
+
+    # and there should be only one unique ID
+    testthat::expect_equal(unique(x[!is.na(ovl_id)][["ovl_id"]]), "ovl_2_1")
+  }
+)