Skip to content

Commit

Permalink
initial approach to detect overlapping annotations (#42)
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristophLeonhardt committed Mar 19, 2024
1 parent 77a4abf commit d48a449
Show file tree
Hide file tree
Showing 8 changed files with 313 additions and 6 deletions.
8 changes: 5 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: dbpedia
Type: Package
Title: R Wrapper for DBpedia Spotlight
Version: 0.1.2.9002
Date: 2024-03-12
Version: 0.1.2.9003
Date: 2024-03-19
Authors@R: c(
person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
person("Christoph", "Leonhardt", role = "aut")
Expand Down Expand Up @@ -30,7 +30,8 @@ Suggests:
WikidataQueryServiceR,
GermaParl2,
quanteda,
testthat
testthat,
withr
Remotes:
polmineR=github::PolMine/polmineR@dev
AdditionalRepositories: https://polmine.github.io/drat/
Expand All @@ -50,6 +51,7 @@ Collate:
'get_annotation_table.R'
'xml.R'
'entity_types.R'
'overlaps.R'
'zzz.R'
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export(as_annotation)
export(as_subcorpus)
export(dbpedia_get_wikidata_uris)
export(dbpedia_spotlight_status)
export(detect_overlap)
export(namespaced_xpath)
export(sparql_query)
export(wikidata_query)
Expand All @@ -18,6 +19,7 @@ importFrom(NLP,Annotation)
importFrom(RcppCWB,cl_cpos2struc)
importFrom(RcppCWB,cl_struc2str)
importFrom(RcppCWB,get_region_matrix)
importFrom(cli,cli_abort)
importFrom(cli,cli_alert_danger)
importFrom(cli,cli_alert_info)
importFrom(cli,cli_alert_warning)
Expand All @@ -33,7 +35,9 @@ importFrom(cli,style_bold)
importFrom(data.table,`:=`)
importFrom(data.table,as.data.table)
importFrom(data.table,data.table)
importFrom(data.table,foverlaps)
importFrom(data.table,is.data.table)
importFrom(data.table,melt)
importFrom(data.table,rbindlist)
importFrom(data.table,setDT)
importFrom(data.table,setcolorder)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## dbpedia v0.1.2.9003
* started introducing functionality to detect and resolve overlaps (see issue #42) with `detect_overlap()`
* introduced tests for `detect_overlap()`
* modified test suite to follow recommendations of "R Packages" (2nd edition) by Wickham and Bryan, in particular by using `withr` for self-contained tests

## dbpedia v0.1.2.9002
* `expand_to_token` of `get_dbpedia_uris()` also expands spans to the left now (#44)
* added `end` to data.table grouping in `get_dbpedia_uris()` for subcorpora to address issue #43. This avoids processing multiple entities at the same time.
Expand Down
138 changes: 138 additions & 0 deletions R/overlaps.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#' Detect overlapping entities
#'
#' The function detects overlapping regions of annotations in a `data.table`
#' object and assigns an overlap id to each group of overlapping entities.
#'
#' @param x the input `data.table` containing a start and an end position of
#' regions to be compared.
#' @param start_col a `character vector` of length 1, referring to the name of
#' the column containing the start positions of the regions to be compared.
#' @param end_col a `character vector` of length 1, referring to the name of the
#' column containing the end positions of the regions to be compared.
#' @param verbose a `logical value` of whether to print messages or not.
#' @return The input `data.table` is modified by reference. The column "ovl_id"
#' (overlap ID) is added. Each group of overlapping entities is annotated with
#' a unique ID. The functions `classify_overlap()` and `resolve_overlap()` can
#' be used thereafter.
#' @importFrom data.table foverlaps melt
#' @importFrom cli cli_abort
#' @export
#' @examples
#' doc <- "Der Deutsche Bundestag tagt in Berlin."
#'
#' get_dbpedia_uris(
#' x = doc,
#' language = getOption("dbpedia.lang"),
#' max_len = 5600L,
#' confidence = 0.35,
#' api = getOption("dbpedia.endpoint"),
#' types = character(),
#' support = 20,
#' types_src = c("DBpedia", "Wikidata"),
#' verbose = TRUE
#' ) |>
#' detect_overlap(start = "start", verbose = TRUE)
detect_overlap <- function(x, start_col, end_col = NULL, verbose = TRUE) {

# Preparation: Add column "end" if necessary
if (is.null(end_col)) {
# check if there is already a column called "end".
if ("end" %in% colnames(x)) {
cli_abort(
c("{.var end_col} is NULL but there is already a column named {.val end}.",
"x" = "Cannot add new column {.val end} to the input data.table.")
)
}

cli_alert_warning(
"Argument {.var end} is NULL. Setting {.var end} to {.var start_col + nchar}.
This can be wrong in case of CWB corpora."
)
x[, end := get(start_col) + nchar(text)]
end_col <- "end"
}

if ("doc" %in% colnames(x)) {
x[, ovl_id := detect_overlap_aux(.SD,
group_id = .GRP,
start_col = start_col,
end_col = end_col,
verbose = verbose),
by = doc]
} else {
x[, ovl_id := detect_overlap_aux(x,
group_id = NULL,
start_col = start_col,
end_col = end_col,
verbose = verbose)]
}
return(x)
}

detect_overlap_aux <- function(input_dt, group_id, start_col, end_col, verbose = TRUE) {

# create subset of data.table
ovl_dt <- input_dt[, which(colnames(input_dt) %in% c("doc", "start", "end",
"text", start_col, end_col)),
with = FALSE]

# add temporary row idx for later join
ovl_dt[, row_idx := 1:nrow(ovl_dt)]

# set keys for the following foverlaps. Should be start and end.
setkeyv(ovl_dt, c(start_col, end_col))

# foverlaps returns a data.table with row numbers in x and y (both x). These
# pairs are redundant in the sense that there is both "1-2" and "2-1" (first
# row in x overlaps with the second row in y and second row in y overlaps with
# the first row in x).

# To get only relevant overlaps (i.e. not the same entity in both
# data.tables), subset by those in which the row idx in x is smaller than in
# y.

overlaps_out <- foverlaps(ovl_dt, ovl_dt, type = "any", which = TRUE)[xid < yid]

if (nrow(overlaps_out) == 0) {

if (isTRUE(verbose)) {
cli_alert_info("No overlaps found. Returning NA", wrap = TRUE)
}

retval <- NA_character_

} else {

# add an ID for individual overlaps
if (!is.null(group_id)) {
overlaps_out[, overlap_id := sprintf("ovl_%s_%s",
group_id,
1:nrow(overlaps_out))]
} else {
overlaps_out[, overlap_id := sprintf("ovl_%s", 1:nrow(overlaps_out))]
}

# and make from wide to long table for join
overlaps_out_long <- melt(overlaps_out,
id.vars = "overlap_id",
measure.vars = c("xid", "yid"),
value.name = "row_idx")

# remove variable name
overlaps_out_long[, variable := NULL]

# print number of overlapping entities and rows
if (isTRUE(verbose)) {
cli_alert_info(
"Found {.val {nrow(overlaps_out)}} overlaps concerning {.val {nrow(overlaps_out_long)}} entities.",
wrap = TRUE)
}

# merge to input
ovl_dt[overlaps_out_long, on = "row_idx", ovl_id := i.overlap_id]

retval <- ovl_dt[["ovl_id"]]
}

return(retval)
}
46 changes: 46 additions & 0 deletions man/detect_overlap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions tests/testthat/test-dbpedia.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
test_that(
"get_dbpedia_uris() for quanteda corpus",
{
library(quanteda)


withr::local_package("quanteda")

uritab <- quanteda::data_char_ukimmig2010[1:3] %>%
corpus() %>%
get_dbpedia_uris(
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test-expand_to_token.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
test_that(
"argument expand_to_tokens results in additional matches",
{
library(polmineR)
withr::local_package("polmineR")
use("RcppCWB")

reuters_newswire <- corpus("REUTERS") |>
Expand Down
111 changes: 111 additions & 0 deletions tests/testthat/test-overlaps.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
test_that(
"detect overlap detects overlap for characters",
{

doc <- "Vice President Gore in Washington D.C."

withr::local_package("data.table") # for copy

x <- get_dbpedia_uris(
x = doc,
max_len = 5600L,
confidence = 0.35,
api = "http://api.dbpedia-spotlight.org/en/annotate",
language = "en",
types = character(),
support = 20,
types_src = c("DBpedia", "Wikidata"),
verbose = TRUE
)

y <- copy(x)

# modify x by reference
detect_overlap(x = x, start_col = "start", verbose = TRUE)

# Test 1: This should add the columns "end" and "ovl_id" to x
additional_cols <- setdiff(colnames(x), colnames(y))
expect_equal(additional_cols, c("end", "ovl_id"))

# Test 2: This should also keep all rows
expect_equal(nrow(x), nrow(y))

# Test 3: This should return one unique non-na overlap ID
length_non_na_unique <- length(unique(x[["ovl_id"]][!is.na(x[["ovl_id"]])]))
expect_equal(length_non_na_unique, 1L)

# Test 4: This should return two rows with non-na overlap IDs
length_non_na <- length(x[["ovl_id"]][!is.na(x[["ovl_id"]])])
expect_equal(length_non_na, 2L)

# Test 5: Running detect_overlap() again like above should result in an
# error.
expect_error(
detect_overlap(x = x, start_col = "start", verbose = TRUE),
'Cannot add new column'
)

}
)

test_that(
"detect overlap detects overlap for CWB subcorpora",
{

withr::local_package("polmineR")
use("RcppCWB")

reuters_anno <- corpus("REUTERS") |>
subset(id == "353") |>
get_dbpedia_uris(
max_len = 5600L,
confidence = 0.35,
api = "http://api.dbpedia-spotlight.org/en/annotate",
language = "en",
types = character(),
support = 20,
verbose = TRUE
) |>
detect_overlap(start_col = "cpos_left", end_col = "cpos_right", verbose = TRUE)

# there should be two rows with overlapping entity spans in this example
testthat::expect_equal(nrow(reuters_anno[!is.na(ovl_id)]), 2L)

# and there should be only one unique ID
testthat::expect_equal(
unique(reuters_anno[!is.na(ovl_id)][["ovl_id"]]), "ovl_1"
)
}
)

test_that(
"detect overlap detects overlap for quanteda corpora (with multiple documents)",
{

withr::local_package("quanteda")

x <- quanteda::data_corpus_inaugural |>
corpus_subset(President == "Bush" & Year > 2000) |>
corpus_reshape(to = "paragraphs") |>
_[2:3] |>
get_dbpedia_uris(
max_len = 5000L,
confidence = 0.35,
api = "http://api.dbpedia-spotlight.org/en/annotate",
language = "en",
types = character(),
support = 20,
verbose = FALSE,
progress = TRUE
)

# modify x by reference
y <- detect_overlap(x, start_col = "start", verbose = TRUE)

# there should be two rows with overlapping entity spans in this example
testthat::expect_equal(nrow(x[!is.na(ovl_id)]), 2L)

# and there should be only one unique ID
testthat::expect_equal(unique(x[!is.na(ovl_id)][["ovl_id"]]), "ovl_2_1")
}
)

0 comments on commit d48a449

Please sign in to comment.