Skip to content

Commit

Permalink
entity spans expand to start of token (#44)
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristophLeonhardt committed Mar 12, 2024
1 parent 4a8fd3c commit 2d88fc9
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 12 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: dbpedia
Type: Package
Title: R Wrapper for DBpedia Spotlight
Version: 0.1.2.9001
Date: 2024-03-06
Version: 0.1.2.9002
Date: 2024-03-12
Authors@R: c(
person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
person("Christoph", "Leonhardt", role = "aut")
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## dbpedia v0.1.2.9002
* `expand_to_token` of `get_dbpedia_uris()` also expands spans to the left now (#44)

## dbpedia v0.1.2.9001
* `entity_types_map()` now creates assignments again (#40) and returns them as character vectors
* `entity_types_map()` also passes all arguments when used with data.table objects
Expand Down
30 changes: 20 additions & 10 deletions R/dbpedia.R
Original file line number Diff line number Diff line change
Expand Up @@ -604,12 +604,22 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp
)

# prepare function to assign cpos_right depending on value and arguments
expand_fun = function(.SD) {
cpos_right <- dt[.SD[["end"]] == dt[["end"]]][["id"]]
if (length(cpos_right) == 0 & isTRUE(expand_to_token)) {
cpos_right <- dt[["id"]][which(dt[["end"]] > .SD[["end"]])[1]]
expand_fun = function(.SD, direction) {
if (direction == "right") {
cpos_right <- dt[.SD[["end"]] == dt[["end"]]][["id"]]
if (length(cpos_right) == 0 & isTRUE(expand_to_token)) {
cpos_right <- dt[["id"]][which(dt[["end"]] > .SD[["end"]])[1]]
} else {
cpos_right
}
} else {
cpos_right
cpos_left <- dt[.SD[["start"]] == dt[["start"]]][["id"]]
if (length(cpos_left) == 0 & isTRUE(expand_to_token)) {
cpos_vec <- which(dt[["start"]] < .SD[["start"]])
cpos_left <- dt[["id"]][cpos_vec[length(cpos_vec)]]
} else {
cpos_left
}
}
}

Expand All @@ -618,8 +628,8 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp
links[, "end" := links[["start"]] + nchar(links[["text"]]) - 1L]
tab <- links[,
list(
cpos_left = dt[.SD[["start"]] == dt[["start"]]][["id"]],
cpos_right = expand_fun(.SD),
cpos_left = expand_fun(.SD, direction = "left"),
cpos_right = expand_fun(.SD, direction = "right"),
dbpedia_uri = .SD[["dbpedia_uri"]],
text = .SD[["text"]],
types = .SD[["types"]]
Expand Down Expand Up @@ -689,9 +699,9 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp
}

# drop entities which cannot be mapped exactly to the tokenstream from the
# output (see issue #26).
if (isTRUE(drop_inexact_annotations) & any(is.na(tab[["cpos_right"]]))) {
missing_cpos_idx <- which(is.na(tab[["cpos_right"]]))
# output (see issues #26, #44).
if (isTRUE(drop_inexact_annotations) & (any(is.na(tab[["cpos_right"]])) | any(is.na(tab[["cpos_left"]])))) {
missing_cpos_idx <- unique(c(which(is.na(tab[["cpos_right"]])), which(is.na(tab[["cpos_left"]]))))
cli_alert_warning("Cannot map {length(missing_cpos_idx)} entit{?y/ies} exactly to tokenstream. Dropping {?it/them} from the annotation.")
tab <- tab[-missing_cpos_idx, ]
}
Expand Down

0 comments on commit 2d88fc9

Please sign in to comment.