Skip to content

Commit

Permalink
fix sentence splitter offset
Browse files Browse the repository at this point in the history
  • Loading branch information
LittlePea13 committed Aug 2, 2024
1 parent fa4d1e0 commit 4ec34ed
Showing 1 changed file with 9 additions and 5 deletions.
14 changes: 9 additions & 5 deletions relik/inference/data/window/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from relik.inference.data.splitters.blank_sentence_splitter import BlankSentenceSplitter
from relik.inference.data.splitters.base_sentence_splitter import BaseSentenceSplitter
from relik.inference.data.splitters.window_based_splitter import WindowSentenceSplitter
from relik.inference.data.splitters.spacy_sentence_splitter import SpacySentenceSplitter
from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
from relik.reader.data.relik_reader_sample import RelikReaderSample
from relik.inference.data.objects import AnnotationType, TaskType
Expand Down Expand Up @@ -227,10 +228,10 @@ def _merge_doc_windows(self, windows: List[RelikReaderSample]) -> RelikReaderSam
if len(windows) == 1:
return self._normalize_single_window(windows[0])

if not isinstance(self.splitter, WindowSentenceSplitter):
# here we don't really need to merge windows, just normalize them
# TODO: check if we need to merge windows in this case
return [self._normalize_single_window(w) for w in windows]
# if not isinstance(self.splitter, WindowSentenceSplitter):
# # here we don't really need to merge windows, just normalize them
# # TODO: check if we need to merge windows in this case
# return [self._normalize_single_window(w) for w in windows]

if len(windows) > 0 and getattr(windows[0], "offset", None) is not None:
windows = sorted(windows, key=(lambda x: x.offset))
Expand Down Expand Up @@ -507,7 +508,10 @@ def _merge_window_pair(
) = self._merge_predictions(window1, window2)

# merge text, take into account overlapping chars
m_text = window1.text[: window2.offset] + window2.text
if isinstance(self.splitter, SpacySentenceSplitter):
m_text = window1.text[: window2.offset] + " " + window2.text
else:
m_text = window1.text[: window2.offset] + window2.text

merging_output.update(
dict(
Expand Down

0 comments on commit 4ec34ed

Please sign in to comment.