Fix Jupyter notebooks

sillsdev · Oct 15, 2024 · b7c06c8 · b7c06c8
1 parent e0e3f08
commit b7c06c8
Show file tree

Hide file tree

Showing 13 changed files with 178 additions and 118 deletions.
diff --git a/README.md b/README.md
@@ -14,6 +14,6 @@ pip install sil-machine
 
 If you would like to find out more about how to use Machine, check out the tutorial Jupyter notebooks:
 
-- [Tokenization](https://nbviewer.org/github/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
-- [Text Corpora](https://nbviewer.org/github/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
-- [Word Alignment](https://nbviewer.org/github/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
+- [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
+- [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
+- [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
diff --git a/machine/corpora/parallel_text_corpus.py b/machine/corpora/parallel_text_corpus.py
@@ -310,6 +310,11 @@ def filter(self, predicate: Callable[[ParallelTextRow], bool]) -> ParallelTextCo
     def filter_by_index(self, predicate: Callable[[ParallelTextRow, int], bool]) -> ParallelTextCorpus:
         return _FilterParallelTextCorpus(self, predicate)
 
+    def filter_texts(self, text_ids: Optional[Iterable[str]]) -> ParallelTextCorpus:
+        if text_ids is None:
+            return self
+        return _FilterTextsParallelTextCorpus(self, text_ids)
+
     def take(self, count: int) -> ParallelTextCorpus:
         return _TakeParallelTextCorpus(self, count)
 
@@ -553,6 +558,31 @@ def _get_rows(self, text_ids: Optional[Iterable[str]]) -> Generator[ParallelText
             yield from islice(rows, self._count)
 
 
+class _FilterTextsParallelTextCorpus(ParallelTextCorpus):
+    def __init__(self, corpus: ParallelTextCorpus, text_ids: Iterable[str]) -> None:
+        self._corpus = corpus
+        self._text_ids = set(text_ids)
+
+    @property
+    def is_source_tokenized(self) -> bool:
+        return self._corpus.is_source_tokenized
+
+    @property
+    def is_target_tokenized(self) -> bool:
+        return self._corpus.is_target_tokenized
+
+    def _get_rows(self, text_ids: Optional[Iterable[str]]) -> Generator[ParallelTextRow, None, None]:
+        with self._corpus.get_rows(
+            self._text_ids if text_ids is None else self._text_ids.intersection(text_ids)
+        ) as rows:
+            yield from rows
+
+    def count(self, include_empty: bool = True, text_ids: Optional[Iterable[str]] = None) -> int:
+        return self._corpus.count(
+            include_empty, self._text_ids if text_ids is None else self._text_ids.intersection(text_ids)
+        )
+
+
 class _PandasParallelTextCorpus(ParallelTextCorpus):
     def __init__(
         self,

diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py
@@ -123,7 +123,10 @@ def __hash__(self) -> int:
         return hash((self.verse_ref, tuple(self.path)))
 
     def __repr__(self) -> str:
-        return f"{self.verse_ref}/{'/'.join(str(se) for se in self.path)}"
+        result = str(self.verse_ref)
+        if len(self.path) > 0:
+            result += "/" + "/".join(str(se) for se in self.path)
+        return result
 
 
 EMPTY_SCRIPTURE_REF = ScriptureRef()
diff --git a/machine/corpora/usfm_parser.py b/machine/corpora/usfm_parser.py
@@ -256,7 +256,7 @@ def process_token(self) -> bool:
                     and pub_verse_num_token.text is not None
                     and pub_verse_end_token.marker == "vp*"
                 ):
-                    pub_chapter = pub_verse_num_token.text.strip()
+                    pub_verse = pub_verse_num_token.text.strip()
                     self.state.special_token_count += 3
 
             assert token.data is not None
@@ -425,16 +425,6 @@ def _close_all(self) -> None:
         while len(self.state.stack) > 0:
             self._close_element()
 
-    def _is_study_bible_item_closed(self, start_marker: str, ending_marker: str) -> bool:
-        for i in range(self.state.index + 1, len(self.state.tokens)):
-            token = self.state.tokens[i]
-            if token.marker == ending_marker:
-                return True
-
-            if token.marker == start_marker or token.type in {UsfmTokenType.BOOK, UsfmTokenType.CHAPTER}:
-                return False
-        return False
-
     def _determine_unknown_token_type(self) -> UsfmTokenType:
         if any(e.type == UsfmElementType.NOTE for e in self.state.stack):
             return UsfmTokenType.CHARACTER

diff --git a/machine/corpora/usfm_stylesheet.py b/machine/corpora/usfm_stylesheet.py
@@ -4,7 +4,7 @@
 import regex as re
 
 from ..utils.file_utils import detect_encoding
-from ..utils.string_utils import parse_integer
+from ..utils.string_utils import parse_float, parse_integer
 from ..utils.typeshed import StrPath
 from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType
 
@@ -248,16 +248,16 @@ def _parse_tag_entry(tag: UsfmTag, entries: List[Tuple[str, str]], entry_index:
             if space_after is not None and space_after >= 0:
                 tag.space_after = space_after
         elif entry_marker == "leftmargin":
-            left_margin = parse_integer(entry_text)
-            if left_margin is not None and left_margin >= 0:
+            left_margin = parse_float(entry_text)
+            if left_margin is not None:
                 tag.left_margin = left_margin
         elif entry_marker == "rightmargin":
-            right_margin = parse_integer(entry_text)
-            if right_margin is not None and right_margin >= 0:
+            right_margin = parse_float(entry_text)
+            if right_margin is not None:
                 tag.right_margin = right_margin
         elif entry_marker == "firstlineindent":
-            first_line_indent = parse_integer(entry_text)
-            if first_line_indent is not None and first_line_indent >= 0:
+            first_line_indent = parse_float(entry_text)
+            if first_line_indent is not None:
                 tag.first_line_indent = first_line_indent
         elif entry_marker == "rank":
             if entry_text == "-":

diff --git a/machine/corpora/usfm_tag.py b/machine/corpora/usfm_tag.py
@@ -66,18 +66,18 @@ def __init__(self, marker: str) -> None:
         self.description: Optional[str] = None
         self.encoding: Optional[str] = None
         self.end_marker: Optional[str] = None
-        self.first_line_indent: int = 0
+        self.first_line_indent: float = 0
         self.font_name: Optional[str] = None
         self.font_size: int = 0
         self.italic: bool = False
         self.justification: UsfmJustification = UsfmJustification.LEFT
-        self.left_margin: int = 0
+        self.left_margin: float = 0
         self.line_spacing: int = 0
         self.name: Optional[str] = None
         self.not_repeatable: bool = False
         self._occurs_under: Set[str] = set()
         self.rank: int = 0
-        self.right_margin: int = 0
+        self.right_margin: float = 0
         self.small_caps: bool = False
         self.space_after: int = 0
         self.space_before: int = 0

diff --git a/machine/translation/corpus_ops.py b/machine/translation/corpus_ops.py
@@ -1,5 +1,6 @@
-from typing import Callable, Generator, Optional, Union
+from typing import Callable, Generator, Iterable, Optional, Union
 
+from ..corpora.corpora_utils import batch
 from ..corpora.parallel_text_corpus import ParallelTextCorpus
 from ..corpora.parallel_text_row import ParallelTextRow
 from ..utils.progress_status import ProgressStatus
@@ -48,11 +49,11 @@ def is_source_tokenized(self) -> bool:
     def is_target_tokenized(self) -> bool:
         return self._corpus.is_target_tokenized
 
-    def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
-        with self._corpus.batch(self._batch_size) as batches:
-            for batch in batches:
-                alignments = self._aligner.align_batch(batch)
-                for row, alignment in zip(batch, alignments):
+    def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:
+        with self._corpus.get_rows(text_ids) as rows:
+            for row_batch in batch(rows, self._batch_size):
+                alignments = self._aligner.align_batch(row_batch)
+                for row, alignment in zip(row_batch, alignments):
                     known_alignment = WordAlignmentMatrix.from_parallel_text_row(row)
                     if known_alignment is not None:
                         known_alignment.priority_symmetrize_with(alignment)
@@ -78,12 +79,12 @@ def is_source_tokenized(self) -> bool:
     def is_target_tokenized(self) -> bool:
         return self._corpus.is_target_tokenized
 
-    def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
-        with self._corpus.batch(self._batch_size) as batches:
-            for batch in batches:
+    def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:
+        with self._corpus.get_rows(text_ids) as rows:
+            for row_batch in batch(rows, self._batch_size):
                 translations = self._translation_engine.translate_batch(
-                    [r.source_segment if self.is_source_tokenized else r.source_text for r in batch]
+                    [r.source_segment if self.is_source_tokenized else r.source_text for r in row_batch]
                 )
-                for row, translation in zip(batch, translations):
+                for row, translation in zip(row_batch, translations):
                     row.target_segment = translation.target_tokens
                     yield row
diff --git a/machine/utils/string_utils.py b/machine/utils/string_utils.py
@@ -72,6 +72,13 @@ def parse_integer(s: str) -> Optional[int]:
         return None
 
 
+def parse_float(s: str) -> Optional[float]:
+    try:
+        return float(s)
+    except ValueError:
+        return None
+
+
 def has_sentence_ending(s: str) -> bool:
     s = s.strip()
     for c in reversed(s):