Skip to content

Commit

Permalink
Fix Jupyter notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed Oct 15, 2024
1 parent e0e3f08 commit b7c06c8
Show file tree
Hide file tree
Showing 13 changed files with 178 additions and 118 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ pip install sil-machine

If you would like to find out more about how to use Machine, check out the tutorial Jupyter notebooks:

- [Tokenization](https://nbviewer.org/github/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
- [Text Corpora](https://nbviewer.org/github/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
- [Word Alignment](https://nbviewer.org/github/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
- [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
- [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
- [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
30 changes: 30 additions & 0 deletions machine/corpora/parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,11 @@ def filter(self, predicate: Callable[[ParallelTextRow], bool]) -> ParallelTextCo
def filter_by_index(self, predicate: Callable[[ParallelTextRow, int], bool]) -> ParallelTextCorpus:
return _FilterParallelTextCorpus(self, predicate)

def filter_texts(self, text_ids: Optional[Iterable[str]]) -> ParallelTextCorpus:
if text_ids is None:
return self
return _FilterTextsParallelTextCorpus(self, text_ids)

def take(self, count: int) -> ParallelTextCorpus:
return _TakeParallelTextCorpus(self, count)

Expand Down Expand Up @@ -553,6 +558,31 @@ def _get_rows(self, text_ids: Optional[Iterable[str]]) -> Generator[ParallelText
yield from islice(rows, self._count)


class _FilterTextsParallelTextCorpus(ParallelTextCorpus):
def __init__(self, corpus: ParallelTextCorpus, text_ids: Iterable[str]) -> None:
self._corpus = corpus
self._text_ids = set(text_ids)

@property
def is_source_tokenized(self) -> bool:
return self._corpus.is_source_tokenized

@property
def is_target_tokenized(self) -> bool:
return self._corpus.is_target_tokenized

def _get_rows(self, text_ids: Optional[Iterable[str]]) -> Generator[ParallelTextRow, None, None]:
with self._corpus.get_rows(
self._text_ids if text_ids is None else self._text_ids.intersection(text_ids)
) as rows:
yield from rows

def count(self, include_empty: bool = True, text_ids: Optional[Iterable[str]] = None) -> int:
return self._corpus.count(
include_empty, self._text_ids if text_ids is None else self._text_ids.intersection(text_ids)
)


class _PandasParallelTextCorpus(ParallelTextCorpus):
def __init__(
self,
Expand Down
5 changes: 4 additions & 1 deletion machine/corpora/scripture_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,10 @@ def __hash__(self) -> int:
return hash((self.verse_ref, tuple(self.path)))

def __repr__(self) -> str:
return f"{self.verse_ref}/{'/'.join(str(se) for se in self.path)}"
result = str(self.verse_ref)
if len(self.path) > 0:
result += "/" + "/".join(str(se) for se in self.path)
return result


EMPTY_SCRIPTURE_REF = ScriptureRef()
12 changes: 1 addition & 11 deletions machine/corpora/usfm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def process_token(self) -> bool:
and pub_verse_num_token.text is not None
and pub_verse_end_token.marker == "vp*"
):
pub_chapter = pub_verse_num_token.text.strip()
pub_verse = pub_verse_num_token.text.strip()
self.state.special_token_count += 3

assert token.data is not None
Expand Down Expand Up @@ -425,16 +425,6 @@ def _close_all(self) -> None:
while len(self.state.stack) > 0:
self._close_element()

def _is_study_bible_item_closed(self, start_marker: str, ending_marker: str) -> bool:
for i in range(self.state.index + 1, len(self.state.tokens)):
token = self.state.tokens[i]
if token.marker == ending_marker:
return True

if token.marker == start_marker or token.type in {UsfmTokenType.BOOK, UsfmTokenType.CHAPTER}:
return False
return False

def _determine_unknown_token_type(self) -> UsfmTokenType:
if any(e.type == UsfmElementType.NOTE for e in self.state.stack):
return UsfmTokenType.CHARACTER
Expand Down
14 changes: 7 additions & 7 deletions machine/corpora/usfm_stylesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import regex as re

from ..utils.file_utils import detect_encoding
from ..utils.string_utils import parse_integer
from ..utils.string_utils import parse_float, parse_integer
from ..utils.typeshed import StrPath
from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType

Expand Down Expand Up @@ -248,16 +248,16 @@ def _parse_tag_entry(tag: UsfmTag, entries: List[Tuple[str, str]], entry_index:
if space_after is not None and space_after >= 0:
tag.space_after = space_after
elif entry_marker == "leftmargin":
left_margin = parse_integer(entry_text)
if left_margin is not None and left_margin >= 0:
left_margin = parse_float(entry_text)
if left_margin is not None:
tag.left_margin = left_margin
elif entry_marker == "rightmargin":
right_margin = parse_integer(entry_text)
if right_margin is not None and right_margin >= 0:
right_margin = parse_float(entry_text)
if right_margin is not None:
tag.right_margin = right_margin
elif entry_marker == "firstlineindent":
first_line_indent = parse_integer(entry_text)
if first_line_indent is not None and first_line_indent >= 0:
first_line_indent = parse_float(entry_text)
if first_line_indent is not None:
tag.first_line_indent = first_line_indent
elif entry_marker == "rank":
if entry_text == "-":
Expand Down
6 changes: 3 additions & 3 deletions machine/corpora/usfm_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,18 @@ def __init__(self, marker: str) -> None:
self.description: Optional[str] = None
self.encoding: Optional[str] = None
self.end_marker: Optional[str] = None
self.first_line_indent: int = 0
self.first_line_indent: float = 0
self.font_name: Optional[str] = None
self.font_size: int = 0
self.italic: bool = False
self.justification: UsfmJustification = UsfmJustification.LEFT
self.left_margin: int = 0
self.left_margin: float = 0
self.line_spacing: int = 0
self.name: Optional[str] = None
self.not_repeatable: bool = False
self._occurs_under: Set[str] = set()
self.rank: int = 0
self.right_margin: int = 0
self.right_margin: float = 0
self.small_caps: bool = False
self.space_after: int = 0
self.space_before: int = 0
Expand Down
23 changes: 12 additions & 11 deletions machine/translation/corpus_ops.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Callable, Generator, Optional, Union
from typing import Callable, Generator, Iterable, Optional, Union

from ..corpora.corpora_utils import batch
from ..corpora.parallel_text_corpus import ParallelTextCorpus
from ..corpora.parallel_text_row import ParallelTextRow
from ..utils.progress_status import ProgressStatus
Expand Down Expand Up @@ -48,11 +49,11 @@ def is_source_tokenized(self) -> bool:
def is_target_tokenized(self) -> bool:
return self._corpus.is_target_tokenized

def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
with self._corpus.batch(self._batch_size) as batches:
for batch in batches:
alignments = self._aligner.align_batch(batch)
for row, alignment in zip(batch, alignments):
def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:
with self._corpus.get_rows(text_ids) as rows:
for row_batch in batch(rows, self._batch_size):
alignments = self._aligner.align_batch(row_batch)
for row, alignment in zip(row_batch, alignments):
known_alignment = WordAlignmentMatrix.from_parallel_text_row(row)
if known_alignment is not None:
known_alignment.priority_symmetrize_with(alignment)
Expand All @@ -78,12 +79,12 @@ def is_source_tokenized(self) -> bool:
def is_target_tokenized(self) -> bool:
return self._corpus.is_target_tokenized

def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
with self._corpus.batch(self._batch_size) as batches:
for batch in batches:
def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:
with self._corpus.get_rows(text_ids) as rows:
for row_batch in batch(rows, self._batch_size):
translations = self._translation_engine.translate_batch(
[r.source_segment if self.is_source_tokenized else r.source_text for r in batch]
[r.source_segment if self.is_source_tokenized else r.source_text for r in row_batch]
)
for row, translation in zip(batch, translations):
for row, translation in zip(row_batch, translations):
row.target_segment = translation.target_tokens
yield row
7 changes: 7 additions & 0 deletions machine/utils/string_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ def parse_integer(s: str) -> Optional[int]:
return None


def parse_float(s: str) -> Optional[float]:
try:
return float(s)
except ValueError:
return None


def has_sentence_ending(s: str) -> bool:
s = s.strip()
for c in reversed(s):
Expand Down
Loading

0 comments on commit b7c06c8

Please sign in to comment.