Skip to content

Commit

Permalink
Add Paratext/USFM processing tutorial (#130)
Browse files Browse the repository at this point in the history
- replace "strip_all_text" and "prefer_existing_text" parameters with a single enum parameter
  • Loading branch information
ddaspit authored Oct 17, 2024
1 parent b7c06c8 commit cec61de
Show file tree
Hide file tree
Showing 7 changed files with 462 additions and 22 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ If you would like to find out more about how to use Machine, check out the tutor
- [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
- [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
- [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)
3 changes: 2 additions & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
normalize,
unescape_spaces,
)
from .update_usfm_parser_handler import UpdateUsfmParserHandler
from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler
from .usfm_file_text import UsfmFileText
from .usfm_file_text_corpus import UsfmFileTextCorpus
from .usfm_memory_text import UsfmMemoryText
Expand Down Expand Up @@ -125,6 +125,7 @@
"TextRow",
"TextRowFlags",
"unescape_spaces",
"UpdateUsfmBehavior",
"UpdateUsfmParserHandler",
"UsfmAttribute",
"UsfmElementType",
Expand Down
9 changes: 3 additions & 6 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .scripture_ref import ScriptureRef
from .update_usfm_parser_handler import UpdateUsfmParserHandler
from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler
from .usfm_parser import parse_usfm


Expand All @@ -21,17 +21,14 @@ def update_usfm(
book_id: str,
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
full_name: Optional[str] = None,
strip_all_text: bool = False,
prefer_existing_text: bool = True,
behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING,
) -> Optional[str]:
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
return None
with self._open(file_name) as sfm_file:
usfm: str = sfm_file.read().decode(self._settings.encoding)
handler = UpdateUsfmParserHandler(
rows, None if full_name is None else f"- {full_name}", strip_all_text, prefer_existing_text
)
handler = UpdateUsfmParserHandler(rows, None if full_name is None else f"- {full_name}", behavior)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
return handler.get_usfm(self._settings.stylesheet)
Expand Down
17 changes: 12 additions & 5 deletions machine/corpora/update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum, auto
from typing import List, Optional, Sequence, Tuple, Union

from .scripture_ref import ScriptureRef
Expand All @@ -8,21 +9,25 @@
from .usfm_tokenizer import UsfmTokenizer


class UpdateUsfmBehavior(Enum):
PREFER_EXISTING = auto()
PREFER_NEW = auto()
STRIP_EXISTING = auto()


class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
def __init__(
self,
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
id_text: Optional[str] = None,
strip_all_text: bool = False,
prefer_existing_text: bool = False,
behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING,
) -> None:
super().__init__()
self._rows = rows or []
self._tokens: List[UsfmToken] = []
self._new_tokens: List[UsfmToken] = []
self._id_text = id_text
self._strip_all_text = strip_all_text
self._prefer_existing_text = prefer_existing_text
self._behavior = behavior
self._replace_stack: List[bool] = []
self._row_index: int = 0
self._token_index: int = 0
Expand Down Expand Up @@ -283,7 +288,9 @@ def _replace_with_new_tokens(self, state: UsfmParserState) -> bool:
existing_text = True
break
use_new_tokens: bool = (
self._strip_all_text or (new_text and not existing_text) or (new_text and not self._prefer_existing_text)
self._behavior is UpdateUsfmBehavior.STRIP_EXISTING
or (new_text and not existing_text)
or (new_text and self._behavior is UpdateUsfmBehavior.PREFER_NEW)
)
if use_new_tokens:
self._tokens.extend(self._new_tokens)
Expand Down
Loading

0 comments on commit cec61de

Please sign in to comment.