Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix USX corpus issues #138

Merged
merged 2 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
from .usx_file_alignment_corpus import UsxFileAlignmentCorpus
from .usx_file_text import UsxFileText
from .usx_file_text_corpus import UsxFileTextCorpus
from .usx_memory_text import UsxMemoryText
from .usx_zip_text import UsxZipText
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
Expand Down Expand Up @@ -150,6 +151,7 @@
"UsxFileAlignmentCorpus",
"UsxFileText",
"UsxFileTextCorpus",
"UsxMemoryText",
"UsxZipText",
"ZipParatextProjectSettingsParser",
"ZipParatextProjectSettingsParserBase",
Expand Down
4 changes: 2 additions & 2 deletions machine/corpora/usx_file_alignment_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,14 @@ def _get_links(word_tokenizer: RangeTokenizer[str, int, str], tokens: Sequence[U
text = ""
link_strs: List[Tuple[Range[int], str]] = []
for token in tokens:
if token.para_element != prev_para_elem and len(text) > 0:
if token.parent_element != prev_para_elem and len(text) > 0:
text += " "

start = len(text)
text += str(token)
if token.element is not None and token.element.tag == "wg":
link_strs.append((Range.create(start, len(text)), token.element.get("target_links", "")))
prev_para_elem = token.para_element
prev_para_elem = token.parent_element
text = text.strip()

i = 0
Expand Down
15 changes: 15 additions & 0 deletions machine/corpora/usx_memory_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Optional

from ..scripture.verse_ref import Versification
from .memory_stream_container import MemoryStreamContainer
from .stream_container import StreamContainer
from .usx_text_base import UsxTextBase


class UsxMemoryText(UsxTextBase):
def __init__(self, id: str, usx: str, versification: Optional[Versification] = None) -> None:
super().__init__(id, versification)
self._usx = usx

def _create_stream_container(self) -> StreamContainer:
return MemoryStreamContainer(self._usx)
2 changes: 1 addition & 1 deletion machine/corpora/usx_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

@dataclass(frozen=True)
class UsxToken:
para_element: ElementTree.Element
parent_element: ElementTree.Element
text: str
element: Optional[ElementTree.Element]

Expand Down
2 changes: 1 addition & 1 deletion machine/corpora/usx_verse.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self, chapter: str, verse: str, is_sentence_start: bool, tokens: It

if (
prev_token is not None
and token.para_element != prev_token.para_element
and token.parent_element != prev_token.parent_element
and len(text) > 0
and not ends_with_space
):
Expand Down
73 changes: 53 additions & 20 deletions machine/corpora/usx_verse_parser.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from __future__ import annotations

import string
from dataclasses import dataclass, field
from typing import BinaryIO, Iterable, List, Optional
from xml.etree import ElementTree

from ..scripture.verse_ref import are_overlapping_verse_ranges
from ..utils.string_utils import has_sentence_ending, is_integer
from ..utils.string_utils import has_sentence_ending
from .corpora_utils import merge_verse_ranges
from .usx_token import UsxToken
from .usx_verse import UsxVerse
Expand All @@ -22,6 +23,7 @@ def parse(self, stream: BinaryIO) -> Iterable[UsxVerse]:
if root_elem is None:
raise RuntimeError("USX does not contain a book element.")
assert root_elem is not None
ctxt.parent_element = root_elem
for verse in self._parse_element(root_elem, ctxt):
yield verse

Expand All @@ -42,7 +44,7 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter
if not _is_verse_para(e):
ctxt.is_sentence_start = True
continue
ctxt.para_element = e
ctxt.parent_element = e
for evt in self._parse_element(e, ctxt):
yield evt
elif e.tag == "verse":
Expand Down Expand Up @@ -81,43 +83,74 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter
elif e.tag == "figure":
if ctxt.chapter is not None and ctxt.verse is not None:
ctxt.add_token("", e)
elif e.tag == "table":
for evt in self._parse_element(e, ctxt):
yield evt
elif e.tag == "row":
for evt in self._parse_element(e, ctxt):
yield evt
elif e.tag == "cell":
ctxt.parent_element = e
for evt in self._parse_element(e, ctxt):
yield evt

if e.tail is not None and ctxt.chapter is not None and ctxt.verse is not None:
ctxt.add_token(e.tail)


_NONVERSE_PARA_STYLES = {"ms", "mr", "s", "sr", "r", "d", "sp", "rem", "restore", "cl"}


def _is_numbered_style(style_prefix: str, style: str) -> bool:
return style.startswith(style_prefix) and is_integer(style[len(style_prefix) :])
_VERSE_PARA_STYLES = {
# Paragraphs
"p",
"m",
"po",
"pr",
"cls",
"pmo",
"pm",
"pmc",
"pmr",
"pi",
"pc",
"mi",
"nb",
# Poetry
"q",
"qc",
"qr",
"qm",
"qd",
"b",
"d",
# Lists
"lh",
"li",
"lf",
"lim",
# Deprecated
"ph",
"phi",
"ps",
"psi",
}


def _is_verse_para(para_elem: ElementTree.Element) -> bool:
style = para_elem.get("style", "")
if style in _NONVERSE_PARA_STYLES:
return False

if _is_numbered_style("ms", style):
return False

if _is_numbered_style("s", style):
return False

return True
style = style.rstrip(string.digits)
return style in _VERSE_PARA_STYLES


@dataclass
class _ParseContext:
chapter: Optional[str] = None
verse: Optional[str] = None
is_sentence_start: bool = True
para_element: Optional[ElementTree.Element] = None
parent_element: Optional[ElementTree.Element] = None
_verse_tokens: List[UsxToken] = field(default_factory=list)

def add_token(self, text: str, elem: Optional[ElementTree.Element] = None) -> None:
assert self.para_element is not None
self._verse_tokens.append(UsxToken(self.para_element, text, elem))
assert self.parent_element is not None
self._verse_tokens.append(UsxToken(self.parent_element, text, elem))

def create_verse(self) -> UsxVerse:
assert self.chapter is not None and self.verse is not None
Expand Down
59 changes: 59 additions & 0 deletions tests/corpora/test_usx_memory_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import List

from testutils.corpora_test_helpers import scripture_ref

from machine.corpora import ScriptureRef, TextRow, UsxMemoryText


def test_get_rows_descriptive_title() -> None:
rows = get_rows(
r"""<usx version="3.0">
<book code="MAT" style="id">- Test</book>
<chapter number="1" style="c" />
<para style="d">
<verse number="1" style="v" sid="MAT 1:1" />Descriptive title</para>
<para style="p">
The rest of verse one.<verse eid="MAT 1:1" />
<verse number="2" style="v" />This is verse two.</para>
</usx>
"""
)
assert len(rows) == 2

assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows])
assert rows[0].text == "Descriptive title", str.join(",", [tr.text for tr in rows])


def test_get_rows_table() -> None:
rows = get_rows(
r"""<usx version="3.0">
<book code="MAT" style="id">- Test</book>
<chapter number="1" style="c" />
<table>
<row style="tr">
<cell style="tc1" align="start"><verse number="1" style="v" />Chapter</cell>
<cell style="tcr2" align="end">1</cell>
<cell style="tc3" align="start">verse</cell>
<cell style="tcr4" align="end">1</cell>
</row>
<row style="tr">
<cell style="tc1" colspan="2" align="start"><verse number="2" style="v" /></cell>
<cell style="tc3" colspan="2" align="start">Chapter 1 verse 2</cell>
</row>
</table>
</usx>
"""
)

assert len(rows) == 2

assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1")
assert rows[0].text == "Chapter 1 verse 1"

assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:2")
assert rows[1].text == "Chapter 1 verse 2"


def get_rows(usx: str) -> List[TextRow]:
text = UsxMemoryText("MAT", usx)
return list(text.get_rows())
Loading