Skip to content

Commit

Permalink
Add support for tables to USX corpora
Browse files Browse the repository at this point in the history
  • Loading branch information
ddaspit committed Nov 5, 2024
1 parent 6d73765 commit 0bde8db
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 8 deletions.
4 changes: 2 additions & 2 deletions machine/corpora/usx_file_alignment_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,14 @@ def _get_links(word_tokenizer: RangeTokenizer[str, int, str], tokens: Sequence[U
text = ""
link_strs: List[Tuple[Range[int], str]] = []
for token in tokens:
if token.para_element != prev_para_elem and len(text) > 0:
if token.parent_element != prev_para_elem and len(text) > 0:
text += " "

start = len(text)
text += str(token)
if token.element is not None and token.element.tag == "wg":
link_strs.append((Range.create(start, len(text)), token.element.get("target_links", "")))
prev_para_elem = token.para_element
prev_para_elem = token.parent_element
text = text.strip()

i = 0
Expand Down
2 changes: 1 addition & 1 deletion machine/corpora/usx_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

@dataclass(frozen=True)
class UsxToken:
para_element: ElementTree.Element
parent_element: ElementTree.Element
text: str
element: Optional[ElementTree.Element]

Expand Down
2 changes: 1 addition & 1 deletion machine/corpora/usx_verse.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self, chapter: str, verse: str, is_sentence_start: bool, tokens: It

if (
prev_token is not None
and token.para_element != prev_token.para_element
and token.parent_element != prev_token.parent_element
and len(text) > 0
and not ends_with_space
):
Expand Down
19 changes: 15 additions & 4 deletions machine/corpora/usx_verse_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def parse(self, stream: BinaryIO) -> Iterable[UsxVerse]:
if root_elem is None:
raise RuntimeError("USX does not contain a book element.")
assert root_elem is not None
ctxt.parent_element = root_elem
for verse in self._parse_element(root_elem, ctxt):
yield verse

Expand All @@ -43,7 +44,7 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter
if not _is_verse_para(e):
ctxt.is_sentence_start = True
continue
ctxt.para_element = e
ctxt.parent_element = e
for evt in self._parse_element(e, ctxt):
yield evt
elif e.tag == "verse":
Expand Down Expand Up @@ -82,6 +83,16 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter
elif e.tag == "figure":
if ctxt.chapter is not None and ctxt.verse is not None:
ctxt.add_token("", e)
elif e.tag == "table":
for evt in self._parse_element(e, ctxt):
yield evt
elif e.tag == "row":
for evt in self._parse_element(e, ctxt):
yield evt
elif e.tag == "cell":
ctxt.parent_element = e
for evt in self._parse_element(e, ctxt):
yield evt

if e.tail is not None and ctxt.chapter is not None and ctxt.verse is not None:
ctxt.add_token(e.tail)
Expand Down Expand Up @@ -134,12 +145,12 @@ class _ParseContext:
chapter: Optional[str] = None
verse: Optional[str] = None
is_sentence_start: bool = True
para_element: Optional[ElementTree.Element] = None
parent_element: Optional[ElementTree.Element] = None
_verse_tokens: List[UsxToken] = field(default_factory=list)

def add_token(self, text: str, elem: Optional[ElementTree.Element] = None) -> None:
assert self.para_element is not None
self._verse_tokens.append(UsxToken(self.para_element, text, elem))
assert self.parent_element is not None
self._verse_tokens.append(UsxToken(self.parent_element, text, elem))

def create_verse(self) -> UsxVerse:
assert self.chapter is not None and self.verse is not None
Expand Down
30 changes: 30 additions & 0 deletions tests/corpora/test_usx_memory_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,36 @@ def test_get_rows_descriptive_title() -> None:
assert rows[0].text == "Descriptive title", str.join(",", [tr.text for tr in rows])


def test_get_rows_table() -> None:
rows = get_rows(
r"""<usx version="3.0">
<book code="MAT" style="id">- Test</book>
<chapter number="1" style="c" />
<table>
<row style="tr">
<cell style="tc1" align="start"><verse number="1" style="v" />Chapter</cell>
<cell style="tcr2" align="end">1</cell>
<cell style="tc3" align="start">verse</cell>
<cell style="tcr4" align="end">1</cell>
</row>
<row style="tr">
<cell style="tc1" colspan="2" align="start"><verse number="2" style="v" /></cell>
<cell style="tc3" colspan="2" align="start">Chapter 1 verse 2</cell>
</row>
</table>
</usx>
"""
)

assert len(rows) == 2

assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1")
assert rows[0].text == "Chapter 1 verse 1"

assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:2")
assert rows[1].text == "Chapter 1 verse 2"


def get_rows(usx: str) -> List[TextRow]:
text = UsxMemoryText("MAT", usx)
return list(text.get_rows())

0 comments on commit 0bde8db

Please sign in to comment.