diff --git a/.gitignore b/.gitignore
index 053774f6..26316716 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,6 +35,7 @@ build/Release
# Dependency directories
*/node_modules/
*/ENV/
+*/ENV*/
jspm_packages/
# Typescript v1 declaration files
diff --git a/python-usfm-parser/API guide for python usfm_grammar.ipynb b/python-usfm-parser/API guide for python usfm_grammar.ipynb
index 5a643496..799796ee 100644
--- a/python-usfm-parser/API guide for python usfm_grammar.ipynb
+++ b/python-usfm-parser/API guide for python usfm_grammar.ipynb
@@ -8,7 +8,7 @@
"outputs": [],
"source": [
"import sys\n",
- "sys.path.append('/home/kavitha/Documents/PEG JS and USFM/usfm-grammar-v3/usfm-grammar/python-usfm-parser/ENV/lib/python3.8/site-packages')\n"
+ "sys.path.append('/home/kavitha/Documents/PEG JS and USFM/usfm-grammar-v3/usfm-grammar/python-usfm-parser/ENV3.10/lib/python3.10/site-packages')\n"
]
},
{
@@ -23,7 +23,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"id": "b3d034a2",
"metadata": {},
"outputs": [],
@@ -89,7 +89,7 @@
"metadata": {},
"outputs": [],
"source": [
- "my_parser.toDict()"
+ "my_parser.to_dict()"
]
},
{
@@ -99,7 +99,7 @@
"metadata": {},
"outputs": [],
"source": [
- "my_parser.toDict(Filter.ALL.value)"
+ "my_parser.to_dict(Filter.ALL.value)"
]
},
{
@@ -109,7 +109,7 @@
"metadata": {},
"outputs": [],
"source": [
- "my_parser.toDict(Filter.NOTES.value)"
+ "my_parser.to_dict(Filter.NOTES.value)"
]
},
{
@@ -118,7 +118,9 @@
"id": "e4f49981",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "my_parser.to_dict(Filter.NOTES_TEXT.value)"
+ ]
},
{
"cell_type": "code",
@@ -127,7 +129,7 @@
"metadata": {},
"outputs": [],
"source": [
- "table_output = my_parser.toTable()\n",
+ "table_output = my_parser.to_list()\n",
"table_output\n"
]
},
@@ -156,7 +158,7 @@
"metadata": {},
"outputs": [],
"source": [
- "table_output = my_parser.toTable(Filter.NOTES.value)\n",
+ "table_output = my_parser.to_list(Filter.NOTES.value)\n",
"print(\"\\n\".join([\"\\t\".join(row) for row in table_output]))\n"
]
},
@@ -175,7 +177,7 @@
"metadata": {},
"outputs": [],
"source": [
- "my_parser.toDict(Filter.SCRIPTURE_PARAGRAPHS.value)"
+ "my_parser.to_dict(Filter.SCRIPTURE_PARAGRAPHS.value)"
]
},
{
@@ -185,7 +187,7 @@
"metadata": {},
"outputs": [],
"source": [
- "table_output = my_parser.toTable(Filter.SCRIPTURE_PARAGRAPHS.value)\n",
+ "table_output = my_parser.to_list(Filter.SCRIPTURE_PARAGRAPHS.value)\n",
"print(\"\\n\".join([\"\\t\".join(row) for row in table_output]))\n"
]
},
@@ -206,7 +208,7 @@
"source": [
"import xml.etree.ElementTree as ET\n",
"\n",
- "usx_elem = my_parser.toUSX()\n",
+ "usx_elem = my_parser.to_usx()\n",
"usx_str = ET.tostring(usx_elem, encoding=\"unicode\")"
]
},
@@ -226,9 +228,7 @@
"id": "295dae47",
"metadata": {},
"outputs": [],
- "source": [
- "!pip install lxml"
- ]
+ "source": []
},
{
"cell_type": "code",
@@ -236,30 +236,17 @@
"id": "583efddc",
"metadata": {},
"outputs": [],
- "source": [
- "!pip install rnc2rng"
- ]
+ "source": []
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"id": "2bd40ba2",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "''"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"import sys\n",
- "sys.path.append('/home/kavitha/Documents/PEG JS and USFM/usfm-grammar-v3/usfm-grammar/python-usfm-parser/ENV/lib/python3.8/site-packages')\n",
+ "sys.path.append('/home/kavitha/Documents/PEG JS and USFM/usfm-grammar-v3/usfm-grammar/python-usfm-parser/ENV3.10/lib/python3.10/site-packages')\n",
"\n",
"\n",
"from usfm_grammar import USFMParser, Filter\n",
@@ -268,7 +255,7 @@
"input_usfm_str = open(\"origin.usfm\",\"r\", encoding='utf8').read()\n",
"my_parser = USFMParser(input_usfm_str)\n",
"\n",
- "usx_elem = my_parser.toUSX()\n",
+ "usx_elem = my_parser.to_usx()\n",
"usx_str = ET.tostring(usx_elem, encoding=\"unicode\")\n",
"\n",
"usx_str"
@@ -276,7 +263,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": null,
"id": "a680a0b6",
"metadata": {},
"outputs": [],
@@ -289,18 +276,10 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": null,
"id": "0fac8a56",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "valid\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"\n",
"\n",
@@ -323,40 +302,11 @@
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": null,
"id": "1ea6bb28",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "valid\n"
- ]
- }
- ],
- "source": [
- "origin_usx_str = '''\n",
- " \n",
- " MARK\n",
- " \n",
- " \n",
- " \n",
- " verse one \n",
- " \n",
- " \n",
- " verse two\n",
- " \n",
- " \n",
- " \n",
- "'''\n",
- "usx_f = StringIO(origin_usx_str)\n",
- "doc = etree.parse(usx_f)\n",
- "if relaxng.validate(doc):\n",
- " print(\"valid\")\n",
- "else:\n",
- " relaxng.assertValid(doc)"
- ]
+ "outputs": [],
+ "source": []
},
{
"cell_type": "code",
@@ -368,56 +318,20 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": null,
"id": "8d12593b",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "valid\n"
- ]
- }
- ],
- "source": [
- "empty_usx_str = '''\n",
- " \n",
- " \n",
- " \n",
- " \n",
- " verse one\n",
- " \n",
- " \n",
- "\n",
- "'''\n",
- "usx_f = StringIO(empty_usx_str)\n",
- "doc = etree.parse(usx_f)\n",
- "if relaxng.validate(doc):\n",
- " print(\"valid\")\n",
- "else:\n",
- " relaxng.assertValid(doc)"
- ]
+ "outputs": [],
+ "source": []
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"id": "818e36d9",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'(File (book (id (bookcode) (description))) (mtBlock (mt (numberedLevelMax4) (text))) (chapter (c (chapterNumber)) (paragraph (p))))'"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "my_parser.toAST()"
+ "my_parser.to_syntax_tree()"
]
},
{
@@ -445,7 +359,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.10"
+ "version": "3.10.6"
}
},
"nbformat": 4,
diff --git a/python-usfm-parser/requirements.txt b/python-usfm-parser/requirements.txt
index 899b528b..79b39f15 100644
--- a/python-usfm-parser/requirements.txt
+++ b/python-usfm-parser/requirements.txt
@@ -1,2 +1,4 @@
tree-sitter==0.20.0
-jupyterlab==3.3.2
+jupyterlab==3.4.4
+rnc2rng==2.6.6
+lxml==4.9.1
diff --git a/python-usfm-parser/usfm_grammar.py b/python-usfm-parser/usfm_grammar.py
index 9c3dc1db..be4ea843 100644
--- a/python-usfm-parser/usfm_grammar.py
+++ b/python-usfm-parser/usfm_grammar.py
@@ -2,8 +2,7 @@
import json
from enum import Enum
from tree_sitter import Language, Parser
-import xml.etree.ElementTree as ET
-from xml.dom import minidom
+from lxml import etree
class Filter(str, Enum):
ALL = "all"
@@ -15,7 +14,7 @@ class Filter(str, Enum):
class Format(str, Enum):
JSON = "json"
CSV = "table"
- AST = "syntax-tree"
+ ST = "syntax-tree"
USX = "usx"
MD = "markdown"
@@ -58,7 +57,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
code = usfm_bytes[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8')
elif tupl[1] == 'desc':
desc = usfm_bytes[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8')
- book_xml_node = ET.SubElement(parent_xml_node, "book")
+ book_xml_node = etree.SubElement(parent_xml_node, "book")
book_xml_node.set("code", code)
book_xml_node.set("style", "id")
if desc is not None and desc.strip() != "":
@@ -67,7 +66,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
chap_cap = USFM_LANGUAGE.query('''(c (chapterNumber) @chap-num)''').captures(node)[0]
chap_num = usfm_bytes[chap_cap[0].start_byte:chap_cap[0].end_byte].decode('utf-8')
ref = parent_xml_node.find("book").attrib['code']+" "+chap_num
- chap_xml_node = ET.SubElement(parent_xml_node, "chapter")
+ chap_xml_node = etree.SubElement(parent_xml_node, "chapter")
chap_xml_node.set("number", chap_num)
chap_xml_node.set("style", "c")
chap_xml_node.set("sid", ref)
@@ -77,19 +76,19 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
prev_verses = xml_root_node.findall(".//verse")
if len(prev_verses)>0:
if "sid" in prev_verses[-1].attrib:
- v_end_xml_node = ET.SubElement(parent_xml_node, "verse")
+ v_end_xml_node = etree.SubElement(parent_xml_node, "verse")
v_end_xml_node.set('eid', prev_verses[-1].get('sid'))
- chap_end_xml_node = ET.SubElement(parent_xml_node, "chapter")
+ chap_end_xml_node = etree.SubElement(parent_xml_node, "chapter")
chap_end_xml_node.set("eid", ref)
elif node.type == "v":
prev_verses = xml_root_node.findall(".//verse")
if len(prev_verses)>0:
if "sid" in prev_verses[-1].attrib:
- v_end_xml_node = ET.SubElement(parent_xml_node, "verse")
+ v_end_xml_node = etree.SubElement(parent_xml_node, "verse")
v_end_xml_node.set('eid', prev_verses[-1].get('sid'))
verse_num_cap = USFM_LANGUAGE.query("(v (verseNumber) @vnum)").captures(node)[0]
verse_num = usfm_bytes[verse_num_cap[0].start_byte:verse_num_cap[0].end_byte].decode('utf-8')
- v_xml_node = ET.SubElement(parent_xml_node, "verse")
+ v_xml_node = etree.SubElement(parent_xml_node, "verse")
ref = xml_root_node.findall('.//chapter')[-1].get('sid')+ ":"+ verse_num
v_xml_node.set('number', verse_num.strip())
v_xml_node.set('sid', ref.strip())
@@ -99,14 +98,14 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
elif node.type == 'paragraph':
para_tag_cap = USFM_LANGUAGE.query("(paragraph (_) @para-marker)").captures(node)[0]
para_marker = para_tag_cap[0].type
- para_xml_node = ET.SubElement(parent_xml_node, "para")
+ para_xml_node = etree.SubElement(parent_xml_node, "para")
para_xml_node.set("style", para_marker)
for child in para_tag_cap[0].children[1:]:
node_2_usx(child, usfm_bytes, para_xml_node, xml_root_node)
elif node.type in NOTE_MARKERS:
tag_node = node.children[0]
caller_node = node.children[1]
- note_xml_node = ET.SubElement(parent_xml_node, "note")
+ note_xml_node = etree.SubElement(parent_xml_node, "note")
note_xml_node.set("style",
usfm_bytes[tag_node.start_byte:tag_node.end_byte].decode('utf-8')
.replace("\\","").strip())
@@ -121,7 +120,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
if node.children[-1].type.startswith('\\'):
closing_node = node.children[-1]
children_range = children_range-1
- char_xml_node = ET.SubElement(parent_xml_node, "char")
+ char_xml_node = etree.SubElement(parent_xml_node, "char")
char_xml_node.set("style",
usfm_bytes[tag_node.start_byte:tag_node.end_byte].decode('utf-8')
.replace("\\","").strip())
@@ -150,11 +149,11 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
else:
parent_xml_node.text = text_val
elif node.type == "table":
- table_xml_node = ET.SubElement(parent_xml_node, "table")
+ table_xml_node = etree.SubElement(parent_xml_node, "table")
for child in node.children:
node_2_usx(child, usfm_bytes, table_xml_node, xml_root_node)
elif node.type == "tr":
- row_xml_node = ET.SubElement(parent_xml_node, "row")
+ row_xml_node = etree.SubElement(parent_xml_node, "row")
row_xml_node.set("style", "tr")
for child in node.children[1:]:
node_2_usx(child, usfm_bytes, row_xml_node, xml_root_node)
@@ -162,7 +161,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
tag_node = node.children[0]
style = usfm_bytes[tag_node.start_byte:tag_node.end_byte].decode('utf-8')\
.replace("\\","").strip()
- cell_xml_node = ET.SubElement(parent_xml_node, "cell")
+ cell_xml_node = etree.SubElement(parent_xml_node, "cell")
cell_xml_node.set("style", style)
if "r" in style:
cell_xml_node.set("align", "end")
@@ -179,14 +178,14 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
] @ms-name)''').captures(node)[0]
style = usfm_bytes[ms_name_cap[0].start_byte:ms_name_cap[0].end_byte].decode('utf-8')\
.replace("\\","").strip()
- ms_xml_node = ET.SubElement(parent_xml_node, "ms")
+ ms_xml_node = etree.SubElement(parent_xml_node, "ms")
ms_xml_node.set('style', style)
for child in node.children:
if child.type.endswith("Attribute"):
node_2_usx(child, usfm_bytes, ms_xml_node, xml_root_node)
elif node.type == "esb":
style = "esb"
- sidebar_xml_node = ET.SubElement(parent_xml_node, "sidebar")
+ sidebar_xml_node = etree.SubElement(parent_xml_node, "sidebar")
sidebar_xml_node.set("style", style)
for child in node.children[1:-1]:
node_2_usx(child, usfm_bytes, sidebar_xml_node, xml_root_node)
@@ -195,12 +194,12 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
category = usfm_bytes[cat_cap[0].start_byte:cat_cap[0].end_byte].decode('utf-8').strip()
parent_xml_node.set('category', category)
elif node.type == 'fig':
- fig_xml_node = ET.SubElement(parent_xml_node, "figure")
+ fig_xml_node = etree.SubElement(parent_xml_node, "figure")
fig_xml_node.set("style", 'fig')
for child in node.children[1:-1]:
node_2_usx(child, usfm_bytes, fig_xml_node, xml_root_node)
elif node.type == 'b':
- break_xml_node = ET.SubElement(parent_xml_node, "optbreak")
+ break_xml_node = etree.SubElement(parent_xml_node, "optbreak")
elif (node.type in PARA_STYLE_MARKERS or
node.type.replace("\\","").strip() in PARA_STYLE_MARKERS):
tag_node = node.children[0]
@@ -215,7 +214,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
num = usfm_bytes[num_node.start_byte:num_node.end_byte].decode('utf-8')
style += num
children_range_start = 2
- para_xml_node = ET.SubElement(parent_xml_node, "para")
+ para_xml_node = etree.SubElement(parent_xml_node, "para")
para_xml_node.set("style", style)
# caps = USFM_LANGUAGE.query('((text) @inner-text)').captures(node)
# para_xml_node.text = " ".join([usfm_bytes[txt_cap[0].start_byte:txt_cap[0].end_byte].decode('utf-8').strip()
@@ -236,7 +235,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node):
def node_2_dict(node, usfm_bytes):
- '''recursive function converting an AST node and its children to dictionary'''
+ '''recursive function converting a syntax tree node and its children to dictionary'''
if len(node.children)>0:
item = []
for child in node.children:
@@ -280,184 +279,189 @@ def node_2_dict(node, usfm_bytes):
title_query = USFM_LANGUAGE.query("""(title) @title""")
class USFMParser():
- """Parser class with usfmstring, AST and methods for JSON convertions"""
+ """Parser class with usfmstring, syntax_tree and methods for JSON convertions"""
def __init__(self, usfm_string):
# super(USFMParser, self).__init__()
- self.USFM = usfm_string
- self.USFMbytes = None
- self.AST = None
+ self.usfm = usfm_string
+ self.usfm_bytes = None
+ self.syntax_tree = None
self.errors = None
- self.USFMbytes = bytes(self.USFM, "utf8")
- tree = parser.parse(self.USFMbytes)
- self.AST = tree.root_node
+ self.usfm_bytes = bytes(self.usfm, "utf8")
+ tree = parser.parse(self.usfm_bytes)
+ self.syntax_tree = tree.root_node
# check for errors in the parse tree and raise them
- errors = error_query.captures(self.AST)
+ errors = error_query.captures(self.syntax_tree)
if len(errors) > 0:
- self.errors = [(f"At {err[0].start_point}", self.USFMbytes[err[0].start_byte:err[0].end_byte].decode('utf-8'))
+ self.errors = [(f"At {err[0].start_point}", self.usfm_bytes[err[0].start_byte:err[0].end_byte].decode('utf-8'))
for err in errors]
- def toAST(self):
- return self.AST.sexp()
+ def to_syntax_tree(self):
+ return self.syntax_tree.sexp()
- def toDict(self, filt=Filter.SCRIPTURE_BCV.value):
+ def to_dict(self, filt=Filter.SCRIPTURE_BCV.value):
if filt in [Filter.SCRIPTURE_BCV.value, Filter.NOTES.value, Filter.NOTES_TEXT.value,
Filter.SCRIPTURE_PARAGRAPHS.value, None]:
dict_output = {}
- captures = bookcode_query.captures(self.AST)
+ captures = bookcode_query.captures(self.syntax_tree)
cap = captures[0]
- dict_output['book'] = {'bookcode': self.USFMbytes[cap[0].start_byte:cap[0].end_byte].decode('utf-8')}
+ dict_output['book'] = {'bookcode': self.usfm_bytes[cap[0].start_byte:cap[0].end_byte].decode('utf-8')}
dict_output['book']['chapters'] = []
- captures = chapter_query.captures(self.AST)
+ captures = chapter_query.captures(self.syntax_tree)
for cap in captures:
chap_captures = chapternum_query.captures(cap[0])
ccap= chap_captures[0]
dict_output['book']['chapters'].append({"chapterNumber":
- self.USFMbytes[ccap[0].start_byte:ccap[0].end_byte].decode('utf-8'),
+ self.usfm_bytes[ccap[0].start_byte:ccap[0].end_byte].decode('utf-8'),
"contents":[]})
- if filt in [Filter.SCRIPTURE_BCV.value, None]:
- '''query for just the chapter, verse and text nodes from the AST'''
- versenum_captures = versenum_query.captures(cap[0])
- versetext_captures = versetext_query.captures(cap[0])
- combined = {item[0].start_byte: item for item in versenum_captures+versetext_captures}
- sorted_combined = [combined[i] for i in sorted(combined)]
- for vcap in sorted_combined:
- if vcap[1] == "verse":
- dict_output['book']['chapters'][-1]["contents"].append(
- {"verseNumber":self.USFMbytes[vcap[0].start_byte:vcap[0].end_byte].decode('utf-8').strip(),
- "verseText":""})
- elif vcap[1] == "verse-text":
- text_captures = text_query.captures(vcap[0])
- text_val = "".join([self.USFMbytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8').replace("\n", " ")
- for tcap in text_captures])
- dict_output['book']['chapters'][-1]['contents'][-1]['verseText'] += text_val
- elif filt in [Filter.NOTES.value, Filter.NOTES_TEXT.value]:
- '''query for just the chapter, verse and text nodes from the AST'''
- versenum_captures = versenum_query.captures(cap[0])
- notes_captures = notes_query.captures(cap[0])
- if len(notes_captures) == 0:
- continue
- combined = {item[0].start_byte: item for item in versenum_captures+notes_captures}
- sorted_combined = [combined[i] for i in sorted(combined)]
- for index,vcap in enumerate(sorted_combined):
- if vcap[1] == "verse" and \
- index+1 !=len(sorted_combined) and sorted_combined[index+1][1] =="note":
- '''need to add a verse only if it has notes'''
- dict_output['book']['chapters'][-1]["contents"].append(
- {"verseNumber":self.USFMbytes[vcap[0].start_byte:vcap[0].end_byte].decode('utf-8').strip(),
- "notes":[]})
- elif vcap[1] == "note":
- note_type = vcap[0].type
- if filt == Filter.NOTES.value:
- note_details = node_2_dict(vcap[0], self.USFMbytes)
- elif filt == Filter.NOTES_TEXT.value:
- notetext_captures = notestext_query.captures(vcap[0])
- note_details = "|".join([self.USFMbytes[ncap[0].start_byte:ncap[0].end_byte].decode('utf-8').strip().replace("\n","") for ncap in notetext_captures])
- dict_output['book']['chapters'][-1]['contents'][-1]['notes'].append({note_type: note_details})
- elif filt in [Filter.SCRIPTURE_PARAGRAPHS.value]:
- '''titles and section information, paragraph breaks
- and also structuring like lists and tables
- along with verse text and versenumber details at the lowest level'''
- title_captures = title_query.captures(cap[0])
- para_captures = para_query.captures(cap[0])
- combined_tit_paras = {item[0].start_byte: item for item in title_captures+para_captures}
- sorted_tit_paras = [combined_tit_paras[i] for i in sorted(combined_tit_paras)]
- for comp in sorted_tit_paras:
- if comp[1] == "title":
- text_captures = text_query.captures(comp[0])
- title_texts = []
- for tcap in text_captures:
- title_texts.append(self.USFMbytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8'))
- dict_output['book']['chapters'][-1]['contents'].append(
- {"title":" ".join(title_texts).strip()})
- elif comp[1] == "para":
- comp_type = comp[0].type
- versenum_captures = versenum_query.captures(comp[0])
- versetext_captures = versetext_query.captures(comp[0])
- combined = {item[0].start_byte: item for item in versenum_captures+versetext_captures}
- sorted_combined = [combined[i] for i in sorted(combined)]
- inner_contents = []
- for vcap in sorted_combined:
- if vcap[1] == "verse":
- inner_contents.append(
- {"verseNumber":self.USFMbytes[vcap[0].start_byte:vcap[0].end_byte].decode('utf-8').strip(),
+ match filt:
+ case Filter.SCRIPTURE_BCV.value | None:
+ '''query for just the chapter, verse and text nodes from the syntax_tree'''
+ versenum_captures = versenum_query.captures(cap[0])
+ versetext_captures = versetext_query.captures(cap[0])
+ combined = {item[0].start_byte: item for item in versenum_captures+versetext_captures}
+ sorted_combined = [combined[i] for i in sorted(combined)]
+ for vcap in sorted_combined:
+ match vcap:
+ case (vnode, "verse"):
+ dict_output['book']['chapters'][-1]["contents"].append(
+ {"verseNumber":self.usfm_bytes[vnode.start_byte:vnode.end_byte].decode('utf-8').strip(),
"verseText":""})
- elif vcap[1] == "verse-text":
- text_captures = text_query.captures(vcap[0])
- text_val = "".join([self.USFMbytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8').replace("\n", " ")
+ case (vnode, "verse-text"):
+ text_captures = text_query.captures(vnode)
+ text_val = "".join([self.usfm_bytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8').replace("\n", " ")
for tcap in text_captures])
- if len(inner_contents) == 0:
- inner_contents.append({"verseText":""})
- inner_contents[-1]['verseText'] += text_val
-
- dict_output['book']['chapters'][-1]["contents"].append({comp_type:inner_contents})
+ dict_output['book']['chapters'][-1]['contents'][-1]['verseText'] += text_val
+ case Filter.NOTES.value | Filter.NOTES_TEXT.value:
+ '''query for just the chapter, verse and text nodes from the syntax_tree'''
+ versenum_captures = versenum_query.captures(cap[0])
+ notes_captures = notes_query.captures(cap[0])
+ if len(notes_captures) == 0:
+ continue
+ combined = {item[0].start_byte: item for item in versenum_captures+notes_captures}
+ sorted_combined = [combined[i] for i in sorted(combined)]
+ for index,vcap in enumerate(sorted_combined):
+ if vcap[1] == "verse" and \
+ index+1 !=len(sorted_combined) and sorted_combined[index+1][1] =="note":
+ '''need to add a verse only if it has notes'''
+ dict_output['book']['chapters'][-1]["contents"].append(
+ {"verseNumber":self.usfm_bytes[vcap[0].start_byte:vcap[0].end_byte].decode('utf-8').strip(),
+ "notes":[]})
+ elif vcap[1] == "note":
+ note_type = vcap[0].type
+ if filt == Filter.NOTES.value:
+ note_details = node_2_dict(vcap[0], self.usfm_bytes)
+ elif filt == Filter.NOTES_TEXT.value:
+ notetext_captures = notestext_query.captures(vcap[0])
+ note_details = "|".join([self.usfm_bytes[ncap[0].start_byte:ncap[0].end_byte].decode('utf-8').strip().replace("\n","") for ncap in notetext_captures])
+ dict_output['book']['chapters'][-1]['contents'][-1]['notes'].append({note_type: note_details})
+ case Filter.SCRIPTURE_PARAGRAPHS.value:
+ '''titles and section information, paragraph breaks
+ and also structuring like lists and tables
+ along with verse text and versenumber details at the lowest level'''
+ title_captures = title_query.captures(cap[0])
+ para_captures = para_query.captures(cap[0])
+ combined_tit_paras = {item[0].start_byte: item for item in title_captures+para_captures}
+ sorted_tit_paras = [combined_tit_paras[i] for i in sorted(combined_tit_paras)]
+ for comp in sorted_tit_paras:
+ match comp:
+ case (comp_node, "title"):
+ text_captures = text_query.captures(comp_node)
+ title_texts = []
+ for tcap in text_captures:
+ title_texts.append(self.usfm_bytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8'))
+ dict_output['book']['chapters'][-1]['contents'].append(
+ {"title":" ".join(title_texts).strip()})
+ case (comp_node, "para"):
+ comp_type = comp_node.type
+ versenum_captures = versenum_query.captures(comp_node)
+ versetext_captures = versetext_query.captures(comp_node)
+ combined = {item[0].start_byte: item for item in versenum_captures+versetext_captures}
+ sorted_combined = [combined[i] for i in sorted(combined)]
+ inner_contents = []
+ for vcap in sorted_combined:
+ match vcap:
+ case (vnode, "verse"):
+ inner_contents.append(
+ {"verseNumber":self.usfm_bytes[vnode.start_byte:vnode.end_byte].decode('utf-8').strip(),
+ "verseText":""})
+ case (vnode, "verse-text"):
+ text_captures = text_query.captures(vnode)
+ text_val = "".join([self.usfm_bytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8').replace("\n", " ")
+ for tcap in text_captures])
+ if len(inner_contents) == 0:
+ inner_contents.append({"verseText":""})
+ inner_contents[-1]['verseText'] += text_val
+
+ dict_output['book']['chapters'][-1]["contents"].append({comp_type:inner_contents})
return dict_output
elif filt == Filter.ALL.value:
- '''directly converts the AST to JSON/dict'''
- return node_2_dict(self.AST, self.USFMbytes)
+ '''directly converts the syntax_tree to JSON/dict'''
+ return node_2_dict(self.syntax_tree, self.usfm_bytes)
else:
raise Exception(f"This filter option, {filt}, is yet to be implemeneted")
- def toTable(self, filt=Filter.SCRIPTURE_BCV.value):
+ def to_list(self, filt=Filter.SCRIPTURE_BCV.value):
'''uses the toJSON function and converts JSON to CSV'''
- if filt == Filter.SCRIPTURE_BCV.value or filt is None:
- scripture_json = self.toDict(Filter.SCRIPTURE_BCV.value)
- table_output = [["Book","Chapter","Verse","Text"]]
- book = scripture_json['book']['bookcode']
- for chap in scripture_json['book']['chapters']:
- chapter = chap['chapterNumber']
- for verse in chap['contents']:
- row = [book, chapter, verse['verseNumber'], '"'+verse['verseText']+'"']
- table_output.append(row)
- return table_output
- elif filt == Filter.NOTES.value:
- notes_json = self.toDict(Filter.NOTES_TEXT.value)
- table_output = [["Book","Chapter","Verse","Type", "Note"]]
- book = notes_json['book']['bookcode']
- for chap in notes_json['book']['chapters']:
- chapter = chap['chapterNumber']
- for verse in chap['contents']:
- v_num = verse['verseNumber']
- for note in verse['notes']:
- typ = list(note)[0]
- row = [book, chapter, v_num, typ, '"'+note[typ]+'"']
- table_output.append(row)
- return table_output
- elif filt == Filter.SCRIPTURE_PARAGRAPHS.value:
- notes_json = self.toDict(Filter.SCRIPTURE_PARAGRAPHS.value)
- table_output = [["Book","Chapter","Type", "Contents"]]
- book = notes_json['book']['bookcode']
- for chap in notes_json['book']['chapters']:
- chapter = chap['chapterNumber']
- for comp in chap['contents']:
- typ = list(comp)[0]
- if typ == "title":
- cont = comp[typ]
- else:
- inner_cont = []
- for inner_comp in comp[typ]:
- inner_cont += list(inner_comp.values())
- cont = ' '.join(inner_cont)
- row = [book, chapter, typ, cont]
- table_output.append(row)
- return table_output
+ match filt:
+ case Filter.SCRIPTURE_BCV.value | None:
+ scripture_json = self.to_dict(Filter.SCRIPTURE_BCV.value)
+ table_output = [["Book","Chapter","Verse","Text"]]
+ book = scripture_json['book']['bookcode']
+ for chap in scripture_json['book']['chapters']:
+ chapter = chap['chapterNumber']
+ for verse in chap['contents']:
+ row = [book, chapter, verse['verseNumber'], '"'+verse['verseText']+'"']
+ table_output.append(row)
+ return table_output
+ case Filter.NOTES.value:
+ notes_json = self.to_dict(Filter.NOTES_TEXT.value)
+ table_output = [["Book","Chapter","Verse","Type", "Note"]]
+ book = notes_json['book']['bookcode']
+ for chap in notes_json['book']['chapters']:
+ chapter = chap['chapterNumber']
+ for verse in chap['contents']:
+ v_num = verse['verseNumber']
+ for note in verse['notes']:
+ typ = list(note)[0]
+ row = [book, chapter, v_num, typ, '"'+note[typ]+'"']
+ table_output.append(row)
+ return table_output
+ case Filter.SCRIPTURE_PARAGRAPHS.value:
+ notes_json = self.to_dict(Filter.SCRIPTURE_PARAGRAPHS.value)
+ table_output = [["Book","Chapter","Type", "Contents"]]
+ book = notes_json['book']['bookcode']
+ for chap in notes_json['book']['chapters']:
+ chapter = chap['chapterNumber']
+ for comp in chap['contents']:
+ typ = list(comp)[0]
+ if typ == "title":
+ cont = comp[typ]
+ else:
+ inner_cont = []
+ for inner_comp in comp[typ]:
+ inner_cont += list(inner_comp.values())
+ cont = ' '.join(inner_cont)
+ row = [book, chapter, typ, cont]
+ table_output.append(row)
+ return table_output
- else:
- raise Exception(f"This filter option, {filt}, is yet to be implemeneted")
+ case _:
+ raise Exception(f"This filter option, {filt}, is yet to be implemeneted")
- def toMarkDown(self, filt=Filter.SCRIPTURE_PARAGRAPHS.value):
+ def to_markdown(self, filt=Filter.SCRIPTURE_PARAGRAPHS.value):
'''query for chapter, paragraph, text structure'''
return "yet to be implemeneted"
- def toUSX(self, filt=Filter.ALL):
- '''convert the AST to the XML format USX'''
- usx_root = ET.Element("usx")
+ def to_usx(self, filt=Filter.ALL):
+ '''convert the syntax_tree to the XML format USX'''
+ usx_root = etree.Element("usx")
usx_root.set("version", "3.0")
- node_2_usx(self.AST, self.USFMbytes, usx_root, usx_root)
+ node_2_usx(self.syntax_tree, self.usfm_bytes, usx_root, usx_root)
return usx_root
if __name__ == '__main__':
@@ -466,10 +470,11 @@ def toUSX(self, filt=Filter.ALL):
arg_parser.add_argument('infile', type=str, help='input usfm file')
arg_parser.add_argument('--format', type=str, help='output format',
choices=[Format.JSON.value, Format.CSV.value, Format.USX.value,
- Format.MD.value, Format.AST.value],
+ Format.MD.value, Format.ST.value],
default=Format.JSON.value)
arg_parser.add_argument('--filter', type=str, help='the type of contents to be included',
- choices=[Filter.SCRIPTURE_BCV.value, Filter.NOTES.value, Filter.SCRIPTURE_PARAGRAPHS.value])
+ choices=[Filter.SCRIPTURE_BCV.value, Filter.NOTES.value,
+ Filter.SCRIPTURE_PARAGRAPHS.value, Filter.ALL.value])
arg_parser.add_argument('--csv_col_sep', type=str, help="column separator or delimiter. Only useful with format=table.",
default="\t")
arg_parser.add_argument('--csv_row_sep', type=str, help="row separator or delimiter. Only useful with format=table.",
@@ -490,20 +495,20 @@ def toUSX(self, filt=Filter.ALL):
err_str = "\n\t".join(my_parser.errors)
print(f"Errors at:{err_str}")
- if output_format == Format.JSON:
- dict_output = my_parser.toDict(filt = output_filter)
- print(json.dumps(dict_output, indent=4, ensure_ascii=False))
- elif output_format == Format.CSV:
- table_output = my_parser.toTable(filt = output_filter)
- print(csv_row_sep.join([csv_col_sep.join(row) for row in table_output]))
- elif output_format == Format.USX:
- xmlstr = ET.tostring(my_parser.toUSX(filt = output_filter),encoding="unicode")
- print(minidom.parseString(xmlstr).toprettyxml(indent=" "))
- elif output_format == Format.MD:
- print(my_parser.toMarkDown(filt = output_filter))
- elif output_format == Format.AST:
- print(my_parser.toAST())
- else:
- raise Exception(f"Un-recognized output format:{output_format}!")
-
-
+ match output_format:
+ case Format.JSON:
+ dict_output = my_parser.to_dict(filt = output_filter)
+ print(json.dumps(dict_output, indent=4, ensure_ascii=False))
+ case Format.CSV:
+ table_output = my_parser.to_list(filt = output_filter)
+ print(csv_row_sep.join([csv_col_sep.join(row) for row in table_output]))
+ case Format.USX:
+ xmlstr = etree.tostring(my_parser.to_usx(filt=output_filter),
+ encoding='unicode', pretty_print=True)
+ print(xmlstr)
+ case Format.MD:
+ print(my_parser.to_markdown(filt = output_filter))
+ case Format.ST:
+ print(my_parser.to_syntax_tree())
+ case _:
+ raise Exception(f"Un-recognized output format:{output_format}!")