From 990f8b11057e619560fe568ae9f0baf13d7093ea Mon Sep 17 00:00:00 2001 From: kavitharaju Date: Wed, 10 Aug 2022 23:12:02 +0530 Subject: [PATCH] Python module refactor (#158) * switch to python3.10 * change Python API names - Enum Format.ST = "syntax-tree" - Class data member USFMParser.syntax_tree - Class data member USFMParser.USFM_bytes - Class member function USFMParser.to_syntax_tree() - Class member function USFMParser.to_dict() - Class member function USFMParser.to_list() - Class member function USFMParser.to_markdown() - Class member function USFMParser.to_usx() * use match-case in place of if-else when useful * update the API guide jupyter notebook with new names * use lxml library instead of xml * keep class members all in lowercase: usfm, usfm_bytes --- .gitignore | 1 + .../API guide for python usfm_grammar.ipynb | 150 ++----- python-usfm-parser/requirements.txt | 4 +- python-usfm-parser/usfm_grammar.py | 367 +++++++++--------- 4 files changed, 222 insertions(+), 300 deletions(-) diff --git a/.gitignore b/.gitignore index 053774f6..26316716 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ build/Release # Dependency directories */node_modules/ */ENV/ +*/ENV*/ jspm_packages/ # Typescript v1 declaration files diff --git a/python-usfm-parser/API guide for python usfm_grammar.ipynb b/python-usfm-parser/API guide for python usfm_grammar.ipynb index 5a643496..799796ee 100644 --- a/python-usfm-parser/API guide for python usfm_grammar.ipynb +++ b/python-usfm-parser/API guide for python usfm_grammar.ipynb @@ -8,7 +8,7 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('/home/kavitha/Documents/PEG JS and USFM/usfm-grammar-v3/usfm-grammar/python-usfm-parser/ENV/lib/python3.8/site-packages')\n" + "sys.path.append('/home/kavitha/Documents/PEG JS and USFM/usfm-grammar-v3/usfm-grammar/python-usfm-parser/ENV3.10/lib/python3.10/site-packages')\n" ] }, { @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "b3d034a2", "metadata": {}, "outputs": [], @@ -89,7 +89,7 @@ "metadata": {}, "outputs": [], "source": [ - "my_parser.toDict()" + "my_parser.to_dict()" ] }, { @@ -99,7 +99,7 @@ "metadata": {}, "outputs": [], "source": [ - "my_parser.toDict(Filter.ALL.value)" + "my_parser.to_dict(Filter.ALL.value)" ] }, { @@ -109,7 +109,7 @@ "metadata": {}, "outputs": [], "source": [ - "my_parser.toDict(Filter.NOTES.value)" + "my_parser.to_dict(Filter.NOTES.value)" ] }, { @@ -118,7 +118,9 @@ "id": "e4f49981", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "my_parser.to_dict(Filter.NOTES_TEXT.value)" + ] }, { "cell_type": "code", @@ -127,7 +129,7 @@ "metadata": {}, "outputs": [], "source": [ - "table_output = my_parser.toTable()\n", + "table_output = my_parser.to_list()\n", "table_output\n" ] }, @@ -156,7 +158,7 @@ "metadata": {}, "outputs": [], "source": [ - "table_output = my_parser.toTable(Filter.NOTES.value)\n", + "table_output = my_parser.to_list(Filter.NOTES.value)\n", "print(\"\\n\".join([\"\\t\".join(row) for row in table_output]))\n" ] }, @@ -175,7 +177,7 @@ "metadata": {}, "outputs": [], "source": [ - "my_parser.toDict(Filter.SCRIPTURE_PARAGRAPHS.value)" + "my_parser.to_dict(Filter.SCRIPTURE_PARAGRAPHS.value)" ] }, { @@ -185,7 +187,7 @@ "metadata": {}, "outputs": [], "source": [ - "table_output = my_parser.toTable(Filter.SCRIPTURE_PARAGRAPHS.value)\n", + "table_output = my_parser.to_list(Filter.SCRIPTURE_PARAGRAPHS.value)\n", "print(\"\\n\".join([\"\\t\".join(row) for row in table_output]))\n" ] }, @@ -206,7 +208,7 @@ "source": [ "import xml.etree.ElementTree as ET\n", "\n", - "usx_elem = my_parser.toUSX()\n", + "usx_elem = my_parser.to_usx()\n", "usx_str = ET.tostring(usx_elem, encoding=\"unicode\")" ] }, @@ -226,9 +228,7 @@ "id": "295dae47", "metadata": {}, "outputs": [], - "source": [ - "!pip install lxml" - ] + "source": [] }, { "cell_type": "code", @@ -236,30 +236,17 @@ "id": "583efddc", "metadata": {}, "outputs": [], - "source": [ - "!pip install rnc2rng" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "2bd40ba2", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "''" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import sys\n", - "sys.path.append('/home/kavitha/Documents/PEG JS and USFM/usfm-grammar-v3/usfm-grammar/python-usfm-parser/ENV/lib/python3.8/site-packages')\n", + "sys.path.append('/home/kavitha/Documents/PEG JS and USFM/usfm-grammar-v3/usfm-grammar/python-usfm-parser/ENV3.10/lib/python3.10/site-packages')\n", "\n", "\n", "from usfm_grammar import USFMParser, Filter\n", @@ -268,7 +255,7 @@ "input_usfm_str = open(\"origin.usfm\",\"r\", encoding='utf8').read()\n", "my_parser = USFMParser(input_usfm_str)\n", "\n", - "usx_elem = my_parser.toUSX()\n", + "usx_elem = my_parser.to_usx()\n", "usx_str = ET.tostring(usx_elem, encoding=\"unicode\")\n", "\n", "usx_str" @@ -276,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "a680a0b6", "metadata": {}, "outputs": [], @@ -289,18 +276,10 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "0fac8a56", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "valid\n" - ] - } - ], + "outputs": [], "source": [ "\n", "\n", @@ -323,40 +302,11 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "1ea6bb28", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "valid\n" - ] - } - ], - "source": [ - "origin_usx_str = '''\n", - " \n", - " MARK\n", - " \n", - " \n", - " \n", - " verse one \n", - " \n", - " \n", - " verse two\n", - " \n", - " \n", - " \n", - "'''\n", - "usx_f = StringIO(origin_usx_str)\n", - "doc = etree.parse(usx_f)\n", - "if relaxng.validate(doc):\n", - " print(\"valid\")\n", - "else:\n", - " relaxng.assertValid(doc)" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", @@ -368,56 +318,20 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "8d12593b", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "valid\n" - ] - } - ], - "source": [ - "empty_usx_str = '''\n", - " \n", - " \n", - " \n", - " \n", - " verse one\n", - " \n", - " \n", - "\n", - "'''\n", - "usx_f = StringIO(empty_usx_str)\n", - "doc = etree.parse(usx_f)\n", - "if relaxng.validate(doc):\n", - " print(\"valid\")\n", - "else:\n", - " relaxng.assertValid(doc)" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "818e36d9", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'(File (book (id (bookcode) (description))) (mtBlock (mt (numberedLevelMax4) (text))) (chapter (c (chapterNumber)) (paragraph (p))))'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "my_parser.toAST()" + "my_parser.to_syntax_tree()" ] }, { @@ -445,7 +359,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/python-usfm-parser/requirements.txt b/python-usfm-parser/requirements.txt index 899b528b..79b39f15 100644 --- a/python-usfm-parser/requirements.txt +++ b/python-usfm-parser/requirements.txt @@ -1,2 +1,4 @@ tree-sitter==0.20.0 -jupyterlab==3.3.2 +jupyterlab==3.4.4 +rnc2rng==2.6.6 +lxml==4.9.1 diff --git a/python-usfm-parser/usfm_grammar.py b/python-usfm-parser/usfm_grammar.py index 9c3dc1db..be4ea843 100644 --- a/python-usfm-parser/usfm_grammar.py +++ b/python-usfm-parser/usfm_grammar.py @@ -2,8 +2,7 @@ import json from enum import Enum from tree_sitter import Language, Parser -import xml.etree.ElementTree as ET -from xml.dom import minidom +from lxml import etree class Filter(str, Enum): ALL = "all" @@ -15,7 +14,7 @@ class Filter(str, Enum): class Format(str, Enum): JSON = "json" CSV = "table" - AST = "syntax-tree" + ST = "syntax-tree" USX = "usx" MD = "markdown" @@ -58,7 +57,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): code = usfm_bytes[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8') elif tupl[1] == 'desc': desc = usfm_bytes[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8') - book_xml_node = ET.SubElement(parent_xml_node, "book") + book_xml_node = etree.SubElement(parent_xml_node, "book") book_xml_node.set("code", code) book_xml_node.set("style", "id") if desc is not None and desc.strip() != "": @@ -67,7 +66,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): chap_cap = USFM_LANGUAGE.query('''(c (chapterNumber) @chap-num)''').captures(node)[0] chap_num = usfm_bytes[chap_cap[0].start_byte:chap_cap[0].end_byte].decode('utf-8') ref = parent_xml_node.find("book").attrib['code']+" "+chap_num - chap_xml_node = ET.SubElement(parent_xml_node, "chapter") + chap_xml_node = etree.SubElement(parent_xml_node, "chapter") chap_xml_node.set("number", chap_num) chap_xml_node.set("style", "c") chap_xml_node.set("sid", ref) @@ -77,19 +76,19 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): prev_verses = xml_root_node.findall(".//verse") if len(prev_verses)>0: if "sid" in prev_verses[-1].attrib: - v_end_xml_node = ET.SubElement(parent_xml_node, "verse") + v_end_xml_node = etree.SubElement(parent_xml_node, "verse") v_end_xml_node.set('eid', prev_verses[-1].get('sid')) - chap_end_xml_node = ET.SubElement(parent_xml_node, "chapter") + chap_end_xml_node = etree.SubElement(parent_xml_node, "chapter") chap_end_xml_node.set("eid", ref) elif node.type == "v": prev_verses = xml_root_node.findall(".//verse") if len(prev_verses)>0: if "sid" in prev_verses[-1].attrib: - v_end_xml_node = ET.SubElement(parent_xml_node, "verse") + v_end_xml_node = etree.SubElement(parent_xml_node, "verse") v_end_xml_node.set('eid', prev_verses[-1].get('sid')) verse_num_cap = USFM_LANGUAGE.query("(v (verseNumber) @vnum)").captures(node)[0] verse_num = usfm_bytes[verse_num_cap[0].start_byte:verse_num_cap[0].end_byte].decode('utf-8') - v_xml_node = ET.SubElement(parent_xml_node, "verse") + v_xml_node = etree.SubElement(parent_xml_node, "verse") ref = xml_root_node.findall('.//chapter')[-1].get('sid')+ ":"+ verse_num v_xml_node.set('number', verse_num.strip()) v_xml_node.set('sid', ref.strip()) @@ -99,14 +98,14 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): elif node.type == 'paragraph': para_tag_cap = USFM_LANGUAGE.query("(paragraph (_) @para-marker)").captures(node)[0] para_marker = para_tag_cap[0].type - para_xml_node = ET.SubElement(parent_xml_node, "para") + para_xml_node = etree.SubElement(parent_xml_node, "para") para_xml_node.set("style", para_marker) for child in para_tag_cap[0].children[1:]: node_2_usx(child, usfm_bytes, para_xml_node, xml_root_node) elif node.type in NOTE_MARKERS: tag_node = node.children[0] caller_node = node.children[1] - note_xml_node = ET.SubElement(parent_xml_node, "note") + note_xml_node = etree.SubElement(parent_xml_node, "note") note_xml_node.set("style", usfm_bytes[tag_node.start_byte:tag_node.end_byte].decode('utf-8') .replace("\\","").strip()) @@ -121,7 +120,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): if node.children[-1].type.startswith('\\'): closing_node = node.children[-1] children_range = children_range-1 - char_xml_node = ET.SubElement(parent_xml_node, "char") + char_xml_node = etree.SubElement(parent_xml_node, "char") char_xml_node.set("style", usfm_bytes[tag_node.start_byte:tag_node.end_byte].decode('utf-8') .replace("\\","").strip()) @@ -150,11 +149,11 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): else: parent_xml_node.text = text_val elif node.type == "table": - table_xml_node = ET.SubElement(parent_xml_node, "table") + table_xml_node = etree.SubElement(parent_xml_node, "table") for child in node.children: node_2_usx(child, usfm_bytes, table_xml_node, xml_root_node) elif node.type == "tr": - row_xml_node = ET.SubElement(parent_xml_node, "row") + row_xml_node = etree.SubElement(parent_xml_node, "row") row_xml_node.set("style", "tr") for child in node.children[1:]: node_2_usx(child, usfm_bytes, row_xml_node, xml_root_node) @@ -162,7 +161,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): tag_node = node.children[0] style = usfm_bytes[tag_node.start_byte:tag_node.end_byte].decode('utf-8')\ .replace("\\","").strip() - cell_xml_node = ET.SubElement(parent_xml_node, "cell") + cell_xml_node = etree.SubElement(parent_xml_node, "cell") cell_xml_node.set("style", style) if "r" in style: cell_xml_node.set("align", "end") @@ -179,14 +178,14 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): ] @ms-name)''').captures(node)[0] style = usfm_bytes[ms_name_cap[0].start_byte:ms_name_cap[0].end_byte].decode('utf-8')\ .replace("\\","").strip() - ms_xml_node = ET.SubElement(parent_xml_node, "ms") + ms_xml_node = etree.SubElement(parent_xml_node, "ms") ms_xml_node.set('style', style) for child in node.children: if child.type.endswith("Attribute"): node_2_usx(child, usfm_bytes, ms_xml_node, xml_root_node) elif node.type == "esb": style = "esb" - sidebar_xml_node = ET.SubElement(parent_xml_node, "sidebar") + sidebar_xml_node = etree.SubElement(parent_xml_node, "sidebar") sidebar_xml_node.set("style", style) for child in node.children[1:-1]: node_2_usx(child, usfm_bytes, sidebar_xml_node, xml_root_node) @@ -195,12 +194,12 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): category = usfm_bytes[cat_cap[0].start_byte:cat_cap[0].end_byte].decode('utf-8').strip() parent_xml_node.set('category', category) elif node.type == 'fig': - fig_xml_node = ET.SubElement(parent_xml_node, "figure") + fig_xml_node = etree.SubElement(parent_xml_node, "figure") fig_xml_node.set("style", 'fig') for child in node.children[1:-1]: node_2_usx(child, usfm_bytes, fig_xml_node, xml_root_node) elif node.type == 'b': - break_xml_node = ET.SubElement(parent_xml_node, "optbreak") + break_xml_node = etree.SubElement(parent_xml_node, "optbreak") elif (node.type in PARA_STYLE_MARKERS or node.type.replace("\\","").strip() in PARA_STYLE_MARKERS): tag_node = node.children[0] @@ -215,7 +214,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): num = usfm_bytes[num_node.start_byte:num_node.end_byte].decode('utf-8') style += num children_range_start = 2 - para_xml_node = ET.SubElement(parent_xml_node, "para") + para_xml_node = etree.SubElement(parent_xml_node, "para") para_xml_node.set("style", style) # caps = USFM_LANGUAGE.query('((text) @inner-text)').captures(node) # para_xml_node.text = " ".join([usfm_bytes[txt_cap[0].start_byte:txt_cap[0].end_byte].decode('utf-8').strip() @@ -236,7 +235,7 @@ def node_2_usx(node, usfm_bytes, parent_xml_node, xml_root_node): def node_2_dict(node, usfm_bytes): - '''recursive function converting an AST node and its children to dictionary''' + '''recursive function converting a syntax tree node and its children to dictionary''' if len(node.children)>0: item = [] for child in node.children: @@ -280,184 +279,189 @@ def node_2_dict(node, usfm_bytes): title_query = USFM_LANGUAGE.query("""(title) @title""") class USFMParser(): - """Parser class with usfmstring, AST and methods for JSON convertions""" + """Parser class with usfmstring, syntax_tree and methods for JSON convertions""" def __init__(self, usfm_string): # super(USFMParser, self).__init__() - self.USFM = usfm_string - self.USFMbytes = None - self.AST = None + self.usfm = usfm_string + self.usfm_bytes = None + self.syntax_tree = None self.errors = None - self.USFMbytes = bytes(self.USFM, "utf8") - tree = parser.parse(self.USFMbytes) - self.AST = tree.root_node + self.usfm_bytes = bytes(self.usfm, "utf8") + tree = parser.parse(self.usfm_bytes) + self.syntax_tree = tree.root_node # check for errors in the parse tree and raise them - errors = error_query.captures(self.AST) + errors = error_query.captures(self.syntax_tree) if len(errors) > 0: - self.errors = [(f"At {err[0].start_point}", self.USFMbytes[err[0].start_byte:err[0].end_byte].decode('utf-8')) + self.errors = [(f"At {err[0].start_point}", self.usfm_bytes[err[0].start_byte:err[0].end_byte].decode('utf-8')) for err in errors] - def toAST(self): - return self.AST.sexp() + def to_syntax_tree(self): + return self.syntax_tree.sexp() - def toDict(self, filt=Filter.SCRIPTURE_BCV.value): + def to_dict(self, filt=Filter.SCRIPTURE_BCV.value): if filt in [Filter.SCRIPTURE_BCV.value, Filter.NOTES.value, Filter.NOTES_TEXT.value, Filter.SCRIPTURE_PARAGRAPHS.value, None]: dict_output = {} - captures = bookcode_query.captures(self.AST) + captures = bookcode_query.captures(self.syntax_tree) cap = captures[0] - dict_output['book'] = {'bookcode': self.USFMbytes[cap[0].start_byte:cap[0].end_byte].decode('utf-8')} + dict_output['book'] = {'bookcode': self.usfm_bytes[cap[0].start_byte:cap[0].end_byte].decode('utf-8')} dict_output['book']['chapters'] = [] - captures = chapter_query.captures(self.AST) + captures = chapter_query.captures(self.syntax_tree) for cap in captures: chap_captures = chapternum_query.captures(cap[0]) ccap= chap_captures[0] dict_output['book']['chapters'].append({"chapterNumber": - self.USFMbytes[ccap[0].start_byte:ccap[0].end_byte].decode('utf-8'), + self.usfm_bytes[ccap[0].start_byte:ccap[0].end_byte].decode('utf-8'), "contents":[]}) - if filt in [Filter.SCRIPTURE_BCV.value, None]: - '''query for just the chapter, verse and text nodes from the AST''' - versenum_captures = versenum_query.captures(cap[0]) - versetext_captures = versetext_query.captures(cap[0]) - combined = {item[0].start_byte: item for item in versenum_captures+versetext_captures} - sorted_combined = [combined[i] for i in sorted(combined)] - for vcap in sorted_combined: - if vcap[1] == "verse": - dict_output['book']['chapters'][-1]["contents"].append( - {"verseNumber":self.USFMbytes[vcap[0].start_byte:vcap[0].end_byte].decode('utf-8').strip(), - "verseText":""}) - elif vcap[1] == "verse-text": - text_captures = text_query.captures(vcap[0]) - text_val = "".join([self.USFMbytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8').replace("\n", " ") - for tcap in text_captures]) - dict_output['book']['chapters'][-1]['contents'][-1]['verseText'] += text_val - elif filt in [Filter.NOTES.value, Filter.NOTES_TEXT.value]: - '''query for just the chapter, verse and text nodes from the AST''' - versenum_captures = versenum_query.captures(cap[0]) - notes_captures = notes_query.captures(cap[0]) - if len(notes_captures) == 0: - continue - combined = {item[0].start_byte: item for item in versenum_captures+notes_captures} - sorted_combined = [combined[i] for i in sorted(combined)] - for index,vcap in enumerate(sorted_combined): - if vcap[1] == "verse" and \ - index+1 !=len(sorted_combined) and sorted_combined[index+1][1] =="note": - '''need to add a verse only if it has notes''' - dict_output['book']['chapters'][-1]["contents"].append( - {"verseNumber":self.USFMbytes[vcap[0].start_byte:vcap[0].end_byte].decode('utf-8').strip(), - "notes":[]}) - elif vcap[1] == "note": - note_type = vcap[0].type - if filt == Filter.NOTES.value: - note_details = node_2_dict(vcap[0], self.USFMbytes) - elif filt == Filter.NOTES_TEXT.value: - notetext_captures = notestext_query.captures(vcap[0]) - note_details = "|".join([self.USFMbytes[ncap[0].start_byte:ncap[0].end_byte].decode('utf-8').strip().replace("\n","") for ncap in notetext_captures]) - dict_output['book']['chapters'][-1]['contents'][-1]['notes'].append({note_type: note_details}) - elif filt in [Filter.SCRIPTURE_PARAGRAPHS.value]: - '''titles and section information, paragraph breaks - and also structuring like lists and tables - along with verse text and versenumber details at the lowest level''' - title_captures = title_query.captures(cap[0]) - para_captures = para_query.captures(cap[0]) - combined_tit_paras = {item[0].start_byte: item for item in title_captures+para_captures} - sorted_tit_paras = [combined_tit_paras[i] for i in sorted(combined_tit_paras)] - for comp in sorted_tit_paras: - if comp[1] == "title": - text_captures = text_query.captures(comp[0]) - title_texts = [] - for tcap in text_captures: - title_texts.append(self.USFMbytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8')) - dict_output['book']['chapters'][-1]['contents'].append( - {"title":" ".join(title_texts).strip()}) - elif comp[1] == "para": - comp_type = comp[0].type - versenum_captures = versenum_query.captures(comp[0]) - versetext_captures = versetext_query.captures(comp[0]) - combined = {item[0].start_byte: item for item in versenum_captures+versetext_captures} - sorted_combined = [combined[i] for i in sorted(combined)] - inner_contents = [] - for vcap in sorted_combined: - if vcap[1] == "verse": - inner_contents.append( - {"verseNumber":self.USFMbytes[vcap[0].start_byte:vcap[0].end_byte].decode('utf-8').strip(), + match filt: + case Filter.SCRIPTURE_BCV.value | None: + '''query for just the chapter, verse and text nodes from the syntax_tree''' + versenum_captures = versenum_query.captures(cap[0]) + versetext_captures = versetext_query.captures(cap[0]) + combined = {item[0].start_byte: item for item in versenum_captures+versetext_captures} + sorted_combined = [combined[i] for i in sorted(combined)] + for vcap in sorted_combined: + match vcap: + case (vnode, "verse"): + dict_output['book']['chapters'][-1]["contents"].append( + {"verseNumber":self.usfm_bytes[vnode.start_byte:vnode.end_byte].decode('utf-8').strip(), "verseText":""}) - elif vcap[1] == "verse-text": - text_captures = text_query.captures(vcap[0]) - text_val = "".join([self.USFMbytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8').replace("\n", " ") + case (vnode, "verse-text"): + text_captures = text_query.captures(vnode) + text_val = "".join([self.usfm_bytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8').replace("\n", " ") for tcap in text_captures]) - if len(inner_contents) == 0: - inner_contents.append({"verseText":""}) - inner_contents[-1]['verseText'] += text_val - - dict_output['book']['chapters'][-1]["contents"].append({comp_type:inner_contents}) + dict_output['book']['chapters'][-1]['contents'][-1]['verseText'] += text_val + case Filter.NOTES.value | Filter.NOTES_TEXT.value: + '''query for just the chapter, verse and text nodes from the syntax_tree''' + versenum_captures = versenum_query.captures(cap[0]) + notes_captures = notes_query.captures(cap[0]) + if len(notes_captures) == 0: + continue + combined = {item[0].start_byte: item for item in versenum_captures+notes_captures} + sorted_combined = [combined[i] for i in sorted(combined)] + for index,vcap in enumerate(sorted_combined): + if vcap[1] == "verse" and \ + index+1 !=len(sorted_combined) and sorted_combined[index+1][1] =="note": + '''need to add a verse only if it has notes''' + dict_output['book']['chapters'][-1]["contents"].append( + {"verseNumber":self.usfm_bytes[vcap[0].start_byte:vcap[0].end_byte].decode('utf-8').strip(), + "notes":[]}) + elif vcap[1] == "note": + note_type = vcap[0].type + if filt == Filter.NOTES.value: + note_details = node_2_dict(vcap[0], self.usfm_bytes) + elif filt == Filter.NOTES_TEXT.value: + notetext_captures = notestext_query.captures(vcap[0]) + note_details = "|".join([self.usfm_bytes[ncap[0].start_byte:ncap[0].end_byte].decode('utf-8').strip().replace("\n","") for ncap in notetext_captures]) + dict_output['book']['chapters'][-1]['contents'][-1]['notes'].append({note_type: note_details}) + case Filter.SCRIPTURE_PARAGRAPHS.value: + '''titles and section information, paragraph breaks + and also structuring like lists and tables + along with verse text and versenumber details at the lowest level''' + title_captures = title_query.captures(cap[0]) + para_captures = para_query.captures(cap[0]) + combined_tit_paras = {item[0].start_byte: item for item in title_captures+para_captures} + sorted_tit_paras = [combined_tit_paras[i] for i in sorted(combined_tit_paras)] + for comp in sorted_tit_paras: + match comp: + case (comp_node, "title"): + text_captures = text_query.captures(comp_node) + title_texts = [] + for tcap in text_captures: + title_texts.append(self.usfm_bytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8')) + dict_output['book']['chapters'][-1]['contents'].append( + {"title":" ".join(title_texts).strip()}) + case (comp_node, "para"): + comp_type = comp_node.type + versenum_captures = versenum_query.captures(comp_node) + versetext_captures = versetext_query.captures(comp_node) + combined = {item[0].start_byte: item for item in versenum_captures+versetext_captures} + sorted_combined = [combined[i] for i in sorted(combined)] + inner_contents = [] + for vcap in sorted_combined: + match vcap: + case (vnode, "verse"): + inner_contents.append( + {"verseNumber":self.usfm_bytes[vnode.start_byte:vnode.end_byte].decode('utf-8').strip(), + "verseText":""}) + case (vnode, "verse-text"): + text_captures = text_query.captures(vnode) + text_val = "".join([self.usfm_bytes[tcap[0].start_byte:tcap[0].end_byte].decode('utf-8').replace("\n", " ") + for tcap in text_captures]) + if len(inner_contents) == 0: + inner_contents.append({"verseText":""}) + inner_contents[-1]['verseText'] += text_val + + dict_output['book']['chapters'][-1]["contents"].append({comp_type:inner_contents}) return dict_output elif filt == Filter.ALL.value: - '''directly converts the AST to JSON/dict''' - return node_2_dict(self.AST, self.USFMbytes) + '''directly converts the syntax_tree to JSON/dict''' + return node_2_dict(self.syntax_tree, self.usfm_bytes) else: raise Exception(f"This filter option, {filt}, is yet to be implemeneted") - def toTable(self, filt=Filter.SCRIPTURE_BCV.value): + def to_list(self, filt=Filter.SCRIPTURE_BCV.value): '''uses the toJSON function and converts JSON to CSV''' - if filt == Filter.SCRIPTURE_BCV.value or filt is None: - scripture_json = self.toDict(Filter.SCRIPTURE_BCV.value) - table_output = [["Book","Chapter","Verse","Text"]] - book = scripture_json['book']['bookcode'] - for chap in scripture_json['book']['chapters']: - chapter = chap['chapterNumber'] - for verse in chap['contents']: - row = [book, chapter, verse['verseNumber'], '"'+verse['verseText']+'"'] - table_output.append(row) - return table_output - elif filt == Filter.NOTES.value: - notes_json = self.toDict(Filter.NOTES_TEXT.value) - table_output = [["Book","Chapter","Verse","Type", "Note"]] - book = notes_json['book']['bookcode'] - for chap in notes_json['book']['chapters']: - chapter = chap['chapterNumber'] - for verse in chap['contents']: - v_num = verse['verseNumber'] - for note in verse['notes']: - typ = list(note)[0] - row = [book, chapter, v_num, typ, '"'+note[typ]+'"'] - table_output.append(row) - return table_output - elif filt == Filter.SCRIPTURE_PARAGRAPHS.value: - notes_json = self.toDict(Filter.SCRIPTURE_PARAGRAPHS.value) - table_output = [["Book","Chapter","Type", "Contents"]] - book = notes_json['book']['bookcode'] - for chap in notes_json['book']['chapters']: - chapter = chap['chapterNumber'] - for comp in chap['contents']: - typ = list(comp)[0] - if typ == "title": - cont = comp[typ] - else: - inner_cont = [] - for inner_comp in comp[typ]: - inner_cont += list(inner_comp.values()) - cont = ' '.join(inner_cont) - row = [book, chapter, typ, cont] - table_output.append(row) - return table_output + match filt: + case Filter.SCRIPTURE_BCV.value | None: + scripture_json = self.to_dict(Filter.SCRIPTURE_BCV.value) + table_output = [["Book","Chapter","Verse","Text"]] + book = scripture_json['book']['bookcode'] + for chap in scripture_json['book']['chapters']: + chapter = chap['chapterNumber'] + for verse in chap['contents']: + row = [book, chapter, verse['verseNumber'], '"'+verse['verseText']+'"'] + table_output.append(row) + return table_output + case Filter.NOTES.value: + notes_json = self.to_dict(Filter.NOTES_TEXT.value) + table_output = [["Book","Chapter","Verse","Type", "Note"]] + book = notes_json['book']['bookcode'] + for chap in notes_json['book']['chapters']: + chapter = chap['chapterNumber'] + for verse in chap['contents']: + v_num = verse['verseNumber'] + for note in verse['notes']: + typ = list(note)[0] + row = [book, chapter, v_num, typ, '"'+note[typ]+'"'] + table_output.append(row) + return table_output + case Filter.SCRIPTURE_PARAGRAPHS.value: + notes_json = self.to_dict(Filter.SCRIPTURE_PARAGRAPHS.value) + table_output = [["Book","Chapter","Type", "Contents"]] + book = notes_json['book']['bookcode'] + for chap in notes_json['book']['chapters']: + chapter = chap['chapterNumber'] + for comp in chap['contents']: + typ = list(comp)[0] + if typ == "title": + cont = comp[typ] + else: + inner_cont = [] + for inner_comp in comp[typ]: + inner_cont += list(inner_comp.values()) + cont = ' '.join(inner_cont) + row = [book, chapter, typ, cont] + table_output.append(row) + return table_output - else: - raise Exception(f"This filter option, {filt}, is yet to be implemeneted") + case _: + raise Exception(f"This filter option, {filt}, is yet to be implemeneted") - def toMarkDown(self, filt=Filter.SCRIPTURE_PARAGRAPHS.value): + def to_markdown(self, filt=Filter.SCRIPTURE_PARAGRAPHS.value): '''query for chapter, paragraph, text structure''' return "yet to be implemeneted" - def toUSX(self, filt=Filter.ALL): - '''convert the AST to the XML format USX''' - usx_root = ET.Element("usx") + def to_usx(self, filt=Filter.ALL): + '''convert the syntax_tree to the XML format USX''' + usx_root = etree.Element("usx") usx_root.set("version", "3.0") - node_2_usx(self.AST, self.USFMbytes, usx_root, usx_root) + node_2_usx(self.syntax_tree, self.usfm_bytes, usx_root, usx_root) return usx_root if __name__ == '__main__': @@ -466,10 +470,11 @@ def toUSX(self, filt=Filter.ALL): arg_parser.add_argument('infile', type=str, help='input usfm file') arg_parser.add_argument('--format', type=str, help='output format', choices=[Format.JSON.value, Format.CSV.value, Format.USX.value, - Format.MD.value, Format.AST.value], + Format.MD.value, Format.ST.value], default=Format.JSON.value) arg_parser.add_argument('--filter', type=str, help='the type of contents to be included', - choices=[Filter.SCRIPTURE_BCV.value, Filter.NOTES.value, Filter.SCRIPTURE_PARAGRAPHS.value]) + choices=[Filter.SCRIPTURE_BCV.value, Filter.NOTES.value, + Filter.SCRIPTURE_PARAGRAPHS.value, Filter.ALL.value]) arg_parser.add_argument('--csv_col_sep', type=str, help="column separator or delimiter. Only useful with format=table.", default="\t") arg_parser.add_argument('--csv_row_sep', type=str, help="row separator or delimiter. Only useful with format=table.", @@ -490,20 +495,20 @@ def toUSX(self, filt=Filter.ALL): err_str = "\n\t".join(my_parser.errors) print(f"Errors at:{err_str}") - if output_format == Format.JSON: - dict_output = my_parser.toDict(filt = output_filter) - print(json.dumps(dict_output, indent=4, ensure_ascii=False)) - elif output_format == Format.CSV: - table_output = my_parser.toTable(filt = output_filter) - print(csv_row_sep.join([csv_col_sep.join(row) for row in table_output])) - elif output_format == Format.USX: - xmlstr = ET.tostring(my_parser.toUSX(filt = output_filter),encoding="unicode") - print(minidom.parseString(xmlstr).toprettyxml(indent=" ")) - elif output_format == Format.MD: - print(my_parser.toMarkDown(filt = output_filter)) - elif output_format == Format.AST: - print(my_parser.toAST()) - else: - raise Exception(f"Un-recognized output format:{output_format}!") - - + match output_format: + case Format.JSON: + dict_output = my_parser.to_dict(filt = output_filter) + print(json.dumps(dict_output, indent=4, ensure_ascii=False)) + case Format.CSV: + table_output = my_parser.to_list(filt = output_filter) + print(csv_row_sep.join([csv_col_sep.join(row) for row in table_output])) + case Format.USX: + xmlstr = etree.tostring(my_parser.to_usx(filt=output_filter), + encoding='unicode', pretty_print=True) + print(xmlstr) + case Format.MD: + print(my_parser.to_markdown(filt = output_filter)) + case Format.ST: + print(my_parser.to_syntax_tree()) + case _: + raise Exception(f"Un-recognized output format:{output_format}!")