diff --git a/pyproject.toml b/pyproject.toml index 49b2f37d..fd42a271 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,8 @@ dependencies = [ "subprocess.run == 0.0.8", "python-docx >= 1.1.2", "bo-sent-tokenizer @ git+https://github.com/OpenPecha/bo_sent_tokenizer.git", - "fast_antx @ git+https://github.com/OpenPecha/fast-antx.git" + "fast_antx @ git+https://github.com/OpenPecha/fast-antx.git", + "pecha_org_tools @ git+https://github.com/OpenPecha/pecha_org_tools.git" ] diff --git a/src/openpecha/pecha/parsers/google_doc.py b/src/openpecha/pecha/parsers/google_doc.py index 5678f870..924c80fe 100644 --- a/src/openpecha/pecha/parsers/google_doc.py +++ b/src/openpecha/pecha/parsers/google_doc.py @@ -245,7 +245,7 @@ def add_sapche_ann(self, doc: Dict[str, Any], char_count: int): sapche_anns: List[Dict[str, Any]] = [] for doc_style in doc["styles"]: for idx in range(len(doc_style["texts"])): - if doc_style["styles"][idx].color.rgb == RGBColor(0xFF, 0x00, 0xFF): + if doc_style["styles"][idx].color.rgb == RGBColor(0xFF, 0x00, 0x00): match = re.match(r"([\d\.]+)\s", doc_style["texts"][idx]) if match: # Extract sapche number and store the char length to update the previous ann spans @@ -263,17 +263,18 @@ def add_sapche_ann(self, doc: Dict[str, Any], char_count: int): "sapche_number": sapche_number, } ) - else: - start = char_count + inner_char_count - end = start + len(doc_style["texts"][idx]) - sapche_anns.append( - { - LayerEnum.sapche.value: { - "start": start, - "end": end, - } - } - ) + # If the sapche number is not needed, use the following code in future + # else: + # start = char_count + inner_char_count + # end = start + len(doc_style["texts"][idx]) + # sapche_anns.append( + # { + # LayerEnum.sapche.value: { + # "start": start, + # "end": end, + # } + # } + # ) inner_char_count += len(doc_style["texts"][idx]) inner_char_count += 1 # for newline diff --git a/src/openpecha/pecha/serializers/commentary.py b/src/openpecha/pecha/serializers/commentary.py index f64b6c6f..a1e03ca5 100644 --- a/src/openpecha/pecha/serializers/commentary.py +++ b/src/openpecha/pecha/serializers/commentary.py @@ -1,6 +1,9 @@ from pathlib import Path from typing import Any, Dict, Union +from pecha_org_tools.enums import TextType +from pecha_org_tools.extract import CategoryExtractor + from openpecha.pecha import Pecha from openpecha.pecha.layer import LayerEnum from openpecha.utils import get_text_direction_with_lang @@ -8,10 +11,11 @@ class CommentarySerializer: def __init__(self): - self.category = [] - self.book = [] - self.book_content = {} - self.required_metadata = {} + self.source_category = {} + self.target_category = {} + self.source_book = [] + self.target_book = [] + self.sapche_anns = [] self.meaning_segment_anns = [] self.formatted_sapche_anns = {} @@ -25,37 +29,86 @@ def extract_metadata(self): """ assert self.pecha is not None, "Pecha object is not set" pecha_metadata = self.pecha.metadata - title = pecha_metadata.title - lang = pecha_metadata.language - title = title if lang in ["bo", "en"] else f"{title}[{lang}]" - self.required_metadata = { - "title": title, - "language": pecha_metadata.language, + source_title = pecha_metadata.title["en"] + target_title = pecha_metadata.title["bo"] + + source_metadata = { + "title": source_title, + "language": "en", + "versionSource": pecha_metadata.source if pecha_metadata.source else "", + "direction": get_text_direction_with_lang("en"), + "completestatus": "done", + } + + target_metadata = { + "title": target_title, + "language": pecha_metadata.language.value, "versionSource": pecha_metadata.source if pecha_metadata.source else "", "direction": get_text_direction_with_lang(pecha_metadata.language), "completestatus": "done", } - return self.required_metadata + + return source_metadata, target_metadata def set_metadata_to_json(self): """ Set extracted metadata to json format """ - self.extract_metadata() - self.book.append(self.required_metadata) + source_metadata, target_metadata = self.extract_metadata() + self.source_book.append(source_metadata) + self.target_book.append(target_metadata) - def get_category(self, title: str): + def get_category(self, category_name: str): """ Input: title: Title of the pecha commentary which will be used to get the category format Process: Get the category format from the pecha.org categorizer package """ - pass + assert self.pecha is not None, "Pecha object is not set" + + if isinstance(self.pecha.metadata.title, dict): + bo_title = self.pecha.metadata.title.get("bo", "") + en_title = self.pecha.metadata.title.get("en", "") + + elif isinstance(self.pecha.metadata.title, list): + bo_title = self.pecha.metadata.title[0] + en_title = self.pecha.metadata.title[1] + + else: + bo_title = self.pecha.metadata.title + en_title = self.pecha.metadata.title + + heDesc = self.pecha.metadata.source_metadata.get("heDesc", "") + heShortDesc = self.pecha.metadata.source_metadata.get("heShortDesc", "") + + enDesc = self.pecha.metadata.source_metadata.get("enDesc", "") + enShortDesc = self.pecha.metadata.source_metadata.get("enShortDesc", "") + + pecha_metadata = { + "bo": { + "title": bo_title, + "heDesc": heDesc, + "heShortDesc": heShortDesc, + }, + "en": { + "title": en_title, + "enDesc": enDesc, + "enShortDesc": enShortDesc, + }, + } + + categorizer = CategoryExtractor() + category_json = categorizer.get_category( + category_name, pecha_metadata, TextType.COMMENTARY + ) + return category_json - def set_category_to_json(self): + def set_category_to_json(self, category_name: str): """ Set the category format to self.category attribute """ - self.get_category(self.required_metadata["title"]) + category_json = self.get_category(category_name) + self.source_category = category_json["en"] + self.target_category = category_json["bo"] pass def get_sapche_anns(self): @@ -218,7 +271,7 @@ def get_text_related_to_sapche(self): ) sapche_ann["meaning_segments"].append(formatted_meaning_segment_ann) - def serialize(self, pecha_path: Path, title: str): + def serialize(self, pecha_path: Path, category_name: str): """ Serialize the commentary pecha to json format """ @@ -226,6 +279,14 @@ def serialize(self, pecha_path: Path, title: str): self.pecha = Pecha.from_path(pecha_path) self.set_metadata_to_json() - self.set_category_to_json() + self.set_category_to_json(category_name) formatted_sapche_ann = self.format_sapche_anns() - return formatted_sapche_ann + + self.source_book[0]["content"] = {} + self.target_book[0]["content"] = formatted_sapche_ann + + serialized_json = { + "source": {"categories": self.source_category, "book": self.source_book}, + "target": {"categories": self.target_category, "book": self.target_book}, + } + return serialized_json diff --git a/test.py b/test.py deleted file mode 100644 index dcce6059..00000000 --- a/test.py +++ /dev/null @@ -1,14 +0,0 @@ -from stam import AnnotationStore - -store = AnnotationStore( - file="tests/pecha/serializers/pecha_db/commentary/data/IC3797777/layers/0301/Sapche-C111.json" -) - - -for ann in store: - ann_data = {} - print(str(ann)) - for data in ann: - ann_data[data.key().id()] = str(data.value()) - - print(ann_data) diff --git "a/tests/pecha/parser/google_doc/data/commentary_with_sapche/\340\275\242\340\276\241\340\275\274\340\274\213\340\275\242\340\276\227\340\275\272\340\274\213\340\275\202\340\275\205\340\275\274\340\275\221\340\274\213\340\275\224\340\274\213commentary.docx" "b/tests/pecha/parser/google_doc/data/commentary_with_sapche/\340\275\242\340\276\241\340\275\274\340\274\213\340\275\242\340\276\227\340\275\272\340\274\213\340\275\202\340\275\205\340\275\274\340\275\221\340\274\213\340\275\224\340\274\213_commentary.docx" similarity index 99% rename from "tests/pecha/parser/google_doc/data/commentary_with_sapche/\340\275\242\340\276\241\340\275\274\340\274\213\340\275\242\340\276\227\340\275\272\340\274\213\340\275\202\340\275\205\340\275\274\340\275\221\340\274\213\340\275\224\340\274\213commentary.docx" rename to "tests/pecha/parser/google_doc/data/commentary_with_sapche/\340\275\242\340\276\241\340\275\274\340\274\213\340\275\242\340\276\227\340\275\272\340\274\213\340\275\202\340\275\205\340\275\274\340\275\221\340\274\213\340\275\224\340\274\213_commentary.docx" index 1ee68921..3204c3cb 100644 Binary files "a/tests/pecha/parser/google_doc/data/commentary_with_sapche/\340\275\242\340\276\241\340\275\274\340\274\213\340\275\242\340\276\227\340\275\272\340\274\213\340\275\202\340\275\205\340\275\274\340\275\221\340\274\213\340\275\224\340\274\213commentary.docx" and "b/tests/pecha/parser/google_doc/data/commentary_with_sapche/\340\275\242\340\276\241\340\275\274\340\274\213\340\275\242\340\276\227\340\275\272\340\274\213\340\275\202\340\275\205\340\275\274\340\275\221\340\274\213\340\275\224\340\274\213_commentary.docx" differ diff --git a/tests/pecha/parser/google_doc/test_google_doc.py b/tests/pecha/parser/google_doc/test_google_doc.py index 036dd344..23f62db2 100644 --- a/tests/pecha/parser/google_doc/test_google_doc.py +++ b/tests/pecha/parser/google_doc/test_google_doc.py @@ -73,7 +73,7 @@ def test_parser_on_commentary_text(): def test_parser_on_commentary_with_sapche(): data = Path(__file__).parent / "data" - input = data / "commentary_with_sapche/རྡོ་རྗེ་གཅོད་པ་commentary.docx" + input = data / "commentary_with_sapche/རྡོ་རྗེ་གཅོད་པ་_commentary.docx" metadata = read_json(data / "commentary_with_sapche/metadata.json") parser = GoogleDocParser( @@ -83,11 +83,11 @@ def test_parser_on_commentary_with_sapche(): output_path.mkdir(parents=True, exist_ok=True) parser.parse(input, metadata, output_path) expected_sapche_anns = [ - {"Sapche": {"start": 101, "end": 123}}, - {"Sapche": {"start": 124, "end": 165}, "sapche_number": "1.1."}, - {"Sapche": {"start": 251, "end": 268}}, - {"Sapche": {"start": 269, "end": 309}, "sapche_number": "2.1."}, - {"Sapche": {"start": 474, "end": 552}}, + {"Sapche": {"start": 102, "end": 124}, "sapche_number": "1."}, + {"Sapche": {"start": 126, "end": 166}, "sapche_number": "1.1."}, + {"Sapche": {"start": 252, "end": 283}, "sapche_number": "1.2."}, + {"Sapche": {"start": 541, "end": 558}, "sapche_number": "2."}, + {"Sapche": {"start": 560, "end": 560}, "sapche_number": "2.1."}, ] assert parser.sapche_anns == expected_sapche_anns diff --git a/tests/pecha/serializers/pecha_db/commentary/data/IC3797777/metadata.json b/tests/pecha/serializers/pecha_db/commentary/data/IC3797777/metadata.json index c62f97a2..adde0bbd 100644 --- a/tests/pecha/serializers/pecha_db/commentary/data/IC3797777/metadata.json +++ b/tests/pecha/serializers/pecha_db/commentary/data/IC3797777/metadata.json @@ -1,14 +1,21 @@ { - "id": "IC3797777", - "title": "Dummy Title", - "author": "Dummy Author", - "imported": "2024-11-21T15:37:01.183088", - "source": "Dummy Source", + "id": "I15BEA1DE", + "title": { + "bo": "རྡོ་རྗེ་གཅོད་པ།", + "en": "Vajra Cutter" + }, + "author": "སྟོན་པ་བཅོམ་ལྡན་འདས།", + "imported": "2024-11-28T10:42:25.426868", + "source": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747", "toolkit_version": "0.0.1", "parser": "GoogleDocParser", "initial_creation_type": "ebook", "language": "bo", "source_metadata": { + "heDesc": "", + "heShortDesc": "", + "enDesc": "", + "enShortDesc": "", "root_path": "opf_id/layers/basename/layer_file.json" }, "bases": [], diff --git a/tests/pecha/serializers/pecha_db/commentary/data/commentary_serialized.json b/tests/pecha/serializers/pecha_db/commentary/data/commentary_serialized.json new file mode 100644 index 00000000..ad2d8bd5 --- /dev/null +++ b/tests/pecha/serializers/pecha_db/commentary/data/commentary_serialized.json @@ -0,0 +1,88 @@ +{ + "source": { + "categories": [ + { + "name": "The Buddha's Teachings", + "enDesc": "", + "enShortDesc": "" + }, + { + "name": "Vajra Cutter", + "enDesc": "", + "enShortDesc": "" + }, + { + "name": "Commentaries", + "enDesc": "", + "enShortDesc": "" + }, + { + "name": "Vajra Cutter", + "enDesc": "", + "enShortDesc": "" + } + ], + "book": [ + { + "title": "Vajra Cutter", + "language": "en", + "versionSource": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747", + "direction": "ltr", + "completestatus": "done", + "content": {} + } + ] + }, + "target": { + "categories": [ + { + "name": "སངས་རྒྱས་ཀྱི་བཀའ།", + "heDesc": "", + "heShortDesc": "" + }, + { + "name": "རྡོ་རྗེ་གཅོད་པ།", + "heDesc": "", + "heShortDesc": "" + }, + { + "name": "འགྲེལ་པ།", + "heDesc": "", + "heShortDesc": "" + }, + { + "name": "རྡོ་རྗེ་གཅོད་པ།", + "heDesc": "", + "heShortDesc": "" + } + ], + "book": [ + { + "title": "རྡོ་རྗེ་གཅོད་པ།", + "language": "bo", + "versionSource": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747", + "direction": "ltr", + "completestatus": "done", + "content": { + "མདོའི་ལུས་ཀྱི་འགྲེལ་པ།": { + "data": [], + "སངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།": { + "data": [ + "<1><1>རྒྱ་གར་སྐད་དུ། །ཨཱརྱ་བྷ་ག་བ་ཏཱི་པྲཛྙཱ་པཱ་ར་མི་ཏཱ་བཛྲ་ཙྪེ་དི་ཀཱ་ཡཱཿསཔྟ་དཱརྠ་ཊཱི་ཀཱ།" + ] + } + }, + "ཚིག་གི་དོན་བཤད་པ།": { + "data": [], + "སངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།": { + "data": [ + "ཚིག་གི་དོན་བཤད་པ།\nསངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།", + "མཚུངས་མེད་སངས་རྒྱས་ཆོས་རྣམས་སྐྱེད་མཛད་ལ། །\nགང་ཞིག་ཆོས་དབྱིངས་གསོ་བའི་མ་མ་སྟེ། །\nདབྱེ་དཀའི་རྡོ་རྗེ་གཞན་དོན་གྲུབ་གང་ཡིན། །\nགང་ཞིག་བཟུང་བས་འཕགས་ཀུན་སྐྱེད་པའམ་ཡིན། །" + ] + } + } + } + } + ] + } +} \ No newline at end of file diff --git a/tests/pecha/serializers/pecha_db/commentary/data/metadata.json b/tests/pecha/serializers/pecha_db/commentary/data/metadata.json new file mode 100644 index 00000000..fc9ff10d --- /dev/null +++ b/tests/pecha/serializers/pecha_db/commentary/data/metadata.json @@ -0,0 +1,11 @@ +{ + "title": {"bo": "རྡོ་རྗེ་གཅོད་པ།", "en": "Vajra Cutter"}, + "author": "སྟོན་པ་བཅོམ་ལྡན་འདས།", + "source": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747", + "initial_creation_type": "ebook", + "language": "bo", + "heDesc": "", + "heShortDesc": "", + "enDesc": "", + "enShortDesc": "" +} diff --git "a/tests/pecha/serializers/pecha_db/commentary/data/\340\275\242\340\276\241\340\275\274\340\274\213\340\275\242\340\276\227\340\275\272\340\274\213\340\275\202\340\275\205\340\275\274\340\275\221\340\274\213\340\275\224\340\274\213commentary.docx" "b/tests/pecha/serializers/pecha_db/commentary/data/\340\275\242\340\276\241\340\275\274\340\274\213\340\275\242\340\276\227\340\275\272\340\274\213\340\275\202\340\275\205\340\275\274\340\275\221\340\274\213\340\275\224\340\274\213commentary.docx" new file mode 100644 index 00000000..a27fd1a3 Binary files /dev/null and "b/tests/pecha/serializers/pecha_db/commentary/data/\340\275\242\340\276\241\340\275\274\340\274\213\340\275\242\340\276\227\340\275\272\340\274\213\340\275\202\340\275\205\340\275\274\340\275\221\340\274\213\340\275\224\340\274\213commentary.docx" differ diff --git a/tests/pecha/serializers/pecha_db/commentary/test_commentary_serializer.py b/tests/pecha/serializers/pecha_db/commentary/test_commentary_serializer.py index af48d27c..14cc79d5 100644 --- a/tests/pecha/serializers/pecha_db/commentary/test_commentary_serializer.py +++ b/tests/pecha/serializers/pecha_db/commentary/test_commentary_serializer.py @@ -1,31 +1,35 @@ from pathlib import Path +from unittest.mock import patch from openpecha.pecha.serializers.commentary import CommentarySerializer +from openpecha.utils import read_json def test_commentary_serializer(): DATA_DIR = Path(__file__).parent / "data" pecha_path = DATA_DIR / "IC3797777" - serializer = CommentarySerializer() - formatted_sapche_anns = serializer.serialize(pecha_path, title="test") - expected_formatted_sapche_anns = { - "མདོའི་ལུས་ཀྱི་འགྲེལ་པ།": { - "data": [], - "སངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།": { - "data": [ - "<1><1>རྒྱ་གར་སྐད་དུ། །ཨཱརྱ་བྷ་ག་བ་ཏཱི་པྲཛྙཱ་པཱ་ར་མི་ཏཱ་བཛྲ་ཙྪེ་དི་ཀཱ་ཡཱཿསཔྟ་དཱརྠ་ཊཱི་ཀཱ།" - ] - }, - }, - "ཚིག་གི་དོན་བཤད་པ།": { - "data": [], - "སངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།": { - "data": [ - "ཚིག་གི་དོན་བཤད་པ།\nསངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།", - "མཚུངས་མེད་སངས་རྒྱས་ཆོས་རྣམས་སྐྱེད་མཛད་ལ། །\nགང་ཞིག་ཆོས་དབྱིངས་གསོ་བའི་མ་མ་སྟེ། །\nདབྱེ་དཀའི་རྡོ་རྗེ་གཞན་དོན་གྲུབ་གང་ཡིན། །\nགང་ཞིག་བཟུང་བས་འཕགས་ཀུན་སྐྱེད་པའམ་ཡིན། །", - ] - }, - }, - } - assert formatted_sapche_anns == expected_formatted_sapche_anns + # Patch the `get_category` method in `CategoryExtractor` to return a custom value + with patch( + "pecha_org_tools.extract.CategoryExtractor.get_category" + ) as mock_get_category: + mock_get_category.return_value = { + "bo": [ + {"name": "སངས་རྒྱས་ཀྱི་བཀའ།", "heDesc": "", "heShortDesc": ""}, + {"name": "རྡོ་རྗེ་གཅོད་པ།", "heDesc": "", "heShortDesc": ""}, + {"name": "འགྲེལ་པ།", "heDesc": "", "heShortDesc": ""}, + {"name": "རྡོ་རྗེ་གཅོད་པ།", "heDesc": "", "heShortDesc": ""}, + ], + "en": [ + {"name": "The Buddha's Teachings", "enDesc": "", "enShortDesc": ""}, + {"name": "Vajra Cutter", "enDesc": "", "enShortDesc": ""}, + {"name": "Commentaries", "enDesc": "", "enShortDesc": ""}, + {"name": "Vajra Cutter", "enDesc": "", "enShortDesc": ""}, + ], + } + + serializer = CommentarySerializer() + serialized_json = serializer.serialize(pecha_path, "རྡོ་རྗེ་གཅོད་པ།") + + expected_serialized_json = read_json(DATA_DIR / "commentary_serialized.json") + assert serialized_json == expected_serialized_json