OpenPecha · ta4tsering · Dec 2, 2024 · Nov 28, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,8 @@ dependencies = [
   "subprocess.run == 0.0.8",
   "python-docx >= 1.1.2",
   "bo-sent-tokenizer @ git+https://github.com/OpenPecha/bo_sent_tokenizer.git",
-  "fast_antx @ git+https://github.com/OpenPecha/fast-antx.git"
+  "fast_antx @ git+https://github.com/OpenPecha/fast-antx.git",
+  "pecha_org_tools @ git+https://github.com/OpenPecha/pecha_org_tools.git"
 
 ]
 

diff --git a/src/openpecha/pecha/parsers/google_doc.py b/src/openpecha/pecha/parsers/google_doc.py
@@ -245,7 +245,7 @@ def add_sapche_ann(self, doc: Dict[str, Any], char_count: int):
         sapche_anns: List[Dict[str, Any]] = []
         for doc_style in doc["styles"]:
             for idx in range(len(doc_style["texts"])):
-                if doc_style["styles"][idx].color.rgb == RGBColor(0xFF, 0x00, 0xFF):
+                if doc_style["styles"][idx].color.rgb == RGBColor(0xFF, 0x00, 0x00):
                     match = re.match(r"([\d\.]+)\s", doc_style["texts"][idx])
                     if match:
                         # Extract sapche number and store the char length to update the previous ann spans
@@ -263,17 +263,18 @@ def add_sapche_ann(self, doc: Dict[str, Any], char_count: int):
                                 "sapche_number": sapche_number,
                             }
                         )
-                    else:
-                        start = char_count + inner_char_count
-                        end = start + len(doc_style["texts"][idx])
-                        sapche_anns.append(
-                            {
-                                LayerEnum.sapche.value: {
-                                    "start": start,
-                                    "end": end,
-                                }
-                            }
-                        )
+                    # If the sapche number is not needed, use the following code in future
+                    # else:
+                    #     start = char_count + inner_char_count
+                    #     end = start + len(doc_style["texts"][idx])
+                    #     sapche_anns.append(
+                    #         {
+                    #             LayerEnum.sapche.value: {
+                    #                 "start": start,
+                    #                 "end": end,
+                    #             }
+                    #         }
+                    #     )
                 inner_char_count += len(doc_style["texts"][idx])
             inner_char_count += 1  # for newline
 

diff --git a/src/openpecha/pecha/serializers/commentary.py b/src/openpecha/pecha/serializers/commentary.py
@@ -1,17 +1,21 @@
 from pathlib import Path
 from typing import Any, Dict, Union
 
+from pecha_org_tools.enums import TextType
+from pecha_org_tools.extract import CategoryExtractor
+
 from openpecha.pecha import Pecha
 from openpecha.pecha.layer import LayerEnum
 from openpecha.utils import get_text_direction_with_lang
 
 
 class CommentarySerializer:
     def __init__(self):
-        self.category = []
-        self.book = []
-        self.book_content = {}
-        self.required_metadata = {}
+        self.source_category = {}
+        self.target_category = {}
+        self.source_book = []
+        self.target_book = []
+
         self.sapche_anns = []
         self.meaning_segment_anns = []
         self.formatted_sapche_anns = {}
@@ -25,37 +29,86 @@ def extract_metadata(self):
         """
         assert self.pecha is not None, "Pecha object is not set"
         pecha_metadata = self.pecha.metadata
-        title = pecha_metadata.title
-        lang = pecha_metadata.language
-        title = title if lang in ["bo", "en"] else f"{title}[{lang}]"
-        self.required_metadata = {
-            "title": title,
-            "language": pecha_metadata.language,
+        source_title = pecha_metadata.title["en"]
+        target_title = pecha_metadata.title["bo"]
+
+        source_metadata = {
+            "title": source_title,
+            "language": "en",
+            "versionSource": pecha_metadata.source if pecha_metadata.source else "",
+            "direction": get_text_direction_with_lang("en"),
+            "completestatus": "done",
+        }
+
+        target_metadata = {
+            "title": target_title,
+            "language": pecha_metadata.language.value,
             "versionSource": pecha_metadata.source if pecha_metadata.source else "",
             "direction": get_text_direction_with_lang(pecha_metadata.language),
             "completestatus": "done",
         }
-        return self.required_metadata
+
+        return source_metadata, target_metadata
 
     def set_metadata_to_json(self):
         """
         Set extracted metadata to json format
         """
-        self.extract_metadata()
-        self.book.append(self.required_metadata)
+        source_metadata, target_metadata = self.extract_metadata()
+        self.source_book.append(source_metadata)
+        self.target_book.append(target_metadata)
 
-    def get_category(self, title: str):
+    def get_category(self, category_name: str):
         """
         Input: title: Title of the pecha commentary which will be used to get the category format
         Process: Get the category format from the pecha.org categorizer package
         """
-        pass
+        assert self.pecha is not None, "Pecha object is not set"
+
+        if isinstance(self.pecha.metadata.title, dict):
+            bo_title = self.pecha.metadata.title.get("bo", "")
+            en_title = self.pecha.metadata.title.get("en", "")
+
+        elif isinstance(self.pecha.metadata.title, list):
+            bo_title = self.pecha.metadata.title[0]
+            en_title = self.pecha.metadata.title[1]
+
+        else:
+            bo_title = self.pecha.metadata.title
+            en_title = self.pecha.metadata.title
+
+        heDesc = self.pecha.metadata.source_metadata.get("heDesc", "")
+        heShortDesc = self.pecha.metadata.source_metadata.get("heShortDesc", "")
+
+        enDesc = self.pecha.metadata.source_metadata.get("enDesc", "")
+        enShortDesc = self.pecha.metadata.source_metadata.get("enShortDesc", "")
+
+        pecha_metadata = {
+            "bo": {
+                "title": bo_title,
+                "heDesc": heDesc,
+                "heShortDesc": heShortDesc,
+            },
+            "en": {
+                "title": en_title,
+                "enDesc": enDesc,
+                "enShortDesc": enShortDesc,
+            },
+        }
+
+        categorizer = CategoryExtractor()
+        category_json = categorizer.get_category(
+            category_name, pecha_metadata, TextType.COMMENTARY
+        )
+        return category_json
 
-    def set_category_to_json(self):
+    def set_category_to_json(self, category_name: str):
         """
         Set the category format to self.category attribute
         """
-        self.get_category(self.required_metadata["title"])
+        category_json = self.get_category(category_name)
+        self.source_category = category_json["en"]
+        self.target_category = category_json["bo"]
         pass
 
     def get_sapche_anns(self):
@@ -218,14 +271,22 @@ def get_text_related_to_sapche(self):
                     )
                     sapche_ann["meaning_segments"].append(formatted_meaning_segment_ann)
 
-    def serialize(self, pecha_path: Path, title: str):
+    def serialize(self, pecha_path: Path, category_name: str):
         """
         Serialize the commentary pecha to json format
         """
         self.pecha_path = pecha_path
         self.pecha = Pecha.from_path(pecha_path)
 
         self.set_metadata_to_json()
-        self.set_category_to_json()
+        self.set_category_to_json(category_name)
         formatted_sapche_ann = self.format_sapche_anns()
-        return formatted_sapche_ann
+
+        self.source_book[0]["content"] = {}
+        self.target_book[0]["content"] = formatted_sapche_ann
+
+        serialized_json = {
+            "source": {"categories": self.source_category, "book": self.source_book},
+            "target": {"categories": self.target_category, "book": self.target_book},
+        }
+        return serialized_json
diff --git a/test.py b/test.py
diff --git a/...ith_sapche/རྡོ་རྗེ་གཅོད་པ་commentary.docx → ...th_sapche/རྡོ་རྗེ་གཅོད་པ་_commentary.docx b/...ith_sapche/རྡོ་རྗེ་གཅོད་པ་commentary.docx → ...th_sapche/རྡོ་རྗེ་གཅོད་པ་_commentary.docx
diff --git a/tests/pecha/parser/google_doc/test_google_doc.py b/tests/pecha/parser/google_doc/test_google_doc.py
@@ -73,7 +73,7 @@ def test_parser_on_commentary_text():
 
 def test_parser_on_commentary_with_sapche():
     data = Path(__file__).parent / "data"
-    input = data / "commentary_with_sapche/རྡོ་རྗེ་གཅོད་པ་commentary.docx"
+    input = data / "commentary_with_sapche/རྡོ་རྗེ་གཅོད་པ་_commentary.docx"
     metadata = read_json(data / "commentary_with_sapche/metadata.json")
 
     parser = GoogleDocParser(
@@ -83,11 +83,11 @@ def test_parser_on_commentary_with_sapche():
     output_path.mkdir(parents=True, exist_ok=True)
     parser.parse(input, metadata, output_path)
     expected_sapche_anns = [
-        {"Sapche": {"start": 101, "end": 123}},
-        {"Sapche": {"start": 124, "end": 165}, "sapche_number": "1.1."},
-        {"Sapche": {"start": 251, "end": 268}},
-        {"Sapche": {"start": 269, "end": 309}, "sapche_number": "2.1."},
-        {"Sapche": {"start": 474, "end": 552}},
+        {"Sapche": {"start": 102, "end": 124}, "sapche_number": "1."},
+        {"Sapche": {"start": 126, "end": 166}, "sapche_number": "1.1."},
+        {"Sapche": {"start": 252, "end": 283}, "sapche_number": "1.2."},
+        {"Sapche": {"start": 541, "end": 558}, "sapche_number": "2."},
+        {"Sapche": {"start": 560, "end": 560}, "sapche_number": "2.1."},
     ]
 
     assert parser.sapche_anns == expected_sapche_anns

diff --git a/tests/pecha/serializers/pecha_db/commentary/data/IC3797777/metadata.json b/tests/pecha/serializers/pecha_db/commentary/data/IC3797777/metadata.json
@@ -1,14 +1,21 @@
 {
-  "id": "IC3797777",
-  "title": "Dummy Title",
-  "author": "Dummy Author",
-  "imported": "2024-11-21T15:37:01.183088",
-  "source": "Dummy Source",
+  "id": "I15BEA1DE",
+  "title": {
+    "bo": "རྡོ་རྗེ་གཅོད་པ།",
+    "en": "Vajra Cutter"
+  },
+  "author": "སྟོན་པ་བཅོམ་ལྡན་འདས།",
+  "imported": "2024-11-28T10:42:25.426868",
+  "source": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747",
   "toolkit_version": "0.0.1",
   "parser": "GoogleDocParser",
   "initial_creation_type": "ebook",
   "language": "bo",
   "source_metadata": {
+    "heDesc": "",
+    "heShortDesc": "",
+    "enDesc": "",
+    "enShortDesc": "",
     "root_path": "opf_id/layers/basename/layer_file.json"
   },
   "bases": [],

diff --git a/tests/pecha/serializers/pecha_db/commentary/data/commentary_serialized.json b/tests/pecha/serializers/pecha_db/commentary/data/commentary_serialized.json
@@ -0,0 +1,88 @@
+{
+  "source": {
+    "categories": [
+      {
+        "name": "The Buddha's Teachings",
+        "enDesc": "",
+        "enShortDesc": ""
+      },
+      {
+        "name": "Vajra Cutter",
+        "enDesc": "",
+        "enShortDesc": ""
+      },
+      {
+        "name": "Commentaries",
+        "enDesc": "",
+        "enShortDesc": ""
+      },
+      {
+        "name": "Vajra Cutter",
+        "enDesc": "",
+        "enShortDesc": ""
+      }
+    ],
+    "book": [
+      {
+        "title": "Vajra Cutter",
+        "language": "en",
+        "versionSource": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747",
+        "direction": "ltr",
+        "completestatus": "done",
+        "content": {}
+      }
+    ]
+  },
+  "target": {
+    "categories": [
+      {
+        "name": "སངས་རྒྱས་ཀྱི་བཀའ།",
+        "heDesc": "",
+        "heShortDesc": ""
+      },
+      {
+        "name": "རྡོ་རྗེ་གཅོད་པ།",
+        "heDesc": "",
+        "heShortDesc": ""
+      },
+      {
+        "name": "འགྲེལ་པ།",
+        "heDesc": "",
+        "heShortDesc": ""
+      },
+      {
+        "name": "རྡོ་རྗེ་གཅོད་པ།",
+        "heDesc": "",
+        "heShortDesc": ""
+      }
+    ],
+    "book": [
+      {
+        "title": "རྡོ་རྗེ་གཅོད་པ།",
+        "language": "bo",
+        "versionSource": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747",
+        "direction": "ltr",
+        "completestatus": "done",
+        "content": {
+          "མདོའི་ལུས་ཀྱི་འགྲེལ་པ།": {
+            "data": [],
+            "སངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།": {
+              "data": [
+                "<1><1>རྒྱ་གར་སྐད་དུ། །ཨཱརྱ་བྷ་ག་བ་ཏཱི་པྲཛྙཱ་པཱ་ར་མི་ཏཱ་བཛྲ་ཙྪེ་དི་ཀཱ་ཡཱཿསཔྟ་དཱརྠ་ཊཱི་ཀཱ།"
+              ]
+            }
+          },
+          "ཚིག་གི་དོན་བཤད་པ།": {
+            "data": [],
+            "སངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།": {
+              "data": [
+                "ཚིག་གི་དོན་བཤད་པ།\nསངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།",
+                "མཚུངས་མེད་སངས་རྒྱས་ཆོས་རྣམས་སྐྱེད་མཛད་ལ། །\nགང་ཞིག་ཆོས་དབྱིངས་གསོ་བའི་མ་མ་སྟེ། །\nདབྱེ་དཀའི་རྡོ་རྗེ་གཞན་དོན་གྲུབ་གང་ཡིན། །\nགང་ཞིག་བཟུང་བས་འཕགས་ཀུན་སྐྱེད་པའམ་ཡིན། །"
+              ]
+            }
+          }
+        }
+      }
+    ]
+  }
+}
diff --git a/tests/pecha/serializers/pecha_db/commentary/data/metadata.json b/tests/pecha/serializers/pecha_db/commentary/data/metadata.json
@@ -0,0 +1,11 @@
+{
+    "title": {"bo": "རྡོ་རྗེ་གཅོད་པ།", "en": "Vajra Cutter"},
+    "author": "སྟོན་པ་བཅོམ་ལྡན་འདས།",
+    "source": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747",
+    "initial_creation_type": "ebook",
+    "language": "bo",
+    "heDesc": "",
+    "heShortDesc": "",
+    "enDesc": "",
+    "enShortDesc": ""
+}
diff --git a/tests/pecha/serializers/pecha_db/commentary/data/རྡོ་རྗེ་གཅོད་པ་commentary.docx b/tests/pecha/serializers/pecha_db/commentary/data/རྡོ་རྗེ་གཅོད་པ་commentary.docx