Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify sapche color as red #85

Merged
merged 7 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ dependencies = [
"subprocess.run == 0.0.8",
"python-docx >= 1.1.2",
"bo-sent-tokenizer @ git+https://github.com/OpenPecha/bo_sent_tokenizer.git",
"fast_antx @ git+https://github.com/OpenPecha/fast-antx.git"
"fast_antx @ git+https://github.com/OpenPecha/fast-antx.git",
"pecha_org_tools @ git+https://github.com/OpenPecha/pecha_org_tools.git"

]

Expand Down
25 changes: 13 additions & 12 deletions src/openpecha/pecha/parsers/google_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def add_sapche_ann(self, doc: Dict[str, Any], char_count: int):
sapche_anns: List[Dict[str, Any]] = []
for doc_style in doc["styles"]:
for idx in range(len(doc_style["texts"])):
if doc_style["styles"][idx].color.rgb == RGBColor(0xFF, 0x00, 0xFF):
if doc_style["styles"][idx].color.rgb == RGBColor(0xFF, 0x00, 0x00):
match = re.match(r"([\d\.]+)\s", doc_style["texts"][idx])
if match:
# Extract sapche number and store the char length to update the previous ann spans
Expand All @@ -263,17 +263,18 @@ def add_sapche_ann(self, doc: Dict[str, Any], char_count: int):
"sapche_number": sapche_number,
}
)
else:
start = char_count + inner_char_count
end = start + len(doc_style["texts"][idx])
sapche_anns.append(
{
LayerEnum.sapche.value: {
"start": start,
"end": end,
}
}
)
# If the sapche number is not needed, use the following code in future
# else:
# start = char_count + inner_char_count
# end = start + len(doc_style["texts"][idx])
# sapche_anns.append(
# {
# LayerEnum.sapche.value: {
# "start": start,
# "end": end,
# }
# }
# )
inner_char_count += len(doc_style["texts"][idx])
inner_char_count += 1 # for newline

Expand Down
101 changes: 81 additions & 20 deletions src/openpecha/pecha/serializers/commentary.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
from pathlib import Path
from typing import Any, Dict, Union

from pecha_org_tools.enums import TextType
from pecha_org_tools.extract import CategoryExtractor

from openpecha.pecha import Pecha
from openpecha.pecha.layer import LayerEnum
from openpecha.utils import get_text_direction_with_lang


class CommentarySerializer:
def __init__(self):
self.category = []
self.book = []
self.book_content = {}
self.required_metadata = {}
self.source_category = {}
self.target_category = {}
self.source_book = []
self.target_book = []

self.sapche_anns = []
self.meaning_segment_anns = []
self.formatted_sapche_anns = {}
Expand All @@ -25,37 +29,86 @@ def extract_metadata(self):
"""
assert self.pecha is not None, "Pecha object is not set"
pecha_metadata = self.pecha.metadata
title = pecha_metadata.title
lang = pecha_metadata.language
title = title if lang in ["bo", "en"] else f"{title}[{lang}]"
self.required_metadata = {
"title": title,
"language": pecha_metadata.language,
source_title = pecha_metadata.title["en"]
target_title = pecha_metadata.title["bo"]

source_metadata = {
"title": source_title,
"language": "en",
"versionSource": pecha_metadata.source if pecha_metadata.source else "",
"direction": get_text_direction_with_lang("en"),
"completestatus": "done",
}

target_metadata = {
"title": target_title,
"language": pecha_metadata.language.value,
"versionSource": pecha_metadata.source if pecha_metadata.source else "",
"direction": get_text_direction_with_lang(pecha_metadata.language),
"completestatus": "done",
}
return self.required_metadata

return source_metadata, target_metadata

def set_metadata_to_json(self):
"""
Set extracted metadata to json format
"""
self.extract_metadata()
self.book.append(self.required_metadata)
source_metadata, target_metadata = self.extract_metadata()
self.source_book.append(source_metadata)
self.target_book.append(target_metadata)

def get_category(self, title: str):
def get_category(self, category_name: str):
"""
Input: title: Title of the pecha commentary which will be used to get the category format
Process: Get the category format from the pecha.org categorizer package
"""
pass
assert self.pecha is not None, "Pecha object is not set"

if isinstance(self.pecha.metadata.title, dict):
bo_title = self.pecha.metadata.title.get("bo", "")
en_title = self.pecha.metadata.title.get("en", "")

elif isinstance(self.pecha.metadata.title, list):
bo_title = self.pecha.metadata.title[0]
en_title = self.pecha.metadata.title[1]

else:
bo_title = self.pecha.metadata.title
en_title = self.pecha.metadata.title

heDesc = self.pecha.metadata.source_metadata.get("heDesc", "")
heShortDesc = self.pecha.metadata.source_metadata.get("heShortDesc", "")

enDesc = self.pecha.metadata.source_metadata.get("enDesc", "")
enShortDesc = self.pecha.metadata.source_metadata.get("enShortDesc", "")

pecha_metadata = {
"bo": {
"title": bo_title,
"heDesc": heDesc,
"heShortDesc": heShortDesc,
},
"en": {
"title": en_title,
"enDesc": enDesc,
"enShortDesc": enShortDesc,
},
}

categorizer = CategoryExtractor()
category_json = categorizer.get_category(
category_name, pecha_metadata, TextType.COMMENTARY
)
return category_json

def set_category_to_json(self):
def set_category_to_json(self, category_name: str):
"""
Set the category format to self.category attribute
"""
self.get_category(self.required_metadata["title"])
category_json = self.get_category(category_name)
self.source_category = category_json["en"]
self.target_category = category_json["bo"]
pass

def get_sapche_anns(self):
Expand Down Expand Up @@ -218,14 +271,22 @@ def get_text_related_to_sapche(self):
)
sapche_ann["meaning_segments"].append(formatted_meaning_segment_ann)

def serialize(self, pecha_path: Path, title: str):
def serialize(self, pecha_path: Path, category_name: str):
"""
Serialize the commentary pecha to json format
"""
self.pecha_path = pecha_path
self.pecha = Pecha.from_path(pecha_path)

self.set_metadata_to_json()
self.set_category_to_json()
self.set_category_to_json(category_name)
formatted_sapche_ann = self.format_sapche_anns()
return formatted_sapche_ann

self.source_book[0]["content"] = {}
self.target_book[0]["content"] = formatted_sapche_ann

serialized_json = {
"source": {"categories": self.source_category, "book": self.source_book},
"target": {"categories": self.target_category, "book": self.target_book},
}
return serialized_json
14 changes: 0 additions & 14 deletions test.py

This file was deleted.

12 changes: 6 additions & 6 deletions tests/pecha/parser/google_doc/test_google_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_parser_on_commentary_text():

def test_parser_on_commentary_with_sapche():
data = Path(__file__).parent / "data"
input = data / "commentary_with_sapche/རྡོ་རྗེ་གཅོད་པ་commentary.docx"
input = data / "commentary_with_sapche/རྡོ་རྗེ་གཅོད་པ་_commentary.docx"
metadata = read_json(data / "commentary_with_sapche/metadata.json")

parser = GoogleDocParser(
Expand All @@ -83,11 +83,11 @@ def test_parser_on_commentary_with_sapche():
output_path.mkdir(parents=True, exist_ok=True)
parser.parse(input, metadata, output_path)
expected_sapche_anns = [
{"Sapche": {"start": 101, "end": 123}},
{"Sapche": {"start": 124, "end": 165}, "sapche_number": "1.1."},
{"Sapche": {"start": 251, "end": 268}},
{"Sapche": {"start": 269, "end": 309}, "sapche_number": "2.1."},
{"Sapche": {"start": 474, "end": 552}},
{"Sapche": {"start": 102, "end": 124}, "sapche_number": "1."},
{"Sapche": {"start": 126, "end": 166}, "sapche_number": "1.1."},
{"Sapche": {"start": 252, "end": 283}, "sapche_number": "1.2."},
{"Sapche": {"start": 541, "end": 558}, "sapche_number": "2."},
{"Sapche": {"start": 560, "end": 560}, "sapche_number": "2.1."},
]

assert parser.sapche_anns == expected_sapche_anns
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
{
"id": "IC3797777",
"title": "Dummy Title",
"author": "Dummy Author",
"imported": "2024-11-21T15:37:01.183088",
"source": "Dummy Source",
"id": "I15BEA1DE",
"title": {
"bo": "རྡོ་རྗེ་གཅོད་པ།",
"en": "Vajra Cutter"
},
"author": "སྟོན་པ་བཅོམ་ལྡན་འདས།",
"imported": "2024-11-28T10:42:25.426868",
"source": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747",
"toolkit_version": "0.0.1",
"parser": "GoogleDocParser",
"initial_creation_type": "ebook",
"language": "bo",
"source_metadata": {
"heDesc": "",
"heShortDesc": "",
"enDesc": "",
"enShortDesc": "",
"root_path": "opf_id/layers/basename/layer_file.json"
},
"bases": [],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"source": {
"categories": [
{
"name": "The Buddha's Teachings",
"enDesc": "",
"enShortDesc": ""
},
{
"name": "Vajra Cutter",
"enDesc": "",
"enShortDesc": ""
},
{
"name": "Commentaries",
"enDesc": "",
"enShortDesc": ""
},
{
"name": "Vajra Cutter",
"enDesc": "",
"enShortDesc": ""
}
],
"book": [
{
"title": "Vajra Cutter",
"language": "en",
"versionSource": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747",
"direction": "ltr",
"completestatus": "done",
"content": {}
}
]
},
"target": {
"categories": [
{
"name": "སངས་རྒྱས་ཀྱི་བཀའ།",
"heDesc": "",
"heShortDesc": ""
},
{
"name": "རྡོ་རྗེ་གཅོད་པ།",
"heDesc": "",
"heShortDesc": ""
},
{
"name": "འགྲེལ་པ།",
"heDesc": "",
"heShortDesc": ""
},
{
"name": "རྡོ་རྗེ་གཅོད་པ།",
"heDesc": "",
"heShortDesc": ""
}
],
"book": [
{
"title": "རྡོ་རྗེ་གཅོད་པ།",
"language": "bo",
"versionSource": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747",
"direction": "ltr",
"completestatus": "done",
"content": {
"མདོའི་ལུས་ཀྱི་འགྲེལ་པ།": {
"data": [],
"སངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།": {
"data": [
"<1><1>རྒྱ་གར་སྐད་དུ། །ཨཱརྱ་བྷ་ག་བ་ཏཱི་པྲཛྙཱ་པཱ་ར་མི་ཏཱ་བཛྲ་ཙྪེ་དི་ཀཱ་ཡཱཿསཔྟ་དཱརྠ་ཊཱི་ཀཱ།"
]
}
},
"ཚིག་གི་དོན་བཤད་པ།": {
"data": [],
"སངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།": {
"data": [
"ཚིག་གི་དོན་བཤད་པ།\nསངས་རྒྱས་ཀྱི་གདུང་རྒྱུན་མི་འཆད་པ་བསྟན་པ།",
"མཚུངས་མེད་སངས་རྒྱས་ཆོས་རྣམས་སྐྱེད་མཛད་ལ། །\nགང་ཞིག་ཆོས་དབྱིངས་གསོ་བའི་མ་མ་སྟེ། །\nདབྱེ་དཀའི་རྡོ་རྗེ་གཞན་དོན་གྲུབ་གང་ཡིན། །\nགང་ཞིག་བཟུང་བས་འཕགས་ཀུན་སྐྱེད་པའམ་ཡིན། །"
]
}
}
}
}
]
}
}
11 changes: 11 additions & 0 deletions tests/pecha/serializers/pecha_db/commentary/data/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"title": {"bo": "རྡོ་རྗེ་གཅོད་པ།", "en": "Vajra Cutter"},
"author": "སྟོན་པ་བཅོམ་ལྡན་འདས།",
"source": "https://library.bdrc.io/show/bdr:WA1KG12670?tabs=bdr:MW3JT13747,bdr:W3JT13747",
"initial_creation_type": "ebook",
"language": "bo",
"heDesc": "",
"heShortDesc": "",
"enDesc": "",
"enShortDesc": ""
}
Loading