Skip to content

Commit

Permalink
Merge pull request #1 from Esukhia/text-saving
Browse files Browse the repository at this point in the history
Text saving
  • Loading branch information
kaldan007 authored May 9, 2021
2 parents 88063b8 + 002e247 commit 135210e
Show file tree
Hide file tree
Showing 18 changed files with 662 additions and 27 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,5 @@ dmypy.json


.env
.vscode
.vscode
.github
1 change: 1 addition & 0 deletions pedurma/pecha.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class NotesPage(PageBase):

class Text(BaseModel):
id: str
vol_span: List[str]
pages: List[Page]
notes: Optional[List[NotesPage]]

Expand Down
99 changes: 99 additions & 0 deletions pedurma/save_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from openpecha.blupdate import *
from pedurma.pecha import *
from pedurma.texts import serialize_text_obj

def get_old_vol(pecha_opf_path, pecha_id, text_obj):
old_vols = {}
for vol_id in text_obj.vol_span:
old_vols[vol_id] = (pecha_opf_path / f"{pecha_id}.opf/base/{vol_id}.txt").read_text(encoding='utf-8')
return old_vols

def get_old_text_base(pecha_idx, old_vol_base, text_id, text_vol_num):
text_span = pecha_idx['annotations'][text_id]['span']
for vol_span in text_span:
if vol_span['vol'] == text_vol_num:
return old_vol_base[vol_span['start']:vol_span['end']]
return ''

def get_new_vol(old_vols, pecha_idx, text_obj):
new_vols = {}
new_text = serialize_text_obj(text_obj)
for vol_id, new_text_base in new_text.items():
vol_num = int(vol_id[1:])
old_vol_base = old_vols[vol_id]
old_text_base = get_old_text_base(pecha_idx, old_vol_base, text_obj.id, vol_num)
if new_text_base[0] == "\n":
new_text_base = new_text_base[1:]
new_vol_base = old_vol_base.replace(old_text_base, new_text_base)
new_vols[vol_id] = new_vol_base
return new_vols

def update_base(pecha_opf_path, pecha_id, text_obj, pecha_idx = None):
if not pecha_idx:
pecha_idx = yaml.safe_load((pecha_opf_path / f"{pecha_id}.opf/index.yml").read_text(encoding='utf-8'))
old_vols = get_old_vol(pecha_opf_path, pecha_id, text_obj)
new_vols = get_new_vol(old_vols, pecha_idx, text_obj)
for vol_id, new_vol_base in new_vols.items():
(pecha_opf_path / f"{pecha_id}.opf/base/{vol_id}.txt").write_text(new_vol_base, encoding='utf-8')
print(f'INFO: {vol_id} base updated..')

def get_old_layers(pecha_opf_path, pecha_id, vol_id):
old_layers = {}
layer_paths = list((pecha_opf_path / f"{pecha_id}.opf/layers/{vol_id}").iterdir())
for layer_path in layer_paths:
layer_name = layer_path.stem
layer_content = yaml.safe_load(layer_path.read_text(encoding='utf-8'))
old_layers[layer_name] = layer_content
return old_layers

def update_layer(pecha_opf_path, pecha_id, vol_id, old_layers, updater):
for layer_name, old_layer in old_layers.items():
update_ann_layer(old_layer, updater)
new_layer = yaml.safe_dump(old_layer, sort_keys=False)
(pecha_opf_path / f"{pecha_id}.opf/layers/{vol_id}/{layer_name}.yml").write_text(new_layer, encoding='utf-8')
print(f'INFO: {vol_id} {layer_name} has been updated...')

def update_old_layers(pecha_opf_path, pecha_id, text_obj, pecha_idx = None):
if not pecha_idx:
pecha_idx = yaml.safe_load((pecha_opf_path / f"{pecha_id}.opf/index.yml").read_text(encoding='utf-8'))
old_vols = get_old_vol(pecha_opf_path, pecha_id, text_obj)
new_vols = get_new_vol(old_vols, pecha_idx, text_obj)
for (vol_id, old_vol_base), (_, new_vol_base) in zip(old_vols.items(), new_vols.items()):
updater = Blupdate(old_vol_base, new_vol_base)
old_layers = get_old_layers(pecha_opf_path, pecha_id, vol_id)
update_layer(pecha_opf_path, pecha_id, vol_id, old_layers, updater)

def update_other_text_index(pecha_idx, text_id, cur_vol_offset, vol_num):
check_flag = False
for text_uuid, text in pecha_idx['annotations'].items():
if check_flag:
for vol_walker, vol_span in enumerate(text['span']):
if vol_span['vol'] == vol_num:
pecha_idx["annotations"][text_uuid]['span'][vol_walker]['start'] += cur_vol_offset
pecha_idx["annotations"][text_uuid]['span'][vol_walker]['end'] += cur_vol_offset
elif vol_span['vol'] > vol_num:
return pecha_idx
if text_uuid == text_id:
check_flag = True
return pecha_idx

def update_index(pecha_opf_path, pecha_id, text_obj, pecha_idx=None):
if not pecha_idx:
pecha_idx = yaml.safe_load((pecha_opf_path / f"{pecha_id}.opf/index.yml").read_text(encoding='utf-8'))
old_vols = get_old_vol(pecha_opf_path, pecha_id, text_obj)
new_vols = get_new_vol(old_vols, pecha_idx, text_obj)
for (vol_id, old_vol_base), (_, new_vol_base) in zip(old_vols.items(), new_vols.items()):
check_next_text = True
vol_num = int(vol_id[1:])
cur_vol_offset = len(new_vol_base) - len(old_vol_base)
if cur_vol_offset != 0:
for vol_walker, vol_span in enumerate(pecha_idx["annotations"][text_obj.id]['span']):
if vol_span['vol'] == vol_num:
pecha_idx["annotations"][text_obj.id]['span'][vol_walker]['end'] += cur_vol_offset
elif vol_span['vol'] > vol_num:
check_next_text = False
break
if check_next_text:
pecha_idx = update_other_text_index(pecha_idx, text_obj.id, cur_vol_offset, vol_num)
return pecha_idx

43 changes: 28 additions & 15 deletions pedurma/texts.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,17 @@ def get_meta_data(pecha_id, text_uuid, meta_data):
meta = {}

meta = {
"work_id": meta_data["work_id"],
"img_grp_offset": meta_data["img_grp_offset"],
"pref": meta_data["pref"],
"work_id": meta_data.get("work_id", ""),
"img_grp_offset": meta_data.get("img_grp_offset",""),
"pref": meta_data.get("pref", ""),
"pecha_id": pecha_id,
"text_uuid": text_uuid,
}
return meta


def get_hfml_text(opf_path, text_id, index=None):
serializer = HFMLSerializer(opf_path, text_id=text_id, index_layer=index)
serializer = HFMLSerializer(opf_path, text_id=text_id, index_layer=index, layers=['Pagination', 'Durchen'])
serializer.apply_layers()
hfml_text = serializer.get_result()
return hfml_text
Expand Down Expand Up @@ -131,19 +131,27 @@ def get_note_ref(pagination):


def get_clean_page(page):
page_content = re.sub(r"\[([𰵀-󴉱])?[0-9]+[a-z]{1}\]", "", page)
page_content = re.sub(r"\[(\w+)\.(\d+)\]", "", page_content)
return page_content
pat_list = {
"page_pattern": r"\[([𰵀-󴉱])?[0-9]+[a-z]{1}\]",
"topic_pattern": r"\{([𰵀-󴉱])?\w+\}",
"start_durchen_pattern": r"\<([𰵀-󴉱])?d",
"end_durchen_pattern": r"d\>",
"sub_topic_pattern": r"\{([𰵀-󴉱])?\w+\-\w+\}",
}
base_page = page
for ann, ann_pat in pat_list.items():
base_page = re.sub(ann_pat, "", base_page)
return base_page


def get_page_obj(page, text_meta, tag, pagination_layer):
page_idx = re.search(r"\[([𰵀-󴉱])?([0-9]+[a-z]{1})\]", page).group(2)
page_id, pagination = get_page_id(page_idx, pagination_layer)
page_content = page
page_content = get_clean_page(page)
pg_num = get_page_num(page_idx)
page_link = get_link(pg_num, text_meta)
note_ref = get_note_ref(pagination)
if get_clean_page(page_content) == "\n":
if page_content == "\n":
page_obj = None
else:
if tag == "note":
Expand Down Expand Up @@ -182,7 +190,9 @@ def get_page_obj_list(text, text_meta, pagination_layer, tag="text"):
def construct_text_obj(hfmls, text_meta, opf_path):
pages = []
notes = []
vol_span = []
for vol_num, hfml_text in hfmls.items():
vol_span.append(vol_num)
text_meta["vol"] = int(vol_num[1:])
pagination_layer = from_yaml(
Path(
Expand All @@ -196,18 +206,20 @@ def construct_text_obj(hfmls, text_meta, opf_path):
pages += get_page_obj_list(body_text, text_meta, pagination_layer, tag="text")
if durchen:
notes += get_page_obj_list(durchen, text_meta, pagination_layer, tag="note")
text_obj = Text(id=text_meta["text_uuid"], pages=pages, notes=notes)
text_obj = Text(id=text_meta["text_uuid"], vol_span=vol_span, pages=pages, notes=notes)
return text_obj


def serialize_text_obj(text):
text_hfml = ""
text_hfml = {}
for vol_id in text.vol_span:
text_hfml[vol_id] = ""
pages = text.pages
notes = text.notes
for page in pages:
text_hfml += page.content
text_hfml[f"v{int(page.vol):03}"] += page.content
for note in notes:
text_hfml += note.content
text_hfml[f"v{int(note.vol):03}"] += note.content
return text_hfml

def get_derge_google_text_obj(text_id):
Expand All @@ -227,8 +239,9 @@ def get_derge_google_text_obj(text_id):
dg_text = construct_text_obj(dg_hfmls, google_text_meta, google_pecha_path)
return dg_text

def get_text_obj(pecha_id, text_id):
pecha_path = download_pecha(pecha_id, needs_update=False)
def get_text_obj(pecha_id, text_id, pecha_path = None):
if not pecha_path:
pecha_path = download_pecha(pecha_id, needs_update=False)
meta_data = from_yaml(Path(f"{pecha_path}/{pecha_id}.opf/meta.yml"))
index = from_yaml(Path(f"{pecha_path}/{pecha_id}.opf/index.yml"))
hfmls = get_hfml_text(f"{pecha_path}/{pecha_id}.opf/", text_id, index)
Expand Down
21 changes: 21 additions & 0 deletions tests/data/save_text/expected_index.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
id: 559d95c999ba4b56b704539f48c88019
annotation_type: index
revision: '00001'
annotations:
259260e8e3544fc1a9a27d7dffc72df6:
parts: []
span:
- vol: 1
start: 0
end: 179
- vol: 2
start: 0
end: 218
work_id: D1115
cf52cbae1a7640b688b24135fe566920:
parts: []
span:
- vol: 2
start: 219
end: 384
work_id: D1116
15 changes: 15 additions & 0 deletions tests/data/save_text/expected_layers/Durchen.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
id: 9b28dc8b0d6549929106c66d4d02f784
annotation_type: Durchen
revision: '00001'
annotations:
d724dd7a79704a4088a0a625717d7fa6:
span:
start: 169
end: 215
97ca36f7b601415a8187a46e23fa9db2:
span:
start: 339
end: 384
local_ids:
d724dd7a79704a4088a0a625717d7fa6: 200000
97ca36f7b601415a8187a46e23fa9db2: 200001
66 changes: 66 additions & 0 deletions tests/data/save_text/expected_layers/Pagination.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
id: b745a20831cc4ab5a38a46a8738294a6
annotation_type: Pagination
revision: '00001'
annotations:
c11d8db649854c5d89ca3df22047d07b:
page_index: 1a
page_info: ''
reference: null
span:
start: 0
end: 57
note_ref: 05d117045b0c4ea5aee3aeba558e94bd
21671cb910d9486c8ba4793305c00d58:
page_index: 1b
page_info: ''
reference: null
span:
start: 60
end: 116
note_ref: 05d117045b0c4ea5aee3aeba558e94bd
671dc26715434318b3d641521d4e9292:
page_index: 2a
page_info: ''
reference: null
span:
start: 119
end: 166
note_ref: 05d117045b0c4ea5aee3aeba558e94bd
05d117045b0c4ea5aee3aeba558e94bd:
page_index: 2b
page_info: ''
reference: null
span:
start: 169
end: 215
3373e79434004aaeb8b2e69649243d2a:
page_index: 3a
page_info: ''
reference: null
span:
start: 218
end: 281
note_ref: 9efa117a2b9444ac8cb09c198d21cdd8
71dff610d4c841c58e9c815582bf8508:
page_index: 3b
page_info: ''
reference: null
span:
start: 284
end: 336
note_ref: 9efa117a2b9444ac8cb09c198d21cdd8
9efa117a2b9444ac8cb09c198d21cdd8:
page_index: 4a
page_info: ''
reference: null
span:
start: 339
end: 384
local_ids:
c11d8db649854c5d89ca3df22047d07b: 200000
21671cb910d9486c8ba4793305c00d58: 200001
671dc26715434318b3d641521d4e9292: 200002
05d117045b0c4ea5aee3aeba558e94bd: 200003
3373e79434004aaeb8b2e69649243d2a: 200004
71dff610d4c841c58e9c815582bf8508: 200005
9efa117a2b9444ac8cb09c198d21cdd8: 200006
27 changes: 27 additions & 0 deletions tests/data/save_text/expected_v002.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
་་༄ལོ། །རྒྱ་གར་སྐད་དུ།
དབྱིངས་སུ་བསྟོད་པ།
འཚལ་ལོ། །གང་ཞིག་

མཐོང་ངོ་། །ཕྱོགས་
དེ་དང་དེ་ཡི་ཕྱོགས་
ཏིང་འཛིན་རྡོ་རྗེ་ཡིས

རིམ་གྱིས་སྦྱངས་
མེད་ཉི་མ་ཟླ་བ་ཡང་།
་རྡུལ་ལ་སོགས།

འབྱོར་ཆེན་པོ་དེར་
སྡུག་བསྔལ་གྱིས་
དེ་ཡི་སྐུ་ལས་

ངོས་ལྗོན་ཤིང་
ལེན་པ་པོ་ཕུན་སུམ་ཚོགས་པའོ།
འདི་དག་གིས་ནི་སྦྱིན་པར་

མངའ་དབང་མཛད་པ་
འདི་དག་གིས་ནི་དེའི་
གིས་ནི་སྐྱེ་kkབ་ལ་

དེ་ལ་ནམ་མཁའི་
བ་ཡང་དག་པར་
གིས་ནི་ཆོས་སྟོན་པའི་
15 changes: 15 additions & 0 deletions tests/data/save_text/old_opf/P000002.opf/base/v001.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
ཉ༄ཚོ། །རྒྱ་གར་སྐད་དུ།
སྟ་བ་ནཱ་མ། བོད་སྐད་དུ།
པར་འོས་པ་བསྔགས་

གཏམ་འདི་ཙམ
འདི་ཉིད་སྨྲ་བར་
དང་-། །ཁྱོད་མ

འདོད་གང་དག་
སྐྱབས་འགྲོ་བ།
སྟོང་གིས་ཀྱང་།

རྒྱ་གར་གྱི་
༢༦༤ ༧པེ་〉〉་
བཞུགས་གོ།
27 changes: 27 additions & 0 deletions tests/data/save_text/old_opf/P000002.opf/base/v002.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
་་༄ལོ། །རྒྱ་གར་སྐད་དུ།
དབྱིངས་སུ་བསྟོད་པ།
འཚལ་ལོ། །གང་ཞིག་

མཐོང་ངོ་། །ཕྱོགས་
དེ་དང་དེ་ཡི་ཕྱོགས་
ཏིང་འཛིན་རྡོ་རྗེ་ཡིས

རིམ་གྱིས་སྦྱངས་
མེད་ཉི་མ་ཟླ་བ་ཡང་།
་རྡུལ་ལ་སོགས།

འབྱོར་ཆེན་པོ་དེར་
སྡུག་བསྔལ་གྱིས་
དེ་ཡི་སྐུ་ལས་

ངོས་ལྗོན་ཤིང་
ལེན་པ་པོ་ཕུན་སུམ་ཚོགས་པའོ།
འདི་དག་གིས་ནི་སྦྱིན་པར་

མངའ་དབང་མཛད་པ་
འདི་དག་གིས་ནི་དེའི་
གིས་ནི་སྐྱེ་བ་ལ་

དེ་ལ་ནམ་མཁའི་
བ་ཡང་དག་པར་
གིས་ནི་ཆོས་སྟོན་པའི་
Loading

0 comments on commit 135210e

Please sign in to comment.