diff --git a/pedurma/reconstruction.py b/pedurma/reconstruction.py index 42e2172..b552791 100644 --- a/pedurma/reconstruction.py +++ b/pedurma/reconstruction.py @@ -20,9 +20,8 @@ from docx import Document from pedurma.exceptions import PageNumMissing -from pedurma.pecha import NotesPage from pedurma.preprocess import preprocess_google_notes, preprocess_namsel_notes -from pedurma.preview_note_layer import update_hybird_pecha_note_layer +from pedurma.text_report import get_text_report from pedurma.texts import ( get_body_text_from_last_page, get_page_ann, @@ -1036,7 +1035,7 @@ def pecha_path_2_id(pecha_path): return pecha_path_obj.stem -def get_preview_text(text_id, pecha_paths=None): +def get_reconstructed_text(text_id, pecha_paths=None): if pecha_paths is None: pecha_paths = get_pecha_paths(text_id) pedurmatext = get_pedurma_text_obj(text_id, pecha_paths) @@ -1063,9 +1062,6 @@ def get_preview_text(text_id, pecha_paths=None): cur_vol_preview = get_vol_preview( dg_body, namsel_body, dg_note_text, namsel_note_text, vol_num ) - update_hybird_pecha_note_layer( - cur_vol_preview, pecha_paths["google"], int(vol_num) - ) preview_text[f"v{int(vol_num):03}"] = cur_vol_preview dg_body = "" namsel_body = "" @@ -1101,16 +1097,34 @@ def create_docx(text_id, chunks, path): return output_path -def get_docx_text(text_id, pecha_paths=None, output_path=None): +def get_docx_text(text_id, preview_text, output_path=None): if not output_path: (Path.home() / ".collation_docx").mkdir(parents=True, exist_ok=True) output_path = Path.home() / ".collation_docx" collation_text = "" - preview_text, google_pecha_id = get_preview_text(text_id, pecha_paths) for vol_id, text in preview_text.items(): collation_text += f"{text}\n\n" collation_text = collation_text.replace("\n", "") - collation_text = re.sub(r"(༺.+?༻)", r"\n\g<1>\n", collation_text) + collation_text = re.sub(r"(\d+-\d+)", r"\n\g<1>\n", collation_text) chunks = split_text(collation_text) docx_path = create_docx(text_id, chunks, output_path) return docx_path + + +def get_preview_text(text_id, docx_output_path, pecha_paths=None): + preview_text_info = { + "preview_text": None, + "google_pecha_id": None, + "docx_output_path": None, + "text_report": None, + } + preview_text, google_pecha_id = get_reconstructed_text(text_id, pecha_paths) + preview_text_info["preview_text"] = preview_text + preview_text_info["google_pecha_id"] = google_pecha_id + preview_text_info["docx_output_path"] = get_docx_text( + text_id, preview_text, docx_output_path + ) + preview_text_info["text_report"] = get_text_report( + text_id, pecha_paths, preview_text + ) + return preview_text_info diff --git a/pedurma/text_report.py b/pedurma/text_report.py index 1406ca6..774d263 100644 --- a/pedurma/text_report.py +++ b/pedurma/text_report.py @@ -4,6 +4,8 @@ from openpecha.utils import load_yaml +from pedurma.texts import get_pecha_paths + def get_metadata(pecha_path): pecha_id = Path(pecha_path).stem @@ -39,6 +41,8 @@ def get_text_report(text_id, pecha_paths, preview_text): "total_number_of_footnotes": None, "download_date": None, } + if pecha_paths is None: + pecha_paths = get_pecha_paths(text_id) text_report["title"] = get_text_title(pecha_paths["google"]) number_of_pages = 0 number_of_footnotes = 0 diff --git a/tests/preview/test_get_preview.py b/tests/preview/test_get_preview.py index 2fb7c4f..f882e18 100644 --- a/tests/preview/test_get_preview.py +++ b/tests/preview/test_get_preview.py @@ -5,7 +5,11 @@ from pedurma.exceptions import PageNumMissing from pedurma.pecha import NotesPage, Page -from pedurma.reconstruction import get_docx_text, get_preview_page, get_preview_text +from pedurma.reconstruction import ( + get_docx_text, + get_preview_page, + get_reconstructed_text, +) from pedurma.utils import from_yaml @@ -14,16 +18,7 @@ def get_dummy_preview(): namsel_pecha_path = str(Path(__file__).parent / "data" / "P973") text_id = "D1119" pecha_paths = {"namsel": namsel_pecha_path, "google": dg_pecha_path} - preview_text = get_preview_text(text_id, pecha_paths) - ( - Path(__file__).parent - / "data" - / "P972" - / "P972.opf" - / "layers" - / "v001" - / "PedurmaNote.yml" - ).unlink() + preview_text = get_reconstructed_text(text_id, pecha_paths) return preview_text @@ -151,19 +146,8 @@ def test_get_preview_text(): def test_get_docx_text(): text_id = "D1119" output_path = Path.home() - dg_pecha_path = str(Path(__file__).parent / "data" / "P972") - namsel_pecha_path = str(Path(__file__).parent / "data" / "P973") - pecha_paths = {"namsel": namsel_pecha_path, "google": dg_pecha_path} - docx_path = get_docx_text(text_id, pecha_paths, output_path) + preview_text, google_pecha_id = get_dummy_preview() + docx_path = get_docx_text(text_id, preview_text, output_path) expected_path = Path.home() / "D1119.docx" assert docx_path == expected_path expected_path.unlink() - ( - Path(__file__).parent - / "data" - / "P972" - / "P972.opf" - / "layers" - / "v001" - / "PedurmaNote.yml" - ).unlink()