diff --git a/Makefile b/Makefile index 778a29b0..026c8c6d 100755 --- a/Makefile +++ b/Makefile @@ -119,7 +119,7 @@ run-tests-utils: run-tests-broker: export $(shell sed 's/=.*//' ./tests/.env) - pytest tests/tests_broker/test_*.py -v + pytest tests/tests_broker/test_*.py -s -v run-tests-harvester: export $(shell sed 's/=.*//' ./tests/.env) diff --git a/src/broker/operandi_broker/worker.py b/src/broker/operandi_broker/worker.py index 181614ea..5c709628 100644 --- a/src/broker/operandi_broker/worker.py +++ b/src/broker/operandi_broker/worker.py @@ -5,6 +5,7 @@ from os.path import join from pathlib import Path from sys import exit +from typing import List from operandi_utils import reconfigure_all_loggers, get_log_file_path_prefix from operandi_utils.constants import LOG_LEVEL_WORKER, StateJob, StateWorkspace @@ -111,6 +112,7 @@ def __callback(self, ch, method, properties, body): workflow_script_path = Path(workflow_db.workflow_script_path) nf_uses_mets_server = workflow_db.uses_mets_server + nf_executable_steps = workflow_db.executable_steps workspace_dir = Path(workspace_db.workspace_dir) mets_basename = workspace_db.mets_basename ws_pages_amount = workspace_db.pages_amount @@ -132,8 +134,8 @@ def __callback(self, ch, method, properties, body): workspace_dir=workspace_dir, workspace_base_mets=mets_basename, workflow_script_path=workflow_script_path, input_file_grp=input_file_grp, nf_process_forks=nf_process_forks, ws_pages_amount=ws_pages_amount, use_mets_server=nf_uses_mets_server, - file_groups_to_remove=remove_file_grps, cpus=slurm_job_cpus, ram=slurm_job_ram, - partition=slurm_job_partition + nf_executable_steps=nf_executable_steps, file_groups_to_remove=remove_file_grps, cpus=slurm_job_cpus, + ram=slurm_job_ram, partition=slurm_job_partition ) self.log.info(f"The HPC slurm job was successfully submitted") except Exception as error: @@ -200,7 +202,8 @@ def signal_handler(self, sig, frame): def prepare_and_trigger_slurm_job( self, workflow_job_id: str, workspace_id: str, workspace_dir: Path, workspace_base_mets: str, workflow_script_path: Path, input_file_grp: str, nf_process_forks: int, ws_pages_amount: int, - use_mets_server: bool, file_groups_to_remove: str, cpus: int, ram: int, partition: str + use_mets_server: bool, nf_executable_steps: List[str], file_groups_to_remove: str, cpus: int, ram: int, + partition: str ) -> str: if self.test_sbatch: job_deadline_time = HPC_JOB_DEADLINE_TIME_TEST @@ -232,8 +235,9 @@ def prepare_and_trigger_slurm_job( workflow_job_id=workflow_job_id, nextflow_script_path=workflow_script_path, workspace_id=workspace_id, mets_basename=workspace_base_mets, input_file_grp=input_file_grp, nf_process_forks=nf_process_forks, ws_pages_amount=ws_pages_amount, - use_mets_server=use_mets_server, file_groups_to_remove=file_groups_to_remove, cpus=cpus, ram=ram, - job_deadline_time=job_deadline_time, partition=partition, qos=qos) + use_mets_server=use_mets_server, nf_executable_steps=nf_executable_steps, + file_groups_to_remove=file_groups_to_remove, cpus=cpus, ram=ram, job_deadline_time=job_deadline_time, + partition=partition, qos=qos) except Exception as error: db_stats = sync_db_increase_processing_stats( find_user_id=self.current_message_user_id, pages_failed=ws_pages_amount) diff --git a/src/server/operandi_server/routers/admin_panel.py b/src/server/operandi_server/routers/admin_panel.py index 0c8bdf87..9af5130b 100644 --- a/src/server/operandi_server/routers/admin_panel.py +++ b/src/server/operandi_server/routers/admin_panel.py @@ -51,6 +51,7 @@ def __init__(self): endpoint=self.user_workflows, methods=["GET"], status_code=status.HTTP_200_OK, summary="Get all workflows submitted by the user identified by user_id" ) + async def push_to_ola_hd(self, workspace_id: str, auth: HTTPBasicCredentials = Depends(HTTPBasic())): py_user_action = await self.user_authenticator.user_login(auth) if py_user_action.account_type != AccountType.ADMIN: @@ -86,7 +87,7 @@ async def get_users(self, auth: HTTPBasicCredentials = Depends(HTTPBasic())): message = "Admin privileges required for the endpoint" self.logger.error(message) raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=message) - + users = await db_get_all_user_accounts() return [PYUserInfo.from_db_user_account(user) for user in users] diff --git a/src/server/operandi_server/routers/workflow.py b/src/server/operandi_server/routers/workflow.py index c5194a50..5bf43374 100644 --- a/src/server/operandi_server/routers/workflow.py +++ b/src/server/operandi_server/routers/workflow.py @@ -12,11 +12,12 @@ from fastapi.security import HTTPBasic, HTTPBasicCredentials from starlette.status import HTTP_404_NOT_FOUND -from operandi_utils import get_nf_workflows_dir +from operandi_utils import get_nf_wfs_dir, get_ocrd_process_wfs_dir from operandi_utils.constants import AccountType, ServerApiTag, StateJob, StateWorkspace from operandi_utils.database import ( db_create_workflow, db_create_workflow_job, db_get_hpc_slurm_job, db_get_workflow, db_update_workspace, db_increase_processing_stats_with_handling) +from operandi_utils.oton import OTONConverter from operandi_utils.rabbitmq import ( get_connection_publisher, RABBITMQ_QUEUE_JOB_STATUSES, RABBITMQ_QUEUE_HARVESTER, RABBITMQ_QUEUE_USERS) from operandi_server.constants import ( @@ -30,7 +31,7 @@ get_db_workflow_job_with_handling, get_db_workflow_with_handling, nf_script_uses_mets_server_with_handling, - validate_oton_with_handling + validate_oton_with_handling, nf_script_executable_steps_with_handling ) from .workspace_utils import check_if_file_group_exists_with_handling, get_db_workspace_with_handling from .user import RouterUser @@ -133,10 +134,35 @@ async def _push_status_request_to_rabbitmq(self, job_id: str): self.logger.error(f"{message}, error: {error}") raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=message) - async def insert_production_workflows(self, production_workflows_dir: Path = get_nf_workflows_dir()): + async def produce_production_workflows( + self, + ocrd_process_wf_dir: Path = get_ocrd_process_wfs_dir(), + production_nf_wfs_dir: Path = get_nf_wfs_dir() + ): + oton_converter = OTONConverter() + for path in ocrd_process_wf_dir.iterdir(): + if not path.is_file(): + self.logger.info(f"Skipping non-file path: {path}") + continue + if path.suffix != '.txt': + self.logger.info(f"Skipping non .txt extension file path: {path}") + continue + # path.stem -> file_name + # path.name -> file_name.ext + self.logger.info(f"Converting to Nextflow workflow the ocrd process workflow: {path}") + output_path = Path(production_nf_wfs_dir, f"{path.stem}.nf") + oton_converter.convert_oton( + input_path=path, output_path=str(output_path), environment="apptainer", with_mets_server=False) + self.logger.info(f"Converted to a Nextflow file without a mets server: {output_path}") + output_path = Path(production_nf_wfs_dir, f"{path.stem}_with_MS.nf") + oton_converter.convert_oton( + input_path=path, output_path=str(output_path), environment="apptainer", with_mets_server=True) + self.logger.info(f"Converted to a Nextflow file with a mets server: {output_path}") + + async def insert_production_workflows(self, production_nf_wfs_dir: Path = get_nf_wfs_dir()): wf_detail = "Workflow provided by the Operandi Server" - self.logger.info(f"Inserting production workflows for Operandi from: {production_workflows_dir}") - for path in production_workflows_dir.iterdir(): + self.logger.info(f"Inserting production workflows for Operandi from: {production_nf_wfs_dir}") + for path in production_nf_wfs_dir.iterdir(): if not path.is_file(): self.logger.info(f"Skipping non-file path: {path}") continue @@ -150,11 +176,14 @@ async def insert_production_workflows(self, production_workflows_dir: Path = get nf_script_dest = join(workflow_dir, path.name) copyfile(src=path, dst=nf_script_dest) uses_mets_server = await nf_script_uses_mets_server_with_handling(self.logger, nf_script_dest) - self.logger.info(f"Inserting: {workflow_id}, uses_mets_server: {uses_mets_server}, script path: {nf_script_dest}") + executable_steps = await nf_script_executable_steps_with_handling(self.logger, nf_script_dest) + self.logger.info( + f"Inserting: {workflow_id}, uses_mets_server: {uses_mets_server}, script path: {nf_script_dest}") await db_create_workflow( user_id="Operandi Server", workflow_id=workflow_id, workflow_dir=workflow_dir, workflow_script_path=nf_script_dest, - workflow_script_base=path.name, uses_mets_server=uses_mets_server, details=wf_detail) + workflow_script_base=path.name, uses_mets_server=uses_mets_server, executable_steps=executable_steps, + details=wf_detail) self.production_workflows.append(workflow_id) async def list_workflows(self, auth: HTTPBasicCredentials = Depends(HTTPBasic())) -> List[WorkflowRsrc]: @@ -204,10 +233,11 @@ async def upload_workflow_script( self.logger.error(f"{message}, error: {error}") raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=message) uses_mets_server = await nf_script_uses_mets_server_with_handling(self.logger, nf_script_dest) + executable_steps = await nf_script_executable_steps_with_handling(self.logger, nf_script_dest) db_workflow = await db_create_workflow( user_id=py_user_action.user_id, workflow_id=workflow_id, workflow_dir=workflow_dir, workflow_script_path=nf_script_dest, workflow_script_base=nextflow_script.filename, - uses_mets_server=uses_mets_server, details=details) + uses_mets_server=uses_mets_server, executable_steps=executable_steps, details=details) return WorkflowRsrc.from_db_workflow(db_workflow) async def update_workflow_script( @@ -239,10 +269,11 @@ async def update_workflow_script( self.logger.error(f"{message}, error: {error}") raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=message) uses_mets_server = await nf_script_uses_mets_server_with_handling(self.logger, nf_script_dest) + executable_steps = await nf_script_executable_steps_with_handling(self.logger, nf_script_dest) db_workflow = await db_create_workflow( user_id=py_user_action.user_id, workflow_id=workflow_id, workflow_dir=workflow_dir, workflow_script_path=nf_script_dest, workflow_script_base=nextflow_script.filename, - uses_mets_server=uses_mets_server, details=details) + uses_mets_server=uses_mets_server, executable_steps=executable_steps, details=details) return WorkflowRsrc.from_db_workflow(db_workflow) async def get_workflow_job_status( @@ -442,7 +473,8 @@ def _push_job_to_rabbitmq( # Added by Faizan async def convert_txt_to_nextflow( - self, txt_file: UploadFile, environment: str, auth: HTTPBasicCredentials = Depends(HTTPBasic()) + self, txt_file: UploadFile, environment: str, with_mets_server: bool = True, + auth: HTTPBasicCredentials = Depends(HTTPBasic()) ): # Authenticate the user await self.user_authenticator.user_login(auth) @@ -459,5 +491,5 @@ async def convert_txt_to_nextflow( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=message) await validate_oton_with_handling(self.logger, ocrd_process_txt) - await convert_oton_with_handling(self.logger, environment, ocrd_process_txt, nf_script_dest) - return FileResponse(nf_script_dest, filename=f'{oton_id}.nf') + await convert_oton_with_handling(self.logger, ocrd_process_txt, nf_script_dest, environment, with_mets_server) + return FileResponse(nf_script_dest, filename=f'{oton_id}.nf', media_type="application/txt-file") diff --git a/src/server/operandi_server/routers/workflow_utils.py b/src/server/operandi_server/routers/workflow_utils.py index 7401881b..591798b3 100644 --- a/src/server/operandi_server/routers/workflow_utils.py +++ b/src/server/operandi_server/routers/workflow_utils.py @@ -1,9 +1,11 @@ from fastapi import HTTPException, status from pathlib import Path +from typing import List from operandi_utils.database import db_get_workflow, db_get_workflow_job from operandi_utils.database.models import DBWorkflow, DBWorkflowJob from operandi_utils.oton import OTONConverter, OCRDValidator +from operandi_utils.oton.constants import PARAMS_KEY_METS_SOCKET_PATH async def get_db_workflow_with_handling( @@ -39,9 +41,8 @@ async def get_db_workflow_job_with_handling(logger, job_id: str, check_local_exi raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=message) return db_workflow_job - async def nf_script_uses_mets_server_with_handling( - logger, nf_script_path: str, search_string: str = "params.mets_socket" + logger, nf_script_path: str, search_string: str = PARAMS_KEY_METS_SOCKET_PATH ) -> bool: try: with open(nf_script_path) as nf_file: @@ -56,6 +57,35 @@ async def nf_script_uses_mets_server_with_handling( logger.error(f"{message}, error: {error}") raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=message) +async def nf_script_executable_steps_with_handling(logger, nf_script_path: str) -> List[str]: + processor_executables: List[str] = [] + try: + with open(nf_script_path) as nf_file: + line = nf_file.readline() + while line: + for word in line.split(' '): + if "ocrd-" in word: + processor_executables.append(word) + break + line = nf_file.readline() + except Exception as error: + message = "Failed to identify processor executables in the provided Nextflow workflow." + logger.error(f"{message}, error: {error}") + raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=message) + + """ + apptainer_images: List[str] = [] + try: + for executable in processor_executables: + apptainer_images.append(OCRD_PROCESSOR_EXECUTABLE_TO_IMAGE[executable]) + except Exception as error: + message = "Failed to produce apptainer image names from the processor executables list" + logger.error(f"{message}, error: {error}") + raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=message) + return apptainer_images + """ + logger.info(f"Found processor executables: {processor_executables}") + return processor_executables async def validate_oton_with_handling(logger, ocrd_process_txt_path: str): try: @@ -67,7 +97,9 @@ async def validate_oton_with_handling(logger, ocrd_process_txt_path: str): logger.error(f"{message}, error: {error}") raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=message) -async def convert_oton_with_handling(logger, environment: str, ocrd_process_txt_path: str, nf_script_dest_path: str): +async def convert_oton_with_handling( + logger, ocrd_process_txt_path: str, nf_script_dest_path: str, environment: str, with_mets_server: bool +): environments = ["local", "docker", "apptainer"] if environment not in environments: message = f"Unknown environment value: {environment}. Must be one of: {environments}" @@ -75,12 +107,7 @@ async def convert_oton_with_handling(logger, environment: str, ocrd_process_txt_ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=message) try: converter = OTONConverter() - if environment == "local": - converter.convert_oton_env_local(str(ocrd_process_txt_path), str(nf_script_dest_path)) - elif environment == "docker": - converter.convert_oton_env_docker(str(ocrd_process_txt_path), str(nf_script_dest_path)) - elif environment == "apptainer": - converter.convert_oton_env_apptainer(str(ocrd_process_txt_path), str(nf_script_dest_path)) + converter.convert_oton(str(ocrd_process_txt_path), str(nf_script_dest_path), environment, with_mets_server) except ValueError as error: message = "Failed to convert ocrd process workflow to nextflow workflow" logger.error(f"{message}, error: {error}") diff --git a/src/server/operandi_server/server.py b/src/server/operandi_server/server.py index c413cf2f..6c8eb873 100644 --- a/src/server/operandi_server/server.py +++ b/src/server/operandi_server/server.py @@ -114,6 +114,7 @@ async def include_webapi_routers(self): self.include_router(RouterDiscovery().router) self.include_router(RouterUser().router) workflow_router = RouterWorkflow() + await workflow_router.produce_production_workflows() await workflow_router.insert_production_workflows() self.include_router(workflow_router.router) self.include_router(RouterWorkspace().router) diff --git a/src/utils/operandi_utils/__init__.py b/src/utils/operandi_utils/__init__.py index 6cbc04c8..0fd74c2f 100644 --- a/src/utils/operandi_utils/__init__.py +++ b/src/utils/operandi_utils/__init__.py @@ -4,7 +4,8 @@ "is_url_responsive", "generate_id", "get_log_file_path_prefix", - "get_nf_workflows_dir", + "get_nf_wfs_dir", + "get_ocrd_process_wfs_dir", "make_zip_archive", "receive_file", "reconfigure_all_loggers", @@ -25,7 +26,8 @@ download_mets_file, is_url_responsive, generate_id, - get_nf_workflows_dir, + get_nf_wfs_dir, + get_ocrd_process_wfs_dir, receive_file, make_zip_archive, unpack_zip_archive, diff --git a/src/utils/operandi_utils/constants.py b/src/utils/operandi_utils/constants.py index 542fcdaf..c38f33cb 100644 --- a/src/utils/operandi_utils/constants.py +++ b/src/utils/operandi_utils/constants.py @@ -18,6 +18,7 @@ "LOG_LEVEL_SERVER", "LOG_LEVEL_WORKER", "MODULE_TYPES", + "OCRD_PROCESSOR_EXECUTABLE_TO_IMAGE", "OLA_HD_BAG_ENDPOINT", "OLA_HD_USER", "OLA_HD_PASSWORD", @@ -165,3 +166,78 @@ class StateWorkspace(str, Enum): TRANSFERRING_TO_HPC = "TRANSFERRING_TO_HPC" TRANSFERRING_FROM_HPC = "TRANSFERRING_FROM_HPC" UNSET = "UNSET" + +# TODO: Find a more optimal way of achieving this dynamically +OCRD_PROCESSOR_EXECUTABLE_TO_IMAGE = { + "ocrd": "ocrd_core.sif", + "ocrd-tesserocr-crop": "ocrd_tesserocr.sif", + "ocrd-tesserocr-deskew": "ocrd_tesserocr.sif", + "ocrd-tesserocr-recognize": "ocrd_tesserocr.sif", + "ocrd-tesserocr-segment": "ocrd_tesserocr.sif", + "ocrd-tesserocr-segment-line": "ocrd_tesserocr.sif", + "ocrd-tesserocr-segment-region": "ocrd_tesserocr.sif", + "ocrd-tesserocr-segment-table": "ocrd_tesserocr.sif", + "ocrd-tesserocr-segment-word": "ocrd_tesserocr.sif", + "ocrd-tesserocr-fontshape": "ocrd_tesserocr.sif", + "ocrd-tesserocr-binarize": "ocrd_tesserocr.sif", + "ocrd-cis-ocropy-binarize": "ocrd_cis.sif", + "ocrd-cis-ocropy-denoise": "ocrd_cis.sif", + "ocrd-cis-ocropy-deskew": "ocrd_cis.sif", + "ocrd-cis-ocropy-dewarp": "ocrd_cis.sif", + "ocrd-cis-ocropy-segment": "ocrd_cis.sif", + "ocrd-cis-ocropy-resegment": "ocrd_cis.sif", + "ocrd-cis-ocropy-clip": "ocrd_cis.sif", + "ocrd-cis-ocropy-recognize": "ocrd_cis.sif", + "ocrd-cis-ocropy-train": "ocrd_cis.sif", + "ocrd-cis-align": "ocrd_cis.sif", + "ocrd-cis-postcorrect": "ocrd_cis.sif", + "ocrd-kraken-recognize": "ocrd_kraken.sif", + "ocrd-kraken-segment": "ocrd_kraken.sif", + "ocrd-kraken-binarize": "ocrd_kraken.sif", + "ocrd-preprocess-image": "ocrd_wrap.sif", + "ocrd-skimage-normalize": "ocrd_wrap.sif", + "ocrd-skimage-binarize": "ocrd_wrap.sif", + "ocrd-skimage-denoise": "ocrd_wrap.sif", + "ocrd-skimage-denoise-raw": "ocrd_wrap.sif", + "ocrd-calamari-recognize": "ocrd_calamari.sif", + "ocrd-olena-binarize": "ocrd_olena.sif", + "ocrd-dinglehopper": "ocrd_dinglehopper.sif", + "ocrd-eynollah-segment": "ocrd_eynollah.sif", + "ocrd-fileformat-transform": "ocrd_fileformat.sif", + "ocrd-nmalign-merge": "ocrd_nmalign.sif", + "ocrd-segment-extract-glyphs": "ocrd_segment.sif", + "ocrd-segment-extract-lines": "ocrd_segment.sif", + "ocrd-segment-extract-pages": "ocrd_segment.sif", + "ocrd-segment-extract-regions": "ocrd_segment.sif", + "ocrd-segment-extract-words": "ocrd_segment.sif", + "ocrd-segment-from-coco": "ocrd_segment.sif", + "ocrd-segment-from-masks": "ocrd_segment.sif", + "ocrd-segment-project": "ocrd_segment.sif", + "ocrd-segment-repair": "ocrd_segment.sif", + "ocrd-segment-replace-original": "ocrd_segment.sif", + "ocrd-segment-replace-page": "ocrd_segment.sif", + "ocrd-segment-replace-text": "ocrd_segment.sif", + "ocrd-segment-evaluate": "ocrd_segment.sif", + "ocrd-anybaseocr-dewarp": "ocrd_anybaseocr.sif", + "ocrd-anybaseocr-crop": "ocrd_anybaseocr.sif", + "ocrd-anybaseocr-binarize": "ocrd_anybaseocr.sif", + "ocrd-anybaseocr-layout-analysis": "ocrd_anybaseocr.sif", + "ocrd-anybaseocr-textline": "ocrd_anybaseocr.sif", + "ocrd-anybaseocr-tiseg": "ocrd_anybaseocr.sif", + "ocrd-anybaseocr-block-segmentation": "ocrd_anybaseocr.sif", + "ocrd-anybaseocr-deskew": "ocrd_anybaseocr.sif", + "ocrd-sbb-binarize": "ocrd_sbb_binarization.sif", + "ocrd-detectron2-segment": "ocrd_detectron2.sif", + "ocrd-froc": "ocrd_froc.sif", + "ocrd-pagetopdf": "ocrd_pagetopdf.sif", + "ocrd-keraslm-rate": "ocrd_keraslm.sif", + "ocrd-docstruct": "ocrd_docstruct.sif", + "ocrd-doxa-binarize": "ocrd_doxa.sif", + "ocrd-im6convert": "ocrd_im6convert.sif", + "ocrd-olahd-client": "ocrd_olahd-client.sif", + "ocrd-cor-asv-ann-mark": "ocrd_cor-asv-ann.sif", + "ocrd-cor-asv-ann-align": "ocrd_cor-asv-ann.sif", + "ocrd-cor-asv-ann-evaluate": "ocrd_cor-asv-ann.sif", + "ocrd-cor-asv-ann-join": "ocrd_cor-asv-ann.sif", + "ocrd-cor-asv-ann-process": "ocrd_cor-asv-ann.sif" +} diff --git a/src/utils/operandi_utils/database/db_workflow.py b/src/utils/operandi_utils/database/db_workflow.py index 91b3da8b..8fb4c0b2 100644 --- a/src/utils/operandi_utils/database/db_workflow.py +++ b/src/utils/operandi_utils/database/db_workflow.py @@ -7,7 +7,7 @@ # TODO: This also updates to satisfy the PUT method in the Workflow Manager - fix this async def db_create_workflow( user_id: str, workflow_id: str, workflow_dir: str, workflow_script_base: str, workflow_script_path: str, - uses_mets_server: bool, details: str = "Workflow" + uses_mets_server: bool, executable_steps: List[str], details: str = "Workflow" ) -> DBWorkflow: try: db_workflow = await db_get_workflow(workflow_id) @@ -19,6 +19,7 @@ async def db_create_workflow( workflow_script_base=workflow_script_base, workflow_script_path=workflow_script_path, uses_mets_server=uses_mets_server, + executable_steps=executable_steps, datetime=datetime.now(), details=details ) @@ -29,6 +30,7 @@ async def db_create_workflow( db_workflow.workflow_script_base = workflow_script_base db_workflow.workflow_script_path = workflow_script_path db_workflow.uses_mets_server = uses_mets_server + db_workflow.executable_steps = executable_steps db_workflow.details = details await db_workflow.save() return db_workflow @@ -37,10 +39,11 @@ async def db_create_workflow( @call_sync async def sync_db_create_workflow( user_id: str, workflow_id: str, workflow_dir: str, workflow_script_base: str, workflow_script_path: str, - uses_mets_server: bool, details: str = "Workflow" + uses_mets_server: bool, executable_steps: List[str], details: str = "Workflow" ) -> DBWorkflow: return await db_create_workflow( - user_id, workflow_id, workflow_dir, workflow_script_base, workflow_script_path, uses_mets_server, details) + user_id, workflow_id, workflow_dir, workflow_script_base, workflow_script_path, uses_mets_server, + executable_steps, details) async def db_get_workflow(workflow_id: str) -> DBWorkflow: @@ -87,6 +90,8 @@ async def db_update_workflow(find_workflow_id: str, **kwargs) -> DBWorkflow: db_workflow.workflow_script_path = value elif key == "uses_mets_server": db_workflow.uses_mets_server = value + elif key == "executable_steps": + db_workflow.executable_steps = value elif key == "deleted": db_workflow.deleted = value elif key == "details": @@ -103,4 +108,4 @@ async def sync_db_update_workflow(find_workflow_id: str, **kwargs) -> DBWorkflow @call_sync async def sync_db_get_all_workflows_by_user(user_id: str, start_date: Optional[datetime] = None, end_date: Optional[datetime] = None) -> List[DBWorkflow]: - return await db_get_all_workflows_by_user(user_id, start_date, end_date) \ No newline at end of file + return await db_get_all_workflows_by_user(user_id, start_date, end_date) diff --git a/src/utils/operandi_utils/database/models.py b/src/utils/operandi_utils/database/models.py index d587e589..d885df0e 100644 --- a/src/utils/operandi_utils/database/models.py +++ b/src/utils/operandi_utils/database/models.py @@ -102,6 +102,7 @@ class DBWorkflow(Document): workflow_script_base The name of the nextflow script file workflow_script_path Nextflow workflow file full path on the server uses_mets_server Whether the NF script forwards requests to a workspace mets server + executable_steps A list of ocrd_processor executables deleted Whether the entry has been deleted locally from the server datetime Shows the created date time of the entry details Extra user specified details about this entry @@ -112,6 +113,7 @@ class DBWorkflow(Document): workflow_script_base: str workflow_script_path: str uses_mets_server: bool + executable_steps: List[str] deleted: bool = False datetime = datetime.now() details: Optional[str] diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_check_ocrd_all_version.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_check_ocrd_all_version.sh old mode 100644 new mode 100755 index 24466582..35372adc --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_check_ocrd_all_version.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_check_ocrd_all_version.sh @@ -1,6 +1,6 @@ #!/bin/bash -#SBATCH --partition standard96:shared -#SBATCH --time 00:05:00 +#SBATCH --partition standard96s:shared +#SBATCH --time 00:20:00 #SBATCH --qos 2h #SBATCH --output check_ocrd_all_version_job-%J.txt #SBATCH --cpus-per-task 1 @@ -9,12 +9,96 @@ set -e hostname -/opt/slurm/etc/scripts/misc/slurm_resources +# /opt/slurm/etc/scripts/misc/slurm_resources module purge module load apptainer -SIF_PATH="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_all_maximum_image.sif" +SIF_PATH="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_processor_sifs/ocrd_all_maximum_image.sif" -apptainer exec "$SIF_PATH" ocrd --version apptainer exec "$SIF_PATH" ocrd-tesserocr-recognize --dump-module-dir apptainer exec "$SIF_PATH" ls -la /models + +ocrd_processors=( +"ocrd-anybaseocr-binarize" +"ocrd-anybaseocr-block-segmentation" +"ocrd-anybaseocr-crop" +"ocrd-anybaseocr-deskew" +"ocrd-anybaseocr-dewarp" +"ocrd-anybaseocr-layout-analysis" +"ocrd-anybaseocr-textline" +"ocrd-anybaseocr-tiseg" +"ocrd-calamari-recognize" +"ocrd-cis-align" +"ocrd-cis-data" +"ocrd-cis-ocropy-binarize" +"ocrd-cis-ocropy-clip" +"ocrd-cis-ocropy-denoise" +"ocrd-cis-ocropy-deskew" +"ocrd-cis-ocropy-dewarp" +"ocrd-cis-ocropy-recognize" +"ocrd-cis-ocropy-resegment" +"ocrd-cis-ocropy-segment" +"ocrd-cis-ocropy-train" +"ocrd-cis-postcorrect" +"ocrd-cor-asv-ann-align" +"ocrd-cor-asv-ann-evaluate" +"ocrd-cor-asv-ann-join" +"ocrd-cor-asv-ann-mark" +"ocrd-cor-asv-ann-process" +"ocrd-detectron2-segment" +"ocrd-dinglehopper" +"ocrd-docstruct" +"ocrd-doxa-binarize" +"ocrd-dummy" +"ocrd-eynollah-segment" +"ocrd-fileformat-transform" +"ocrd-froc-recognize" +"ocrd-im6convert" +"ocrd-import" +"ocrd-keraslm-rate" +"ocrd-kraken-binarize" +"ocrd-kraken-recognize" +"ocrd-kraken-segment" +"ocrd-make" +"ocrd-nmalign-merge" +"ocrd-olahd-client" +"ocrd-olena-binarize" +"ocrd-page2alto-transform" +"ocrd-pagetopdf" +"ocrd-page-transform" +"ocrd-preprocess-image" +"ocrd-repair-inconsistencies" +"ocrd-sbb-binarize" +"ocrd-segment-evaluate" +"ocrd-segment-extract-glyphs" +"ocrd-segment-extract-lines" +"ocrd-segment-extract-pages" +"ocrd-segment-extract-regions" +"ocrd-segment-extract-words" +"ocrd-segment-from-coco" +"ocrd-segment-from-masks" +"ocrd-segment-project" +"ocrd-segment-repair" +"ocrd-segment-replace-original" +"ocrd-segment-replace-page" +"ocrd-segment-replace-text" +"ocrd-skimage-binarize" +"ocrd-skimage-denoise" +"ocrd-skimage-denoise-raw" +"ocrd-skimage-normalize" +"ocrd-tesserocr-binarize" +"ocrd-tesserocr-crop" +"ocrd-tesserocr-deskew" +"ocrd-tesserocr-fontshape" +"ocrd-tesserocr-recognize" +"ocrd-tesserocr-segment" +"ocrd-tesserocr-segment-line" +"ocrd-tesserocr-segment-region" +"ocrd-tesserocr-segment-table" +"ocrd-tesserocr-segment-word" +) + +for ocrd_processor in "${ocrd_processors[@]}" +do + echo -n "$ocrd_processor " & apptainer exec "$SIF_PATH" "$ocrd_processor" --version || true +done diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_all_maximum_sif.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_all_maximum_sif.sh index 96500594..d5e68d31 100755 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_all_maximum_sif.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_all_maximum_sif.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --partition standard96:shared +#SBATCH --partition standard96s:shared #SBATCH --time 2:00:00 #SBATCH --output create_ocrd_all_sif_job-%J.txt #SBATCH --cpus-per-task 16 @@ -11,7 +11,7 @@ module purge module load apptainer hostname -/opt/slurm/etc/scripts/misc/slurm_resources +# /opt/slurm/etc/scripts/misc/slurm_resources APPTAINER_TMPDIR="$LOCAL_TMPDIR" APPTAINER_CACHE_DIR="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr" diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_slim_sif_images.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_slim_sif_images.sh new file mode 100755 index 00000000..711b264e --- /dev/null +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_slim_sif_images.sh @@ -0,0 +1,67 @@ +#!/bin/bash +#SBATCH --partition standard96s:shared +#SBATCH --time 4:00:00 +#SBATCH --output create_ocrd_slim_sif_images_job-%J.txt +#SBATCH --cpus-per-task 16 +#SBATCH --mem 64G + +set -e + +module purge +module load apptainer + +hostname +# /opt/slurm/etc/scripts/misc/slurm_resources + +APPTAINER_TMPDIR="$LOCAL_TMPDIR" +APPTAINER_CACHE_DIR="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_processor_sifs_tmp" + +if [ ! -d "${APPTAINER_CACHE_DIR}" ]; then + echo "Creating non-existing APPTAINER_CACHE_DIR folder" + mkdir -p "${APPTAINER_CACHE_DIR}" +fi + +cd "${APPTAINER_CACHE_DIR}" || exit +# apptainer build --disable-cache "ocrd_all_maximum_image_new.sif" "docker://ocrd/all:latest" +# apptainer exec "ocrd_all_maximum_image_new.sif" ocrd --version + +declare -a images=( +"core" +"tesserocr" +"cis" +"kraken" +"wrap" +"calamari" +"olena" +"dinglehopper" +"eynollah" +"fileformat" +"nmalign" +"segment" +"anybaseocr" +"sbb_binarization" +"froc" +"pagetopdf" +"keraslm" +"docstruct" +"doxa" +"im6convert" +"olahd-client" +"detectron2" +"cor-asv-ann" +) + +for image in "${images[@]}" +do + if [ -f "$APPTAINER_CACHE_DIR/ocrd_$image.sif" ]; then + echo "Already exists, skipping: $APPTAINER_CACHE_DIR/ocrd_$image.sif" + continue + fi + echo "Building SIF of $image" + apptainer build --disable-cache "ocrd_$image.sif" "docker://ocrd/$image:latest" + case $? in + 0) echo "Building complete: $APPTAINER_CACHE_DIR/ocrd_$image.sif" ;; + *) echo "Building failed, error code: $?" >&2 ;; + esac + echo "" +done diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_download_ocrd_all_models.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_download_ocrd_all_models.sh old mode 100644 new mode 100755 index 05e543ab..ed2b7a88 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_download_ocrd_all_models.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_download_ocrd_all_models.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --partition standard96:shared +#SBATCH --partition standard96s:shared #SBATCH --time 6:00:00 #SBATCH --output download_all_ocrd_models_job-%J.txt #SBATCH --cpus-per-task 16 @@ -11,10 +11,10 @@ module purge module load apptainer hostname -/opt/slurm/etc/scripts/misc/slurm_resources +# /opt/slurm/etc/scripts/misc/slurm_resources # This sif file is generated with another batch script -SIF_PATH="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_all_maximum_image.sif" +SIF_PATH="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_processor_sifs/ocrd_all_maximum_image.sif" OCRD_MODELS_DIR="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_models" OCRD_MODELS_DIR_IN_DOCKER="/usr/local/share" diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh index 04ebe84c..cc2dcafa 100755 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh @@ -2,123 +2,166 @@ set -e -# Parameters are as follows: -# S0 - This batch script -# S1 - The scratch base for slurm workspaces -# $2 - Workflow job id -# $3 - Nextflow script id -# $4 - Entry input file group -# $5 - Workspace id -# $6 - Mets basename - default "mets.xml" -# $7 - CPUs for the Nextflow processes -# $8 - RAM for the Nextflow processes -# $9 - Amount of forks per OCR-D processor in the NF script -# $10 - Amount of pages in the workspace -# $11 - Boolean flag showing whether a mets server is utilized or not -# $12 - File groups to be removed from the workspace after the processing - -SIF_PATH="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_all_maximum_image.sif" -SIF_PATH_IN_NODE="${TMP_LOCAL}/ocrd_all_maximum_image.sif" -OCRD_MODELS_DIR="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_models" -OCRD_MODELS_DIR_IN_NODE="${TMP_LOCAL}/ocrd_models" -OCRD_MODELS_DIR_IN_DOCKER="/usr/local/share/ocrd-resources" -BIND_OCRD_MODELS="${OCRD_MODELS_DIR_IN_NODE}/ocrd-resources:${OCRD_MODELS_DIR_IN_DOCKER}" - -SCRATCH_BASE=$1 -WORKFLOW_JOB_ID=$2 -NEXTFLOW_SCRIPT_ID=$3 -IN_FILE_GRP=$4 -WORKSPACE_ID=$5 -METS_BASENAME=$6 -CPUS=$7 -RAM=$8 -FORKS=$9 -PAGES=${10} -USE_METS_SERVER=${11} -FILE_GROUPS_TO_REMOVE=${12} - -WORKFLOW_JOB_DIR="${SCRATCH_BASE}/${WORKFLOW_JOB_ID}" -NF_SCRIPT_PATH="${WORKFLOW_JOB_DIR}/${NEXTFLOW_SCRIPT_ID}" -WORKSPACE_DIR="${WORKFLOW_JOB_DIR}/${WORKSPACE_ID}" -WORKSPACE_DIR_IN_DOCKER="/ws_data" -BIND_WORKSPACE_DIR="${WORKSPACE_DIR}:${WORKSPACE_DIR_IN_DOCKER}" -BIND_METS_FILE_PATH="${WORKSPACE_DIR_IN_DOCKER}/${METS_BASENAME}" -METS_SOCKET_BASENAME="mets_server.sock" -BIND_METS_SOCKET_PATH="${WORKSPACE_DIR_IN_DOCKER}/${METS_SOCKET_BASENAME}" - -hostname -/opt/slurm/etc/scripts/misc/slurm_resources - module purge +module load jq module load apptainer module load nextflow # module load spack-user; eval "$(spack load --sh curl%gcc@10.2.0)" -echo "ocrd all SIF path: $SIF_PATH" -echo "ocrd all SIF path node local: $SIF_PATH_IN_NODE" -echo "Workspace dir: $WORKSPACE_DIR" -echo "Nextflow script path: $NF_SCRIPT_PATH" -echo "Use mets server: $USE_METS_SERVER" -echo "Used file group: $IN_FILE_GRP" -echo "Pages: $PAGES" +hostname +# /opt/slurm/etc/scripts/misc/slurm_resource # To submit separate jobs for each process in the NF script # export NXF_EXECUTOR=slurm +# TODO: Use the -r (or --raw-output) option to emit raw strings as output: +json_args="$1" + +ocrd_processor_images=() +mapfile -t ocrd_processor_images < <(echo "$json_args" | jq .ocrd_processor_images | tr -d '"' | tr "," "\n") +echo "Ocrd total images in request: ${#ocrd_processor_images[@]}" +echo "Ocrd images: " +for ocrd_image in "${ocrd_processor_images[@]}" +do + echo -n "${ocrd_image} " +done + +PROJECT_BASE_DIR=$(echo "$json_args" | jq .project_base_dir | tr -d '"') +SCRATCH_BASE=$(echo "$json_args" | jq .scratch_base_dir | tr -d '"') +WORKFLOW_JOB_ID=$(echo "$json_args" | jq .workflow_job_id | tr -d '"') +WORKSPACE_ID=$(echo "$json_args" | jq .workspace_id | tr -d '"') +USE_METS_SERVER=$(echo "$json_args" | jq .use_mets_server_bash_flag | tr -d '"') +FILE_GROUPS_TO_REMOVE=$(echo "$json_args" | jq .file_groups_to_remove | tr -d '"') + +WORKFLOW_JOB_DIR=$(echo "$json_args" | jq .hpc_workflow_job_dir | tr -d '"') +WORKSPACE_DIR=$(echo "$json_args" | jq .hpc_workspace_dir | tr -d '"') +NF_RUN_COMMAND=$(echo "$json_args" | jq .nf_run_command | tr -d '"') +PRINT_OCRD_VERSION_COMMAND=$(echo "$json_args" | jq .print_ocrd_version_command | tr -d '"') +START_METS_SERVER_COMMAND=$(echo "$json_args" | jq .start_mets_server_command | tr -d '"') +STOP_METS_SERVER_COMMAND=$(echo "$json_args" | jq .stop_mets_server_command | tr -d '"') +LIST_FILE_GROUPS_COMMAND=$(echo "$json_args" | jq .list_file_groups_command | tr -d '"') +REMOVE_FILE_GROUP_COMMAND=$(echo "$json_args" | jq .remove_file_group_command | tr -d '"') + +PROJECT_DIR_OCRD_MODELS="${PROJECT_BASE_DIR}/ocrd_models" +PROJECT_DIR_PROCESSOR_SIFS="${PROJECT_BASE_DIR}/ocrd_processor_sifs" + +NODE_DIR_OCRD_MODELS="${TMP_LOCAL}/ocrd_models" +NODE_DIR_PROCESSOR_SIFS="${TMP_LOCAL}/ocrd_processor_sifs" + +echo "" +echo "Project dir ocrd models: $PROJECT_DIR_OCRD_MODELS" +echo "Project dir processor sifs: $PROJECT_DIR_PROCESSOR_SIFS" +echo "Node dir ocrd models: $NODE_DIR_OCRD_MODELS" +echo "Node dir processor sifs: $NODE_DIR_PROCESSOR_SIFS" +echo "" -# Define functions to be used -check_existence_of_paths() { - # The SIF file of the OCR-D All docker image must be previously created - if [ ! -f "${SIF_PATH}" ]; then - echo "Required ocrd_all_image sif file not found at: ${SIF_PATH}" - exit 1 - fi - echo "Required ocrd_all_image sif file found at: ${SIF_PATH}" - - # Models directory must be previously filled with OCR-D models - if [ ! -d "${OCRD_MODELS_DIR}" ]; then - echo "Ocrd models directory not found at: ${OCRD_MODELS_DIR}" - exit 1 - fi - echo "Ocrd models directory found at: ${OCRD_MODELS_DIR}" +echo "Workspace dir: $WORKSPACE_DIR" +echo "Use mets server: $USE_METS_SERVER" +echo "" +echo "Nf run command with Node placeholders: $NF_RUN_COMMAND" +NF_RUN_COMMAND="${NF_RUN_COMMAND//PH_NODE_DIR_OCRD_MODELS/$NODE_DIR_OCRD_MODELS}" +NF_RUN_COMMAND="${NF_RUN_COMMAND//PH_CMD_WRAPPER/\'}" +NF_RUN_COMMAND="${NF_RUN_COMMAND//PH_NODE_DIR_PROCESSOR_SIFS/$NODE_DIR_PROCESSOR_SIFS}" +echo "" +echo "Nf run command without placeholders: $NF_RUN_COMMAND" +echo "" + +echo "Replacing ocrd core NODE_DIR_PROCESSOR_SIFS" +PRINT_OCRD_VERSION_COMMAND="${PRINT_OCRD_VERSION_COMMAND//PH_NODE_DIR_PROCESSOR_SIFS/$NODE_DIR_PROCESSOR_SIFS}" +START_METS_SERVER_COMMAND="${START_METS_SERVER_COMMAND//PH_NODE_DIR_PROCESSOR_SIFS/$NODE_DIR_PROCESSOR_SIFS}" +STOP_METS_SERVER_COMMAND="${STOP_METS_SERVER_COMMAND//PH_NODE_DIR_PROCESSOR_SIFS/$NODE_DIR_PROCESSOR_SIFS}" +LIST_FILE_GROUPS_COMMAND="${LIST_FILE_GROUPS_COMMAND//PH_NODE_DIR_PROCESSOR_SIFS/$NODE_DIR_PROCESSOR_SIFS}" +REMOVE_FILE_GROUP_COMMAND="${REMOVE_FILE_GROUP_COMMAND//PH_NODE_DIR_PROCESSOR_SIFS/$NODE_DIR_PROCESSOR_SIFS}" +echo "" + +check_existence_of_dir_scratch_base(){ if [ ! -d "${SCRATCH_BASE}" ]; then + echo "Creating non-existing SCRATCH_BASE folder" mkdir -p "${SCRATCH_BASE}" fi - if [ ! -d "${SCRATCH_BASE}" ]; then - echo "Required scratch base dir was not created: ${SCRATCH_BASE}" + echo "Required scratch base dir was not found: ${SCRATCH_BASE}" exit 1 fi echo "Scratch base found/created at: ${SCRATCH_BASE}" } +check_existence_of_dir_ocrd_models(){ + # Models directory must be previously filled with OCR-D models + if [ ! -d "${PROJECT_DIR_OCRD_MODELS}" ]; then + echo "Ocrd models directory not found at: ${PROJECT_DIR_OCRD_MODELS}" + exit 1 + fi + echo "Ocrd models directory found at: ${PROJECT_DIR_OCRD_MODELS}" +} + +check_existence_of_ocrd_processor_images_to_be_used(){ + for ocrd_image in "${ocrd_processor_images[@]}" + do + image_path="${PROJECT_DIR_PROCESSOR_SIFS}/${ocrd_image}" + if [ ! -f "$image_path" ]; then + echo "Expected ocrd processor image not found at: $image_path" + exit 1 + fi + done +} + +check_existence_of_paths() { + check_existence_of_dir_scratch_base + check_existence_of_dir_ocrd_models + check_existence_of_ocrd_processor_images_to_be_used +} + clear_data_from_computing_node() { - echo "If existing, removing the SIF from the computing node, path: ${SIF_PATH_IN_NODE}" - rm -f "${SIF_PATH_IN_NODE}" - echo "If existing, removing the OCR-D models from the computing node, path: ${OCRD_MODELS_DIR_IN_NODE}" - rm -rf "${OCRD_MODELS_DIR_IN_NODE}" + echo "" + echo "Removing the OCR-D models directory from the computing node, path: ${NODE_DIR_OCRD_MODELS}" + rm -rf "${NODE_DIR_OCRD_MODELS}" + echo "Removing the OCR-D processor images (SIF) directory from the computing node, path: ${NODE_DIR_PROCESSOR_SIFS}" + rm -rf "${NODE_DIR_PROCESSOR_SIFS}" } -transfer_requirements_to_node_storage() { - cp "${SIF_PATH}" "${SIF_PATH_IN_NODE}" - # Check if transfer successful - if [ ! -f "${SIF_PATH_IN_NODE}" ]; then - echo "Required ocrd_all_image sif file not found at node local storage: ${SIF_PATH_IN_NODE}" +transfer_to_node_storage_processor_models(){ + cp -R "${PROJECT_DIR_OCRD_MODELS}" "${NODE_DIR_OCRD_MODELS}" + if [ ! -d "${NODE_DIR_OCRD_MODELS}" ]; then + echo "Ocrd models directory not found at node local storage: ${NODE_DIR_OCRD_MODELS}" + clear_data_from_computing_node exit 1 else - echo "Successfully transferred SIF to node local storage" - apptainer exec "$SIF_PATH_IN_NODE" ocrd --version + echo "Successfully transferred ocrd models to node local storage" fi +} - cp -R "${OCRD_MODELS_DIR}" "${OCRD_MODELS_DIR_IN_NODE}" - if [ ! -d "${OCRD_MODELS_DIR_IN_NODE}" ]; then - echo "Ocrd models directory not found at node local storage: ${OCRD_MODELS_DIR_IN_NODE}" - clear_data_from_computing_node +transfer_to_node_storage_processor_images(){ + if [ ! -d "${NODE_DIR_PROCESSOR_SIFS}" ]; then + echo "Creating non-existing processor sif images dir: $NODE_DIR_PROCESSOR_SIFS" + mkdir -p "${NODE_DIR_PROCESSOR_SIFS}" + fi + if [ ! -d "${NODE_DIR_PROCESSOR_SIFS}" ]; then + echo "Required node processor sif images dir was not found: ${NODE_DIR_PROCESSOR_SIFS}" exit 1 - else - echo "Successfully transferred ocrd models to node local storage" fi + + for ocrd_image in "${ocrd_processor_images[@]}" + do + ocrd_image_path="${PROJECT_DIR_PROCESSOR_SIFS}/${ocrd_image}" + node_ocrd_image_path="${NODE_DIR_PROCESSOR_SIFS}/${ocrd_image}" + if [ ! -f "$ocrd_image_path" ]; then + echo "Expected ocrd processor image not found at: $ocrd_image_path" + exit 1 + else + echo "Transferring ocrd processor image to the compute node: ${ocrd_image}" + cp "${ocrd_image_path}" "${node_ocrd_image_path}" + echo "Ocrd processor image was transferred to: ${node_ocrd_image_path}" + if [ ! -f "${node_ocrd_image_path}" ]; then + echo "Expected ocrd processor image was copied but not found locally at: ${node_ocrd_image_path}" + exit 1 + fi + fi + done + echo "" + eval "$PRINT_OCRD_VERSION_COMMAND" + echo "" } unzip_workflow_job_dir() { @@ -142,70 +185,27 @@ unzip_workflow_job_dir() { } start_mets_server() { - # TODO: Would be better to start the mets server as an instance, but this is still broken - # apptainer instance start \ - # --bind "${BIND_WORKSPACE_DIR}" \ - # "${SIF_PATH_IN_NODE}" \ - # instance_mets_server \ - # ocrd workspace -U "${BIND_METS_SOCKET_PATH}" -d "${WORKSPACE_DIR_IN_DOCKER}" server start - if [ "$1" == "true" ] ; then echo "Starting the mets server for the specific workspace in the background" - apptainer exec \ - --bind "${BIND_WORKSPACE_DIR}" \ - "${SIF_PATH_IN_NODE}" \ - ocrd workspace -U "${BIND_METS_SOCKET_PATH}" -d "${WORKSPACE_DIR_IN_DOCKER}" server start \ - > "${WORKSPACE_DIR}/mets_server.log" 2>&1 & + eval "$START_METS_SERVER_COMMAND" + sleep 10 fi - sleep 10 } stop_mets_server() { - # Not supported in the HPC (the version there is <7.40) - # curl -X DELETE --unix-socket "${WORKSPACE_DIR}/${METS_SOCKET_BASENAME}" "http://localhost/" - - # TODO Stop the instance here - # singularity instance stop instance_mets_server - if [ "$1" == "true" ] ; then echo "Stopping the mets server" - apptainer exec \ - --bind "${BIND_WORKSPACE_DIR}" \ - "${SIF_PATH_IN_NODE}" \ - ocrd workspace -U "${BIND_METS_SOCKET_PATH}" -d "${WORKSPACE_DIR_IN_DOCKER}" server stop + eval "$STOP_METS_SERVER_COMMAND" fi } execute_nextflow_workflow() { - local APPTAINER_CMD="apptainer exec --bind ${BIND_WORKSPACE_DIR} --bind ${BIND_OCRD_MODELS} --env OCRD_METS_CACHING=false ${SIF_PATH_IN_NODE}" if [ "$1" == "true" ] ; then echo "Executing the nextflow workflow with mets server" - nextflow run "${NF_SCRIPT_PATH}" \ - -ansi-log false \ - -with-report \ - --input_file_group "${IN_FILE_GRP}" \ - --mets "${BIND_METS_FILE_PATH}" \ - --mets_socket "${BIND_METS_SOCKET_PATH}" \ - --workspace_dir "${WORKSPACE_DIR_IN_DOCKER}" \ - --pages "${PAGES}" \ - --singularity_wrapper "${APPTAINER_CMD}" \ - --cpus "${CPUS}" \ - --ram "${RAM}" \ - --forks "${FORKS}" else echo "Executing the nextflow workflow without mets server" - nextflow run "${NF_SCRIPT_PATH}" \ - -ansi-log false \ - -with-report \ - --input_file_group "${IN_FILE_GRP}" \ - --mets "${BIND_METS_FILE_PATH}" \ - --workspace_dir "${WORKSPACE_DIR_IN_DOCKER}" \ - --pages "${PAGES}" \ - --singularity_wrapper "${APPTAINER_CMD}" \ - --cpus "${CPUS}" \ - --ram "${RAM}" \ - --forks "${FORKS}" fi + eval "$NF_RUN_COMMAND" case $? in 0) echo "The nextflow workflow execution has finished successfully" ;; @@ -215,7 +215,7 @@ execute_nextflow_workflow() { list_file_groups_from_workspace() { all_file_groups=() - mapfile -t all_file_groups < <(apptainer exec --bind "${BIND_WORKSPACE_DIR}" "${SIF_PATH_IN_NODE}" ocrd workspace -d "${WORKSPACE_DIR_IN_DOCKER}" list-group) + mapfile -t all_file_groups < <($LIST_FILE_GROUPS_COMMAND) file_groups_length=${#all_file_groups[@]} echo -n "File groups: " for file_group in "${all_file_groups[@]}" @@ -228,9 +228,8 @@ list_file_groups_from_workspace() { remove_file_group_from_workspace() { echo "Removing file group: $1" - apptainer exec --bind "${BIND_WORKSPACE_DIR}" "${SIF_PATH_IN_NODE}" \ - ocrd workspace -d "${WORKSPACE_DIR_IN_DOCKER}" remove-group -r -f "$1" \ - > "${WORKSPACE_DIR}/remove_file_groups.log" 2>&1 + REMOVE_FILE_GROUP_COMMAND="${REMOVE_FILE_GROUP_COMMAND//FILE_GROUP_PLACEHOLDER/$1}" + eval "$REMOVE_FILE_GROUP_COMMAND" } remove_file_groups_from_workspace() { @@ -271,7 +270,9 @@ zip_results() { # Main loop for workflow job execution check_existence_of_paths unzip_workflow_job_dir -transfer_requirements_to_node_storage +echo "" +transfer_to_node_storage_processor_models +transfer_to_node_storage_processor_images start_mets_server "$USE_METS_SERVER" execute_nextflow_workflow "$USE_METS_SERVER" stop_mets_server "$USE_METS_SERVER" diff --git a/src/utils/operandi_utils/hpc/batch_scripts/wrapper_check_workflow_job_status.sh b/src/utils/operandi_utils/hpc/batch_scripts/wrapper_check_workflow_job_status.sh old mode 100644 new mode 100755 diff --git a/src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh b/src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh old mode 100644 new mode 100755 index a3f6255a..b662459b --- a/src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh @@ -1,33 +1,21 @@ #!/bin/bash -# $0 - This bash script -# $1 - Slurm parameter - partition -# $2 - Slurm parameter - time -# $3 - Slurm parameter - output -# $4 - Slurm parameter - cpus-per-task -# $5 - Slurm parameter - mem -# $6 - Slurm parameter - qos +module purge +module load jq -# $7 - The batch script path to execute -# $8 - The scratch base for slurm workspaces -# $9 - Workflow job id -# $10 - Nextflow script id -# $11 - Entry input file group -# $12 - Workspace id -# $13 - Mets basename - default "mets.xml" -# $14 - CPUs for the Nextflow processes -# $15 - RAM for the Nextflow processes -# $16 - Amount of forks per OCR-D processor in the NF script -# $17 - Amount of pages in the workspace -# $18 - Boolean flag showing whether a mets server is utilized or not -# $19 - File groups to be removed from the workspace after the processing +sbatch_args="$1" +partition=$(echo "$sbatch_args" | jq .partition | tr -d '"') +deadline_time=$(echo "$sbatch_args" | jq .job_deadline_time | tr -d '"') +output=$(echo "$sbatch_args" | jq .output_log | tr -d '"') +cpus_per_task=$(echo "$sbatch_args" | jq .cpus | tr -d '"') +memory=$(echo "$sbatch_args" | jq .ram | tr -d '"') +qos=$(echo "$sbatch_args" | jq .qos | tr -d '"') +batch_script_path=$(echo "$sbatch_args" | jq .batch_script_path | tr -d '"') -if [ "$6" == "48h" ] ; then +# $2 is a json of regular arguments used inside the `batch_submit_workflow_job.sh` +if [ "$qos" == "48h" ] ; then # QOS not set, the default of 48h is used - sbatch --partition="$1" --time="$2" --output="$3" --cpus-per-task="$4" --mem="$5" "$7" "$8" "$9" "${10}" "${11}" "${12}" "${13}" "${14}" "${15}" "${16}" "${17}" "${18}" "${19}" + sbatch --partition="$partition" --time="$deadline_time" --output="$output" --cpus-per-task="$cpus_per_task" --mem="$memory" "$batch_script_path" "$2" else - sbatch --partition="$1" --time="$2" --output="$3" --cpus-per-task="$4" --mem="$5" --qos="$6" "$7" "$8" "$9" "${10}" "${11}" "${12}" "${13}" "${14}" "${15}" "${16}" "${17}" "${18}" "${19}" + sbatch --partition="$partition" --time="$deadline_time" --output="$output" --cpus-per-task="$cpus_per_task" --mem="$memory" --qos="$qos" "$batch_script_path" "$2" fi - -echo "0:$0 1:$1 2:$2 3:$3 4:$4 5:$5 6:$6 7:$7 8:$8 9:$9 10:${10}" -echo "11:${11} 12:${12} 13:${13} 14:${14} 15:${15} 16:${16} 17:${17} 18:${18} 19:${19}" diff --git a/src/utils/operandi_utils/hpc/constants.py b/src/utils/operandi_utils/hpc/constants.py index 5e0210d4..5ce9f88c 100644 --- a/src/utils/operandi_utils/hpc/constants.py +++ b/src/utils/operandi_utils/hpc/constants.py @@ -13,6 +13,7 @@ "HPC_SSH_CONNECTION_TRY_TIMES", "HPC_NHR_PROJECT", "HPC_NHR_CLUSTERS", + "HPC_USE_SLIM_IMAGES", "HPC_WRAPPER_SUBMIT_WORKFLOW_JOB", "HPC_WRAPPER_CHECK_WORKFLOW_JOB_STATUS" ] @@ -61,14 +62,16 @@ HPC_DIR_BATCH_SCRIPTS = "batch_scripts" HPC_DIR_SLURM_WORKSPACES = "slurm_workspaces" # TODO: Fix the constant file name - it should be automatically resolved -HPC_BATCH_SUBMIT_WORKFLOW_JOB = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/batch_submit_workflow_job.sh" -HPC_WRAPPER_SUBMIT_WORKFLOW_JOB = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/wrapper_submit_workflow_job.sh" +# TODO: Fix the naming when releasing the next Operandi version +HPC_BATCH_SUBMIT_WORKFLOW_JOB = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/batch_submit_workflow_job2.sh" +HPC_WRAPPER_SUBMIT_WORKFLOW_JOB = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/wrapper_submit_workflow_job2.sh" HPC_WRAPPER_CHECK_WORKFLOW_JOB_STATUS = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/wrapper_check_workflow_job_status.sh" HPC_JOB_DEADLINE_TIME_REGULAR = "48:00:00" -HPC_JOB_DEADLINE_TIME_TEST = "0:30:00" -HPC_NHR_JOB_DEFAULT_PARTITION = "standard96s:shared" -HPC_NHR_JOB_TEST_PARTITION = "standard96s:shared" +HPC_JOB_DEADLINE_TIME_TEST = "00:30:00" +# TODO: Use again "standard96s:shared" +HPC_NHR_JOB_DEFAULT_PARTITION = "standard96:shared" +HPC_NHR_JOB_TEST_PARTITION = "standard96:shared" # Check here: https://docs.hpc.gwdg.de/getting_started/transition/index.html HPC_JOB_QOS_SHORT = "2h" @@ -76,3 +79,6 @@ HPC_JOB_QOS_LONG = "7d" HPC_JOB_QOS_VERY_LONG = "14d" HPC_SSH_CONNECTION_TRY_TIMES = 30 + +# A switch to decide whether to use the fat ocrd_all image or slim images of the processors +HPC_USE_SLIM_IMAGES: bool = False diff --git a/src/utils/operandi_utils/hpc/nextflow_workflows/default_workflow.nf b/src/utils/operandi_utils/hpc/nextflow_workflows/default_workflow.nf index ca8b2a11..a748cf8b 100755 --- a/src/utils/operandi_utils/hpc/nextflow_workflows/default_workflow.nf +++ b/src/utils/operandi_utils/hpc/nextflow_workflows/default_workflow.nf @@ -1,250 +1,291 @@ -nextflow.enable.dsl=2 +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 -// The values are assigned inside the batch script -// Based on internal values and options provided in the request -params.input_file_group = "null" -params.mets = "null" +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" params.workspace_dir = "null" -// amount of pages of the workspace params.pages = "null" -params.singularity_wrapper = "null" params.cpus = "null" params.ram = "null" params.forks = params.cpus -// Do not pass these parameters from the caller unless you know what you are doing params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" log.info """\ - OPERANDI - HPC - Default Workflow - =========================================== - input_file_group : ${params.input_file_group} - mets : ${params.mets} - workspace_dir : ${params.workspace_dir} - pages : ${params.pages} - singularity_wrapper : ${params.singularity_wrapper} - cpus : ${params.cpus} - ram : ${params.ram} - forks : ${params.forks} - cpus_per_fork : ${params.cpus_per_fork} - ram_per_fork : ${params.ram_per_fork} - """ - .stripIndent() + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + """.stripIndent() process split_page_ranges { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: val range_multiplier + output: env mets_file_chunk env current_range_pages + script: - """ - current_range_pages=\$(${params.singularity_wrapper} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) - echo "Current range is: \$current_range_pages" - mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) - echo "Mets file chunk path: \$mets_file_chunk" - \$(${params.singularity_wrapper} cp -p ${params.mets} \$mets_file_chunk) - """ + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(${params.env_wrapper_cmd_core} cp -p ${params.mets_path} \$mets_file_chunk) + """ } -process ocrd_cis_ocropy_binarize { +process ocrd_cis_ocropy_binarize_0 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-binarize -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } -process ocrd_anybaseocr_crop { +process ocrd_anybaseocr_crop_1 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-anybaseocr-crop -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step1} ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } -process ocrd_skimage_binarize { +process ocrd_skimage_binarize_2 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-skimage-binarize -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P method "li" - """ + """ + ${params.env_wrapper_cmd_step2} ocrd-skimage-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"method": "li"}' + """ } -process ocrd_skimage_denoise { +process ocrd_skimage_denoise_3 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-skimage-denoise -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P level-of-operation "page" - """ + """ + ${params.env_wrapper_cmd_step3} ocrd-skimage-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ } -process ocrd_tesserocr_deskew { +process ocrd_tesserocr_deskew_4 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-tesserocr-deskew -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P operation_level "page" - """ + """ + ${params.env_wrapper_cmd_step4} ocrd-tesserocr-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"operation_level": "page"}' + """ } -process ocrd_cis_ocropy_segment { +process ocrd_cis_ocropy_segment_5 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-segment -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P level-of-operation "page" - """ + """ + ${params.env_wrapper_cmd_step5} ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ } -process ocrd_cis_ocropy_dewarp { +process ocrd_cis_ocropy_dewarp_6 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-dewarp -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step6} ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } -process ocrd_calamari_recognize { +process ocrd_calamari_recognize_7 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-calamari-recognize -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P checkpoint_dir "qurator-gt4histocr-1.0" - """ + """ + ${params.env_wrapper_cmd_step7} ocrd-calamari-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + """ } process merging_mets { - // Must be a single instance - modifying the main mets file + debug true maxForks 1 + cpus params.cpus_per_fork + memory params.ram_per_fork input: val mets_file_chunk val page_range + script: - """ - ${params.singularity_wrapper} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} - ${params.singularity_wrapper} rm ${mets_file_chunk} - """ + """ + ${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + ${params.env_wrapper_cmd_core} rm ${mets_file_chunk} + """ } workflow { main: ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) split_page_ranges(ch_range_multipliers) - ocrd_cis_ocropy_binarize(split_page_ranges.out[0], split_page_ranges.out[1], params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop(ocrd_cis_ocropy_binarize.out[0], ocrd_cis_ocropy_binarize.out[1], "OCR-D-BIN", "OCR-D-CROP") - ocrd_skimage_binarize(ocrd_anybaseocr_crop.out[0], ocrd_anybaseocr_crop.out[1], "OCR-D-CROP", "OCR-D-BIN2") - ocrd_skimage_denoise(ocrd_skimage_binarize.out[0], ocrd_skimage_binarize.out[1], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") - ocrd_tesserocr_deskew(ocrd_skimage_denoise.out[0], ocrd_skimage_denoise.out[1], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_cis_ocropy_segment(ocrd_tesserocr_deskew.out[0], ocrd_tesserocr_deskew.out[1], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") - ocrd_cis_ocropy_dewarp(ocrd_cis_ocropy_segment.out[0], ocrd_cis_ocropy_segment.out[1], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") - ocrd_calamari_recognize(ocrd_cis_ocropy_dewarp.out[0], ocrd_cis_ocropy_dewarp.out[1], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") - merging_mets(ocrd_calamari_recognize.out[0], ocrd_calamari_recognize.out[1]) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + merging_mets(ocrd_calamari_recognize_7.out[0], ocrd_calamari_recognize_7.out[1]) } diff --git a/src/utils/operandi_utils/hpc/nextflow_workflows/default_workflow_with_MS.nf b/src/utils/operandi_utils/hpc/nextflow_workflows/default_workflow_with_MS.nf index 8833a502..23115cd0 100755 --- a/src/utils/operandi_utils/hpc/nextflow_workflows/default_workflow_with_MS.nf +++ b/src/utils/operandi_utils/hpc/nextflow_workflows/default_workflow_with_MS.nf @@ -1,217 +1,273 @@ -nextflow.enable.dsl=2 +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 -// The values are assigned inside the batch script -// Based on internal values and options provided in the request -params.input_file_group = "null" -params.mets = "null" -params.mets_socket = "null" +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" params.workspace_dir = "null" -// amount of pages of the workspace params.pages = "null" -params.singularity_wrapper = "null" +params.mets_socket_path = "null" params.cpus = "null" params.ram = "null" params.forks = params.cpus -// Do not pass these parameters from the caller unless you know what you are doing params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" log.info """\ - OPERANDI - HPC - Default Workflow with Mets Server - =========================================== - input_file_group : ${params.input_file_group} - mets : ${params.mets} - mets_socket : ${params.mets_socket} - workspace_dir : ${params.workspace_dir} - pages : ${params.pages} - singularity_wrapper : ${params.singularity_wrapper} - cpus : ${params.cpus} - ram : ${params.ram} - forks : ${params.forks} - cpus_per_fork : ${params.cpus_per_fork} - ram_per_fork : ${params.ram_per_fork} - """ - .stripIndent() + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + mets_socket_path: ${params.mets_socket_path} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + """.stripIndent() process split_page_ranges { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: val range_multiplier + output: + env mets_file_chunk env current_range_pages - shell: - ''' - current_range_pages=$(!{params.singularity_wrapper} ocrd workspace -d !{params.workspace_dir} list-page -f comma-separated -D !{params.forks} -C !{range_multiplier}) - echo "Current range is: $current_range_pages" - ''' + + script: + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.mets_path}) + """ } -process ocrd_cis_ocropy_binarize { +process ocrd_cis_ocropy_binarize_0 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-binarize -U ${params.mets_socket} -w ${params.workspace_dir} --page-id ${page_range} -m ${params.mets} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } -process ocrd_anybaseocr_crop { +process ocrd_anybaseocr_crop_1 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-anybaseocr-crop -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step1} ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } -process ocrd_skimage_binarize { +process ocrd_skimage_binarize_2 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-skimage-binarize -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P method "li" - """ + """ + ${params.env_wrapper_cmd_step2} ocrd-skimage-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"method": "li"}' + """ } -process ocrd_skimage_denoise { +process ocrd_skimage_denoise_3 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-skimage-denoise -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P level-of-operation "page" - """ + """ + ${params.env_wrapper_cmd_step3} ocrd-skimage-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ } -process ocrd_tesserocr_deskew { +process ocrd_tesserocr_deskew_4 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-tesserocr-deskew -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P operation_level "page" - """ + """ + ${params.env_wrapper_cmd_step4} ocrd-tesserocr-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"operation_level": "page"}' + """ } -process ocrd_cis_ocropy_segment { +process ocrd_cis_ocropy_segment_5 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-segment -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P level-of-operation "page" - """ + """ + ${params.env_wrapper_cmd_step5} ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ } -process ocrd_cis_ocropy_dewarp { +process ocrd_cis_ocropy_dewarp_6 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-dewarp -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step6} ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } -process ocrd_calamari_recognize { +process ocrd_calamari_recognize_7 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-calamari-recognize -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P checkpoint_dir "qurator-gt4histocr-1.0" - """ + """ + ${params.env_wrapper_cmd_step7} ocrd-calamari-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + """ } workflow { main: ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) split_page_ranges(ch_range_multipliers) - ocrd_cis_ocropy_binarize(split_page_ranges.out, params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop(ocrd_cis_ocropy_binarize.out, "OCR-D-BIN", "OCR-D-CROP") - ocrd_skimage_binarize(ocrd_anybaseocr_crop.out, "OCR-D-CROP", "OCR-D-BIN2") - ocrd_skimage_denoise(ocrd_skimage_binarize.out, "OCR-D-BIN2", "OCR-D-BIN-DENOISE") - ocrd_tesserocr_deskew(ocrd_skimage_denoise.out, "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_cis_ocropy_segment(ocrd_tesserocr_deskew.out, "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") - ocrd_cis_ocropy_dewarp(ocrd_cis_ocropy_segment.out, "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") - ocrd_calamari_recognize(ocrd_cis_ocropy_dewarp.out, "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") } diff --git a/src/utils/operandi_utils/hpc/nextflow_workflows/odem_workflow.nf b/src/utils/operandi_utils/hpc/nextflow_workflows/odem_workflow.nf index 57dd0b48..8d3908c2 100755 --- a/src/utils/operandi_utils/hpc/nextflow_workflows/odem_workflow.nf +++ b/src/utils/operandi_utils/hpc/nextflow_workflows/odem_workflow.nf @@ -1,294 +1,345 @@ -nextflow.enable.dsl=2 +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 -// The values are assigned inside the batch script -// Based on internal values and options provided in the request -params.input_file_group = "null" -params.mets = "null" +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" params.workspace_dir = "null" -// amount of pages of the workspace params.pages = "null" -params.singularity_wrapper = "null" params.cpus = "null" params.ram = "null" params.forks = params.cpus -// Do not pass these parameters from the caller unless you know what you are doing params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" +params.env_wrapper_cmd_step8 = "null" +params.env_wrapper_cmd_step9 = "null" log.info """\ - OPERANDI - HPC - Odem Workflow - =========================================== - input_file_group : ${params.input_file_group} - mets : ${params.mets} - workspace_dir : ${params.workspace_dir} - pages : ${params.pages} - singularity_wrapper : ${params.singularity_wrapper} - cpus : ${params.cpus} - ram : ${params.ram} - forks : ${params.forks} - cpus_per_fork : ${params.cpus_per_fork} - ram_per_fork : ${params.ram_per_fork} - """ - .stripIndent() + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + env_wrapper_cmd_step8: ${params.env_wrapper_cmd_step8} + env_wrapper_cmd_step9: ${params.env_wrapper_cmd_step9} + """.stripIndent() process split_page_ranges { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: val range_multiplier + output: env mets_file_chunk env current_range_pages + script: - """ - current_range_pages=\$(${params.singularity_wrapper} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) - echo "Current range is: \$current_range_pages" - mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) - echo "Mets file chunk path: \$mets_file_chunk" - \$(${params.singularity_wrapper} cp -p ${params.mets} \$mets_file_chunk) - """ + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(${params.env_wrapper_cmd_core} cp -p ${params.mets_path} \$mets_file_chunk) + """ } process ocrd_cis_ocropy_binarize_0 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-binarize -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"dpi": 300}' + """ } process ocrd_anybaseocr_crop_1 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-anybaseocr-crop -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step1} ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"dpi": 300}' + """ } process ocrd_cis_ocropy_denoise_2 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-denoise -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step2} ocrd-cis-ocropy-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"dpi": 300}' + """ } process ocrd_cis_ocropy_deskew_3 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-deskew -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P level-of-operation page - """ + """ + ${params.env_wrapper_cmd_step3} ocrd-cis-ocropy-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ } process ocrd_tesserocr_segment_region_4 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-tesserocr-segment-region -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P padding 5.0 -P find_tables false -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step4} ocrd-tesserocr-segment-region -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"padding": 5.0, "find_tables": false, "dpi": 300}' + """ } process ocrd_segment_repair_5 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-segment-repair -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P plausibilize true -P plausibilize_merge_min_overlap 0.7 - """ + """ + ${params.env_wrapper_cmd_step5} ocrd-segment-repair -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"plausibilize": true, "plausibilize_merge_min_overlap": 0.7}' + """ } process ocrd_cis_ocropy_clip_6 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-clip -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step6} ocrd-cis-ocropy-clip -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } process ocrd_cis_ocropy_segment_7 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-segment -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step7} ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"dpi": 300}' + """ } process ocrd_cis_ocropy_dewarp_8 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-dewarp -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step8} ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } process ocrd_tesserocr_recognize_9 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-tesserocr-recognize -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P model Fraktur - """ + """ + ${params.env_wrapper_cmd_step9} ocrd-tesserocr-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"model": "Fraktur"}' + """ } process merging_mets { - // Must be a single instance - modifying the main mets file + debug true maxForks 1 + cpus params.cpus_per_fork + memory params.ram_per_fork input: val mets_file_chunk val page_range + script: - """ - ${params.singularity_wrapper} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} - ${params.singularity_wrapper} rm ${mets_file_chunk} - """ + """ + ${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + ${params.env_wrapper_cmd_core} rm ${mets_file_chunk} + """ } workflow { main: ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) split_page_ranges(ch_range_multipliers) - ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.input_file_group, "OCR-D-BINPAGE") - ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], "OCR-D-BINPAGE", "OCR-D-SEG-PAGE-ANYOCR") - ocrd_cis_ocropy_denoise_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], "OCR-D-SEG-PAGE-ANYOCR", "OCR-D-DENOISE-OCROPY") - ocrd_cis_ocropy_deskew_3(ocrd_cis_ocropy_denoise_2.out[0], ocrd_cis_ocropy_denoise_2.out[1], "OCR-D-DENOISE-OCROPY", "OCR-D-DESKEW-OCROPY") - ocrd_tesserocr_segment_region_4(ocrd_cis_ocropy_deskew_3.out[0], ocrd_cis_ocropy_deskew_3.out[1], "OCR-D-DESKEW-OCROPY", "OCR-D-SEG-BLOCK-TESSERACT") - ocrd_segment_repair_5(ocrd_tesserocr_segment_region_4.out[0], ocrd_tesserocr_segment_region_4.out[1], "OCR-D-SEG-BLOCK-TESSERACT", "OCR-D-SEGMENT-REPAIR") - ocrd_cis_ocropy_clip_6(ocrd_segment_repair_5.out[0], ocrd_segment_repair_5.out[1], "OCR-D-SEGMENT-REPAIR", "OCR-D-CLIP") - ocrd_cis_ocropy_segment_7(ocrd_cis_ocropy_clip_6.out[0], ocrd_cis_ocropy_clip_6.out[1], "OCR-D-CLIP", "OCR-D-SEGMENT-OCROPY") - ocrd_cis_ocropy_dewarp_8(ocrd_cis_ocropy_segment_7.out[0], ocrd_cis_ocropy_segment_7.out[1], "OCR-D-SEGMENT-OCROPY", "OCR-D-DEWARP") - ocrd_tesserocr_recognize_9(ocrd_cis_ocropy_dewarp_8.out[0], ocrd_cis_ocropy_dewarp_8.out[1], "OCR-D-DEWARP", "OCR-D-OCR") + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BINPAGE") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BINPAGE", "OCR-D-SEG-PAGE-ANYOCR") + ocrd_cis_ocropy_denoise_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-SEG-PAGE-ANYOCR", "OCR-D-DENOISE-OCROPY") + ocrd_cis_ocropy_deskew_3(ocrd_cis_ocropy_denoise_2.out[0], ocrd_cis_ocropy_denoise_2.out[1], ocrd_cis_ocropy_denoise_2.out[2], "OCR-D-DENOISE-OCROPY", "OCR-D-DESKEW-OCROPY") + ocrd_tesserocr_segment_region_4(ocrd_cis_ocropy_deskew_3.out[0], ocrd_cis_ocropy_deskew_3.out[1], ocrd_cis_ocropy_deskew_3.out[2], "OCR-D-DESKEW-OCROPY", "OCR-D-SEG-BLOCK-TESSERACT") + ocrd_segment_repair_5(ocrd_tesserocr_segment_region_4.out[0], ocrd_tesserocr_segment_region_4.out[1], ocrd_tesserocr_segment_region_4.out[2], "OCR-D-SEG-BLOCK-TESSERACT", "OCR-D-SEGMENT-REPAIR") + ocrd_cis_ocropy_clip_6(ocrd_segment_repair_5.out[0], ocrd_segment_repair_5.out[1], ocrd_segment_repair_5.out[2], "OCR-D-SEGMENT-REPAIR", "OCR-D-CLIP") + ocrd_cis_ocropy_segment_7(ocrd_cis_ocropy_clip_6.out[0], ocrd_cis_ocropy_clip_6.out[1], ocrd_cis_ocropy_clip_6.out[2], "OCR-D-CLIP", "OCR-D-SEGMENT-OCROPY") + ocrd_cis_ocropy_dewarp_8(ocrd_cis_ocropy_segment_7.out[0], ocrd_cis_ocropy_segment_7.out[1], ocrd_cis_ocropy_segment_7.out[2], "OCR-D-SEGMENT-OCROPY", "OCR-D-DEWARP") + ocrd_tesserocr_recognize_9(ocrd_cis_ocropy_dewarp_8.out[0], ocrd_cis_ocropy_dewarp_8.out[1], ocrd_cis_ocropy_dewarp_8.out[2], "OCR-D-DEWARP", "OCR-D-OCR") merging_mets(ocrd_tesserocr_recognize_9.out[0], ocrd_tesserocr_recognize_9.out[1]) } diff --git a/src/utils/operandi_utils/hpc/nextflow_workflows/odem_workflow_with_MS.nf b/src/utils/operandi_utils/hpc/nextflow_workflows/odem_workflow_with_MS.nf index a3235f59..63fe0753 100755 --- a/src/utils/operandi_utils/hpc/nextflow_workflows/odem_workflow_with_MS.nf +++ b/src/utils/operandi_utils/hpc/nextflow_workflows/odem_workflow_with_MS.nf @@ -1,257 +1,327 @@ -nextflow.enable.dsl=2 +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 -// The values are assigned inside the batch script -// Based on internal values and options provided in the request -params.input_file_group = "null" -params.mets = "null" -params.mets_socket = "null" +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" params.workspace_dir = "null" -// amount of pages of the workspace params.pages = "null" -params.singularity_wrapper = "null" +params.mets_socket_path = "null" params.cpus = "null" params.ram = "null" params.forks = params.cpus -// Do not pass these parameters from the caller unless you know what you are doing params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" +params.env_wrapper_cmd_step8 = "null" +params.env_wrapper_cmd_step9 = "null" log.info """\ - OPERANDI - HPC - Odem Workflow with Mets Server - =========================================== - input_file_group : ${params.input_file_group} - mets : ${params.mets} - mets_socket : ${params.mets_socket} - workspace_dir : ${params.workspace_dir} - pages : ${params.pages} - singularity_wrapper : ${params.singularity_wrapper} - cpus : ${params.cpus} - ram : ${params.ram} - forks : ${params.forks} - cpus_per_fork : ${params.cpus_per_fork} - ram_per_fork : ${params.ram_per_fork} - """ - .stripIndent() + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + mets_socket_path: ${params.mets_socket_path} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + env_wrapper_cmd_step8: ${params.env_wrapper_cmd_step8} + env_wrapper_cmd_step9: ${params.env_wrapper_cmd_step9} + """.stripIndent() process split_page_ranges { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: val range_multiplier + output: + env mets_file_chunk env current_range_pages - shell: - ''' - current_range_pages=$(!{params.singularity_wrapper} ocrd workspace -d !{params.workspace_dir} list-page -f comma-separated -D !{params.forks} -C !{range_multiplier}) - echo "Current range is: $current_range_pages" - ''' + + script: + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.mets_path}) + """ } process ocrd_cis_ocropy_binarize_0 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-binarize -U ${params.mets_socket} -w ${params.workspace_dir} --page-id ${page_range} -m ${params.mets} -I ${input_group} -O ${output_group} -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"dpi": 300}' + """ } process ocrd_anybaseocr_crop_1 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-anybaseocr-crop -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step1} ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"dpi": 300}' + """ } process ocrd_cis_ocropy_denoise_2 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-denoise -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step2} ocrd-cis-ocropy-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"dpi": 300}' + """ } process ocrd_cis_ocropy_deskew_3 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-deskew -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P level-of-operation page - """ + """ + ${params.env_wrapper_cmd_step3} ocrd-cis-ocropy-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ } process ocrd_tesserocr_segment_region_4 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-tesserocr-segment-region -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P padding 5.0 -P find_tables false -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step4} ocrd-tesserocr-segment-region -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"padding": 5.0, "find_tables": false, "dpi": 300}' + """ } process ocrd_segment_repair_5 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-segment-repair -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P plausibilize true -P plausibilize_merge_min_overlap 0.7 - """ + """ + ${params.env_wrapper_cmd_step5} ocrd-segment-repair -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"plausibilize": true, "plausibilize_merge_min_overlap": 0.7}' + """ } process ocrd_cis_ocropy_clip_6 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-clip -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step6} ocrd-cis-ocropy-clip -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } process ocrd_cis_ocropy_segment_7 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-segment -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P dpi 300 - """ + """ + ${params.env_wrapper_cmd_step7} ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"dpi": 300}' + """ } process ocrd_cis_ocropy_dewarp_8 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-dewarp -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step8} ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } process ocrd_tesserocr_recognize_9 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + output: + val mets_path val page_range + val workspace_dir script: - """ - ${params.singularity_wrapper} ocrd-tesserocr-recognize -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P model Fraktur - """ + """ + ${params.env_wrapper_cmd_step9} ocrd-tesserocr-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"model": "Fraktur"}' + """ } workflow { main: ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) split_page_ranges(ch_range_multipliers) - ocrd_cis_ocropy_binarize_0(split_page_ranges.out, params.input_file_group, "OCR-D-BINPAGE") - ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out, "OCR-D-BINPAGE", "OCR-D-SEG-PAGE-ANYOCR") - ocrd_cis_ocropy_denoise_2(ocrd_anybaseocr_crop_1.out, "OCR-D-SEG-PAGE-ANYOCR", "OCR-D-DENOISE-OCROPY") - ocrd_cis_ocropy_deskew_3(ocrd_cis_ocropy_denoise_2.out, "OCR-D-DENOISE-OCROPY", "OCR-D-DESKEW-OCROPY") - ocrd_tesserocr_segment_region_4(ocrd_cis_ocropy_deskew_3.out, "OCR-D-DESKEW-OCROPY", "OCR-D-SEG-BLOCK-TESSERACT") - ocrd_segment_repair_5(ocrd_tesserocr_segment_region_4.out, "OCR-D-SEG-BLOCK-TESSERACT", "OCR-D-SEGMENT-REPAIR") - ocrd_cis_ocropy_clip_6(ocrd_segment_repair_5.out, "OCR-D-SEGMENT-REPAIR", "OCR-D-CLIP") - ocrd_cis_ocropy_segment_7(ocrd_cis_ocropy_clip_6.out, "OCR-D-CLIP", "OCR-D-SEGMENT-OCROPY") - ocrd_cis_ocropy_dewarp_8(ocrd_cis_ocropy_segment_7.out, "OCR-D-SEGMENT-OCROPY", "OCR-D-DEWARP") - ocrd_tesserocr_recognize_9(ocrd_cis_ocropy_dewarp_8.out, "OCR-D-DEWARP", "OCR-D-OCR") + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BINPAGE") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BINPAGE", "OCR-D-SEG-PAGE-ANYOCR") + ocrd_cis_ocropy_denoise_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-SEG-PAGE-ANYOCR", "OCR-D-DENOISE-OCROPY") + ocrd_cis_ocropy_deskew_3(ocrd_cis_ocropy_denoise_2.out[0], ocrd_cis_ocropy_denoise_2.out[1], ocrd_cis_ocropy_denoise_2.out[2], "OCR-D-DENOISE-OCROPY", "OCR-D-DESKEW-OCROPY") + ocrd_tesserocr_segment_region_4(ocrd_cis_ocropy_deskew_3.out[0], ocrd_cis_ocropy_deskew_3.out[1], ocrd_cis_ocropy_deskew_3.out[2], "OCR-D-DESKEW-OCROPY", "OCR-D-SEG-BLOCK-TESSERACT") + ocrd_segment_repair_5(ocrd_tesserocr_segment_region_4.out[0], ocrd_tesserocr_segment_region_4.out[1], ocrd_tesserocr_segment_region_4.out[2], "OCR-D-SEG-BLOCK-TESSERACT", "OCR-D-SEGMENT-REPAIR") + ocrd_cis_ocropy_clip_6(ocrd_segment_repair_5.out[0], ocrd_segment_repair_5.out[1], ocrd_segment_repair_5.out[2], "OCR-D-SEGMENT-REPAIR", "OCR-D-CLIP") + ocrd_cis_ocropy_segment_7(ocrd_cis_ocropy_clip_6.out[0], ocrd_cis_ocropy_clip_6.out[1], ocrd_cis_ocropy_clip_6.out[2], "OCR-D-CLIP", "OCR-D-SEGMENT-OCROPY") + ocrd_cis_ocropy_dewarp_8(ocrd_cis_ocropy_segment_7.out[0], ocrd_cis_ocropy_segment_7.out[1], ocrd_cis_ocropy_segment_7.out[2], "OCR-D-SEGMENT-OCROPY", "OCR-D-DEWARP") + ocrd_tesserocr_recognize_9(ocrd_cis_ocropy_dewarp_8.out[0], ocrd_cis_ocropy_dewarp_8.out[1], ocrd_cis_ocropy_dewarp_8.out[2], "OCR-D-DEWARP", "OCR-D-OCR") } diff --git a/src/utils/operandi_utils/hpc/nextflow_workflows/sbb_workflow.nf b/src/utils/operandi_utils/hpc/nextflow_workflows/sbb_workflow.nf index 4230534d..eb37866b 100755 --- a/src/utils/operandi_utils/hpc/nextflow_workflows/sbb_workflow.nf +++ b/src/utils/operandi_utils/hpc/nextflow_workflows/sbb_workflow.nf @@ -1,95 +1,102 @@ -nextflow.enable.dsl=2 +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 -// The values are assigned inside the batch script -// Based on internal values and options provided in the request -params.input_file_group = "null" -params.mets = "null" +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" params.workspace_dir = "null" -// amount of pages of the workspace params.pages = "null" -params.singularity_wrapper = "null" params.cpus = "null" params.ram = "null" params.forks = params.cpus -// Do not pass these parameters from the caller unless you know what you are doing params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" log.info """\ - OPERANDI - HPC - SBB Workflow - =========================================== - input_file_group : ${params.input_file_group} - mets : ${params.mets} - workspace_dir : ${params.workspace_dir} - pages : ${params.pages} - singularity_wrapper : ${params.singularity_wrapper} - cpus : ${params.cpus} - ram : ${params.ram} - forks : ${params.forks} - cpus_per_fork : ${params.cpus_per_fork} - ram_per_fork : ${params.ram_per_fork} - """ - .stripIndent() + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + """.stripIndent() process split_page_ranges { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: val range_multiplier + output: env mets_file_chunk env current_range_pages + script: - """ - current_range_pages=\$(${params.singularity_wrapper} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) - echo "Current range is: \$current_range_pages" - mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) - echo "Mets file chunk path: \$mets_file_chunk" - \$(${params.singularity_wrapper} cp -p ${params.mets} \$mets_file_chunk) - """ + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(${params.env_wrapper_cmd_core} cp -p ${params.mets_path} \$mets_file_chunk) + """ } -process ocrd_tesserocr_recognize { +process ocrd_tesserocr_recognize_0 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir + script: - """ - ${params.singularity_wrapper} ocrd-tesserocr-recognize -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} -P segmentation_level region -P textequiv_level word -P find_tables true -P model deu - """ + """ + ${params.env_wrapper_cmd_step0} ocrd-tesserocr-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"segmentation_level": "region", "textequiv_level": "word", "find_tables": true, "model": "deu"}' + """ } process merging_mets { - // Must be a single instance - modifying the main mets file + debug true maxForks 1 + cpus params.cpus_per_fork + memory params.ram_per_fork input: val mets_file_chunk val page_range + script: - """ - ${params.singularity_wrapper} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} - ${params.singularity_wrapper} rm ${mets_file_chunk} - """ + """ + ${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + ${params.env_wrapper_cmd_core} rm ${mets_file_chunk} + """ } workflow { main: ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) split_page_ranges(ch_range_multipliers) - ocrd_tesserocr_recognize(split_page_ranges.out[0], split_page_ranges.out[1], params.input_file_group, "OCR-D-OCR") - merging_mets(ocrd_tesserocr_recognize.out[0], ocrd_tesserocr_recognize.out[1]) + ocrd_tesserocr_recognize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-OCR") + merging_mets(ocrd_tesserocr_recognize_0.out[0], ocrd_tesserocr_recognize_0.out[1]) } diff --git a/src/utils/operandi_utils/hpc/nextflow_workflows/sbb_workflow_with_MS.nf b/src/utils/operandi_utils/hpc/nextflow_workflows/sbb_workflow_with_MS.nf index 9c24f34b..e81b6094 100755 --- a/src/utils/operandi_utils/hpc/nextflow_workflows/sbb_workflow_with_MS.nf +++ b/src/utils/operandi_utils/hpc/nextflow_workflows/sbb_workflow_with_MS.nf @@ -1,74 +1,84 @@ -nextflow.enable.dsl=2 +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 -// The values are assigned inside the batch script -// Based on internal values and options provided in the request -params.input_file_group = "null" -params.mets = "null" -params.mets_socket = "null" +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" params.workspace_dir = "null" -// amount of pages of the workspace params.pages = "null" -params.singularity_wrapper = "null" +params.mets_socket_path = "null" params.cpus = "null" params.ram = "null" params.forks = params.cpus -// Do not pass these parameters from the caller unless you know what you are doing params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" log.info """\ - OPERANDI - HPC - SBB Workflow with Mets Server - =========================================== - input_file_group : ${params.input_file_group} - mets : ${params.mets} - mets_socket : ${params.mets_socket} - workspace_dir : ${params.workspace_dir} - pages : ${params.pages} - singularity_wrapper : ${params.singularity_wrapper} - cpus : ${params.cpus} - ram : ${params.ram} - forks : ${params.forks} - cpus_per_fork : ${params.cpus_per_fork} - ram_per_fork : ${params.ram_per_fork} - """ - .stripIndent() + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + mets_socket_path: ${params.mets_socket_path} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + """.stripIndent() process split_page_ranges { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: val range_multiplier + output: + env mets_file_chunk env current_range_pages - shell: - ''' - current_range_pages=$(!{params.singularity_wrapper} ocrd workspace -d !{params.workspace_dir} list-page -f comma-separated -D !{params.forks} -C !{range_multiplier}) - echo "Current range is: $current_range_pages" - ''' + + script: + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.mets_path}) + """ } -process ocrd_tesserocr_recognize { +process ocrd_tesserocr_recognize_0 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + + output: + val mets_path + val page_range + val workspace_dir + script: - """ - ${params.singularity_wrapper} ocrd-tesserocr-recognize -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} -P segmentation_level region -P textequiv_level word -P find_tables true -P model deu - """ + """ + ${params.env_wrapper_cmd_step0} ocrd-tesserocr-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"segmentation_level": "region", "textequiv_level": "word", "find_tables": true, "model": "deu"}' + """ } workflow { main: ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) split_page_ranges(ch_range_multipliers) - ocrd_tesserocr_recognize(split_page_ranges.out[0], params.input_file_group, "OCR-D-BIN") + ocrd_tesserocr_recognize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-OCR") } diff --git a/src/utils/operandi_utils/hpc/nextflow_workflows/template_workflow.nf b/src/utils/operandi_utils/hpc/nextflow_workflows/template_workflow.nf index bb001c6d..0028a978 100755 --- a/src/utils/operandi_utils/hpc/nextflow_workflows/template_workflow.nf +++ b/src/utils/operandi_utils/hpc/nextflow_workflows/template_workflow.nf @@ -1,95 +1,102 @@ -nextflow.enable.dsl=2 +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 -// The values are assigned inside the batch script -// Based on internal values and options provided in the request -params.input_file_group = "null" -params.mets = "null" +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" params.workspace_dir = "null" -// amount of pages of the workspace params.pages = "null" -params.singularity_wrapper = "null" params.cpus = "null" params.ram = "null" params.forks = params.cpus -// Do not pass these parameters from the caller unless you know what you are doing params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" log.info """\ - OPERANDI - HPC - Template Workflow - =========================================== - input_file_group : ${params.input_file_group} - mets : ${params.mets} - workspace_dir : ${params.workspace_dir} - pages : ${params.pages} - singularity_wrapper : ${params.singularity_wrapper} - cpus : ${params.cpus} - ram : ${params.ram} - forks : ${params.forks} - cpus_per_fork : ${params.cpus_per_fork} - ram_per_fork : ${params.ram_per_fork} - """ - .stripIndent() + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + """.stripIndent() process split_page_ranges { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: val range_multiplier + output: env mets_file_chunk env current_range_pages + script: - """ - current_range_pages=\$(${params.singularity_wrapper} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) - echo "Current range is: \$current_range_pages" - mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) - echo "Mets file chunk path: \$mets_file_chunk" - \$(${params.singularity_wrapper} cp -p ${params.mets} \$mets_file_chunk) - """ + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(${params.env_wrapper_cmd_core} cp -p ${params.mets_path} \$mets_file_chunk) + """ } -process ocrd_cis_ocropy_binarize { +process ocrd_cis_ocropy_binarize_0 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: - val mets_file_chunk + val mets_path val page_range + val workspace_dir val input_group val output_group + output: - val mets_file_chunk + val mets_path val page_range + val workspace_dir + script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-binarize -w ${params.workspace_dir} -m ${mets_file_chunk} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } process merging_mets { - // Must be a single instance - modifying the main mets file + debug true maxForks 1 + cpus params.cpus_per_fork + memory params.ram_per_fork input: val mets_file_chunk val page_range + script: - """ - ${params.singularity_wrapper} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} - ${params.singularity_wrapper} rm ${mets_file_chunk} - """ + """ + ${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + ${params.env_wrapper_cmd_core} rm ${mets_file_chunk} + """ } workflow { main: ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) split_page_ranges(ch_range_multipliers) - ocrd_cis_ocropy_binarize(split_page_ranges.out[0], split_page_ranges.out[1], params.input_file_group, "OCR-D-BIN") - merging_mets(ocrd_cis_ocropy_binarize.out[0], ocrd_cis_ocropy_binarize.out[1]) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + merging_mets(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1]) } diff --git a/src/utils/operandi_utils/hpc/nextflow_workflows/template_workflow_with_MS.nf b/src/utils/operandi_utils/hpc/nextflow_workflows/template_workflow_with_MS.nf index d89e8ee3..0daa30ed 100755 --- a/src/utils/operandi_utils/hpc/nextflow_workflows/template_workflow_with_MS.nf +++ b/src/utils/operandi_utils/hpc/nextflow_workflows/template_workflow_with_MS.nf @@ -1,74 +1,84 @@ -nextflow.enable.dsl=2 +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 -// The values are assigned inside the batch script -// Based on internal values and options provided in the request -params.input_file_group = "null" -params.mets = "null" -params.mets_socket = "null" +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" params.workspace_dir = "null" -// amount of pages of the workspace params.pages = "null" -params.singularity_wrapper = "null" +params.mets_socket_path = "null" params.cpus = "null" params.ram = "null" params.forks = params.cpus -// Do not pass these parameters from the caller unless you know what you are doing params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" log.info """\ - OPERANDI - HPC - Template Workflow with Mets Server - =========================================== - input_file_group : ${params.input_file_group} - mets : ${params.mets} - mets_socket : ${params.mets_socket} - workspace_dir : ${params.workspace_dir} - pages : ${params.pages} - singularity_wrapper : ${params.singularity_wrapper} - cpus : ${params.cpus} - ram : ${params.ram} - forks : ${params.forks} - cpus_per_fork : ${params.cpus_per_fork} - ram_per_fork : ${params.ram_per_fork} - """ - .stripIndent() + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + mets_socket_path: ${params.mets_socket_path} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + """.stripIndent() process split_page_ranges { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: val range_multiplier + output: + env mets_file_chunk env current_range_pages - shell: - ''' - current_range_pages=$(!{params.singularity_wrapper} ocrd workspace -d !{params.workspace_dir} list-page -f comma-separated -D !{params.forks} -C !{range_multiplier}) - echo "Current range is: $current_range_pages" - ''' + + script: + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.mets_path}) + """ } -process ocrd_cis_ocropy_binarize { +process ocrd_cis_ocropy_binarize_0 { + debug true maxForks params.forks cpus params.cpus_per_fork memory params.ram_per_fork - debug true input: + val mets_path val page_range + val workspace_dir val input_group val output_group + + output: + val mets_path + val page_range + val workspace_dir + script: - """ - ${params.singularity_wrapper} ocrd-cis-ocropy-binarize -U ${params.mets_socket} -w ${params.workspace_dir} -m ${params.mets} --page-id ${page_range} -I ${input_group} -O ${output_group} - """ + """ + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ } workflow { main: ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) split_page_ranges(ch_range_multipliers) - ocrd_cis_ocropy_binarize(split_page_ranges.out[0], params.input_file_group, "OCR-D-BIN") + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") } diff --git a/src/utils/operandi_utils/hpc/nhr_connector.py b/src/utils/operandi_utils/hpc/nhr_connector.py index 8d549bd4..6ea693bf 100644 --- a/src/utils/operandi_utils/hpc/nhr_connector.py +++ b/src/utils/operandi_utils/hpc/nhr_connector.py @@ -33,9 +33,10 @@ def __init__( self._ssh_reconnect_tries = 5 self._ssh_reconnect_tries_remaining = self._ssh_reconnect_tries # TODO: Make the sub cluster options selectable - self.project_root_dir = join(HPC_NHR_CLUSTERS["EmmyPhase2"]["scratch-emmy-hdd"], project_env) - self.batch_scripts_dir = join(self.project_root_dir, "batch_scripts") - self.slurm_workspaces_dir = join(self.project_root_dir, "slurm_workspaces") + self.project_root_dir: str = HPC_NHR_CLUSTERS["EmmyPhase2"]["scratch-emmy-hdd"] + self.project_root_dir_with_env: str = join(self.project_root_dir, project_env) + self.batch_scripts_dir: str = join(self.project_root_dir, project_env, "batch_scripts") + self.slurm_workspaces_dir: str = join(self.project_root_dir, project_env, "slurm_workspaces") @property def ssh_client(self): diff --git a/src/utils/operandi_utils/hpc/nhr_executor.py b/src/utils/operandi_utils/hpc/nhr_executor.py index 7636f2f2..36a48acc 100644 --- a/src/utils/operandi_utils/hpc/nhr_executor.py +++ b/src/utils/operandi_utils/hpc/nhr_executor.py @@ -1,14 +1,23 @@ +from json import dumps from logging import getLogger +from os.path import join from pathlib import Path from time import sleep +from typing import List -from operandi_utils.constants import StateJobSlurm +from operandi_utils.constants import StateJobSlurm, OCRD_PROCESSOR_EXECUTABLE_TO_IMAGE from .constants import ( HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_QOS_DEFAULT, HPC_NHR_JOB_DEFAULT_PARTITION, HPC_BATCH_SUBMIT_WORKFLOW_JOB, - HPC_WRAPPER_SUBMIT_WORKFLOW_JOB, HPC_WRAPPER_CHECK_WORKFLOW_JOB_STATUS + HPC_USE_SLIM_IMAGES, HPC_WRAPPER_SUBMIT_WORKFLOW_JOB, HPC_WRAPPER_CHECK_WORKFLOW_JOB_STATUS ) from .nhr_connector import NHRConnector +# Just some placeholders to be replaced with actual paths that are +# dynamically allocated inside the node that runs the HPC slurm job +PH_NODE_DIR_OCRD_MODELS = "PH_NODE_DIR_OCRD_MODELS" +PH_NODE_DIR_PROCESSOR_SIFS = "PH_NODE_DIR_PROCESSOR_SIFS" +PH_CMD_WRAPPER = "PH_CMD_WRAPPER" + class NHRExecutor(NHRConnector): def __init__(self) -> None: logger = getLogger(name=self.__class__.__name__) @@ -34,8 +43,9 @@ def execute_blocking(self, command, timeout=None, environment=None): def trigger_slurm_job( self, workflow_job_id: str, nextflow_script_path: Path, input_file_grp: str, workspace_id: str, mets_basename: str, nf_process_forks: int, ws_pages_amount: int, use_mets_server: bool, - file_groups_to_remove: str, cpus: int = 2, ram: int = 8, job_deadline_time: str = HPC_JOB_DEADLINE_TIME_TEST, - partition: str = HPC_NHR_JOB_DEFAULT_PARTITION, qos: str = HPC_JOB_QOS_DEFAULT + nf_executable_steps: List[str], file_groups_to_remove: str, cpus: int = 2, ram: int = 8, + job_deadline_time: str = HPC_JOB_DEADLINE_TIME_TEST, partition: str = HPC_NHR_JOB_DEFAULT_PARTITION, + qos: str = HPC_JOB_QOS_DEFAULT ) -> str: if ws_pages_amount < nf_process_forks: self.logger.warning( @@ -48,29 +58,59 @@ def trigger_slurm_job( use_mets_server_bash_flag = "true" if use_mets_server else "false" command = f"{HPC_WRAPPER_SUBMIT_WORKFLOW_JOB}" + sbatch_args = { + "partition": partition, + "job_deadline_time": job_deadline_time, + "output_log": f"{self.slurm_workspaces_dir}/{workflow_job_id}/slurm-job-%J.txt", + "cpus": cpus, + "ram": f"{ram}G", + "qos": qos, + "batch_script_path": HPC_BATCH_SUBMIT_WORKFLOW_JOB + } + + hpc_workflow_job_dir = join(self.slurm_workspaces_dir, workflow_job_id) + hpc_nf_script_path = join(self.slurm_workspaces_dir, workflow_job_id, nextflow_script_id) + hpc_workspace_dir = join(self.slurm_workspaces_dir, workflow_job_id, workspace_id) + + sif_ocrd_all = "ocrd_all_maximum_image.sif" + sif_ocrd_core = OCRD_PROCESSOR_EXECUTABLE_TO_IMAGE["ocrd"] + + if HPC_USE_SLIM_IMAGES: + ph_sif_core = f"{PH_NODE_DIR_PROCESSOR_SIFS}/{sif_ocrd_core}" + else: + ph_sif_core = f"{PH_NODE_DIR_PROCESSOR_SIFS}/{sif_ocrd_all}" + nf_run_command = self.cmd_nextflow_run( + hpc_nf_script_path=hpc_nf_script_path, hpc_ws_dir=hpc_workspace_dir, + bind_ocrd_models=f"{PH_NODE_DIR_OCRD_MODELS}/ocrd-resources:/usr/local/share/ocrd-resources", + sif_core=sif_ocrd_core, + sif_ocrd_all=sif_ocrd_all, input_file_grp=input_file_grp, mets_basename=mets_basename, + use_mets_server=use_mets_server, nf_executable_steps=nf_executable_steps, ws_pages_amount=ws_pages_amount, + cpus=cpus, ram=ram, forks=nf_process_forks, use_slim_images=HPC_USE_SLIM_IMAGES + ) - # SBATCH arguments passed to the batch script - command += f" {partition}" - command += f" {job_deadline_time}" - command += f" {self.slurm_workspaces_dir}/{workflow_job_id}/slurm-job-%J.txt" - command += f" {cpus}" - command += f" {ram}G" - command += f" {qos}" - - # Regular arguments passed to the batch script - command += f" {HPC_BATCH_SUBMIT_WORKFLOW_JOB}" - command += f" {self.slurm_workspaces_dir}" - command += f" {workflow_job_id}" - command += f" {nextflow_script_id}" - command += f" {input_file_grp}" - command += f" {workspace_id}" - command += f" {mets_basename}" - command += f" {cpus}" - command += f" {ram}" - command += f" {nf_process_forks}" - command += f" {ws_pages_amount}" - command += f" {use_mets_server_bash_flag}" - command += f" {file_groups_to_remove}" + if HPC_USE_SLIM_IMAGES: + ocrd_processor_images = ",".join([OCRD_PROCESSOR_EXECUTABLE_TO_IMAGE[exe] for exe in nf_executable_steps]) + ocrd_processor_images = f"{sif_ocrd_core},{ocrd_processor_images}" + else: + ocrd_processor_images = sif_ocrd_all + regular_args = { + "project_base_dir": self.project_root_dir, + "scratch_base_dir": self.slurm_workspaces_dir, + "ocrd_processor_images": ocrd_processor_images, + "workflow_job_id": workflow_job_id, + "workspace_id": workspace_id, + "use_mets_server_bash_flag": use_mets_server_bash_flag, + "file_groups_to_remove": file_groups_to_remove, + "hpc_workflow_job_dir": hpc_workflow_job_dir, + "hpc_workspace_dir": hpc_workspace_dir, + "nf_run_command": nf_run_command, + "print_ocrd_version_command": self.cmd_core_print_version(hpc_workspace_dir, ph_sif_core), + "start_mets_server_command": self.cmd_core_start_mets_server(hpc_workspace_dir, ph_sif_core), + "stop_mets_server_command": self.cmd_core_stop_mets_server(hpc_workspace_dir, ph_sif_core), + "list_file_groups_command": self.cmd_core_list_file_groups(hpc_workspace_dir, ph_sif_core), + "remove_file_group_command": self.cmd_core_remove_file_group(hpc_workspace_dir, ph_sif_core) + } + command += f" '{dumps(sbatch_args)}' '{dumps(regular_args)}'" self.logger.info(f"About to execute a force command: {command}") output, err, return_code = self.execute_blocking(command) @@ -147,3 +187,65 @@ def poll_till_end_slurm_job_state(self, slurm_job_id: str, interval: int = 5, ti # Timeout reached self.logger.info("Polling slurm job status timeout reached") return False + + @staticmethod + def cmd_nextflow_run( + hpc_nf_script_path: str, hpc_ws_dir: str, bind_ocrd_models: str, sif_core: str, sif_ocrd_all: str, + input_file_grp: str, mets_basename: str, use_mets_server: bool, nf_executable_steps: List[str], + ws_pages_amount: int, cpus: int, ram: int, forks: int, use_slim_images: bool + ) -> str: + nf_run_command = f"nextflow run {hpc_nf_script_path} -ansi-log false -with-report" + nf_run_command += f" --input_file_group {input_file_grp}" + nf_run_command += f" --mets_path /ws_data/{mets_basename}" + if use_mets_server: + nf_run_command += f" --mets_socket /ws_data/mets_server.sock" + nf_run_command += f" --workspace_dir /ws_data" + nf_run_command += f" --pages {ws_pages_amount}" + + sif_images = [OCRD_PROCESSOR_EXECUTABLE_TO_IMAGE[exe] for exe in nf_executable_steps] + apptainer_cmd = f"apptainer exec --bind {hpc_ws_dir}:/ws_data --bind {bind_ocrd_models}" + apptainer_cmd += f" --env OCRD_METS_CACHING=false" + apptainer_image = sif_core if use_slim_images else sif_ocrd_all + core_command = f"{apptainer_cmd} {PH_NODE_DIR_PROCESSOR_SIFS}/{apptainer_image}" + nf_run_command += f" --env_wrapper_cmd_core {PH_CMD_WRAPPER}{core_command}{PH_CMD_WRAPPER}" + + index = 0 + for sif_image in sif_images: + apptainer_image = sif_image if use_slim_images else sif_ocrd_all + step_command = f"{apptainer_cmd} {PH_NODE_DIR_PROCESSOR_SIFS}/{apptainer_image}" + nf_run_command += f" --env_wrapper_cmd_step{index} {PH_CMD_WRAPPER}{step_command}{PH_CMD_WRAPPER}" + index += 1 + nf_run_command += f" --cpus {cpus}" + nf_run_command += f" --ram {ram}" + nf_run_command += f" --forks {forks}" + return nf_run_command + + @staticmethod + def cmd_core_print_version(hpc_ws_dir: str, ph_sif_core: str) -> str: + return f"apptainer exec --bind {hpc_ws_dir}:/ws_data {ph_sif_core} ocrd --version" + + @staticmethod + def cmd_core_start_mets_server(hpc_ws_dir: str, ph_sif_core: str) -> str: + command = f"apptainer exec --bind {hpc_ws_dir}:/ws_data {ph_sif_core}" + command += f" ocrd workspace -d /ws_data -U /ws_data/mets_server.sock server start" + command += f" > {hpc_ws_dir}/mets_server.log 2>&1 &" + return command + + @staticmethod + def cmd_core_stop_mets_server(hpc_ws_dir: str, ph_sif_core: str) -> str: + command = f"apptainer exec --bind {hpc_ws_dir}:/ws_data {ph_sif_core}" + command += " ocrd workspace -d /ws_data -U /ws_data/mets_server.sock server stop" + return command + + @staticmethod + def cmd_core_list_file_groups(hpc_ws_dir: str, ph_sif_core: str) -> str: + command = f"apptainer exec --bind {hpc_ws_dir}:/ws_data {ph_sif_core}" + command += " ocrd workspace -d /ws_data list-group" + return command + + @staticmethod + def cmd_core_remove_file_group(hpc_ws_dir: str, ph_sif_core: str) -> str: + command = f"apptainer exec --bind {hpc_ws_dir}:/ws_data {ph_sif_core}" + command += " ocrd workspace -d /ws_data remove-group -r -f FILE_GROUP_PLACEHOLDER" + command += f" > {hpc_ws_dir}/remove_file_groups.log 2>&1" + return command diff --git a/src/utils/operandi_utils/hpc/ocrd_process_workflows/default_workflow.txt b/src/utils/operandi_utils/hpc/ocrd_process_workflows/default_workflow.txt new file mode 100644 index 00000000..b6374302 --- /dev/null +++ b/src/utils/operandi_utils/hpc/ocrd_process_workflows/default_workflow.txt @@ -0,0 +1,9 @@ +ocrd process \ + "cis-ocropy-binarize -I OCR-D-IMG -O OCR-D-BIN" \ + "anybaseocr-crop -I OCR-D-BIN -O OCR-D-CROP" \ + "skimage-binarize -I OCR-D-CROP -O OCR-D-BIN2 -P method li" \ + "skimage-denoise -I OCR-D-BIN2 -O OCR-D-BIN-DENOISE -P level-of-operation page" \ + "tesserocr-deskew -I OCR-D-BIN-DENOISE -O OCR-D-BIN-DENOISE-DESKEW -P operation_level page" \ + "cis-ocropy-segment -I OCR-D-BIN-DENOISE-DESKEW -O OCR-D-SEG -P level-of-operation page" \ + "cis-ocropy-dewarp -I OCR-D-SEG -O OCR-D-SEG-LINE-RESEG-DEWARP" \ + "calamari-recognize -I OCR-D-SEG-LINE-RESEG-DEWARP -O OCR-D-OCR -P checkpoint_dir qurator-gt4histocr-1.0" diff --git a/src/utils/operandi_utils/hpc/ocrd_process_workflows/odem_workflow.txt b/src/utils/operandi_utils/hpc/ocrd_process_workflows/odem_workflow.txt new file mode 100644 index 00000000..63d8e7bb --- /dev/null +++ b/src/utils/operandi_utils/hpc/ocrd_process_workflows/odem_workflow.txt @@ -0,0 +1,11 @@ +ocrd process \ + "cis-ocropy-binarize -I OCR-D-IMG -O OCR-D-BINPAGE -P dpi 300" \ + "anybaseocr-crop -I OCR-D-BINPAGE -O OCR-D-SEG-PAGE-ANYOCR -P dpi 300" \ + "cis-ocropy-denoise -I OCR-D-SEG-PAGE-ANYOCR -O OCR-D-DENOISE-OCROPY -P dpi 300" \ + "cis-ocropy-deskew -I OCR-D-DENOISE-OCROPY -O OCR-D-DESKEW-OCROPY -P level-of-operation page" \ + "tesserocr-segment-region -I OCR-D-DESKEW-OCROPY -O OCR-D-SEG-BLOCK-TESSERACT -P padding 5.0 -P find_tables false -P dpi 300" \ + "segment-repair -I OCR-D-SEG-BLOCK-TESSERACT -O OCR-D-SEGMENT-REPAIR -P plausibilize true -P plausibilize_merge_min_overlap 0.7" \ + "cis-ocropy-clip -I OCR-D-SEGMENT-REPAIR -O OCR-D-CLIP" \ + "cis-ocropy-segment -I OCR-D-CLIP -O OCR-D-SEGMENT-OCROPY -P dpi 300" \ + "cis-ocropy-dewarp -I OCR-D-SEGMENT-OCROPY -O OCR-D-DEWARP" \ + "tesserocr-recognize -I OCR-D-DEWARP -O OCR-D-OCR -P model Fraktur" diff --git a/src/utils/operandi_utils/hpc/ocrd_process_workflows/sbb_workflow.txt b/src/utils/operandi_utils/hpc/ocrd_process_workflows/sbb_workflow.txt new file mode 100644 index 00000000..49cdd8ad --- /dev/null +++ b/src/utils/operandi_utils/hpc/ocrd_process_workflows/sbb_workflow.txt @@ -0,0 +1,2 @@ +ocrd process \ + "tesserocr-recognize -I OCR-D-IMG -O OCR-D-OCR -P segmentation_level region -P textequiv_level word -P find_tables true -P model deu" diff --git a/src/utils/operandi_utils/hpc/ocrd_process_workflows/template_workflow.txt b/src/utils/operandi_utils/hpc/ocrd_process_workflows/template_workflow.txt new file mode 100644 index 00000000..2bbc6cda --- /dev/null +++ b/src/utils/operandi_utils/hpc/ocrd_process_workflows/template_workflow.txt @@ -0,0 +1,2 @@ +ocrd process \ + "cis-ocropy-binarize -I OCR-D-IMG -O OCR-D-BIN" diff --git a/src/utils/operandi_utils/oton/cli.py b/src/utils/operandi_utils/oton/cli.py index c1228124..5f21315f 100644 --- a/src/utils/operandi_utils/oton/cli.py +++ b/src/utils/operandi_utils/oton/cli.py @@ -15,19 +15,19 @@ def cli(): show_default=True, help='Path of the Nextflow workflow script to be generated.') @click.option('-E', '--environment', type=str, default="local", help='The environment of the output Nextflow file. One of: local, docker, apptainer.') -def convert(input_path: str, output_path: str, environment: str): +@click.option('-M', '--with_mets_server', type=bool, default=False, + help='Whether the Nextflow file will use a mets server or not. ' + 'If a Mets server is not used, then splitting and merging the mets files will be used.') +def convert(input_path: str, output_path: str, environment: str, with_mets_server): print(f"Converting from: {input_path}") print(f"Converting to: {output_path}") - if environment == "local": - OTONConverter().convert_oton_env_local(input_path, output_path) - elif environment == "docker": - OTONConverter().convert_oton_env_docker(input_path, output_path) - elif environment == "apptainer": - OTONConverter().convert_oton_env_apptainer(input_path, output_path) - else: - print("Unspecified environment type. Must be one of: local, docker, apptainer.") + environments = ["local", "docker", "apptainer"] + if environment not in environments: + print(f"Invalid environment value: {environment}. Must be one of: {environments}") exit(1) - print(f"Success: Converting workflow from ocrd process to Nextflow with {environment} processor calls") + OTONConverter().convert_oton(input_path, output_path, environment, with_mets_server) + print(f"Success: Converting workflow from ocrd process to Nextflow with {environment} processor calls. " + f"The Nextflow workflow will utilize a mets server: {with_mets_server}") @cli.command("validate", help="Validate an OCR-D workflow txt file.") diff --git a/src/utils/operandi_utils/oton/constants.py b/src/utils/operandi_utils/oton/constants.py index 1482b668..180fc94b 100644 --- a/src/utils/operandi_utils/oton/constants.py +++ b/src/utils/operandi_utils/oton/constants.py @@ -3,31 +3,14 @@ from pkg_resources import resource_filename from operandi_utils.constants import OPERANDI_VERSION - -__all__ = [ - "DIR_IN", - "DIR_OUT", - "METS_FILE", - - "OCRD_ALL_JSON", - "OTON_LOG_LEVEL", - "OTON_LOG_FORMAT", - - "PARAMS_KEY_INPUT_FILE_GRP", - "PARAMS_KEY_METS_PATH", - - "PH_ENV_WRAPPER", - "PH_DIR_IN", - "PH_DIR_OUT", - "PH_METS_FILE", - - "REPR_ENV_WRAPPER", - "REPR_INPUT_FILE_GRP", - "REPR_METS_PATH", - "REPR_WORKSPACE_DIR", - "SPACES", - "WORKFLOW_COMMENT" -] +BS: str = '{}' +SPACES = ' ' +CONST_DIR_IN: str = 'input_group' +CONST_DIR_OUT: str = 'output_group' +CONST_PAGE_RANGE: str = 'page_range' +CONST_METS_PATH: str = 'mets_path' +CONST_METS_SOCKET_PATH: str = 'mets_socket_path' +CONST_WORKSPACE_DIR: str = 'workspace_dir' OCRD_ALL_JSON_FILE = resource_filename(__name__, 'ocrd_all_tool.json') with open(OCRD_ALL_JSON_FILE) as f: @@ -36,26 +19,17 @@ OTON_LOG_LEVEL = environ.get("OTON_LOG_LEVEL", "INFO") OTON_LOG_FORMAT = '%(asctime)s %(levelname)s %(name)s:%(funcName)s: %(lineno)s: %(message)s' +PARAMS_KEY_ENV_WRAPPER_CMD_CORE: str = 'params.env_wrapper_cmd_core' +PARAMS_KEY_ENV_WRAPPER_CMD_STEP: str = 'params.env_wrapper_cmd_step' PARAMS_KEY_INPUT_FILE_GRP: str = 'params.input_file_group' PARAMS_KEY_METS_PATH: str = 'params.mets_path' +PARAMS_KEY_METS_SOCKET_PATH: str = 'params.mets_socket_path' PARAMS_KEY_WORKSPACE_DIR: str = 'params.workspace_dir' -PARAMS_KEY_ENV_WRAPPER: str = 'params.env_wrapper' - -REPR_INPUT_FILE_GRP: str = f"""{PARAMS_KEY_INPUT_FILE_GRP} = "null\"""" -REPR_METS_PATH: str = f"""{PARAMS_KEY_METS_PATH} = "null\"""" -REPR_WORKSPACE_DIR: str = f"""{PARAMS_KEY_WORKSPACE_DIR} = "null\"""" -REPR_ENV_WRAPPER: str = f"""{PARAMS_KEY_ENV_WRAPPER} = "null\"""" - -DIR_IN: str = 'input_file_group' -DIR_OUT: str = 'output_file_group' -METS_FILE: str = 'mets_file' - -# Placeholders -BS: str = '{}' -PH_ENV_WRAPPER: str = f'${BS[0]}{PARAMS_KEY_ENV_WRAPPER}{BS[1]}' -PH_DIR_IN: str = f'${BS[0]}{DIR_IN}{BS[1]}' -PH_DIR_OUT: str = f'${BS[0]}{DIR_OUT}{BS[1]}' -PH_METS_FILE: str = f'${BS[0]}{METS_FILE}{BS[1]}' -SPACES = ' ' +PARAMS_KEY_PAGES: str = 'params.pages' +PARAMS_KEY_CPUS: str = 'params.cpus' +PARAMS_KEY_RAM: str = 'params.ram' +PARAMS_KEY_FORKS: str = 'params.forks' +PARAMS_KEY_CPUS_PER_FORK: str = 'params.cpus_per_fork' +PARAMS_KEY_RAM_PER_FORK: str = 'params.ram_per_fork' WORKFLOW_COMMENT = f"// This workflow was automatically generated by the v{OPERANDI_VERSION} operandi_utils.oton module" diff --git a/src/utils/operandi_utils/oton/nf_block_process.py b/src/utils/operandi_utils/oton/nf_block_process.py index d2f857e5..2f1cb832 100644 --- a/src/utils/operandi_utils/oton/nf_block_process.py +++ b/src/utils/operandi_utils/oton/nf_block_process.py @@ -1,20 +1,21 @@ from logging import getLevelName, getLogger from operandi_utils.oton.ocrd_validator import ProcessorCallArguments -from operandi_utils.oton.constants import OTON_LOG_LEVEL, PH_ENV_WRAPPER, SPACES +from operandi_utils.oton.constants import BS, OTON_LOG_LEVEL, PARAMS_KEY_ENV_WRAPPER_CMD_STEP, SPACES class NextflowBlockProcess: def __init__(self, processor_call_arguments: ProcessorCallArguments, index_pos: int, env_wrapper: bool = False): self.logger = getLogger(__name__) self.logger.setLevel(getLevelName(OTON_LOG_LEVEL)) + self.index_pos = str(index_pos) self.processor_call_arguments: ProcessorCallArguments = processor_call_arguments self.env_wrapper: bool = env_wrapper - self.nf_process_name: str = processor_call_arguments.executable.replace('-', '_') + "_" + str(index_pos) + self.nf_process_name: str = processor_call_arguments.executable.replace('-', '_') + f"_{self.index_pos}" self.directives = {} self.input_params = {} self.output_params = {} - + self.script = "" self.ocrd_command_bash = processor_call_arguments.dump_bash_form() self.ocrd_command_bash_placeholders = processor_call_arguments.dump_bash_form_with_placeholders() @@ -54,23 +55,26 @@ def dump_parameters_output(self) -> str: dump += '\n' return dump - def dump_script(self) -> str: + def dump_script(self, local_script: bool = False) -> str: + if local_script: + return self.script dump = '' dump += f'{SPACES}{SPACES}"""\n' dump += f'{SPACES}{SPACES}' if self.env_wrapper: - dump += f'{PH_ENV_WRAPPER} ' + dump += f'${BS[0]}{PARAMS_KEY_ENV_WRAPPER_CMD_STEP}{self.index_pos}{BS[1]} ' dump += f'{self.ocrd_command_bash_placeholders}\n' dump += f'{SPACES}{SPACES}"""\n' return dump - def file_representation(self): + def file_representation(self, local_script: bool = False): representation = f'process {self.nf_process_name}' representation += ' {\n' representation += self.dump_directives() representation += f'{SPACES}input:\n{self.dump_parameters_input()}' - representation += f'{SPACES}output:\n{self.dump_parameters_output()}' - representation += f'{SPACES}script:\n{self.dump_script()}' + if len(self.output_params) > 0: + representation += f'{SPACES}output:\n{self.dump_parameters_output()}' + representation += f'{SPACES}script:\n{self.dump_script(local_script=local_script)}' representation += '}\n' self.logger.debug(f"\n{representation}") return representation diff --git a/src/utils/operandi_utils/oton/nf_block_workflow.py b/src/utils/operandi_utils/oton/nf_block_workflow.py index 95eaf62e..e37b64ec 100644 --- a/src/utils/operandi_utils/oton/nf_block_workflow.py +++ b/src/utils/operandi_utils/oton/nf_block_workflow.py @@ -1,31 +1,55 @@ from logging import getLevelName, getLogger from typing import List -from operandi_utils.oton.constants import OTON_LOG_LEVEL, PARAMS_KEY_METS_PATH, PARAMS_KEY_INPUT_FILE_GRP, SPACES +from operandi_utils.oton.constants import ( + OTON_LOG_LEVEL, PARAMS_KEY_WORKSPACE_DIR, PARAMS_KEY_INPUT_FILE_GRP, PARAMS_KEY_FORKS, SPACES +) from operandi_utils.oton.nf_block_process import NextflowBlockProcess - class NextflowBlockWorkflow: - def __init__(self, workflow_name: str, nf_processes: List[NextflowBlockProcess]): + def __init__( + self, workflow_name: str, + nf_processes: List[NextflowBlockProcess], + nf_split_block: NextflowBlockProcess, + nf_merge_mets: NextflowBlockProcess, + with_mets_server: bool = False + ): self.logger = getLogger(__name__) self.logger.setLevel(getLevelName(OTON_LOG_LEVEL)) + self.with_mets_server = with_mets_server self.workflow_name = workflow_name self.workflow_calls: List[str] = [] - self.produce_workflow_calls(nf_processes) + self.produce_workflow_calls(nf_processes, nf_split_block, nf_merge_mets) - def produce_workflow_calls(self, nf_blocks_process: List[NextflowBlockProcess]): + def produce_workflow_calls( + self, + nf_blocks_process: List[NextflowBlockProcess], + nf_split_page_ranges: NextflowBlockProcess, + nf_merge_mets: NextflowBlockProcess + ): + self.workflow_calls.append(f"ch_range_multipliers = Channel.of(0..{PARAMS_KEY_FORKS}.intValue()-1)\n") + self.workflow_calls.append(f"{nf_split_page_ranges.nf_process_name}(ch_range_multipliers)\n") previous_nfp = None for block_process in nf_blocks_process: in_file_grps = block_process.processor_call_arguments.input_file_grps out_file_grps = block_process.processor_call_arguments.output_file_grps workflow_call = f"{block_process.nf_process_name}(" if previous_nfp is None: - workflow_call += f'{PARAMS_KEY_METS_PATH}, {PARAMS_KEY_INPUT_FILE_GRP}, "{out_file_grps}"' + workflow_call += ( + f'{nf_split_page_ranges.nf_process_name}.out[0], {nf_split_page_ranges.nf_process_name}.out[1], ' + f'{PARAMS_KEY_WORKSPACE_DIR}, {PARAMS_KEY_INPUT_FILE_GRP}, "{out_file_grps}"' + ) else: - workflow_call += f'{previous_nfp}.out, "{in_file_grps}", "{out_file_grps}"' + workflow_call += ( + f'{previous_nfp}.out[0], {previous_nfp}.out[1], {previous_nfp}.out[2], "{in_file_grps}",' + f' "{out_file_grps}"' + ) workflow_call += ")\n" previous_nfp = block_process.nf_process_name self.workflow_calls.append(workflow_call) + if not self.with_mets_server: + self.workflow_calls.append( + f"{nf_merge_mets.nf_process_name}({previous_nfp}.out[0], {previous_nfp}.out[1])\n") def file_representation(self): representation = 'workflow {\n' diff --git a/src/utils/operandi_utils/oton/nf_file_executable.py b/src/utils/operandi_utils/oton/nf_file_executable.py index a0cdae48..57d8d31c 100644 --- a/src/utils/operandi_utils/oton/nf_file_executable.py +++ b/src/utils/operandi_utils/oton/nf_file_executable.py @@ -3,13 +3,21 @@ from operandi_utils.oton.ocrd_validator import ProcessorCallArguments from operandi_utils.oton.constants import ( - DIR_IN, DIR_OUT, METS_FILE, + BS, CONST_DIR_IN, CONST_DIR_OUT, CONST_PAGE_RANGE, CONST_METS_PATH, CONST_WORKSPACE_DIR, OTON_LOG_LEVEL, PARAMS_KEY_INPUT_FILE_GRP, - REPR_ENV_WRAPPER, - REPR_INPUT_FILE_GRP, - REPR_METS_PATH, - REPR_WORKSPACE_DIR, + PARAMS_KEY_METS_PATH, + PARAMS_KEY_WORKSPACE_DIR, + PARAMS_KEY_ENV_WRAPPER_CMD_CORE, + PARAMS_KEY_ENV_WRAPPER_CMD_STEP, + PARAMS_KEY_FORKS, + PARAMS_KEY_PAGES, + PARAMS_KEY_CPUS, + PARAMS_KEY_CPUS_PER_FORK, + PARAMS_KEY_RAM, + PARAMS_KEY_RAM_PER_FORK, + PARAMS_KEY_METS_SOCKET_PATH, + SPACES, WORKFLOW_COMMENT ) from operandi_utils.oton.nf_block_process import NextflowBlockProcess @@ -21,99 +29,190 @@ def __init__(self): self.logger = getLogger(__name__) self.logger.setLevel(getLevelName(OTON_LOG_LEVEL)) - self.nf_lines_parameters: List[str] = [] + self.supported_environments = ["local", "docker", "apptainer"] + self.nf_lines_parameters = {} + self.nf_process_split_range = None + self.nf_process_merging_mets = None self.nf_blocks_process: List[NextflowBlockProcess] = [] self.nf_blocks_workflow: List[NextflowBlockWorkflow] = [] - def build_parameters_local(self): - self.nf_lines_parameters.append('nextflow.enable.dsl = 2') - self.nf_lines_parameters.append('') - - self.nf_lines_parameters.append(REPR_INPUT_FILE_GRP) - self.nf_lines_parameters.append(REPR_METS_PATH) - self.nf_lines_parameters.append(REPR_WORKSPACE_DIR) - - self.nf_lines_parameters.append('') - - def build_parameters_docker(self): - self.nf_lines_parameters.append('nextflow.enable.dsl = 2') - self.nf_lines_parameters.append('') - - self.nf_lines_parameters.append(REPR_INPUT_FILE_GRP) - self.nf_lines_parameters.append(REPR_METS_PATH) - self.nf_lines_parameters.append(REPR_WORKSPACE_DIR) - self.nf_lines_parameters.append(REPR_ENV_WRAPPER) - - self.nf_lines_parameters.append('') - - def build_parameters_apptainer(self): - self.nf_lines_parameters.append('nextflow.enable.dsl = 2') - self.nf_lines_parameters.append('') - - self.nf_lines_parameters.append(REPR_INPUT_FILE_GRP) - self.nf_lines_parameters.append(REPR_METS_PATH) - self.nf_lines_parameters.append(REPR_WORKSPACE_DIR) - self.nf_lines_parameters.append(REPR_ENV_WRAPPER) - - self.nf_lines_parameters.append('') - - def build_nextflow_processes_local(self, ocrd_processor: List[ProcessorCallArguments]): - index = 0 - for processor in ocrd_processor: - nf_process_block = NextflowBlockProcess(processor, index, env_wrapper=False) - nf_process_block.add_directive(directive='maxForks', value='1') - nf_process_block.add_parameter_input(parameter=METS_FILE, parameter_type='path') - nf_process_block.add_parameter_input(parameter=DIR_IN, parameter_type='val') - nf_process_block.add_parameter_input(parameter=DIR_OUT, parameter_type='val') - nf_process_block.add_parameter_output(parameter=METS_FILE, parameter_type='path') - self.nf_blocks_process.append(nf_process_block) - index += 1 - - def build_nextflow_processes_docker(self, ocrd_processor: List[ProcessorCallArguments]): + def build_parameters(self, environment: str, with_mets_server: bool): + if environment not in self.supported_environments: + raise ValueError(f"Invalid environment value: {environment}. Must be one of: {self.supported_environments}") + + self.nf_lines_parameters[PARAMS_KEY_INPUT_FILE_GRP] = '"null"' + self.nf_lines_parameters[PARAMS_KEY_METS_PATH] = '"null"' + self.nf_lines_parameters[PARAMS_KEY_WORKSPACE_DIR] = '"null"' + self.nf_lines_parameters[PARAMS_KEY_PAGES] = '"null"' + + if with_mets_server: + self.nf_lines_parameters[PARAMS_KEY_METS_SOCKET_PATH] = '"null"' + + if environment == "local": + self.nf_lines_parameters[PARAMS_KEY_FORKS] = '"4"' + if environment == "docker": + self.nf_lines_parameters[PARAMS_KEY_FORKS] = '"4"' + self.nf_lines_parameters[PARAMS_KEY_ENV_WRAPPER_CMD_CORE] = '"null"' + if environment == "apptainer": + self.nf_lines_parameters[PARAMS_KEY_CPUS] = '"null"' + self.nf_lines_parameters[PARAMS_KEY_RAM] = '"null"' + self.nf_lines_parameters[PARAMS_KEY_FORKS] = f'{PARAMS_KEY_CPUS}' + self.nf_lines_parameters[PARAMS_KEY_CPUS_PER_FORK] = \ + f'({PARAMS_KEY_CPUS}.toInteger() / {PARAMS_KEY_FORKS}.toInteger()).intValue()' + self.nf_lines_parameters[PARAMS_KEY_RAM_PER_FORK] = \ + f'sprintf("%dGB", ({PARAMS_KEY_RAM}.toInteger() / {PARAMS_KEY_FORKS}.toInteger()).intValue())' + self.nf_lines_parameters[PARAMS_KEY_ENV_WRAPPER_CMD_CORE] = '"null"' + + # TODO: Refactor later + def build_split_page_ranges_process(self, environment: str, with_mets_server: bool) -> NextflowBlockProcess: + block = NextflowBlockProcess(ProcessorCallArguments(executable="split-page-ranges"), 0) + block.nf_process_name = "split_page_ranges" + block.ocrd_command_bash = "" + block.ocrd_command_bash_placeholders = "" + + block.add_directive(directive='debug', value='true') + block.add_directive(directive='maxForks', value=PARAMS_KEY_FORKS) + if environment == "apptainer": + block.add_directive(directive='cpus', value=PARAMS_KEY_CPUS_PER_FORK) + block.add_directive(directive='memory', value=PARAMS_KEY_RAM_PER_FORK) + + block.add_parameter_input(parameter="range_multiplier", parameter_type="val") + block.add_parameter_output(parameter="mets_file_chunk", parameter_type="env") + block.add_parameter_output(parameter="current_range_pages", parameter_type="env") + + PH_RANGE_MULTIPLIER = '${range_multiplier}' + bash_cmd_ocrd_ws = ( + f"ocrd workspace -d ${BS[0]}{PARAMS_KEY_WORKSPACE_DIR}{BS[1]} list-page -f comma-separated " + f"-D ${BS[0]}{PARAMS_KEY_FORKS}{BS[1]} -C {PH_RANGE_MULTIPLIER}" + ) + bash_cmd_copy_mets_chunk = f"cp -p ${BS[0]}{PARAMS_KEY_METS_PATH}{BS[1]} \\$mets_file_chunk" + + script = f'{SPACES}{SPACES}"""\n{SPACES}{SPACES}' + script += f"current_range_pages=\\$(" + if environment == "apptainer" or environment == "docker": + script += f"${BS[0]}{PARAMS_KEY_ENV_WRAPPER_CMD_CORE}{BS[1]} " + script += f"{bash_cmd_ocrd_ws})\n" + script += f'{SPACES}{SPACES}echo "Current range is: \\$current_range_pages"\n' + + if with_mets_server: + script += f"{SPACES}{SPACES}mets_file_chunk=\\$(echo ${BS[0]}{PARAMS_KEY_METS_PATH}{BS[1]})\n" + + if not with_mets_server: + script += f"{SPACES}{SPACES}mets_file_chunk=\\$(echo ${BS[0]}{PARAMS_KEY_WORKSPACE_DIR}{BS[1]}/mets_{PH_RANGE_MULTIPLIER}.xml)\n" + script += f'{SPACES}{SPACES}echo "Mets file chunk path: \\$mets_file_chunk"\n' + script += f"{SPACES}{SPACES}\\$(" + if environment == "apptainer" or environment == "docker": + script += f"${BS[0]}{PARAMS_KEY_ENV_WRAPPER_CMD_CORE}{BS[1]} " + script += f"{bash_cmd_copy_mets_chunk})\n" + script += f'{SPACES}{SPACES}"""\n' + block.script = script + self.nf_process_split_range = block + return block + + # TODO: Refactor later + def build_merge_mets_process(self, environment: str) -> NextflowBlockProcess: + block = NextflowBlockProcess(ProcessorCallArguments(executable="merging-mets"), 0) + block.nf_process_name = "merging_mets" + block.ocrd_command_bash = "" + block.ocrd_command_bash_placeholders = "" + + block.add_directive(directive='debug', value='true') + # Warning, do not set that to another value. Merging of mets must always be a single instance + block.add_directive(directive='maxForks', value='1') + if environment == "apptainer": + block.add_directive(directive='cpus', value=PARAMS_KEY_CPUS_PER_FORK) + block.add_directive(directive='memory', value=PARAMS_KEY_RAM_PER_FORK) + + block.add_parameter_input(parameter="mets_file_chunk", parameter_type="val") + block.add_parameter_input(parameter="page_range", parameter_type="val") + + PH_METS_FILE_CHUNK = "${mets_file_chunk}" + PH_PAGE_RANGE = "${page_range}" + bash_cmd_ocrd_ws = ( + f"ocrd workspace -d ${BS[0]}{PARAMS_KEY_WORKSPACE_DIR}{BS[1]} " + f"merge --force --no-copy-files {PH_METS_FILE_CHUNK} " + f"--page-id {PH_PAGE_RANGE}" + ) + script = f'{SPACES}{SPACES}"""\n{SPACES}{SPACES}' + if environment == "apptainer" or environment == "docker": + script += f"${BS[0]}{PARAMS_KEY_ENV_WRAPPER_CMD_CORE}{BS[1]} " + script += f"{bash_cmd_ocrd_ws}\n{SPACES}{SPACES}" + if environment == "apptainer" or environment == "docker": + script += f"${BS[0]}{PARAMS_KEY_ENV_WRAPPER_CMD_CORE}{BS[1]} " + script += f"rm {PH_METS_FILE_CHUNK}\n" + script += f'{SPACES}{SPACES}"""\n' + block.script = script + self.nf_process_merging_mets = block + return block + + def build_nextflow_processes( + self, ocrd_processors: List[ProcessorCallArguments], environment: str, with_mets_server: bool = False + ): index = 0 - for processor in ocrd_processor: - nf_process_block = NextflowBlockProcess(processor, index, env_wrapper=True) - nf_process_block.add_directive(directive='maxForks', value='1') - nf_process_block.add_parameter_input(parameter=METS_FILE, parameter_type='path') - nf_process_block.add_parameter_input(parameter=DIR_IN, parameter_type='val') - nf_process_block.add_parameter_input(parameter=DIR_OUT, parameter_type='val') - nf_process_block.add_parameter_output(parameter=METS_FILE, parameter_type='path') + env_wrapper = True if environment == "docker" or environment == "apptainer" else False + self.build_split_page_ranges_process(environment=environment, with_mets_server=with_mets_server) + self.build_merge_mets_process(environment=environment) + for processor in ocrd_processors: + nf_process_block = NextflowBlockProcess(processor, index, env_wrapper=env_wrapper) + + # Add Nextflow process directives + nf_process_block.add_directive(directive='debug', value='true') + nf_process_block.add_directive(directive='maxForks', value=PARAMS_KEY_FORKS) + if environment == "apptainer": + nf_process_block.add_directive(directive='cpus', value=PARAMS_KEY_CPUS_PER_FORK) + nf_process_block.add_directive(directive='memory', value=PARAMS_KEY_RAM_PER_FORK) + + # Add Nextflow process parameters + nf_process_block.add_parameter_input(parameter=CONST_METS_PATH, parameter_type='val') + nf_process_block.add_parameter_input(parameter=CONST_PAGE_RANGE, parameter_type='val') + nf_process_block.add_parameter_input(parameter=CONST_WORKSPACE_DIR, parameter_type='val') + nf_process_block.add_parameter_input(parameter=CONST_DIR_IN, parameter_type='val') + nf_process_block.add_parameter_input(parameter=CONST_DIR_OUT, parameter_type='val') + + nf_process_block.add_parameter_output(parameter=CONST_METS_PATH, parameter_type='val') + nf_process_block.add_parameter_output(parameter=CONST_PAGE_RANGE, parameter_type='val') + nf_process_block.add_parameter_output(parameter=CONST_WORKSPACE_DIR, parameter_type='val') + self.nf_lines_parameters[f'{PARAMS_KEY_ENV_WRAPPER_CMD_STEP}{index}'] = '"null"' self.nf_blocks_process.append(nf_process_block) index += 1 - def build_nextflow_processes_apptainer(self, ocrd_processor: List[ProcessorCallArguments]): - index = 0 - for processor in ocrd_processor: - nf_process_block = NextflowBlockProcess(processor, index, env_wrapper=True) - nf_process_block.add_directive(directive='maxForks', value='1') - nf_process_block.add_parameter_input(parameter=METS_FILE, parameter_type='path') - nf_process_block.add_parameter_input(parameter=DIR_IN, parameter_type='val') - nf_process_block.add_parameter_input(parameter=DIR_OUT, parameter_type='val') - nf_process_block.add_parameter_output(parameter=METS_FILE, parameter_type='path') - self.nf_blocks_process.append(nf_process_block) - index += 1 + def build_log_info_prints(self) -> str: + log_info = f'log.info """\\\n' + log_info += f"{SPACES}OPERANDI HPC - Nextflow Workflow\n" + log_info += f"{SPACES}===================================================\n" + for key, value in self.nf_lines_parameters.items(): + log_info += f"{SPACES}{key[len('params.'):]}: ${BS[0]}{key}{BS[1]}\n" + log_info += f'{SPACES}""".stripIndent()\n' + return log_info - def __assign_first_file_grps_param(self): + def build_main_workflow(self, with_mets_server: bool): first_file_grps = self.nf_blocks_process[0].processor_call_arguments.input_file_grps - index = 0 - for parameter in self.nf_lines_parameters: - if PARAMS_KEY_INPUT_FILE_GRP in parameter: - self.nf_lines_parameters[index] = parameter.replace("null", first_file_grps) - break - index += 1 - - def build_main_workflow(self): - self.__assign_first_file_grps_param() - nf_workflow_block = NextflowBlockWorkflow(workflow_name="main", nf_processes=self.nf_blocks_process) + self.nf_lines_parameters[PARAMS_KEY_INPUT_FILE_GRP] = f'"{first_file_grps}"' + nf_workflow_block = NextflowBlockWorkflow( + workflow_name="main", + nf_processes=self.nf_blocks_process, + nf_split_block=self.nf_process_split_range, + nf_merge_mets=self.nf_process_merging_mets, + with_mets_server=with_mets_server + ) self.nf_blocks_workflow.append(nf_workflow_block) - def produce_nextflow_file(self, output_path: str): + # TODO: Refactor later + def produce_nextflow_file(self, output_path: str, environment: str, with_mets_server: bool): # Write Nextflow line tokens to an output file with open(output_path, mode='w', encoding='utf-8') as nextflow_file: nextflow_file.write(f"{WORKFLOW_COMMENT}\n") - for nextflow_line in self.nf_lines_parameters: - nextflow_file.write(f'{nextflow_line}\n') + nextflow_file.write("nextflow.enable.dsl = 2\n") + nextflow_file.write("\n") + for key, value in self.nf_lines_parameters.items(): + nextflow_file.write(f'{key} = {value}\n') + nextflow_file.write("\n") + nextflow_file.write(self.build_log_info_prints()) + nextflow_file.write("\n") + nextflow_file.write(f'{self.nf_process_split_range.file_representation(local_script=True)}\n') for block in self.nf_blocks_process: - nextflow_file.write(f'{block.file_representation()}\n') + nextflow_file.write(f'{block.file_representation(local_script=False)}\n') + if not with_mets_server: + nextflow_file.write(f'{self.nf_process_merging_mets.file_representation(local_script=True)}\n') for block in self.nf_blocks_workflow: nextflow_file.write(f'{block.file_representation()}\n') diff --git a/src/utils/operandi_utils/oton/oton_converter.py b/src/utils/operandi_utils/oton/oton_converter.py index 65ff07ff..33d4856b 100755 --- a/src/utils/operandi_utils/oton/oton_converter.py +++ b/src/utils/operandi_utils/oton/oton_converter.py @@ -6,27 +6,13 @@ class OTONConverter: def __init__(self): self.ocrd_validator = OCRDValidator() - def __convert_oton(self, input_path: str, output_path: str, environment: str): + # TODO: Refactor later + def convert_oton(self, input_path: str, output_path: str, environment: str, with_mets_server: bool): list_processor_call_arguments = self.ocrd_validator.validate(input_path) nf_file_executable = NextflowFileExecutable() - if environment == "local": - nf_file_executable.build_parameters_local() - nf_file_executable.build_nextflow_processes_local(ocrd_processor=list_processor_call_arguments) - elif environment == "docker": - nf_file_executable.build_parameters_docker() - nf_file_executable.build_nextflow_processes_docker(ocrd_processor=list_processor_call_arguments) - elif environment == "apptainer": - nf_file_executable.build_parameters_apptainer() - nf_file_executable.build_nextflow_processes_apptainer(ocrd_processor=list_processor_call_arguments) - nf_file_executable.build_main_workflow() - nf_file_executable.produce_nextflow_file(output_path) + nf_file_executable.build_parameters(environment=environment, with_mets_server=with_mets_server) + nf_file_executable.build_nextflow_processes( + ocrd_processors=list_processor_call_arguments, environment=environment, with_mets_server=with_mets_server) + nf_file_executable.build_main_workflow(with_mets_server=with_mets_server) + nf_file_executable.produce_nextflow_file(output_path, environment, with_mets_server) return nf_file_executable - - def convert_oton_env_local(self, input_path: str, output_path: str) -> NextflowFileExecutable: - return self.__convert_oton(input_path, output_path, environment="local") - - def convert_oton_env_docker(self, input_path: str, output_path: str) -> NextflowFileExecutable: - return self.__convert_oton(input_path, output_path, environment="docker") - - def convert_oton_env_apptainer(self, input_path: str, output_path: str) -> NextflowFileExecutable: - return self.__convert_oton(input_path, output_path, environment="apptainer") diff --git a/src/utils/operandi_utils/oton/process_call_arguments.py b/src/utils/operandi_utils/oton/process_call_arguments.py index 5f701eba..e1f7600e 100644 --- a/src/utils/operandi_utils/oton/process_call_arguments.py +++ b/src/utils/operandi_utils/oton/process_call_arguments.py @@ -1,7 +1,10 @@ from json import dumps as json_dumps from logging import getLevelName, getLogger from typing import Optional -from operandi_utils.oton.constants import OCRD_ALL_JSON, OTON_LOG_LEVEL, PH_DIR_IN, PH_DIR_OUT, PH_METS_FILE +from operandi_utils.oton.constants import ( + BS, CONST_DIR_IN, CONST_DIR_OUT, CONST_WORKSPACE_DIR, CONST_METS_PATH, CONST_METS_SOCKET_PATH, + OCRD_ALL_JSON, OTON_LOG_LEVEL +) # This class is based on ocrd.task_sequence.ProcessorTask class ProcessorCallArguments: @@ -11,7 +14,10 @@ def __init__( input_file_grps: Optional[str] = None, output_file_grps: Optional[str] = None, parameters: Optional[dict] = None, - mets_file_path: str = "./mets.xml" + workspace_dir: str = None, + mets_file_path: str = None, + mets_socket_path: str = None, + page_id: str = None ): if not executable: raise ValueError(f"Missing executable name") @@ -19,18 +25,28 @@ def __init__( self.logger.setLevel(getLevelName(OTON_LOG_LEVEL)) self.executable = f'ocrd-{executable}' + self.workspace_dir = workspace_dir self.mets_file_path = mets_file_path + self.mets_socket_path = mets_socket_path self.input_file_grps = input_file_grps self.output_file_grps = output_file_grps + self.page_id = page_id self.parameters = parameters if parameters else {} self.ocrd_tool_json = OCRD_ALL_JSON.get(self.executable, None) def dump_bash_form(self) -> str: dump = '' dump += f'{self.executable}' - dump += f' -m {self.mets_file_path}' + if self.mets_socket_path: + dump += f' -U {self.mets_socket_path}' + if self.workspace_dir: + dump += f' -w {self.workspace_dir}' + if self.mets_file_path: + dump += f' -m {self.mets_file_path}' dump += f' -I {self.input_file_grps}' dump += f' -O {self.output_file_grps}' + if self.page_id: + dump += f' --page_id {self.page_id}' if self.parameters: dump += f" -p '{json_dumps(self.parameters)}'" return dump @@ -38,9 +54,12 @@ def dump_bash_form(self) -> str: def dump_bash_form_with_placeholders(self): dump = '' dump += f'{self.executable}' - dump += f' -m {PH_METS_FILE}' - dump += f' -I {PH_DIR_IN}' - dump += f' -O {PH_DIR_OUT}' + if self.mets_socket_path: + dump += f' -U ${BS[0]}{CONST_METS_SOCKET_PATH}{BS[1]}' + dump += f' -w ${BS[0]}{CONST_WORKSPACE_DIR}{BS[1]}' + dump += f' -m ${BS[0]}{CONST_METS_PATH}{BS[1]}' + dump += f' -I ${BS[0]}{CONST_DIR_IN}{BS[1]}' + dump += f' -O ${BS[0]}{CONST_DIR_OUT}{BS[1]}' if self.parameters: dump += f" -p '{json_dumps(self.parameters)}'" return dump diff --git a/src/utils/operandi_utils/utils.py b/src/utils/operandi_utils/utils.py index b68bbfc3..c9a3cb1e 100644 --- a/src/utils/operandi_utils/utils.py +++ b/src/utils/operandi_utils/utils.py @@ -83,9 +83,11 @@ def is_url_responsive(url: str) -> bool: return False -def get_nf_workflows_dir() -> Path: +def get_nf_wfs_dir() -> Path: return Path(dirname(__file__), "hpc", "nextflow_workflows") +def get_ocrd_process_wfs_dir() -> Path: + return Path(dirname(__file__), "hpc", "ocrd_process_workflows") def generate_id(file_ext: str = None): generated_id = str(uuid4()) diff --git a/src/utils/setup.py b/src/utils/setup.py index 9a79e4db..77b08639 100644 --- a/src/utils/setup.py +++ b/src/utils/setup.py @@ -20,7 +20,9 @@ 'operandi_utils.oton', 'operandi_utils.rabbitmq' ], - package_data={'': ['batch_scripts/*.sh', 'nextflow_workflows/*.nf', 'ocrd_all_tool.json']}, + package_data={ + '': ['batch_scripts/*.sh', 'nextflow_workflows/*.nf', 'ocrd_process_workflows/*.txt', 'ocrd_all_tool.json'] + }, install_requires=install_requires, entry_points={ 'console_scripts': [ diff --git a/tests/assets/oton/constants.py b/tests/assets/oton/constants.py index f92a797f..6e540e84 100644 --- a/tests/assets/oton/constants.py +++ b/tests/assets/oton/constants.py @@ -6,12 +6,17 @@ IN_TXT_WF4 = f'{OTON_RESOURCES_DIR}/workflow4.txt' OUT_NF_WF1_APPTAINER = f'{OTON_RESOURCES_DIR}/test_output_nextflow1_apptainer.nf' +OUT_NF_WF1_APPTAINER_WITH_MS = f'{OTON_RESOURCES_DIR}/test_output_nextflow1_apptainer_with_MS.nf' OUT_NF_WF1_DOCKER = f'{OTON_RESOURCES_DIR}/test_output_nextflow1_docker.nf' -OUT_NF_WF1_LOCAL = f'{OTON_RESOURCES_DIR}/test_output_nextflow1.nf' +OUT_NF_WF1_DOCKER_WITH_MS = f'{OTON_RESOURCES_DIR}/test_output_nextflow1_docker_with_MS.nf' +OUT_NF_WF1_LOCAL = f'{OTON_RESOURCES_DIR}/test_output_nextflow1_local.nf' +OUT_NF_WF1_LOCAL_WITH_MS = f'{OTON_RESOURCES_DIR}/test_output_nextflow1_local_with_MS.nf' OUT_NF_WF2_LOCAL = f'{OTON_RESOURCES_DIR}/test_output_nextflow2.nf' OUT_NF_WF3_LOCAL = f'{OTON_RESOURCES_DIR}/test_output_nextflow3.nf' OUT_NF_WF4_LOCAL = f'{OTON_RESOURCES_DIR}/test_output_nextflow4.nf' + + INVALID_WF1 = f'{OTON_RESOURCES_DIR}/invalid_workflow1.txt' INVALID_WF2 = f'{OTON_RESOURCES_DIR}/invalid_workflow2.txt' INVALID_WF3 = f'{OTON_RESOURCES_DIR}/invalid_workflow3.txt' @@ -20,70 +25,112 @@ EXPECTED_WF1 = """ workflow { main: - ocrd_cis_ocropy_binarize_0(params.mets_path, params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out, "OCR-D-BIN", "OCR-D-CROP") - ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out, "OCR-D-CROP", "OCR-D-BIN2") - ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out, "OCR-D-BIN2", "OCR-D-BIN-DENOISE") - ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out, "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out, "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") - ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out, "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") - ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out, "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + merging_mets(ocrd_calamari_recognize_7.out[0], ocrd_calamari_recognize_7.out[1]) +} +""" + +EXPECTED_WF1_WITH_MS = """ +workflow { + main: + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") } """ EXPECTED_WF2 = """ workflow { main: - ocrd_cis_ocropy_binarize_0(params.mets_path, params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out, "OCR-D-BIN", "OCR-D-CROP") - ocrd_skimage_denoise_2(ocrd_anybaseocr_crop_1.out, "OCR-D-CROP", "OCR-D-BIN-DENOISE") - ocrd_tesserocr_deskew_3(ocrd_skimage_denoise_2.out, "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_tesserocr_segment_4(ocrd_tesserocr_deskew_3.out, "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") - ocrd_cis_ocropy_dewarp_5(ocrd_tesserocr_segment_4.out, "OCR-D-SEG", "OCR-D-SEG-DEWARP") - ocrd_tesserocr_recognize_6(ocrd_cis_ocropy_dewarp_5.out, "OCR-D-SEG-DEWARP", "OCR-D-OCR") + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_denoise_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_3(ocrd_skimage_denoise_2.out[0], ocrd_skimage_denoise_2.out[1], ocrd_skimage_denoise_2.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_tesserocr_segment_4(ocrd_tesserocr_deskew_3.out[0], ocrd_tesserocr_deskew_3.out[1], ocrd_tesserocr_deskew_3.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_5(ocrd_tesserocr_segment_4.out[0], ocrd_tesserocr_segment_4.out[1], ocrd_tesserocr_segment_4.out[2], "OCR-D-SEG", "OCR-D-SEG-DEWARP") + ocrd_tesserocr_recognize_6(ocrd_cis_ocropy_dewarp_5.out[0], ocrd_cis_ocropy_dewarp_5.out[1], ocrd_cis_ocropy_dewarp_5.out[2], "OCR-D-SEG-DEWARP", "OCR-D-OCR") + merging_mets(ocrd_tesserocr_recognize_6.out[0], ocrd_tesserocr_recognize_6.out[1]) } """ EXPECTED_WF3 = """ workflow { main: - ocrd_dinglehopper_0(params.mets_path, params.input_file_group, "OCR-D-EVAL-SEG-BLOCK") - ocrd_dinglehopper_1(ocrd_dinglehopper_0.out, "OCR-D-GT-SEG-LINE,OCR-D-OCR", "OCR-D-EVAL-SEG-LINE") - ocrd_dinglehopper_2(ocrd_dinglehopper_1.out, "OCR-D-GT-SEG-PAGE,OCR-D-OCR", "OCR-D-EVAL-SEG-PAGE") + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_dinglehopper_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-EVAL-SEG-BLOCK") + ocrd_dinglehopper_1(ocrd_dinglehopper_0.out[0], ocrd_dinglehopper_0.out[1], ocrd_dinglehopper_0.out[2], "OCR-D-GT-SEG-LINE,OCR-D-OCR", "OCR-D-EVAL-SEG-LINE") + ocrd_dinglehopper_2(ocrd_dinglehopper_1.out[0], ocrd_dinglehopper_1.out[1], ocrd_dinglehopper_1.out[2], "OCR-D-GT-SEG-PAGE,OCR-D-OCR", "OCR-D-EVAL-SEG-PAGE") + merging_mets(ocrd_dinglehopper_2.out[0], ocrd_dinglehopper_2.out[1]) } """ EXPECTED_WF4 = """ workflow { main: - ocrd_olena_binarize_0(params.mets_path, params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop_1(ocrd_olena_binarize_0.out, "OCR-D-BIN", "OCR-D-CROP") - ocrd_olena_binarize_2(ocrd_anybaseocr_crop_1.out, "OCR-D-CROP", "OCR-D-BIN2") - ocrd_cis_ocropy_denoise_3(ocrd_olena_binarize_2.out, "OCR-D-BIN2", "OCR-D-BIN-DENOISE") - ocrd_cis_ocropy_deskew_4(ocrd_cis_ocropy_denoise_3.out, "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_tesserocr_segment_region_5(ocrd_cis_ocropy_deskew_4.out, "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG-REG") - ocrd_segment_repair_6(ocrd_tesserocr_segment_region_5.out, "OCR-D-SEG-REG", "OCR-D-SEG-REPAIR") - ocrd_cis_ocropy_deskew_7(ocrd_segment_repair_6.out, "OCR-D-SEG-REPAIR", "OCR-D-SEG-REG-DESKEW") - ocrd_cis_ocropy_clip_8(ocrd_cis_ocropy_deskew_7.out, "OCR-D-SEG-REG-DESKEW", "OCR-D-SEG-REG-DESKEW-CLIP") - ocrd_tesserocr_segment_line_9(ocrd_cis_ocropy_clip_8.out, "OCR-D-SEG-REG-DESKEW-CLIP", "OCR-D-SEG-LINE") - ocrd_segment_repair_10(ocrd_tesserocr_segment_line_9.out, "OCR-D-SEG-LINE", "OCR-D-SEG-REPAIR-LINE") - ocrd_cis_ocropy_dewarp_11(ocrd_segment_repair_10.out, "OCR-D-SEG-REPAIR-LINE", "OCR-D-SEG-LINE-RESEG-DEWARP") - ocrd_calamari_recognize_12(ocrd_cis_ocropy_dewarp_11.out, "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_olena_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_olena_binarize_0.out[0], ocrd_olena_binarize_0.out[1], ocrd_olena_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_olena_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_cis_ocropy_denoise_3(ocrd_olena_binarize_2.out[0], ocrd_olena_binarize_2.out[1], ocrd_olena_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_cis_ocropy_deskew_4(ocrd_cis_ocropy_denoise_3.out[0], ocrd_cis_ocropy_denoise_3.out[1], ocrd_cis_ocropy_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_tesserocr_segment_region_5(ocrd_cis_ocropy_deskew_4.out[0], ocrd_cis_ocropy_deskew_4.out[1], ocrd_cis_ocropy_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG-REG") + ocrd_segment_repair_6(ocrd_tesserocr_segment_region_5.out[0], ocrd_tesserocr_segment_region_5.out[1], ocrd_tesserocr_segment_region_5.out[2], "OCR-D-SEG-REG", "OCR-D-SEG-REPAIR") + ocrd_cis_ocropy_deskew_7(ocrd_segment_repair_6.out[0], ocrd_segment_repair_6.out[1], ocrd_segment_repair_6.out[2], "OCR-D-SEG-REPAIR", "OCR-D-SEG-REG-DESKEW") + ocrd_cis_ocropy_clip_8(ocrd_cis_ocropy_deskew_7.out[0], ocrd_cis_ocropy_deskew_7.out[1], ocrd_cis_ocropy_deskew_7.out[2], "OCR-D-SEG-REG-DESKEW", "OCR-D-SEG-REG-DESKEW-CLIP") + ocrd_tesserocr_segment_line_9(ocrd_cis_ocropy_clip_8.out[0], ocrd_cis_ocropy_clip_8.out[1], ocrd_cis_ocropy_clip_8.out[2], "OCR-D-SEG-REG-DESKEW-CLIP", "OCR-D-SEG-LINE") + ocrd_segment_repair_10(ocrd_tesserocr_segment_line_9.out[0], ocrd_tesserocr_segment_line_9.out[1], ocrd_tesserocr_segment_line_9.out[2], "OCR-D-SEG-LINE", "OCR-D-SEG-REPAIR-LINE") + ocrd_cis_ocropy_dewarp_11(ocrd_segment_repair_10.out[0], ocrd_segment_repair_10.out[1], ocrd_segment_repair_10.out[2], "OCR-D-SEG-REPAIR-LINE", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_12(ocrd_cis_ocropy_dewarp_11.out[0], ocrd_cis_ocropy_dewarp_11.out[1], ocrd_cis_ocropy_dewarp_11.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + merging_mets(ocrd_calamari_recognize_12.out[0], ocrd_calamari_recognize_12.out[1]) } """ -PARAMETERS_COMMON = [ - 'nextflow.enable.dsl = 2', - 'params.mets_path = "null"', - 'params.workspace_dir = "null"' -] +PARAMETERS_COMMON = { + 'params.mets_path': '"null"', + 'params.workspace_dir': '"null"', + 'params.pages': '"null"', +} -PARAMETERS_LOCAL = [] +PARAMETERS_LOCAL = { + 'params.forks': '"4"', +} -PARAMETERS_DOCKER = [ - 'params.env_wrapper = "null"' -] +PARAMETERS_DOCKER = { + 'params.forks': '"4"', + 'params.env_wrapper_cmd_core': '"null"', + 'params.env_wrapper_cmd_step0': '"null"', + 'params.env_wrapper_cmd_step1': '"null"', + 'params.env_wrapper_cmd_step2': '"null"', +} -PARAMETERS_APPTAINER = [ - 'params.env_wrapper = "null"' -] +PARAMETERS_APPTAINER = { + 'params.cpus': '"null"', + 'params.ram': '"null"', + 'params.forks': 'params.cpus', + 'params.cpus_per_fork': '(params.cpus.toInteger() / params.forks.toInteger()).intValue()', + 'params.ram_per_fork': 'sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue())', + 'params.env_wrapper_cmd_core': '"null"', + 'params.env_wrapper_cmd_step0': '"null"', + 'params.env_wrapper_cmd_step1': '"null"', + 'params.env_wrapper_cmd_step2': '"null"', +} diff --git a/tests/assets/oton/test_output_nextflow1.nf b/tests/assets/oton/test_output_nextflow1.nf deleted file mode 100644 index 49034fbe..00000000 --- a/tests/assets/oton/test_output_nextflow1.nf +++ /dev/null @@ -1,154 +0,0 @@ -// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module -nextflow.enable.dsl = 2 - -params.input_file_group = "OCR-D-IMG" -params.mets_path = "null" -params.workspace_dir = "null" - -process ocrd_cis_ocropy_binarize_0 { - maxForks 1 - - input: - path mets_file - val input_file_group - val output_file_group - - output: - path mets_file - - script: - """ - ocrd-cis-ocropy-binarize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} - """ -} - -process ocrd_anybaseocr_crop_1 { - maxForks 1 - - input: - path mets_file - val input_file_group - val output_file_group - - output: - path mets_file - - script: - """ - ocrd-anybaseocr-crop -m ${mets_file} -I ${input_file_group} -O ${output_file_group} - """ -} - -process ocrd_skimage_binarize_2 { - maxForks 1 - - input: - path mets_file - val input_file_group - val output_file_group - - output: - path mets_file - - script: - """ - ocrd-skimage-binarize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"method": "li"}' - """ -} - -process ocrd_skimage_denoise_3 { - maxForks 1 - - input: - path mets_file - val input_file_group - val output_file_group - - output: - path mets_file - - script: - """ - ocrd-skimage-denoise -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "page"}' - """ -} - -process ocrd_tesserocr_deskew_4 { - maxForks 1 - - input: - path mets_file - val input_file_group - val output_file_group - - output: - path mets_file - - script: - """ - ocrd-tesserocr-deskew -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"operation_level": "page"}' - """ -} - -process ocrd_cis_ocropy_segment_5 { - maxForks 1 - - input: - path mets_file - val input_file_group - val output_file_group - - output: - path mets_file - - script: - """ - ocrd-cis-ocropy-segment -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "page"}' - """ -} - -process ocrd_cis_ocropy_dewarp_6 { - maxForks 1 - - input: - path mets_file - val input_file_group - val output_file_group - - output: - path mets_file - - script: - """ - ocrd-cis-ocropy-dewarp -m ${mets_file} -I ${input_file_group} -O ${output_file_group} - """ -} - -process ocrd_calamari_recognize_7 { - maxForks 1 - - input: - path mets_file - val input_file_group - val output_file_group - - output: - path mets_file - - script: - """ - ocrd-calamari-recognize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' - """ -} - -workflow { - main: - ocrd_cis_ocropy_binarize_0(params.mets_path, params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out, "OCR-D-BIN", "OCR-D-CROP") - ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out, "OCR-D-CROP", "OCR-D-BIN2") - ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out, "OCR-D-BIN2", "OCR-D-BIN-DENOISE") - ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out, "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out, "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") - ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out, "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") - ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out, "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") -} diff --git a/tests/assets/oton/test_output_nextflow1_apptainer.nf b/tests/assets/oton/test_output_nextflow1_apptainer.nf index 21bd9148..a748cf8b 100644 --- a/tests/assets/oton/test_output_nextflow1_apptainer.nf +++ b/tests/assets/oton/test_output_nextflow1_apptainer.nf @@ -4,152 +4,288 @@ nextflow.enable.dsl = 2 params.input_file_group = "OCR-D-IMG" params.mets_path = "null" params.workspace_dir = "null" -params.env_wrapper = "null" +params.pages = "null" +params.cpus = "null" +params.ram = "null" +params.forks = params.cpus +params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() +params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" + +log.info """\ + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + """.stripIndent() + +process split_page_ranges { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val range_multiplier + + output: + env mets_file_chunk + env current_range_pages + + script: + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(${params.env_wrapper_cmd_core} cp -p ${params.mets_path} \$mets_file_chunk) + """ +} process ocrd_cis_ocropy_binarize_0 { - maxForks 1 + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-cis-ocropy-binarize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_anybaseocr_crop_1 { - maxForks 1 + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-anybaseocr-crop -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ${params.env_wrapper_cmd_step1} ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_skimage_binarize_2 { - maxForks 1 + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-skimage-binarize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"method": "li"}' + ${params.env_wrapper_cmd_step2} ocrd-skimage-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"method": "li"}' """ } process ocrd_skimage_denoise_3 { - maxForks 1 + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-skimage-denoise -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "page"}' + ${params.env_wrapper_cmd_step3} ocrd-skimage-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' """ } process ocrd_tesserocr_deskew_4 { - maxForks 1 + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-tesserocr-deskew -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"operation_level": "page"}' + ${params.env_wrapper_cmd_step4} ocrd-tesserocr-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"operation_level": "page"}' """ } process ocrd_cis_ocropy_segment_5 { - maxForks 1 + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-cis-ocropy-segment -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "page"}' + ${params.env_wrapper_cmd_step5} ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' """ } process ocrd_cis_ocropy_dewarp_6 { - maxForks 1 + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-cis-ocropy-dewarp -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ${params.env_wrapper_cmd_step6} ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_calamari_recognize_7 { - maxForks 1 + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step7} ocrd-calamari-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + """ +} + +process merging_mets { + debug true + maxForks 1 + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val mets_file_chunk + val page_range script: """ - ${params.env_wrapper} ocrd-calamari-recognize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + ${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + ${params.env_wrapper_cmd_core} rm ${mets_file_chunk} """ } workflow { main: - ocrd_cis_ocropy_binarize_0(params.mets_path, params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out, "OCR-D-BIN", "OCR-D-CROP") - ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out, "OCR-D-CROP", "OCR-D-BIN2") - ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out, "OCR-D-BIN2", "OCR-D-BIN-DENOISE") - ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out, "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out, "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") - ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out, "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") - ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out, "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + merging_mets(ocrd_calamari_recognize_7.out[0], ocrd_calamari_recognize_7.out[1]) } diff --git a/tests/assets/oton/test_output_nextflow1_apptainer_with_MS.nf b/tests/assets/oton/test_output_nextflow1_apptainer_with_MS.nf new file mode 100644 index 00000000..23115cd0 --- /dev/null +++ b/tests/assets/oton/test_output_nextflow1_apptainer_with_MS.nf @@ -0,0 +1,273 @@ +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 + +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" +params.workspace_dir = "null" +params.pages = "null" +params.mets_socket_path = "null" +params.cpus = "null" +params.ram = "null" +params.forks = params.cpus +params.cpus_per_fork = (params.cpus.toInteger() / params.forks.toInteger()).intValue() +params.ram_per_fork = sprintf("%dGB", (params.ram.toInteger() / params.forks.toInteger()).intValue()) +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" + +log.info """\ + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + mets_socket_path: ${params.mets_socket_path} + cpus: ${params.cpus} + ram: ${params.ram} + forks: ${params.forks} + cpus_per_fork: ${params.cpus_per_fork} + ram_per_fork: ${params.ram_per_fork} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + """.stripIndent() + +process split_page_ranges { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val range_multiplier + + output: + env mets_file_chunk + env current_range_pages + + script: + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.mets_path}) + """ +} + +process ocrd_cis_ocropy_binarize_0 { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_anybaseocr_crop_1 { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step1} ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_skimage_binarize_2 { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step2} ocrd-skimage-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"method": "li"}' + """ +} + +process ocrd_skimage_denoise_3 { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step3} ocrd-skimage-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ +} + +process ocrd_tesserocr_deskew_4 { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step4} ocrd-tesserocr-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"operation_level": "page"}' + """ +} + +process ocrd_cis_ocropy_segment_5 { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step5} ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ +} + +process ocrd_cis_ocropy_dewarp_6 { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step6} ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_calamari_recognize_7 { + debug true + maxForks params.forks + cpus params.cpus_per_fork + memory params.ram_per_fork + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step7} ocrd-calamari-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + """ +} + +workflow { + main: + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") +} diff --git a/tests/assets/oton/test_output_nextflow1_docker.nf b/tests/assets/oton/test_output_nextflow1_docker.nf index 21bd9148..638e6190 100644 --- a/tests/assets/oton/test_output_nextflow1_docker.nf +++ b/tests/assets/oton/test_output_nextflow1_docker.nf @@ -4,152 +4,260 @@ nextflow.enable.dsl = 2 params.input_file_group = "OCR-D-IMG" params.mets_path = "null" params.workspace_dir = "null" -params.env_wrapper = "null" +params.pages = "null" +params.forks = "4" +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" + +log.info """\ + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + forks: ${params.forks} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + """.stripIndent() + +process split_page_ranges { + debug true + maxForks params.forks + + input: + val range_multiplier + + output: + env mets_file_chunk + env current_range_pages + + script: + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(${params.env_wrapper_cmd_core} cp -p ${params.mets_path} \$mets_file_chunk) + """ +} process ocrd_cis_ocropy_binarize_0 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-cis-ocropy-binarize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_anybaseocr_crop_1 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-anybaseocr-crop -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ${params.env_wrapper_cmd_step1} ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_skimage_binarize_2 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-skimage-binarize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"method": "li"}' + ${params.env_wrapper_cmd_step2} ocrd-skimage-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"method": "li"}' """ } process ocrd_skimage_denoise_3 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-skimage-denoise -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "page"}' + ${params.env_wrapper_cmd_step3} ocrd-skimage-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' """ } process ocrd_tesserocr_deskew_4 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-tesserocr-deskew -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"operation_level": "page"}' + ${params.env_wrapper_cmd_step4} ocrd-tesserocr-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"operation_level": "page"}' """ } process ocrd_cis_ocropy_segment_5 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-cis-ocropy-segment -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "page"}' + ${params.env_wrapper_cmd_step5} ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' """ } process ocrd_cis_ocropy_dewarp_6 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ${params.env_wrapper} ocrd-cis-ocropy-dewarp -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ${params.env_wrapper_cmd_step6} ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_calamari_recognize_7 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step7} ocrd-calamari-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + """ +} + +process merging_mets { + debug true + maxForks 1 + + input: + val mets_file_chunk + val page_range script: """ - ${params.env_wrapper} ocrd-calamari-recognize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + ${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + ${params.env_wrapper_cmd_core} rm ${mets_file_chunk} """ } workflow { main: - ocrd_cis_ocropy_binarize_0(params.mets_path, params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out, "OCR-D-BIN", "OCR-D-CROP") - ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out, "OCR-D-CROP", "OCR-D-BIN2") - ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out, "OCR-D-BIN2", "OCR-D-BIN-DENOISE") - ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out, "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out, "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") - ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out, "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") - ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out, "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + merging_mets(ocrd_calamari_recognize_7.out[0], ocrd_calamari_recognize_7.out[1]) } diff --git a/tests/assets/oton/test_output_nextflow1_docker_with_MS.nf b/tests/assets/oton/test_output_nextflow1_docker_with_MS.nf new file mode 100644 index 00000000..e702bba1 --- /dev/null +++ b/tests/assets/oton/test_output_nextflow1_docker_with_MS.nf @@ -0,0 +1,247 @@ +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 + +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" +params.workspace_dir = "null" +params.pages = "null" +params.mets_socket_path = "null" +params.forks = "4" +params.env_wrapper_cmd_core = "null" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" + +log.info """\ + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + mets_socket_path: ${params.mets_socket_path} + forks: ${params.forks} + env_wrapper_cmd_core: ${params.env_wrapper_cmd_core} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + """.stripIndent() + +process split_page_ranges { + debug true + maxForks params.forks + + input: + val range_multiplier + + output: + env mets_file_chunk + env current_range_pages + + script: + """ + current_range_pages=\$(${params.env_wrapper_cmd_core} ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.mets_path}) + """ +} + +process ocrd_cis_ocropy_binarize_0 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step0} ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_anybaseocr_crop_1 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step1} ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_skimage_binarize_2 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step2} ocrd-skimage-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"method": "li"}' + """ +} + +process ocrd_skimage_denoise_3 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step3} ocrd-skimage-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ +} + +process ocrd_tesserocr_deskew_4 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step4} ocrd-tesserocr-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"operation_level": "page"}' + """ +} + +process ocrd_cis_ocropy_segment_5 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step5} ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ +} + +process ocrd_cis_ocropy_dewarp_6 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step6} ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_calamari_recognize_7 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ${params.env_wrapper_cmd_step7} ocrd-calamari-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + """ +} + +workflow { + main: + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") +} diff --git a/tests/assets/oton/test_output_nextflow1_local.nf b/tests/assets/oton/test_output_nextflow1_local.nf new file mode 100644 index 00000000..0aad272f --- /dev/null +++ b/tests/assets/oton/test_output_nextflow1_local.nf @@ -0,0 +1,261 @@ +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 + +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" +params.workspace_dir = "null" +params.pages = "null" +params.forks = "4" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" + +log.info """\ + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + forks: ${params.forks} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + """.stripIndent() + +process split_page_ranges { + debug true + maxForks params.forks + + input: + val range_multiplier + + output: + env mets_file_chunk + env current_range_pages + + script: + """ + current_range_pages=\$(ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(cp -p ${params.mets_path} \$mets_file_chunk) + """ +} + +process ocrd_cis_ocropy_binarize_0 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_anybaseocr_crop_1 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_skimage_binarize_2 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-skimage-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"method": "li"}' + """ +} + +process ocrd_skimage_denoise_3 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-skimage-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ +} + +process ocrd_tesserocr_deskew_4 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-tesserocr-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"operation_level": "page"}' + """ +} + +process ocrd_cis_ocropy_segment_5 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ +} + +process ocrd_cis_ocropy_dewarp_6 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_calamari_recognize_7 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-calamari-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + """ +} + +process merging_mets { + debug true + maxForks 1 + + input: + val mets_file_chunk + val page_range + + script: + """ + ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + rm ${mets_file_chunk} + """ +} + +workflow { + main: + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + merging_mets(ocrd_calamari_recognize_7.out[0], ocrd_calamari_recognize_7.out[1]) +} diff --git a/tests/assets/oton/test_output_nextflow1_local_with_MS.nf b/tests/assets/oton/test_output_nextflow1_local_with_MS.nf new file mode 100644 index 00000000..58ed1547 --- /dev/null +++ b/tests/assets/oton/test_output_nextflow1_local_with_MS.nf @@ -0,0 +1,245 @@ +// This workflow was automatically generated by the v2.17.0 operandi_utils.oton module +nextflow.enable.dsl = 2 + +params.input_file_group = "OCR-D-IMG" +params.mets_path = "null" +params.workspace_dir = "null" +params.pages = "null" +params.mets_socket_path = "null" +params.forks = "4" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" + +log.info """\ + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + mets_socket_path: ${params.mets_socket_path} + forks: ${params.forks} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + """.stripIndent() + +process split_page_ranges { + debug true + maxForks params.forks + + input: + val range_multiplier + + output: + env mets_file_chunk + env current_range_pages + + script: + """ + current_range_pages=\$(ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.mets_path}) + """ +} + +process ocrd_cis_ocropy_binarize_0 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_anybaseocr_crop_1 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_skimage_binarize_2 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-skimage-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"method": "li"}' + """ +} + +process ocrd_skimage_denoise_3 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-skimage-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ +} + +process ocrd_tesserocr_deskew_4 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-tesserocr-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"operation_level": "page"}' + """ +} + +process ocrd_cis_ocropy_segment_5 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-cis-ocropy-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' + """ +} + +process ocrd_cis_ocropy_dewarp_6 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process ocrd_calamari_recognize_7 { + debug true + maxForks params.forks + + input: + val mets_path + val page_range + val workspace_dir + val input_group + val output_group + + output: + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-calamari-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + """ +} + +workflow { + main: + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_skimage_denoise_3(ocrd_skimage_binarize_2.out[0], ocrd_skimage_binarize_2.out[1], ocrd_skimage_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_4(ocrd_skimage_denoise_3.out[0], ocrd_skimage_denoise_3.out[1], ocrd_skimage_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_cis_ocropy_segment_5(ocrd_tesserocr_deskew_4.out[0], ocrd_tesserocr_deskew_4.out[1], ocrd_tesserocr_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_6(ocrd_cis_ocropy_segment_5.out[0], ocrd_cis_ocropy_segment_5.out[1], ocrd_cis_ocropy_segment_5.out[2], "OCR-D-SEG", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_7(ocrd_cis_ocropy_dewarp_6.out[0], ocrd_cis_ocropy_dewarp_6.out[1], ocrd_cis_ocropy_dewarp_6.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") +} diff --git a/tests/assets/oton/test_output_nextflow2.nf b/tests/assets/oton/test_output_nextflow2.nf index df41ad04..f6eaf54a 100644 --- a/tests/assets/oton/test_output_nextflow2.nf +++ b/tests/assets/oton/test_output_nextflow2.nf @@ -4,133 +4,233 @@ nextflow.enable.dsl = 2 params.input_file_group = "OCR-D-IMG" params.mets_path = "null" params.workspace_dir = "null" +params.pages = "null" +params.forks = "4" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" + +log.info """\ + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + forks: ${params.forks} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + """.stripIndent() + +process split_page_ranges { + debug true + maxForks params.forks + + input: + val range_multiplier + + output: + env mets_file_chunk + env current_range_pages + + script: + """ + current_range_pages=\$(ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(cp -p ${params.mets_path} \$mets_file_chunk) + """ +} process ocrd_cis_ocropy_binarize_0 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-cis-ocropy-binarize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd-cis-ocropy-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_anybaseocr_crop_1 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-anybaseocr-crop -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_skimage_denoise_2 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-skimage-denoise -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "page"}' + ocrd-skimage-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' """ } process ocrd_tesserocr_deskew_3 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-tesserocr-deskew -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"operation_level": "page"}' + ocrd-tesserocr-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"operation_level": "page"}' """ } process ocrd_tesserocr_segment_4 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-tesserocr-segment -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"shrink_polygons": true}' + ocrd-tesserocr-segment -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"shrink_polygons": true}' """ } process ocrd_cis_ocropy_dewarp_5 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-cis-ocropy-dewarp -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_tesserocr_recognize_6 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-tesserocr-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"textequiv_level": "glyph", "overwrite_segments": true, "model": "GT4HistOCR_50000000.997_191951"}' + """ +} + +process merging_mets { + debug true + maxForks 1 + + input: + val mets_file_chunk + val page_range script: """ - ocrd-tesserocr-recognize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"textequiv_level": "glyph", "overwrite_segments": true, "model": "GT4HistOCR_50000000.997_191951"}' + ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + rm ${mets_file_chunk} """ } workflow { main: - ocrd_cis_ocropy_binarize_0(params.mets_path, params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out, "OCR-D-BIN", "OCR-D-CROP") - ocrd_skimage_denoise_2(ocrd_anybaseocr_crop_1.out, "OCR-D-CROP", "OCR-D-BIN-DENOISE") - ocrd_tesserocr_deskew_3(ocrd_skimage_denoise_2.out, "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_tesserocr_segment_4(ocrd_tesserocr_deskew_3.out, "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") - ocrd_cis_ocropy_dewarp_5(ocrd_tesserocr_segment_4.out, "OCR-D-SEG", "OCR-D-SEG-DEWARP") - ocrd_tesserocr_recognize_6(ocrd_cis_ocropy_dewarp_5.out, "OCR-D-SEG-DEWARP", "OCR-D-OCR") + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_cis_ocropy_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_cis_ocropy_binarize_0.out[0], ocrd_cis_ocropy_binarize_0.out[1], ocrd_cis_ocropy_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_skimage_denoise_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN-DENOISE") + ocrd_tesserocr_deskew_3(ocrd_skimage_denoise_2.out[0], ocrd_skimage_denoise_2.out[1], ocrd_skimage_denoise_2.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_tesserocr_segment_4(ocrd_tesserocr_deskew_3.out[0], ocrd_tesserocr_deskew_3.out[1], ocrd_tesserocr_deskew_3.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG") + ocrd_cis_ocropy_dewarp_5(ocrd_tesserocr_segment_4.out[0], ocrd_tesserocr_segment_4.out[1], ocrd_tesserocr_segment_4.out[2], "OCR-D-SEG", "OCR-D-SEG-DEWARP") + ocrd_tesserocr_recognize_6(ocrd_cis_ocropy_dewarp_5.out[0], ocrd_cis_ocropy_dewarp_5.out[1], ocrd_cis_ocropy_dewarp_5.out[2], "OCR-D-SEG-DEWARP", "OCR-D-OCR") + merging_mets(ocrd_tesserocr_recognize_6.out[0], ocrd_tesserocr_recognize_6.out[1]) } diff --git a/tests/assets/oton/test_output_nextflow3.nf b/tests/assets/oton/test_output_nextflow3.nf index 28a4fff3..aba90b96 100644 --- a/tests/assets/oton/test_output_nextflow3.nf +++ b/tests/assets/oton/test_output_nextflow3.nf @@ -4,61 +4,133 @@ nextflow.enable.dsl = 2 params.input_file_group = "OCR-D-GT-SEG-BLOCK,OCR-D-OCR" params.mets_path = "null" params.workspace_dir = "null" +params.pages = "null" +params.forks = "4" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" + +log.info """\ + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + forks: ${params.forks} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + """.stripIndent() + +process split_page_ranges { + debug true + maxForks params.forks + + input: + val range_multiplier + + output: + env mets_file_chunk + env current_range_pages + + script: + """ + current_range_pages=\$(ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(cp -p ${params.mets_path} \$mets_file_chunk) + """ +} process ocrd_dinglehopper_0 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-dinglehopper -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd-dinglehopper -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_dinglehopper_1 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-dinglehopper -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd-dinglehopper -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_dinglehopper_2 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-dinglehopper -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} + """ +} + +process merging_mets { + debug true + maxForks 1 + + input: + val mets_file_chunk + val page_range script: """ - ocrd-dinglehopper -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + rm ${mets_file_chunk} """ } workflow { main: - ocrd_dinglehopper_0(params.mets_path, params.input_file_group, "OCR-D-EVAL-SEG-BLOCK") - ocrd_dinglehopper_1(ocrd_dinglehopper_0.out, "OCR-D-GT-SEG-LINE,OCR-D-OCR", "OCR-D-EVAL-SEG-LINE") - ocrd_dinglehopper_2(ocrd_dinglehopper_1.out, "OCR-D-GT-SEG-PAGE,OCR-D-OCR", "OCR-D-EVAL-SEG-PAGE") + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_dinglehopper_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-EVAL-SEG-BLOCK") + ocrd_dinglehopper_1(ocrd_dinglehopper_0.out[0], ocrd_dinglehopper_0.out[1], ocrd_dinglehopper_0.out[2], "OCR-D-GT-SEG-LINE,OCR-D-OCR", "OCR-D-EVAL-SEG-LINE") + ocrd_dinglehopper_2(ocrd_dinglehopper_1.out[0], ocrd_dinglehopper_1.out[1], ocrd_dinglehopper_1.out[2], "OCR-D-GT-SEG-PAGE,OCR-D-OCR", "OCR-D-EVAL-SEG-PAGE") + merging_mets(ocrd_dinglehopper_2.out[0], ocrd_dinglehopper_2.out[1]) } diff --git a/tests/assets/oton/test_output_nextflow4.nf b/tests/assets/oton/test_output_nextflow4.nf index 0fc411ed..0f8bb59c 100644 --- a/tests/assets/oton/test_output_nextflow4.nf +++ b/tests/assets/oton/test_output_nextflow4.nf @@ -4,241 +4,383 @@ nextflow.enable.dsl = 2 params.input_file_group = "OCR-D-IMG" params.mets_path = "null" params.workspace_dir = "null" +params.pages = "null" +params.forks = "4" +params.env_wrapper_cmd_step0 = "null" +params.env_wrapper_cmd_step1 = "null" +params.env_wrapper_cmd_step2 = "null" +params.env_wrapper_cmd_step3 = "null" +params.env_wrapper_cmd_step4 = "null" +params.env_wrapper_cmd_step5 = "null" +params.env_wrapper_cmd_step6 = "null" +params.env_wrapper_cmd_step7 = "null" +params.env_wrapper_cmd_step8 = "null" +params.env_wrapper_cmd_step9 = "null" +params.env_wrapper_cmd_step10 = "null" +params.env_wrapper_cmd_step11 = "null" +params.env_wrapper_cmd_step12 = "null" + +log.info """\ + OPERANDI HPC - Nextflow Workflow + =================================================== + input_file_group: ${params.input_file_group} + mets_path: ${params.mets_path} + workspace_dir: ${params.workspace_dir} + pages: ${params.pages} + forks: ${params.forks} + env_wrapper_cmd_step0: ${params.env_wrapper_cmd_step0} + env_wrapper_cmd_step1: ${params.env_wrapper_cmd_step1} + env_wrapper_cmd_step2: ${params.env_wrapper_cmd_step2} + env_wrapper_cmd_step3: ${params.env_wrapper_cmd_step3} + env_wrapper_cmd_step4: ${params.env_wrapper_cmd_step4} + env_wrapper_cmd_step5: ${params.env_wrapper_cmd_step5} + env_wrapper_cmd_step6: ${params.env_wrapper_cmd_step6} + env_wrapper_cmd_step7: ${params.env_wrapper_cmd_step7} + env_wrapper_cmd_step8: ${params.env_wrapper_cmd_step8} + env_wrapper_cmd_step9: ${params.env_wrapper_cmd_step9} + env_wrapper_cmd_step10: ${params.env_wrapper_cmd_step10} + env_wrapper_cmd_step11: ${params.env_wrapper_cmd_step11} + env_wrapper_cmd_step12: ${params.env_wrapper_cmd_step12} + """.stripIndent() + +process split_page_ranges { + debug true + maxForks params.forks + + input: + val range_multiplier + + output: + env mets_file_chunk + env current_range_pages + + script: + """ + current_range_pages=\$(ocrd workspace -d ${params.workspace_dir} list-page -f comma-separated -D ${params.forks} -C ${range_multiplier}) + echo "Current range is: \$current_range_pages" + mets_file_chunk=\$(echo ${params.workspace_dir}/mets_${range_multiplier}.xml) + echo "Mets file chunk path: \$mets_file_chunk" + \$(cp -p ${params.mets_path} \$mets_file_chunk) + """ +} process ocrd_olena_binarize_0 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-olena-binarize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"impl": "sauvola"}' + ocrd-olena-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"impl": "sauvola"}' """ } process ocrd_anybaseocr_crop_1 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-anybaseocr-crop -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd-anybaseocr-crop -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_olena_binarize_2 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-olena-binarize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"impl": "kim"}' + ocrd-olena-binarize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"impl": "kim"}' """ } process ocrd_cis_ocropy_denoise_3 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-cis-ocropy-denoise -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "page"}' + ocrd-cis-ocropy-denoise -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' """ } process ocrd_cis_ocropy_deskew_4 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-cis-ocropy-deskew -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "page"}' + ocrd-cis-ocropy-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "page"}' """ } process ocrd_tesserocr_segment_region_5 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-tesserocr-segment-region -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd-tesserocr-segment-region -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_segment_repair_6 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-segment-repair -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"plausibilize": true}' + ocrd-segment-repair -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"plausibilize": true}' """ } process ocrd_cis_ocropy_deskew_7 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-cis-ocropy-deskew -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "region"}' + ocrd-cis-ocropy-deskew -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "region"}' """ } process ocrd_cis_ocropy_clip_8 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-cis-ocropy-clip -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"level-of-operation": "region"}' + ocrd-cis-ocropy-clip -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"level-of-operation": "region"}' """ } process ocrd_tesserocr_segment_line_9 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-tesserocr-segment-line -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd-tesserocr-segment-line -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_segment_repair_10 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-segment-repair -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"sanitize": true}' + ocrd-segment-repair -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"sanitize": true}' """ } process ocrd_cis_ocropy_dewarp_11 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir script: """ - ocrd-cis-ocropy-dewarp -m ${mets_file} -I ${input_file_group} -O ${output_file_group} + ocrd-cis-ocropy-dewarp -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} """ } process ocrd_calamari_recognize_12 { - maxForks 1 + debug true + maxForks params.forks input: - path mets_file - val input_file_group - val output_file_group + val mets_path + val page_range + val workspace_dir + val input_group + val output_group output: - path mets_file + val mets_path + val page_range + val workspace_dir + + script: + """ + ocrd-calamari-recognize -w ${workspace_dir} -m ${mets_path} -I ${input_group} -O ${output_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + """ +} + +process merging_mets { + debug true + maxForks 1 + + input: + val mets_file_chunk + val page_range script: """ - ocrd-calamari-recognize -m ${mets_file} -I ${input_file_group} -O ${output_file_group} -p '{"checkpoint_dir": "qurator-gt4histocr-1.0"}' + ocrd workspace -d ${params.workspace_dir} merge --force --no-copy-files ${mets_file_chunk} --page-id ${page_range} + rm ${mets_file_chunk} """ } workflow { main: - ocrd_olena_binarize_0(params.mets_path, params.input_file_group, "OCR-D-BIN") - ocrd_anybaseocr_crop_1(ocrd_olena_binarize_0.out, "OCR-D-BIN", "OCR-D-CROP") - ocrd_olena_binarize_2(ocrd_anybaseocr_crop_1.out, "OCR-D-CROP", "OCR-D-BIN2") - ocrd_cis_ocropy_denoise_3(ocrd_olena_binarize_2.out, "OCR-D-BIN2", "OCR-D-BIN-DENOISE") - ocrd_cis_ocropy_deskew_4(ocrd_cis_ocropy_denoise_3.out, "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") - ocrd_tesserocr_segment_region_5(ocrd_cis_ocropy_deskew_4.out, "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG-REG") - ocrd_segment_repair_6(ocrd_tesserocr_segment_region_5.out, "OCR-D-SEG-REG", "OCR-D-SEG-REPAIR") - ocrd_cis_ocropy_deskew_7(ocrd_segment_repair_6.out, "OCR-D-SEG-REPAIR", "OCR-D-SEG-REG-DESKEW") - ocrd_cis_ocropy_clip_8(ocrd_cis_ocropy_deskew_7.out, "OCR-D-SEG-REG-DESKEW", "OCR-D-SEG-REG-DESKEW-CLIP") - ocrd_tesserocr_segment_line_9(ocrd_cis_ocropy_clip_8.out, "OCR-D-SEG-REG-DESKEW-CLIP", "OCR-D-SEG-LINE") - ocrd_segment_repair_10(ocrd_tesserocr_segment_line_9.out, "OCR-D-SEG-LINE", "OCR-D-SEG-REPAIR-LINE") - ocrd_cis_ocropy_dewarp_11(ocrd_segment_repair_10.out, "OCR-D-SEG-REPAIR-LINE", "OCR-D-SEG-LINE-RESEG-DEWARP") - ocrd_calamari_recognize_12(ocrd_cis_ocropy_dewarp_11.out, "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + ch_range_multipliers = Channel.of(0..params.forks.intValue()-1) + split_page_ranges(ch_range_multipliers) + ocrd_olena_binarize_0(split_page_ranges.out[0], split_page_ranges.out[1], params.workspace_dir, params.input_file_group, "OCR-D-BIN") + ocrd_anybaseocr_crop_1(ocrd_olena_binarize_0.out[0], ocrd_olena_binarize_0.out[1], ocrd_olena_binarize_0.out[2], "OCR-D-BIN", "OCR-D-CROP") + ocrd_olena_binarize_2(ocrd_anybaseocr_crop_1.out[0], ocrd_anybaseocr_crop_1.out[1], ocrd_anybaseocr_crop_1.out[2], "OCR-D-CROP", "OCR-D-BIN2") + ocrd_cis_ocropy_denoise_3(ocrd_olena_binarize_2.out[0], ocrd_olena_binarize_2.out[1], ocrd_olena_binarize_2.out[2], "OCR-D-BIN2", "OCR-D-BIN-DENOISE") + ocrd_cis_ocropy_deskew_4(ocrd_cis_ocropy_denoise_3.out[0], ocrd_cis_ocropy_denoise_3.out[1], ocrd_cis_ocropy_denoise_3.out[2], "OCR-D-BIN-DENOISE", "OCR-D-BIN-DENOISE-DESKEW") + ocrd_tesserocr_segment_region_5(ocrd_cis_ocropy_deskew_4.out[0], ocrd_cis_ocropy_deskew_4.out[1], ocrd_cis_ocropy_deskew_4.out[2], "OCR-D-BIN-DENOISE-DESKEW", "OCR-D-SEG-REG") + ocrd_segment_repair_6(ocrd_tesserocr_segment_region_5.out[0], ocrd_tesserocr_segment_region_5.out[1], ocrd_tesserocr_segment_region_5.out[2], "OCR-D-SEG-REG", "OCR-D-SEG-REPAIR") + ocrd_cis_ocropy_deskew_7(ocrd_segment_repair_6.out[0], ocrd_segment_repair_6.out[1], ocrd_segment_repair_6.out[2], "OCR-D-SEG-REPAIR", "OCR-D-SEG-REG-DESKEW") + ocrd_cis_ocropy_clip_8(ocrd_cis_ocropy_deskew_7.out[0], ocrd_cis_ocropy_deskew_7.out[1], ocrd_cis_ocropy_deskew_7.out[2], "OCR-D-SEG-REG-DESKEW", "OCR-D-SEG-REG-DESKEW-CLIP") + ocrd_tesserocr_segment_line_9(ocrd_cis_ocropy_clip_8.out[0], ocrd_cis_ocropy_clip_8.out[1], ocrd_cis_ocropy_clip_8.out[2], "OCR-D-SEG-REG-DESKEW-CLIP", "OCR-D-SEG-LINE") + ocrd_segment_repair_10(ocrd_tesserocr_segment_line_9.out[0], ocrd_tesserocr_segment_line_9.out[1], ocrd_tesserocr_segment_line_9.out[2], "OCR-D-SEG-LINE", "OCR-D-SEG-REPAIR-LINE") + ocrd_cis_ocropy_dewarp_11(ocrd_segment_repair_10.out[0], ocrd_segment_repair_10.out[1], ocrd_segment_repair_10.out[2], "OCR-D-SEG-REPAIR-LINE", "OCR-D-SEG-LINE-RESEG-DEWARP") + ocrd_calamari_recognize_12(ocrd_cis_ocropy_dewarp_11.out[0], ocrd_cis_ocropy_dewarp_11.out[1], ocrd_cis_ocropy_dewarp_11.out[2], "OCR-D-SEG-LINE-RESEG-DEWARP", "OCR-D-OCR") + merging_mets(ocrd_calamari_recognize_12.out[0], ocrd_calamari_recognize_12.out[1]) } diff --git a/tests/tests_server/test_endpoint_workflow.py b/tests/tests_server/test_endpoint_workflow.py index fac64915..2fc107c8 100644 --- a/tests/tests_server/test_endpoint_workflow.py +++ b/tests/tests_server/test_endpoint_workflow.py @@ -3,8 +3,20 @@ from tests.constants import WORKFLOW_DUMMY_TEXT from .helpers_asserts import assert_local_dir_workflow, assert_response_status_code +def test_post_workflow_script(operandi, auth, db_workflows, bytes_template_workflow): + # Post a new workflow script + wf_detail = "Test template workflow with mets server" + response = operandi.post( + url=f"/workflow?details={wf_detail}", files={"nextflow_script": bytes_template_workflow}, auth=auth) + assert_response_status_code(response.status_code, expected_floor=2) + workflow_id = response.json()['resource_id'] + assert_local_dir_workflow(workflow_id) + db_workflow = db_workflows.find_one({"workflow_id": workflow_id}) + assert_exists_db_resource(db_workflow, resource_key="workflow_id", resource_id=workflow_id) + assert db_workflow["details"] == wf_detail + assert db_workflow["uses_mets_server"] == False -def test_post_workflow_script(operandi, auth, db_workflows, bytes_template_workflow_with_ms): +def test_post_workflow_script_with_ms(operandi, auth, db_workflows, bytes_template_workflow_with_ms): # Post a new workflow script wf_detail = "Test template workflow with mets server" response = operandi.post( @@ -15,6 +27,7 @@ def test_post_workflow_script(operandi, auth, db_workflows, bytes_template_workf db_workflow = db_workflows.find_one({"workflow_id": workflow_id}) assert_exists_db_resource(db_workflow, resource_key="workflow_id", resource_id=workflow_id) assert db_workflow["details"] == wf_detail + assert db_workflow["uses_mets_server"] == True def test_put_workflow_script( @@ -38,6 +51,7 @@ def test_put_workflow_script( assert workflow_path1, "Failed to extract workflow path 1" assert workflow_details1, "Failed to extract workflow details 1" assert db_workflow["details"] == wf_detail + assert db_workflow["uses_mets_server"] == True # The second put request replaces the previously created workflow files = {"nextflow_script": bytes_default_workflow_with_ms} @@ -56,6 +70,7 @@ def test_put_workflow_script( assert workflow_path2, "Failed to extract workflow path 2" assert workflow_details2, "Failed to extract workflow details 2" assert db_workflow["details"] == wf_detail_put + assert db_workflow["uses_mets_server"] == True assert workflow_dir1 == workflow_dir2, \ f"Workflow dir paths should match, but does not: {workflow_dir1} != {workflow_dir2}" @@ -126,7 +141,7 @@ def test_convert_txt_to_nextflow_success(operandi, auth): # Convert the dummy text to bytes and create an in-memory file-like object dummy_file = BytesIO(WORKFLOW_DUMMY_TEXT.encode('utf-8')) files = {"txt_file": ("dummy.txt", dummy_file, "text/plain")} - params = {"environment": "local"} + params = {"environment": "local", "with_mets_server": False} # Simulate uploading the text file for conversion via POST response = operandi.post(url="/convert_workflow", files=files, auth=auth, params=params) @@ -134,9 +149,31 @@ def test_convert_txt_to_nextflow_success(operandi, auth): # Verify the status code and content assert_response_status_code(response.status_code, expected_floor=2) assert "params.mets_path" in nf_file_content - assert "params.env_wrapper" not in nf_file_content + assert "params.env_wrapper_cmd_core" not in nf_file_content + assert "params.mets_socket_path" not in nf_file_content + assert "merging_mets" in nf_file_content +def test_convert_txt_to_nextflow_success_with_mets_server(operandi, auth): + """ + Test the successful conversion of a text file to a Nextflow (.nf) file with mets server. + """ + + # Convert the dummy text to bytes and create an in-memory file-like object + dummy_file = BytesIO(WORKFLOW_DUMMY_TEXT.encode('utf-8')) + files = {"txt_file": ("dummy.txt", dummy_file, "text/plain")} + params = {"environment": "local", "with_mets_server": True} + + # Simulate uploading the text file for conversion via POST + response = operandi.post(url="/convert_workflow", files=files, auth=auth, params=params) + nf_file_content = response.content.decode('utf-8') + # Verify the status code and content + assert_response_status_code(response.status_code, expected_floor=2) + assert "params.mets_path" in nf_file_content + assert "params.env_wrapper_cmd_core" not in nf_file_content + assert "params.mets_socket_path" in nf_file_content + assert "merging_mets" not in nf_file_content + # Added by Faizan def test_convert_txt_to_nextflow_auth_failure(operandi): """ @@ -145,7 +182,7 @@ def test_convert_txt_to_nextflow_auth_failure(operandi): dummy_text = "Some dummy text" dummy_file = BytesIO(dummy_text.encode('utf-8')) files = {"txt_file": ("dummy.txt", dummy_file, "text/plain")} - params = {"environment": "local"} + params = {"environment": "local", "with_mets_server": False} auth = ('invalid_user', 'invalid_password') response = operandi.post(url="/convert_workflow", files=files, auth=auth, params=params) @@ -163,7 +200,7 @@ def test_convert_txt_to_nextflow_validator_failure(operandi, auth): invalid_text = "Invalid ocrd process text" dummy_file = BytesIO(invalid_text.encode('utf-8')) files = {"txt_file": ("invalid.txt", dummy_file, "text/plain")} - params = {"environment": "local"} + params = {"environment": "local", "with_mets_server": False} response = operandi.post(url="/convert_workflow", files=files, auth=auth, params=params) assert_response_status_code(response.status_code, expected_floor=4) @@ -179,9 +216,31 @@ def test_convert_txt_to_nextflow_docker_success(operandi, auth): # Convert the dummy text to bytes and create an in-memory file-like object dummy_file = BytesIO(WORKFLOW_DUMMY_TEXT.encode('utf-8')) files = {"txt_file": ("dummy.txt", dummy_file, "text/plain")} - params = {"environment": "docker"} + params = {"environment": "docker", "with_mets_server": False} + + response = operandi.post(url="/convert_workflow", files=files, auth=auth, params=params) + nf_file_content = response.content.decode('utf-8') + assert_response_status_code(response.status_code, expected_floor=2) + assert "params.mets_path" in nf_file_content + assert "params.env_wrapper_cmd_core" in nf_file_content + assert "params.mets_socket_path" not in nf_file_content + assert "merging_mets" in nf_file_content + + +def test_convert_txt_to_nextflow_docker_success_with_mets_server(operandi, auth): + """ + Test the successful conversion of a text file to a Nextflow (.nf) file with mets server. + """ + + # Convert the dummy text to bytes and create an in-memory file-like object + dummy_file = BytesIO(WORKFLOW_DUMMY_TEXT.encode('utf-8')) + files = {"txt_file": ("dummy.txt", dummy_file, "text/plain")} + params = {"environment": "docker", "with_mets_server": True} response = operandi.post(url="/convert_workflow", files=files, auth=auth, params=params) nf_file_content = response.content.decode('utf-8') assert_response_status_code(response.status_code, expected_floor=2) - assert "params.env_wrapper" in nf_file_content + assert "params.mets_path" in nf_file_content + assert "params.env_wrapper_cmd_core" in nf_file_content + assert "params.mets_socket_path" in nf_file_content + assert "merging_mets" not in nf_file_content diff --git a/tests/tests_utils/test_2_oton/assert_utils.py b/tests/tests_utils/test_2_oton/assert_utils.py index ede5c075..11d334b7 100644 --- a/tests/tests_utils/test_2_oton/assert_utils.py +++ b/tests/tests_utils/test_2_oton/assert_utils.py @@ -2,13 +2,18 @@ from os.path import isfile from re import sub + from tests.assets.oton.constants import PARAMETERS_APPTAINER, PARAMETERS_COMMON, PARAMETERS_DOCKER, PARAMETERS_LOCAL -def assert_common_features(nextflow_file_class, num_blocks_process: int, num_blocks_workflow: int): +def assert_common_features( + nextflow_file_class, num_blocks_process: int, num_blocks_workflow: int, with_mets_server: bool +): parameters = nextflow_file_class.nf_lines_parameters for parameter in PARAMETERS_COMMON: assert parameter in parameters + if with_mets_server: + assert parameters['params.mets_socket_path'] == '"null"', f"params.mets_socket_path is missing in {parameters}" blocks_process = nextflow_file_class.nf_blocks_process assert len(blocks_process) == num_blocks_process for block in blocks_process: @@ -22,31 +27,31 @@ def assert_common_features(nextflow_file_class, num_blocks_process: int, num_blo def assert_common_features_local(nextflow_file_class): parameters = nextflow_file_class.nf_lines_parameters for parameter in PARAMETERS_LOCAL: - assert parameter in parameters + assert parameter in parameters, f"{parameter} is not in {parameters}" blocks_process = nextflow_file_class.nf_blocks_process for block in blocks_process: - assert '${params.env_wrapper}' not in block.dump_script(), \ - "${params.env_wrapper} found but should not exist in " + f"'{block.ocrd_command_bash_placeholders}'" + assert 'params.env_wrapper_cmd_step' not in block.dump_script(), \ + "params.env_wrapper_cmd_step found but should not exist in " + f"'{block.ocrd_command_bash_placeholders}'" def assert_common_features_docker(nextflow_file_class): parameters = nextflow_file_class.nf_lines_parameters for parameter in PARAMETERS_DOCKER: - assert parameter in parameters + assert parameter in parameters, f"{parameter} is not in {parameters}" blocks_process = nextflow_file_class.nf_blocks_process for block in blocks_process: - assert '${params.env_wrapper}' in block.dump_script(), \ - "${params.env_wrapper} not found but should exist in " + f"'{block.ocrd_command_bash_placeholders}'" + assert 'params.env_wrapper_cmd_step' in block.dump_script(), \ + "params.env_wrapper_cmd_step not found but should exist in " + f"'{block.ocrd_command_bash_placeholders}'" def assert_common_features_apptainer(nextflow_file_class): parameters = nextflow_file_class.nf_lines_parameters for parameter in PARAMETERS_APPTAINER: - assert parameter in parameters + assert parameter in parameters, f"{parameter} is not in {parameters}" blocks_process = nextflow_file_class.nf_blocks_process for block in blocks_process: - assert '${params.env_wrapper}' in block.dump_script(), \ - "${params.env_wrapper} not found but should exist in " + f"'{block.ocrd_command_bash_placeholders}'" + assert 'params.env_wrapper_cmd_step' in block.dump_script(), \ + "params.env_wrapper_cmd_step not found but should exist in " + f"'{block.ocrd_command_bash_placeholders}'" def assert_compare_workflow_blocks(output_file_path, expected_wf, clean_files: bool = False): diff --git a/tests/tests_utils/test_2_oton/test_1_parser_2_parse_arguments.py b/tests/tests_utils/test_2_oton/test_1_parser_2_parse_arguments.py index d97777e0..114a9c1c 100644 --- a/tests/tests_utils/test_2_oton/test_1_parser_2_parse_arguments.py +++ b/tests/tests_utils/test_2_oton/test_1_parser_2_parse_arguments.py @@ -7,7 +7,7 @@ def test_basic(ocrd_parser): assert processor_call_arguments.executable == "ocrd-cis-ocropy-binarize" assert processor_call_arguments.input_file_grps == "OCR-D-IMG" assert processor_call_arguments.output_file_grps == "OCR-D-BIN" - assert processor_call_arguments.mets_file_path == "./mets.xml" + assert not processor_call_arguments.mets_file_path assert processor_call_arguments.parameters == {} @@ -29,7 +29,7 @@ def test_with_params_separated(ocrd_parser): assert processor_call_arguments.executable == "ocrd-calamari-recognize" assert processor_call_arguments.input_file_grps == "OCR-D-INPUT" assert processor_call_arguments.output_file_grps == "OCR-D-OCR" - assert processor_call_arguments.mets_file_path == "./mets.xml" + assert not processor_call_arguments.mets_file_path assert processor_call_arguments.parameters == {"checkpoint_dir": "qurator-gt4histocr-1.0", "dummy": "dummy"} @@ -41,5 +41,5 @@ def test_with_params_clustered(ocrd_parser): assert processor_call_arguments.executable == "ocrd-calamari-recognize" assert processor_call_arguments.input_file_grps == "OCR-D-INPUT" assert processor_call_arguments.output_file_grps == "OCR-D-OCR" - assert processor_call_arguments.mets_file_path == "./mets.xml" + assert not processor_call_arguments.mets_file_path assert processor_call_arguments.parameters == {"checkpoint_dir": "qurator-gt4histocr-1.0", "dummy": "dummy"} diff --git a/tests/tests_utils/test_2_oton/test_3_converter_1_local.py b/tests/tests_utils/test_2_oton/test_3_converter_1_local.py index 2e8b003b..1232a756 100644 --- a/tests/tests_utils/test_2_oton/test_3_converter_1_local.py +++ b/tests/tests_utils/test_2_oton/test_3_converter_1_local.py @@ -1,38 +1,44 @@ from tests.assets.oton.constants import ( - EXPECTED_WF1, EXPECTED_WF2, EXPECTED_WF3, EXPECTED_WF4, + EXPECTED_WF1, EXPECTED_WF1_WITH_MS, EXPECTED_WF2, EXPECTED_WF3, EXPECTED_WF4, IN_TXT_WF1, IN_TXT_WF2, IN_TXT_WF3, IN_TXT_WF4, - OUT_NF_WF1_LOCAL, OUT_NF_WF2_LOCAL, OUT_NF_WF3_LOCAL, OUT_NF_WF4_LOCAL + OUT_NF_WF1_LOCAL, OUT_NF_WF1_LOCAL_WITH_MS, OUT_NF_WF2_LOCAL, OUT_NF_WF3_LOCAL, OUT_NF_WF4_LOCAL ) from tests.tests_utils.test_2_oton.assert_utils import ( assert_common_features, assert_common_features_local, assert_compare_workflow_blocks) def test_convert_wf1_with_env_local(oton_converter): - nextflow_file_class = oton_converter.convert_oton_env_local(IN_TXT_WF1, OUT_NF_WF1_LOCAL) - assert 'params.input_file_group = "OCR-D-IMG"' in nextflow_file_class.nf_lines_parameters - assert_common_features(nextflow_file_class, 8, 1) + nextflow_file_class = oton_converter.convert_oton(IN_TXT_WF1, OUT_NF_WF1_LOCAL, "local", False) + assert nextflow_file_class.nf_lines_parameters['params.input_file_group'] == '"OCR-D-IMG"' + assert_common_features(nextflow_file_class, 8, 1, False) assert_compare_workflow_blocks(OUT_NF_WF1_LOCAL, EXPECTED_WF1) +def test_convert_wf1_with_env_local_with_mets_server(oton_converter): + nextflow_file_class = oton_converter.convert_oton(IN_TXT_WF1, OUT_NF_WF1_LOCAL_WITH_MS, "local", True) + assert nextflow_file_class.nf_lines_parameters['params.input_file_group'] == '"OCR-D-IMG"' + assert_common_features(nextflow_file_class, 8, 1, True) + assert_compare_workflow_blocks(OUT_NF_WF1_LOCAL_WITH_MS, EXPECTED_WF1_WITH_MS) + def test_convert_wf2_with_env_local(oton_converter): - nextflow_file_class = oton_converter.convert_oton_env_local(IN_TXT_WF2, OUT_NF_WF2_LOCAL) - assert 'params.input_file_group = "OCR-D-IMG"' in nextflow_file_class.nf_lines_parameters - assert_common_features(nextflow_file_class, 7, 1) + nextflow_file_class = oton_converter.convert_oton(IN_TXT_WF2, OUT_NF_WF2_LOCAL, "local", False) + assert nextflow_file_class.nf_lines_parameters['params.input_file_group'] == '"OCR-D-IMG"' + assert_common_features(nextflow_file_class, 7, 1, False) assert_common_features_local(nextflow_file_class) assert_compare_workflow_blocks(OUT_NF_WF2_LOCAL, EXPECTED_WF2) def test_convert_wf3_with_env_local(oton_converter): - nextflow_file_class = oton_converter.convert_oton_env_local(IN_TXT_WF3, OUT_NF_WF3_LOCAL) - assert 'params.input_file_group = "OCR-D-GT-SEG-BLOCK,OCR-D-OCR"' in nextflow_file_class.nf_lines_parameters - assert_common_features(nextflow_file_class, 3, 1) + nextflow_file_class = oton_converter.convert_oton(IN_TXT_WF3, OUT_NF_WF3_LOCAL, "local", False) + assert nextflow_file_class.nf_lines_parameters['params.input_file_group'] == '"OCR-D-GT-SEG-BLOCK,OCR-D-OCR"' + assert_common_features(nextflow_file_class, 3, 1, False) assert_common_features_local(nextflow_file_class) assert_compare_workflow_blocks(OUT_NF_WF3_LOCAL, EXPECTED_WF3) def test_convert_wf4_with_env_local(oton_converter): - nextflow_file_class = oton_converter.convert_oton_env_local(IN_TXT_WF4, OUT_NF_WF4_LOCAL) - assert 'params.input_file_group = "OCR-D-IMG"' in nextflow_file_class.nf_lines_parameters - assert_common_features(nextflow_file_class, 13, 1) + nextflow_file_class = oton_converter.convert_oton(IN_TXT_WF4, OUT_NF_WF4_LOCAL, "local", False) + assert nextflow_file_class.nf_lines_parameters['params.input_file_group'] == '"OCR-D-IMG"' + assert_common_features(nextflow_file_class, 13, 1, False) assert_common_features_local(nextflow_file_class) assert_compare_workflow_blocks(OUT_NF_WF4_LOCAL, EXPECTED_WF4) diff --git a/tests/tests_utils/test_2_oton/test_3_converter_2_docker.py b/tests/tests_utils/test_2_oton/test_3_converter_2_docker.py index fbe938f5..ad7ea6ff 100644 --- a/tests/tests_utils/test_2_oton/test_3_converter_2_docker.py +++ b/tests/tests_utils/test_2_oton/test_3_converter_2_docker.py @@ -1,10 +1,17 @@ -from tests.assets.oton.constants import EXPECTED_WF1, IN_TXT_WF1, OUT_NF_WF1_DOCKER +from tests.assets.oton.constants import ( + EXPECTED_WF1, EXPECTED_WF1_WITH_MS, IN_TXT_WF1, OUT_NF_WF1_DOCKER, OUT_NF_WF1_DOCKER_WITH_MS) from tests.tests_utils.test_2_oton.assert_utils import ( assert_common_features, assert_common_features_docker, assert_compare_workflow_blocks) def test_convert_wf1_with_env_docker(oton_converter): - nextflow_file_class = oton_converter.convert_oton_env_docker(IN_TXT_WF1, OUT_NF_WF1_DOCKER) - assert_common_features(nextflow_file_class, 8, 1) + nextflow_file_class = oton_converter.convert_oton(IN_TXT_WF1, OUT_NF_WF1_DOCKER, "docker", False) + assert_common_features(nextflow_file_class, 8, 1, False) assert_common_features_docker(nextflow_file_class) assert_compare_workflow_blocks(OUT_NF_WF1_DOCKER, EXPECTED_WF1) + +def test_convert_wf1_with_env_docker_with_mets_server(oton_converter): + nextflow_file_class = oton_converter.convert_oton(IN_TXT_WF1, OUT_NF_WF1_DOCKER_WITH_MS, "docker", True) + assert_common_features(nextflow_file_class, 8, 1, True) + assert_common_features_docker(nextflow_file_class) + assert_compare_workflow_blocks(OUT_NF_WF1_DOCKER_WITH_MS, EXPECTED_WF1_WITH_MS) diff --git a/tests/tests_utils/test_2_oton/test_3_converter_3_apptainer.py b/tests/tests_utils/test_2_oton/test_3_converter_3_apptainer.py index 13fae1ee..02038fc4 100644 --- a/tests/tests_utils/test_2_oton/test_3_converter_3_apptainer.py +++ b/tests/tests_utils/test_2_oton/test_3_converter_3_apptainer.py @@ -1,10 +1,17 @@ -from tests.assets.oton.constants import EXPECTED_WF1, IN_TXT_WF1, OUT_NF_WF1_APPTAINER +from tests.assets.oton.constants import ( + EXPECTED_WF1, EXPECTED_WF1_WITH_MS, IN_TXT_WF1, OUT_NF_WF1_APPTAINER, OUT_NF_WF1_APPTAINER_WITH_MS) from tests.tests_utils.test_2_oton.assert_utils import ( assert_common_features, assert_common_features_apptainer, assert_compare_workflow_blocks) def test_convert_wf1_with_env_apptainer(oton_converter): - nextflow_file_class = oton_converter.convert_oton_env_apptainer(IN_TXT_WF1, OUT_NF_WF1_APPTAINER) - assert_common_features(nextflow_file_class, 8, 1) + nextflow_file_class = oton_converter.convert_oton(IN_TXT_WF1, OUT_NF_WF1_APPTAINER, "apptainer", False) + assert_common_features(nextflow_file_class, 8, 1, False) assert_common_features_apptainer(nextflow_file_class) assert_compare_workflow_blocks(OUT_NF_WF1_APPTAINER, EXPECTED_WF1) + +def test_convert_wf1_with_env_apptainer_with_mets_server(oton_converter): + nextflow_file_class = oton_converter.convert_oton(IN_TXT_WF1, OUT_NF_WF1_APPTAINER_WITH_MS, "apptainer", True) + assert_common_features(nextflow_file_class, 8, 1, True) + assert_common_features_apptainer(nextflow_file_class) + assert_compare_workflow_blocks(OUT_NF_WF1_APPTAINER_WITH_MS, EXPECTED_WF1_WITH_MS) diff --git a/tests/tests_utils/_test_3_hpc/__init__.py b/tests/tests_utils/test_3_hpc/__init__.py similarity index 100% rename from tests/tests_utils/_test_3_hpc/__init__.py rename to tests/tests_utils/test_3_hpc/__init__.py diff --git a/tests/tests_utils/_test_3_hpc/test_1_nhr_executor.py b/tests/tests_utils/test_3_hpc/test_1_nhr_executor.py similarity index 91% rename from tests/tests_utils/_test_3_hpc/test_1_nhr_executor.py rename to tests/tests_utils/test_3_hpc/test_1_nhr_executor.py index 2ba80b1d..35d5cc46 100644 --- a/tests/tests_utils/_test_3_hpc/test_1_nhr_executor.py +++ b/tests/tests_utils/test_3_hpc/test_1_nhr_executor.py @@ -6,7 +6,7 @@ def test_hpc_connector_executor_mk_dir(hpc_nhr_command_executor): - test_dir_name = join(hpc_nhr_command_executor.project_root_dir, f"test_dir_{current_time}") + test_dir_name = join(hpc_nhr_command_executor.project_root_dir_with_env, f"test_dir_{current_time}") sleep(0.5) output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'mkdir -p {test_dir_name}'") assert return_code == 0, err @@ -15,7 +15,7 @@ def test_hpc_connector_executor_mk_dir(hpc_nhr_command_executor): def test_hpc_connector_executor_rm_dir_negative(hpc_nhr_command_executor): - test_dir_name = join(hpc_nhr_command_executor.project_root_dir, f"test_dir_{current_time}") + test_dir_name = join(hpc_nhr_command_executor.project_root_dir_with_env, f"test_dir_{current_time}") sleep(0.5) output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'rm {test_dir_name}'") assert return_code == 1 @@ -25,7 +25,7 @@ def test_hpc_connector_executor_rm_dir_negative(hpc_nhr_command_executor): def test_hpc_connector_executor_rm_dir_positive(hpc_nhr_command_executor): - test_dir_name = join(hpc_nhr_command_executor.project_root_dir, f"test_dir_{current_time}") + test_dir_name = join(hpc_nhr_command_executor.project_root_dir_with_env, f"test_dir_{current_time}") sleep(0.5) output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'rm -rf {test_dir_name}'") assert return_code == 0 @@ -34,7 +34,7 @@ def test_hpc_connector_executor_rm_dir_positive(hpc_nhr_command_executor): def test_hpc_connector_executor_cd_dir(hpc_nhr_command_executor): - test_dir_name = join(hpc_nhr_command_executor.project_root_dir, f"test_dir_{current_time}") + test_dir_name = join(hpc_nhr_command_executor.project_root_dir_with_env, f"test_dir_{current_time}") sleep(0.5) output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'cd {test_dir_name}'") assert return_code == 1 diff --git a/tests/tests_utils/_test_3_hpc/test_2_nhr_transfer.py b/tests/tests_utils/test_3_hpc/test_2_nhr_transfer.py similarity index 96% rename from tests/tests_utils/_test_3_hpc/test_2_nhr_transfer.py rename to tests/tests_utils/test_3_hpc/test_2_nhr_transfer.py index 3a81b884..be740443 100644 --- a/tests/tests_utils/_test_3_hpc/test_2_nhr_transfer.py +++ b/tests/tests_utils/test_3_hpc/test_2_nhr_transfer.py @@ -15,7 +15,7 @@ def test_hpc_connector_transfer_file(hpc_nhr_data_transfer, path_batch_script_em """ assert_exists_file(str(path_batch_script_empty)) - test_hpc_file_path = Path(hpc_nhr_data_transfer.project_root_dir, BATCH_SCRIPT_EMPTY) + test_hpc_file_path = Path(hpc_nhr_data_transfer.project_root_dir_with_env, BATCH_SCRIPT_EMPTY) hpc_nhr_data_transfer.put_file(local_src=path_batch_script_empty, remote_dst=str(test_hpc_file_path)) sleep(2) test_local_received_file_path = Path(OPERANDI_SERVER_BASE_DIR, BATCH_SCRIPT_EMPTY) @@ -29,7 +29,7 @@ def test_hpc_connector_transfer_dir(hpc_nhr_data_transfer, path_dummy_workspace_ Testing the put_dir and get_dir functionality of the HPC transfer """ assert_exists_dir(str(path_dummy_workspace_data_dir)) - test_hpc_dir_path = Path(hpc_nhr_data_transfer.project_root_dir, ID_WORKSPACE) + test_hpc_dir_path = Path(hpc_nhr_data_transfer.project_root_dir_with_env, ID_WORKSPACE) hpc_nhr_data_transfer.put_dir(local_src=str(path_dummy_workspace_data_dir), remote_dst=str(test_hpc_dir_path)) sleep(5) test_local_received_dir_path = Path(OPERANDI_SERVER_BASE_DIR, ID_WORKSPACE) diff --git a/tests/tests_utils/_test_3_hpc/test_3_nhr_combined.py b/tests/tests_utils/test_3_hpc/test_3_nhr_combined.py similarity index 94% rename from tests/tests_utils/_test_3_hpc/test_3_nhr_combined.py rename to tests/tests_utils/test_3_hpc/test_3_nhr_combined.py index 3309b538..13c772e1 100644 --- a/tests/tests_utils/_test_3_hpc/test_3_nhr_combined.py +++ b/tests/tests_utils/test_3_hpc/test_3_nhr_combined.py @@ -61,7 +61,8 @@ def test_hpc_connector_run_batch_script( slurm_job_id = hpc_nhr_command_executor.trigger_slurm_job( workflow_job_id=ID_WORKFLOW_JOB, nextflow_script_path=Path(template_workflow), input_file_grp=DEFAULT_FILE_GRP, workspace_id=ID_WORKSPACE, - mets_basename=DEFAULT_METS_BASENAME, nf_process_forks=2, ws_pages_amount=8, use_mets_server=False, + mets_basename=DEFAULT_METS_BASENAME, nf_process_forks=2, ws_pages_amount=8, + use_mets_server=False, nf_executable_steps=["ocrd-cis-ocropy-binarize"], file_groups_to_remove="", cpus=2, ram=16, job_deadline_time=HPC_JOB_DEADLINE_TIME_TEST, partition=HPC_NHR_JOB_TEST_PARTITION, qos=HPC_JOB_QOS_SHORT) finished_successfully = hpc_nhr_command_executor.poll_till_end_slurm_job_state( @@ -84,7 +85,8 @@ def test_hpc_connector_run_batch_script_with_ms( workflow_job_id=ID_WORKFLOW_JOB_WITH_MS, nextflow_script_path=Path(template_workflow_with_ms), input_file_grp=DEFAULT_FILE_GRP, workspace_id=ID_WORKSPACE_WITH_MS, mets_basename=DEFAULT_METS_BASENAME, nf_process_forks=2, ws_pages_amount=8, - use_mets_server=True, file_groups_to_remove="", cpus=3, ram=16, job_deadline_time=HPC_JOB_DEADLINE_TIME_TEST, + use_mets_server=True, nf_executable_steps=["ocrd-cis-ocropy-binarize"], + file_groups_to_remove="", cpus=3, ram=16, job_deadline_time=HPC_JOB_DEADLINE_TIME_TEST, partition=HPC_NHR_JOB_TEST_PARTITION, qos=HPC_JOB_QOS_SHORT) finished_successfully = hpc_nhr_command_executor.poll_till_end_slurm_job_state( slurm_job_id=slurm_job_id, interval=5, timeout=300)