Merge pull request #23 from apriltuesday/EVA-3225

EVA-3225 - Add native execution for validation and refactor
EBIvariation · Feb 20, 2024 · b37f3b8 · b37f3b8
2 parents 22cc935 + 0d25132
commit b37f3b8
Show file tree

Hide file tree

Showing 16 changed files with 164 additions and 101 deletions.
diff --git a/bin/eva-sub-cli.py b/bin/eva-sub-cli.py
@@ -5,11 +5,11 @@
 
 from ebi_eva_common_pyutils.logger import logging_config
 
-from eva_sub_cli import  main
-from eva_sub_cli.main import VALIDATE, SUBMIT
+from eva_sub_cli import main
+from eva_sub_cli.main import VALIDATE, SUBMIT, DOCKER, NATIVE
 
-def validate_command_line_arguments(args, argparser):
 
+def validate_command_line_arguments(args, argparser):
     if args.vcf_files_mapping and (args.vcf_files or args.assembly_fasta):
         print("Specify vcf_files and assembly_fasta OR a vcf_files_mapping in CSV. Not both")
         argparser.print_usage()
@@ -54,6 +54,8 @@ def validate_command_line_arguments(args, argparser):
                            help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of '
                                 'previous runs. Selecting SUBMIT will run validate only if the validation was not performed '
                                 'successfully before and then run the submission.')
+    argparser.add_argument('--executor', choices=[DOCKER, NATIVE], default=NATIVE,
+                           help='Select an execution type for running validation')
     credential_group = argparser.add_argument_group('Credential', 'Specify the Webin credential you want to use to '
                                                                   'upload to the EVA')
     credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")

diff --git a/eva_sub_cli/main.py b/eva_sub_cli/main.py
@@ -2,15 +2,17 @@
 import csv
 import os
 from ebi_eva_common_pyutils.config import WritableConfig
-from ebi_eva_common_pyutils.logger import logging_config
 
 from eva_sub_cli import SUB_CLI_CONFIG_FILE, __version__
-from eva_sub_cli.docker_validator import DockerValidator
-from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA
+from eva_sub_cli.validators.docker_validator import DockerValidator
+from eva_sub_cli.validators.native_validator import NativeValidator
+from eva_sub_cli.validators.validator import READY_FOR_SUBMISSION_TO_EVA
 from eva_sub_cli.submit import StudySubmitter
 
 VALIDATE = 'validate'
 SUBMIT = 'submit'
+DOCKER = 'docker'
+NATIVE = 'native'
 
 
 def get_vcf_files(mapping_file):
@@ -33,7 +35,7 @@ def create_vcf_files_mapping(submission_dir, vcf_files, assembly_fasta):
 
 
 def orchestrate_process(submission_dir, vcf_files_mapping, vcf_files, assembly_fasta, metadata_json, metadata_xlsx,
-                        tasks, resume, username=None, password=None, **kwargs):
+                        tasks, executor, resume, username=None, password=None, **kwargs):
     # load config
     config_file_path = os.path.join(submission_dir, SUB_CLI_CONFIG_FILE)
     sub_config = WritableConfig(config_file_path, version=__version__)
@@ -52,11 +54,15 @@ def orchestrate_process(submission_dir, vcf_files_mapping, vcf_files, assembly_f
             tasks.append(VALIDATE)
 
     if VALIDATE in tasks:
-        with DockerValidator(vcf_files_mapping, submission_dir, metadata_json, metadata_xlsx,
-                             submission_config=sub_config) as validator:
-            validator.validate()
-            validator.create_reports()
-            validator.update_config_with_validation_result()
+        if executor == DOCKER:
+            with DockerValidator(vcf_files_mapping, submission_dir, metadata_json, metadata_xlsx,
+                                 submission_config=sub_config) as validator:
+                validator.validate_and_report()
+        # default to native execution
+        else:
+            with NativeValidator(vcf_files_mapping, submission_dir, metadata_json, metadata_xlsx,
+                                 submission_config=sub_config) as validator:
+                validator.validate_and_report()
     if SUBMIT in tasks:
         with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config,
                             username=username, password=password) as submitter:

diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
@@ -19,14 +19,17 @@ params.output_dir = null
 params.metadata_json = null
 params.metadata_xlsx = null
 
-// executables
+// executables - external tools
 params.executable = [
     "vcf_validator": "vcf_validator",
     "vcf_assembly_checker": "vcf_assembly_checker",
+    "biovalidator": "biovalidator"
+]
+// python scripts - installed as part of eva-sub-cli
+params.python_scripts = [
     "samples_checker": "samples_checker.py",
     "fasta_checker": "check_fasta_insdc.py",
-    "xlsx2json": "xlsx2json.py",
-    "biovalidator": "biovalidator"
+    "xlsx2json": "xlsx2json.py"
 ]
 // validation tasks
 params.validation_tasks = [ "vcf_check", "assembly_check", "samples_check", "metadata_check", "insdc_check"]
@@ -60,7 +63,6 @@ def joinBasePath(path) {
 output_dir = joinBasePath(params.output_dir)
 
 workflow {
-
     // Prepare the file path
     vcf_channel = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
         .splitCsv(header:true)
@@ -170,7 +172,7 @@ process convert_xlsx_2_json {
     metadata_json = metadata_xlsx.getBaseName() + '.json'
 
     """
-    $params.executable.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --conversion_configuration $conversion_configuration
+    $params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --conversion_configuration $conversion_configuration
     """
 }
 
@@ -205,7 +207,7 @@ process sample_name_concordance {
 
     script:
     """
-    $params.executable.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
+    $params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
     """
 }
 
@@ -224,6 +226,6 @@ process insdc_checker {
 
     script:
     """
-    $params.executable.fasta_checker --input_fasta $fasta_file  --output_yaml ${fasta_file}_check.yml
+    $params.python_scripts.fasta_checker --input_fasta $fasta_file  --output_yaml ${fasta_file}_check.yml
     """
 }
diff --git a/eva_sub_cli/submit.py b/eva_sub_cli/submit.py
@@ -9,7 +9,7 @@
 
 from eva_sub_cli import SUB_CLI_CONFIG_FILE, __version__, SUBMISSION_WS_VAR
 from eva_sub_cli.auth import get_auth
-from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA
+from eva_sub_cli.validators.validator import READY_FOR_SUBMISSION_TO_EVA
 
 SUB_CLI_CONFIG_KEY_SUBMISSION_ID = "submission_id"
 SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL = "submission_upload_url"

diff --git a/eva_sub_cli/validators/__init__.py b/eva_sub_cli/validators/__init__.py
diff --git a/eva_sub_cli/docker_validator.py → eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/docker_validator.py → eva_sub_cli/validators/docker_validator.py
@@ -6,32 +6,28 @@
 import time
 
 from ebi_eva_common_pyutils.command_utils import run_command_with_output
-
-from eva_sub_cli.reporter import Reporter
 from ebi_eva_common_pyutils.logger import logging_config
 
+from eva_sub_cli.validators.validator import Validator, VALIDATION_OUTPUT_DIR
+
 logger = logging_config.get_logger(__name__)
 
-docker_path = 'docker'
 container_image = 'ebivariation/eva-sub-cli'
 container_tag = 'v0.0.1.dev4'
 container_validation_dir = '/opt/vcf_validation'
 container_validation_output_dir = 'vcf_validation_output'
 
-VALIDATION_OUTPUT_DIR = "validation_output"
-
 
-class DockerValidator(Reporter):
+class DockerValidator(Validator):
 
     def __init__(self, mapping_file, output_dir, metadata_json=None,
                  metadata_xlsx=None, container_name=None, docker_path='docker', submission_config=None):
         # validator write to the validation output directory
         # If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
         super().__init__(mapping_file, os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
+                         metadata_json=metadata_json, metadata_xlsx=metadata_xlsx,
                          submission_config=submission_config)
         self.docker_path = docker_path
-        self.metadata_json = metadata_json
-        self.metadata_xlsx = metadata_xlsx
         self.container_name = container_name
         if self.container_name is None:
             self.container_name = container_image.split('/')[1] + '.' + container_tag
@@ -59,16 +55,6 @@ def get_docker_validation_cmd(self):
         return docker_cmd
 
     def run_docker_validator(self):
-        # verify mapping file exists
-        if not os.path.exists(self.mapping_file):
-            raise RuntimeError(f'Mapping file {self.mapping_file} not found')
-
-        # verify all files mentioned in metadata files exist
-        files_missing, missing_files_list = self.check_if_file_missing()
-        if files_missing:
-            raise RuntimeError(f"some files (vcf/fasta) mentioned in metadata file could not be found. "
-                               f"Missing files list {missing_files_list}")
-
         # check if docker container is ready for running validation
         self.verify_docker_env()
 
@@ -94,24 +80,6 @@ def run_docker_validator(self):
         except subprocess.CalledProcessError as ex:
             logger.error(ex)
 
-    def check_if_file_missing(self):
-        files_missing = False
-        missing_files_list = []
-        with open(self.mapping_file) as open_file:
-            reader = csv.DictReader(open_file, delimiter=',')
-            for row in reader:
-                if not os.path.exists(row['vcf']):
-                    files_missing = True
-                    missing_files_list.append(row['vcf'])
-                if not os.path.exists(row['fasta']):
-                    files_missing = True
-                    missing_files_list.append(row['fasta'])
-                if not os.path.exists(row['report']):
-                    files_missing = True
-                    missing_files_list.append(row['report'])
-
-        return files_missing, missing_files_list
-
     def verify_docker_is_installed(self):
         try:
             run_command_with_output(
@@ -123,8 +91,8 @@ def verify_docker_is_installed(self):
             raise RuntimeError(f"Please make sure docker ({self.docker_path}) is installed and available on the path")
 
     def verify_container_is_running(self):
-        container_run_cmd_ouptut = run_command_with_output("check if container is running", f"{self.docker_path} ps", return_process_output=True)
-        if container_run_cmd_ouptut is not None and self.container_name in container_run_cmd_ouptut:
+        container_run_cmd_output = run_command_with_output("check if container is running", f"{self.docker_path} ps", return_process_output=True)
+        if container_run_cmd_output is not None and self.container_name in container_run_cmd_output:
             logger.info(f"Container ({self.container_name}) is running")
             return True
         else:

diff --git a/eva_sub_cli/validators/native_validator.py b/eva_sub_cli/validators/native_validator.py
@@ -0,0 +1,62 @@
+import os
+import subprocess
+
+from ebi_eva_common_pyutils.command_utils import run_command_with_output
+from ebi_eva_common_pyutils.logger import logging_config
+
+from eva_sub_cli.validators.validator import Validator
+
+logger = logging_config.get_logger(__name__)
+
+
+class NativeValidator(Validator):
+
+    def __init__(self, mapping_file, output_dir, metadata_json=None, metadata_xlsx=None,
+                 vcf_validator_path='vcf_validator', assembly_checker_path='vcf_assembly_checker',
+                 biovalidator_path='biovalidator', submission_config=None):
+        super().__init__(mapping_file, output_dir, metadata_json=metadata_json, metadata_xlsx=metadata_xlsx,
+                         submission_config=submission_config)
+        self.vcf_validator_path = vcf_validator_path
+        self.assembly_checker_path = assembly_checker_path
+        self.biovalidator_path = biovalidator_path
+
+    def _validate(self):
+        self.run_validator()
+
+    def run_validator(self):
+        self.verify_executables_installed()
+        try:
+            command = self.get_validation_cmd()
+            run_command_with_output("Run Validation using Nextflow", command)
+        except subprocess.CalledProcessError as ex:
+            logger.error(ex)
+
+    def get_validation_cmd(self):
+        if self.metadata_xlsx and not self.metadata_json:
+            metadata_flag = f"--metadata_xlsx {self.metadata_xlsx}"
+        else:
+            metadata_flag = f"--metadata_json {self.metadata_json}"
+        path_to_workflow = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+                                        'nextflow/validation.nf')
+        return (
+            f"nextflow run {path_to_workflow} "
+            f"--vcf_files_mapping {self.mapping_file} "
+            f"{metadata_flag} "
+            f"--output_dir {self.output_dir} "
+            f"--executable.vcf_validator {self.vcf_validator_path} "
+            f"--executable.vcf_assembly_checker {self.assembly_checker_path} "
+            f"--executable.biovalidator {self.biovalidator_path}"
+        )
+
+    def verify_executables_installed(self):
+        for name, path in [('vcf-validator', self.vcf_validator_path),
+                           ('vcf-assembly-checker', self.assembly_checker_path),
+                           ('biovalidator', self.biovalidator_path)]:
+            try:
+                run_command_with_output(
+                    f"Check {name} is installed and available on the path",
+                    f"{path} --version"
+                )
+            except subprocess.CalledProcessError as ex:
+                logger.error(ex)
+                raise RuntimeError(f"Please make sure {name} ({path}) is installed and available on the path")
diff --git a/eva_sub_cli/reporter.py → eva_sub_cli/validators/validator.py b/eva_sub_cli/reporter.py → eva_sub_cli/validators/validator.py
@@ -13,11 +13,13 @@
 from eva_sub_cli.report import generate_html_report
 from ebi_eva_common_pyutils.logger import logging_config
 
+VALIDATION_OUTPUT_DIR = "validation_output"
 VALIDATION_RESULTS = 'validation_results'
 READY_FOR_SUBMISSION_TO_EVA = 'ready_for_submission_to_eva'
 
 logger = logging_config.get_logger(__name__)
 
+
 def resolve_single_file_path(file_path):
     files = glob.glob(file_path)
     if len(files) == 0:
@@ -26,9 +28,10 @@ def resolve_single_file_path(file_path):
         return files[0]
 
 
-class Reporter:
+class Validator:
 
-    def __init__(self, mapping_file, output_dir, submission_config: WritableConfig = None):
+    def __init__(self, mapping_file, output_dir, metadata_json=None, metadata_xlsx=None,
+                 submission_config: WritableConfig = None):
         self.output_dir = output_dir
         self.mapping_file = mapping_file
         vcf_files, fasta_files = self._find_vcf_and_fasta_files()
@@ -37,6 +40,8 @@ def __init__(self, mapping_file, output_dir, submission_config: WritableConfig =
         self.results = {}
         self.project_title = None  # TODO fill this from metadata?
         self.validation_date = datetime.datetime.now()
+        self.metadata_json = metadata_json
+        self.metadata_xlsx = metadata_xlsx
         if submission_config:
             self.sub_config = submission_config
         else:
@@ -60,13 +65,50 @@ def _find_vcf_and_fasta_files(self):
                 fasta_files.append(row['fasta'])
         return vcf_files, fasta_files
 
+    def validate_and_report(self):
+        self.validate()
+        self.report()
+
     def validate(self):
+        self.verify_files_present()
         self._validate()
         self._collect_validation_workflow_results()
 
+    def report(self):
+        self.create_reports()
+        self.update_config_with_validation_result()
+
     def _validate(self):
         raise NotImplementedError
 
+    def verify_files_present(self):
+        # verify mapping file exists
+        if not os.path.exists(self.mapping_file):
+            raise RuntimeError(f'Mapping file {self.mapping_file} not found')
+
+        # verify all files mentioned in metadata files exist
+        files_missing, missing_files_list = self.check_if_file_missing()
+        if files_missing:
+            raise RuntimeError(f"some files (vcf/fasta) mentioned in metadata file could not be found. "
+                               f"Missing files list {missing_files_list}")
+
+    def check_if_file_missing(self):
+        files_missing = False
+        missing_files_list = []
+        with open(self.mapping_file) as open_file:
+            reader = csv.DictReader(open_file, delimiter=',')
+            for row in reader:
+                if not os.path.exists(row['vcf']):
+                    files_missing = True
+                    missing_files_list.append(row['vcf'])
+                if not os.path.exists(row['fasta']):
+                    files_missing = True
+                    missing_files_list.append(row['fasta'])
+                if not os.path.exists(row['report']):
+                    files_missing = True
+                    missing_files_list.append(row['report'])
+        return files_missing, missing_files_list
+
     def update_config_with_validation_result(self):
         self.sub_config.set(VALIDATION_RESULTS, value=self.results)
         self.sub_config.set(READY_FOR_SUBMISSION_TO_EVA, value=self.verify_ready_for_submission_to_eva())
@@ -160,7 +202,7 @@ def vcf_check_errors_is_critical(self, error):
                     return False
         return True
 
-    def _collect_validation_workflow_results(self, ):
+    def _collect_validation_workflow_results(self):
         # Collect information from the output and summarise in the config
         self._collect_vcf_check_results()
         self._collect_assembly_check_results()

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='eva_sub_cli',
-    packages=['eva_sub_cli'],
+    packages=['eva_sub_cli', 'eva_sub_cli.validators'],
     package_data={'eva_sub_cli': ['nextflow/*', 'etc/*', 'VERSION', 'jinja_templates/*']},
     version=version,
     license='Apache',