Merge pull request #20 from tcezard/EVA3488_remove_report

EVA-3488 - Remove requirement for assembly report / other refactor
EBIvariation · Feb 14, 2024 · 6b19408 · 6b19408
2 parents cc4644d + d5296c0
commit 6b19408
Show file tree

Hide file tree

Showing 8 changed files with 166 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -2,23 +2,27 @@
 EVA Submission Command Line Interface for Validation
 
 
-
-
 ## Installation
 
 TBD
 
 ## Input files for the validation and submission tool
 
-### The VCF file and association with reference genome
+There are two ways of specifying the VCF files and associated assembly
+
+### Using  `--vcf_files` and `--assembly_fasta`
+
+This allows you to provide multiple VCF files to validate and a single associated genome file.
+The VCF files and the associated genome file must use the same chromosome naming convention 
+
+### Using  `--vcf_files_mapping`
 
 The path to the VCF files are provided via CSV file that links the VCF to their respective fasta sequence. This allows 
 us to support different assemblies for each VCF file 
 The CSV file `vcf_mapping.csv` contains the following columns vcf, fasta, report providing respectively:
  - The VCF to validate/upload
  - The assembly in fasta format that was used to derive the VCF
- - The assembly report associated with the assembly (if available) as found in NCBI assemblies (https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#files)
-
+ - (Optional) The assembly report associated with the assembly (if available) as found in NCBI assemblies (https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#files)
 
 Example:
 ```shell
@@ -48,7 +52,7 @@ To validate and submit run the following command
 
 ```shell
 eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \
-               --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir
+               --vcf_files vcf_file1.vcf vcf_file2.vcf --assembly_fasta assembly.fa --submission_dir submission_dir
 ```
 
 ### Validate only

diff --git a/bin/eva-sub-cli.py b/bin/eva-sub-cli.py
@@ -1,33 +1,63 @@
 #!/usr/bin/env python
-
+import os
+import sys
 from argparse import ArgumentParser
 
 from ebi_eva_common_pyutils.logger import logging_config
 
 from eva_sub_cli import  main
 from eva_sub_cli.main import VALIDATE, SUBMIT
 
+def validate_command_line_arguments(args, argparser):
+
+    if args.vcf_files_mapping and (args.vcf_files or args.assembly_fasta):
+        print("Specify vcf_files and assembly_fasta OR a vcf_files_mapping in CSV. Not both")
+        argparser.print_usage()
+        sys.exit(1)
+
+    if (args.vcf_files and not args.assembly_fasta) or (not args.vcf_files and args.assembly_fasta):
+        print("When using --vcf_files and --assembly_fasta, both need to be specified")
+        argparser.print_usage()
+        sys.exit(1)
+
+    if SUBMIT in args.tasks and (
+            not (args.username or os.environ.get('ENAWEBINACCOUNT')) or
+            not (args.password or os.environ.get('ENAWEBINPASSWORD'))):
+        print("To submit your data, you need to provide a Webin username and password")
+        argparser.print_usage()
+        sys.exit(1)
+
 
 if __name__ == "__main__":
     argparser = ArgumentParser(description='EVA Submission CLI - validate and submit data to EVA')
-    argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT],
-                           help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of '
-                                'previous runs. Selecting SUBMIT will run validate only if the validation was not performed '
-                                'successfully before and then run the submission.')
     argparser.add_argument('--submission_dir', required=True, type=str,
                            help='Full path to the directory where all processing will be done '
                                 'and submission info is/will be stored')
-    argparser.add_argument("--vcf_files_mapping", required=True,
+    vcf_group = argparser.add_argument_group(
+        'Input VCF and assembly',
+        "Specify the VCF files and associated assembly with the following options. If you used different assemblies "
+        "for different VCF files then use --vcf_file_mapping"
+    )
+    vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf files to validate")
+    vcf_group.add_argument('--assembly_fasta',
+                           help="The fasta file containing the reference genome from which the variants were derived")
+    vcf_group.add_argument("--vcf_files_mapping",
                            help="csv file with the mappings for vcf files, fasta and assembly report")
-    group = argparser.add_mutually_exclusive_group(required=True)
-    group.add_argument("--metadata_json",
-                       help="Json file that describe the project, analysis, samples and files")
-    group.add_argument("--metadata_xlsx",
-                       help="Excel spreadsheet  that describe the project, analysis, samples and files")
-    argparser.add_argument("--username",
-                           help="Username used for connecting to the ENA webin account")
-    argparser.add_argument("--password",
-                           help="Password used for connecting to the ENA webin account")
+
+    metadata_group = argparser.add_argument_group('Metadata', 'Specify the metadata in a spreadsheet or in a JSON file')
+    metadata_group = metadata_group.add_mutually_exclusive_group(required=True)
+    metadata_group.add_argument("--metadata_json",
+                               help="Json file that describe the project, analysis, samples and files")
+    metadata_group.add_argument("--metadata_xlsx",
+                               help="Excel spreadsheet  that describe the project, analysis, samples and files")
+    argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT],
+                           help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of '
+                                'previous runs. Selecting SUBMIT will run validate only if the validation was not performed '
+                                'successfully before and then run the submission.')
+    credential_group = argparser.add_argument_group('Credential', 'Specify the Webin credential you want to use to '
+                                                                  'upload to the EVA')
+    credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
+    credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
     argparser.add_argument("--resume", default=False, action='store_true',
                            help="Resume the process execution from where it left of. This is currently only supported "
                                 "for the upload part of the SUBMIT task.")
@@ -36,5 +66,6 @@
 
     logging_config.add_stdout_handler()
 
-    main.orchestrate_process(args.submission_dir, args.vcf_files_mapping, args.metadata_json, args.metadata_xlsx,
-                             args.tasks, args.resume)
+    validate_command_line_arguments(args, argparser)
+    # Pass on all the arguments
+    main.orchestrate_process(**args.__dict__)
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -16,8 +16,6 @@ RUN curl -LJo /usr/local/bin/vcf_validator  https://github.com/EBIvariation/vcf-
 # Install biovalidator and make it executable
 RUN git clone https://github.com/elixir-europe/biovalidator.git  \
     && cd biovalidator  \
-    && chmod +x src/biovalidator.js \
-    && sed -i 's/dist/src/' package.json \
     && npm install \
     && npm link
 

diff --git a/eva_sub_cli/docker_validator.py b/eva_sub_cli/docker_validator.py
@@ -7,18 +7,16 @@
 
 from ebi_eva_common_pyutils.command_utils import run_command_with_output
 
-from eva_sub_cli import ETC_DIR
 from eva_sub_cli.reporter import Reporter
 from ebi_eva_common_pyutils.logger import logging_config
 
 logger = logging_config.get_logger(__name__)
 
 docker_path = 'docker'
 container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev3'
+container_tag = 'v0.0.1.dev4'
 container_validation_dir = '/opt/vcf_validation'
-container_validation_output_dir = '/opt/vcf_validation/vcf_validation_output'
-container_etc_dir = '/opt/eva_sub_cli/etc'
+container_validation_output_dir = 'vcf_validation_output'
 
 VALIDATION_OUTPUT_DIR = "validation_output"
 
@@ -31,14 +29,12 @@ def __init__(self, mapping_file, output_dir, metadata_json=None,
         # If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
         super().__init__(mapping_file, os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
                          submission_config=submission_config)
-
         self.docker_path = docker_path
         self.metadata_json = metadata_json
         self.metadata_xlsx = metadata_xlsx
         self.container_name = container_name
         if self.container_name is None:
             self.container_name = container_image.split('/')[1] + '.' + container_tag
-        self.spreadsheet2json_conf = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
 
     def _validate(self):
         self.run_docker_validator()
@@ -47,21 +43,19 @@ def get_docker_validation_cmd(self):
         if self.metadata_xlsx and not self.metadata_json:
             docker_cmd = (
                 f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
-                f"--vcf_files_mapping {container_validation_dir}/{self.mapping_file} "
-                f"--metadata_xlsx {container_validation_dir}/{self.metadata_xlsx} "
-                f"--conversion_configuration {container_validation_dir}/{self.spreadsheet2json_conf} "
-                f"--schema_dir {container_etc_dir} "
+                f"--base_dir {container_validation_dir} "
+                f"--vcf_files_mapping {self.mapping_file} "
+                f"--metadata_xlsx {self.metadata_xlsx} "
                 f"--output_dir {container_validation_output_dir}"
             )
         else:
             docker_cmd = (
                 f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
-                f"--vcf_files_mapping {container_validation_dir}/{self.mapping_file} "
-                f"--metadata_json {container_validation_dir}/{self.metadata_json} "
-                f"--schema_dir {container_etc_dir} "
+                f"--base_dir {container_validation_dir} "
+                f"--vcf_files_mapping {self.mapping_file} "
+                f"--metadata_json {self.metadata_json} "
                 f"--output_dir {container_validation_output_dir}"
             )
-        print(docker_cmd)
         return docker_cmd
 
     def run_docker_validator(self):
@@ -89,14 +83,13 @@ def run_docker_validator(self):
             self.copy_files_to_container()
 
             docker_cmd = self.get_docker_validation_cmd()
-            print(docker_cmd)
             # start validation
             # FIXME: If nextflow fails in the docker exec still exit with error code 0
             run_command_with_output("Run Validation using Nextflow", docker_cmd)
             # copy validation result to user host
             run_command_with_output(
                 "Copy validation output from container to host",
-                f"{self.docker_path} cp {self.container_name}:{container_validation_output_dir} {self.output_dir}"
+                f"{self.docker_path} cp {self.container_name}:{container_validation_dir}/{container_validation_output_dir} {self.output_dir}"
             )
         except subprocess.CalledProcessError as ex:
             logger.error(ex)
@@ -140,9 +133,7 @@ def verify_container_is_running(self):
 
     def verify_container_is_stopped(self):
         container_stop_cmd_output = run_command_with_output(
-            "check if container is stopped",
-            f"{self.docker_path} ps -a"
-            , return_process_output=True
+            "check if container is stopped", f"{self.docker_path} ps -a",  return_process_output=True
         )
         if container_stop_cmd_output is not None and self.container_name in container_stop_cmd_output:
             logger.info(f"Container ({self.container_name}) is in stop state")
@@ -236,13 +227,14 @@ def _copy(file_description, file_path):
             _copy('json metadata file', self.metadata_json)
         if self.metadata_xlsx:
             _copy('excel metadata file', self.metadata_xlsx)
-            _copy('configuration', self.spreadsheet2json_conf)
         with open(self.mapping_file) as open_file:
             reader = csv.DictReader(open_file, delimiter=',')
             for row in reader:
                 _copy('vcf files', row['vcf'])
                 _copy('fasta files', row['fasta'])
-                _copy('assembly report files', row['report'])
+                # report is optional
+                if row['report']:
+                    _copy('assembly report files', row['report'])
 
 
 if __name__ == "__main__":

diff --git a/eva_sub_cli/main.py b/eva_sub_cli/main.py
@@ -12,6 +12,7 @@
 VALIDATE = 'validate'
 SUBMIT = 'submit'
 
+
 def get_vcf_files(mapping_file):
     vcf_files = []
     with open(mapping_file) as open_file:
@@ -21,12 +22,28 @@ def get_vcf_files(mapping_file):
     return vcf_files
 
 
-def orchestrate_process(submission_dir, vcf_files_mapping, metadata_json, metadata_xlsx, tasks, resume):
+def create_vcf_files_mapping(submission_dir, vcf_files, assembly_fasta):
+    mapping_file = os.path.join(submission_dir, 'vcf_mapping_file.csv')
+    with open(mapping_file, 'w') as open_file:
+        writer = csv.writer(open_file, delimiter=',')
+        writer.writerow(['vcf', 'fasta', 'report'])
+        for vcf_file in vcf_files:
+            writer.writerow([os.path.abspath(vcf_file), os.path.abspath(assembly_fasta)])
+    return mapping_file
+
+
+def orchestrate_process(submission_dir, vcf_files_mapping, vcf_files, assembly_fasta, metadata_json, metadata_xlsx,
+                        tasks, resume, username=None, password=None, **kwargs):
     # load config
     config_file_path = os.path.join(submission_dir, SUB_CLI_CONFIG_FILE)
     sub_config = WritableConfig(config_file_path, version=__version__)
 
+    # Get the provided metadata
     metadata_file = metadata_json or metadata_xlsx
+
+    # Get the provided VCF and assembly
+    if vcf_files and assembly_fasta:
+        vcf_files_mapping = create_vcf_files_mapping(submission_dir, vcf_files, assembly_fasta)
     vcf_files = get_vcf_files(vcf_files_mapping)
 
     # Validation is mandatory so if submit is requested then VALIDATE must have run before or be requested as well
@@ -41,5 +58,6 @@ def orchestrate_process(submission_dir, vcf_files_mapping, metadata_json, metada
             validator.create_reports()
             validator.update_config_with_validation_result()
     if SUBMIT in tasks:
-        with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config) as submitter:
+        with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config,
+                            username=username, password=password) as submitter:
             submitter.submit(resume=resume)