From bf3ad5beaac886eea182ee9037c15ad1ca82ba5e Mon Sep 17 00:00:00 2001
From: tcezard <tcezard@ebi.ac.uk>
Date: Mon, 12 Feb 2024 12:55:08 +0000
Subject: [PATCH 1/2] Refactor validation nextflow to be able to run it without
 docker Remove the requirement for file that are part of eva-sub-cli

Make assembly report optional
Allow vcf files and assembly to be provided from the command line
---
 README.md                                 | 16 +++---
 bin/eva-sub-cli.py                        | 65 +++++++++++++++++------
 docker/Dockerfile                         |  2 -
 eva_sub_cli/docker_validator.py           | 34 +++++-------
 eva_sub_cli/main.py                       | 22 +++++++-
 eva_sub_cli/nextflow/validation.nf        | 59 ++++++++++++--------
 tests/ignore_test_nextflow_validation.yml |  6 +++
 tests/test_main.py                        | 41 +++++++++++---
 8 files changed, 166 insertions(+), 79 deletions(-)
 create mode 100644 tests/ignore_test_nextflow_validation.yml

diff --git a/README.md b/README.md
index c7849cc..81bcacb 100644
--- a/README.md
+++ b/README.md
@@ -2,23 +2,27 @@
 EVA Submission Command Line Interface for Validation
 
 
-
-
 ## Installation
 
 TBD
 
 ## Input files for the validation and submission tool
 
-### The VCF file and association with reference genome
+There are two ways of specifying the VCF files and associated assembly
+
+### Using  `--vcf_files` and `--assembly_fasta`
+
+This allows you to provide multiple VCF file to validate and a single genome file associated. 
+The VCF file and genome associated must use the same chromosome naming convention 
+
+### Using  `--vcf_files_mapping`
 
 The path to the VCF files are provided via CSV file that links the VCF to their respective fasta sequence. This allows 
 us to support different assemblies for each VCF file 
 The CSV file `vcf_mapping.csv` contains the following columns vcf, fasta, report providing respectively:
  - The VCF to validate/upload
  - The assembly in fasta format that was used to derive the VCF
- - The assembly report associated with the assembly (if available) as found in NCBI assemblies (https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#files)
-
+ - (Optional) The assembly report associated with the assembly (if available) as found in NCBI assemblies (https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#files)
 
 Example:
 ```shell
@@ -48,7 +52,7 @@ To validate and submit run the following command
 
 ```shell
 eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \
-               --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir
+               --vcf_files vcf_file1.vcf vcf_file2.vcf --assembly_fasta assembly.fa --submission_dir submission_dir
 ```
 
 ### Validate only
diff --git a/bin/eva-sub-cli.py b/bin/eva-sub-cli.py
index cefd2fb..bf8973b 100755
--- a/bin/eva-sub-cli.py
+++ b/bin/eva-sub-cli.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
-
+import os
+import sys
 from argparse import ArgumentParser
 
 from ebi_eva_common_pyutils.logger import logging_config
@@ -7,27 +8,56 @@
 from eva_sub_cli import  main
 from eva_sub_cli.main import VALIDATE, SUBMIT
 
+def validate_command_line_arguments(args, argparser):
+
+    if args.vcf_files_mapping and (args.vcf_files or args.assembly_fasta):
+        print("Specify vcf_files and assembly_fasta OR a vcf_files_mapping in CSV. Not both")
+        argparser.print_usage()
+        sys.exit(1)
+
+    if (args.vcf_files and not args.assembly_fasta) or (not args.vcf_files and args.assembly_fasta):
+        print("When using --vcf_files and --assembly_fasta, both needs to be specified")
+        argparser.print_usage()
+        sys.exit(1)
+
+    if SUBMIT in args.tasks and (
+            not (args.username or os.environ.get('ENAWEBINACCOUNT')) or
+            not (args.password or os.environ.get('ENAWEBINPASSWORD'))):
+        print("To submit your data, you need to provide a Webin username and password")
+        argparser.print_usage()
+        sys.exit(1)
+
 
 if __name__ == "__main__":
     argparser = ArgumentParser(description='EVA Submission CLI - validate and submit data to EVA')
-    argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT],
-                           help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of '
-                                'previous runs. Selecting SUBMIT will run validate only if the validation was not performed '
-                                'successfully before and then run the submission.')
     argparser.add_argument('--submission_dir', required=True, type=str,
                            help='Full path to the directory where all processing will be done '
                                 'and submission info is/will be stored')
-    argparser.add_argument("--vcf_files_mapping", required=True,
+    vcf_group = argparser.add_argument_group(
+        'Input VCF and assembly',
+        "Specify the VCF files and associated assembly with the following options. If you used different assembly "
+        "for different VCF files then use --vcf_file_mapping"
+    )
+    vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf file to validate")
+    vcf_group.add_argument('--assembly_fasta',
+                           help="The fasta file containing the reference genome from which the variant were derived")
+    vcf_group.add_argument("--vcf_files_mapping",
                            help="csv file with the mappings for vcf files, fasta and assembly report")
-    group = argparser.add_mutually_exclusive_group(required=True)
-    group.add_argument("--metadata_json",
-                       help="Json file that describe the project, analysis, samples and files")
-    group.add_argument("--metadata_xlsx",
-                       help="Excel spreadsheet  that describe the project, analysis, samples and files")
-    argparser.add_argument("--username",
-                           help="Username used for connecting to the ENA webin account")
-    argparser.add_argument("--password",
-                           help="Password used for connecting to the ENA webin account")
+
+    metadata_group = argparser.add_argument_group('Metadata', 'Specify the metadata in a spreadsheet of in a JSON file')
+    metadata_group = metadata_group.add_mutually_exclusive_group(required=True)
+    metadata_group.add_argument("--metadata_json",
+                               help="Json file that describe the project, analysis, samples and files")
+    metadata_group.add_argument("--metadata_xlsx",
+                               help="Excel spreadsheet  that describe the project, analysis, samples and files")
+    argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT],
+                           help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of '
+                                'previous runs. Selecting SUBMIT will run validate only if the validation was not performed '
+                                'successfully before and then run the submission.')
+    credential_group = argparser.add_argument_group('Credential', 'Specify the Webin credential you want to use to '
+                                                                  'upload to the EVA')
+    credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
+    credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
     argparser.add_argument("--resume", default=False, action='store_true',
                            help="Resume the process execution from where it left of. This is currently only supported "
                                 "for the upload part of the SUBMIT task.")
@@ -36,5 +66,6 @@
 
     logging_config.add_stdout_handler()
 
-    main.orchestrate_process(args.submission_dir, args.vcf_files_mapping, args.metadata_json, args.metadata_xlsx,
-                             args.tasks, args.resume)
+    validate_command_line_arguments(args, argparser)
+    # Pass on all the arguments
+    main.orchestrate_process(**args.__dict__)
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 65e4f5c..0575701 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -16,8 +16,6 @@ RUN curl -LJo /usr/local/bin/vcf_validator  https://github.com/EBIvariation/vcf-
 # Install biovalidator and make it executable
 RUN git clone https://github.com/elixir-europe/biovalidator.git  \
     && cd biovalidator  \
-    && chmod +x src/biovalidator.js \
-    && sed -i 's/dist/src/' package.json \
     && npm install \
     && npm link
 
diff --git a/eva_sub_cli/docker_validator.py b/eva_sub_cli/docker_validator.py
index a4dd68e..1d09376 100644
--- a/eva_sub_cli/docker_validator.py
+++ b/eva_sub_cli/docker_validator.py
@@ -7,7 +7,6 @@
 
 from ebi_eva_common_pyutils.command_utils import run_command_with_output
 
-from eva_sub_cli import ETC_DIR
 from eva_sub_cli.reporter import Reporter
 from ebi_eva_common_pyutils.logger import logging_config
 
@@ -15,10 +14,9 @@
 
 docker_path = 'docker'
 container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev3'
+container_tag = 'v0.0.1.dev4'
 container_validation_dir = '/opt/vcf_validation'
-container_validation_output_dir = '/opt/vcf_validation/vcf_validation_output'
-container_etc_dir = '/opt/eva_sub_cli/etc'
+container_validation_output_dir = 'vcf_validation_output'
 
 VALIDATION_OUTPUT_DIR = "validation_output"
 
@@ -31,14 +29,12 @@ def __init__(self, mapping_file, output_dir, metadata_json=None,
         # If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
         super().__init__(mapping_file, os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
                          submission_config=submission_config)
-
         self.docker_path = docker_path
         self.metadata_json = metadata_json
         self.metadata_xlsx = metadata_xlsx
         self.container_name = container_name
         if self.container_name is None:
             self.container_name = container_image.split('/')[1] + '.' + container_tag
-        self.spreadsheet2json_conf = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
 
     def _validate(self):
         self.run_docker_validator()
@@ -47,21 +43,19 @@ def get_docker_validation_cmd(self):
         if self.metadata_xlsx and not self.metadata_json:
             docker_cmd = (
                 f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
-                f"--vcf_files_mapping {container_validation_dir}/{self.mapping_file} "
-                f"--metadata_xlsx {container_validation_dir}/{self.metadata_xlsx} "
-                f"--conversion_configuration {container_validation_dir}/{self.spreadsheet2json_conf} "
-                f"--schema_dir {container_etc_dir} "
+                f"--base_dir {container_validation_dir} "
+                f"--vcf_files_mapping {self.mapping_file} "
+                f"--metadata_xlsx {self.metadata_xlsx} "
                 f"--output_dir {container_validation_output_dir}"
             )
         else:
             docker_cmd = (
                 f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
-                f"--vcf_files_mapping {container_validation_dir}/{self.mapping_file} "
-                f"--metadata_json {container_validation_dir}/{self.metadata_json} "
-                f"--schema_dir {container_etc_dir} "
+                f"--base_dir {container_validation_dir} "
+                f"--vcf_files_mapping {self.mapping_file} "
+                f"--metadata_json {self.metadata_json} "
                 f"--output_dir {container_validation_output_dir}"
             )
-        print(docker_cmd)
         return docker_cmd
 
     def run_docker_validator(self):
@@ -89,14 +83,13 @@ def run_docker_validator(self):
             self.copy_files_to_container()
 
             docker_cmd = self.get_docker_validation_cmd()
-            print(docker_cmd)
             # start validation
             # FIXME: If nextflow fails in the docker exec still exit with error code 0
             run_command_with_output("Run Validation using Nextflow", docker_cmd)
             # copy validation result to user host
             run_command_with_output(
                 "Copy validation output from container to host",
-                f"{self.docker_path} cp {self.container_name}:{container_validation_output_dir} {self.output_dir}"
+                f"{self.docker_path} cp {self.container_name}:{container_validation_dir}/{container_validation_output_dir} {self.output_dir}"
             )
         except subprocess.CalledProcessError as ex:
             logger.error(ex)
@@ -140,9 +133,7 @@ def verify_container_is_running(self):
 
     def verify_container_is_stopped(self):
         container_stop_cmd_output = run_command_with_output(
-            "check if container is stopped",
-            f"{self.docker_path} ps -a"
-            , return_process_output=True
+            "check if container is stopped", f"{self.docker_path} ps -a",  return_process_output=True
         )
         if container_stop_cmd_output is not None and self.container_name in container_stop_cmd_output:
             logger.info(f"Container ({self.container_name}) is in stop state")
@@ -236,13 +227,14 @@ def _copy(file_description, file_path):
             _copy('json metadata file', self.metadata_json)
         if self.metadata_xlsx:
             _copy('excel metadata file', self.metadata_xlsx)
-            _copy('configuration', self.spreadsheet2json_conf)
         with open(self.mapping_file) as open_file:
             reader = csv.DictReader(open_file, delimiter=',')
             for row in reader:
                 _copy('vcf files', row['vcf'])
                 _copy('fasta files', row['fasta'])
-                _copy('assembly report files', row['report'])
+                # report is optional
+                if row['report']:
+                    _copy('assembly report files', row['report'])
 
 
 if __name__ == "__main__":
diff --git a/eva_sub_cli/main.py b/eva_sub_cli/main.py
index 4f19aa0..f897d53 100755
--- a/eva_sub_cli/main.py
+++ b/eva_sub_cli/main.py
@@ -12,6 +12,7 @@
 VALIDATE = 'validate'
 SUBMIT = 'submit'
 
+
 def get_vcf_files(mapping_file):
     vcf_files = []
     with open(mapping_file) as open_file:
@@ -21,12 +22,28 @@ def get_vcf_files(mapping_file):
     return vcf_files
 
 
-def orchestrate_process(submission_dir, vcf_files_mapping, metadata_json, metadata_xlsx, tasks, resume):
+def create_vcf_files_mapping(submission_dir, vcf_files, assembly_fasta):
+    mapping_file = os.path.join(submission_dir, 'vcf_mapping_file.csv')
+    with open(mapping_file, 'w') as open_file:
+        writer = csv.writer(open_file, delimiter=',')
+        writer.writerow(['vcf', 'fasta', 'report'])
+        for vcf_file in vcf_files:
+            writer.writerow([os.path.abspath(vcf_file), os.path.abspath(assembly_fasta)])
+    return mapping_file
+
+
+def orchestrate_process(submission_dir, vcf_files_mapping, vcf_files, assembly_fasta, metadata_json, metadata_xlsx,
+                        tasks, resume, username=None, password=None, **kwargs):
     # load config
     config_file_path = os.path.join(submission_dir, SUB_CLI_CONFIG_FILE)
     sub_config = WritableConfig(config_file_path, version=__version__)
 
+    # Get the provided metadata
     metadata_file = metadata_json or metadata_xlsx
+
+    # Get the provided VCF and assembly
+    if vcf_files and assembly_fasta:
+        vcf_files_mapping = create_vcf_files_mapping(submission_dir, vcf_files, assembly_fasta)
     vcf_files = get_vcf_files(vcf_files_mapping)
 
     # Validation is mandatory so if submit is requested then VALIDATE must have run before or be requested as well
@@ -41,5 +58,6 @@ def orchestrate_process(submission_dir, vcf_files_mapping, metadata_json, metada
             validator.create_reports()
             validator.update_config_with_validation_result()
     if SUBMIT in tasks:
-        with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config) as submitter:
+        with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config,
+                            username=username, password=password) as submitter:
             submitter.submit(resume=resume)
diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
index ef15b3a..14c7664 100644
--- a/eva_sub_cli/nextflow/validation.nf
+++ b/eva_sub_cli/nextflow/validation.nf
@@ -11,7 +11,6 @@ def helpMessage() {
             --output_dir            output_directory where the reports will be output
             --metadata_json         Json file describing the project, analysis, samples and files
             --metadata_xlsx         Excel file describing the project, analysis, samples and files
-            --schema_dir            Directory containing the JSON schemas used for validation
     """
 }
 
@@ -31,15 +30,14 @@ params.executable = [
 ]
 // validation tasks
 params.validation_tasks = [ "vcf_check", "assembly_check", "samples_check", "metadata_check", "insdc_check"]
-// container validation dir (prefix for vcf files)
-params.container_validation_dir = "/opt/vcf_validation"
+// prefix to prepend to all provided path
+params.base_dir = ""
 // help
 params.help = null
 
 // Show help message
 if (params.help) exit 0, helpMessage()
 
-
 // Test input files
 if (!params.vcf_files_mapping || !params.output_dir || (!params.metadata_json && !params.metadata_xlsx)) {
     if (!params.vcf_files_mapping)      log.warn('Provide a csv file with the mappings (vcf, fasta, assembly report) --vcf_files_mapping')
@@ -49,18 +47,31 @@ if (!params.vcf_files_mapping || !params.output_dir || (!params.metadata_json &&
     exit 1, helpMessage()
 }
 
+schema_dir = file(projectDir).parent + '/etc'
+conversion_configuration = schema_dir + '/spreadsheet2json_conf.yaml'
+
+def joinBasePath(path) {
+    if (path){
+        return params.base_dir + '/' + path
+    }
+    return 'NO_FILE'
+}
+
+output_dir = joinBasePath(params.output_dir)
 
 workflow {
-    vcf_channel = Channel.fromPath(params.vcf_files_mapping)
+
+    // Prepare the file path
+    vcf_channel = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
         .splitCsv(header:true)
         .map{row -> tuple(
-            file(params.container_validation_dir+row.vcf),
-            file(params.container_validation_dir+row.fasta),
-            file(params.container_validation_dir+row.report)
+            file(joinBasePath(row.vcf)),
+            file(joinBasePath(row.fasta)),
+            file(joinBasePath(row.report))
         )}
-    vcf_files = Channel.fromPath(params.vcf_files_mapping)
+    vcf_files = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
         .splitCsv(header:true)
-        .map{row -> file(params.container_validation_dir+row.vcf)}
+        .map{row -> file(joinBasePath(row.vcf))}
 
     if ("vcf_check" in params.validation_tasks) {
         check_vcf_valid(vcf_channel)
@@ -69,10 +80,10 @@ workflow {
         check_vcf_reference(vcf_channel)
     }
     if (params.metadata_xlsx && !params.metadata_json){
-        convert_xlsx_2_json(params.metadata_xlsx, params.conversion_configuration)
+        convert_xlsx_2_json(joinBasePath(params.metadata_xlsx))
         metadata_json = convert_xlsx_2_json.out.metadata_json
     } else{
-        metadata_json = params.metadata_json
+        metadata_json = joinBasePath(params.metadata_json)
     }
     if ("metadata_check" in params.validation_tasks){
         metadata_json_validation(metadata_json)
@@ -81,9 +92,9 @@ workflow {
         sample_name_concordance(metadata_json, vcf_files.collect())
     }
     if ("insdc_check" in params.validation_tasks){
-        fasta_files = Channel.fromPath(params.vcf_files_mapping)
+        fasta_files = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
         .splitCsv(header:true)
-        .map{row -> file(params.container_validation_dir+row.fasta)}
+        .map{row -> file(joinBasePath(row.fasta))}
         .unique()
         insdc_checker(fasta_files)
     }
@@ -93,7 +104,7 @@ workflow {
 * Validate the VCF file format
 */
 process check_vcf_valid {
-    publishDir "$params.output_dir",
+    publishDir output_dir,
             overwrite: false,
             mode: "copy"
 
@@ -117,7 +128,7 @@ process check_vcf_valid {
 * Validate the VCF reference allele
 */
 process check_vcf_reference {
-    publishDir "$params.output_dir",
+    publishDir output_dir,
             overwrite: true,
             mode: "copy"
 
@@ -132,23 +143,25 @@ process check_vcf_reference {
     when:
     "assembly_check" in params.validation_tasks
 
+    script:
+    def report_opt = report.name != 'NO_FILE' ? "-a $report" : ''
+
     """
     trap 'if [[ \$? == 1 || \$? == 139 ]]; then exit 0; fi' EXIT
 
     mkdir -p assembly_check
-    $params.executable.vcf_assembly_checker -i $vcf -f $fasta -a $report -r summary,text,valid  -o assembly_check --require-genbank > assembly_check/${vcf}.assembly_check.log 2>&1
+    $params.executable.vcf_assembly_checker -i $vcf -f $fasta $report_opt -r summary,text,valid  -o assembly_check --require-genbank > assembly_check/${vcf}.assembly_check.log 2>&1
     """
 }
 
 
 process convert_xlsx_2_json {
-    publishDir "$params.output_dir",
+    publishDir output_dir,
             overwrite: true,
             mode: "copy"
 
     input:
     path(metadata_xlsx)
-    path(conversion_configuration)
 
     output:
     path "metadata.json", emit: metadata_json
@@ -162,7 +175,7 @@ process convert_xlsx_2_json {
 }
 
 process metadata_json_validation {
-    publishDir "$params.output_dir",
+    publishDir output_dir,
             overwrite: true,
             mode: "copy"
 
@@ -174,12 +187,12 @@ process metadata_json_validation {
 
     script:
     """
-    $params.executable.biovalidator --schema $params.schema_dir/eva_schema.json --ref $params.schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt
+    $params.executable.biovalidator --schema $schema_dir/eva_schema.json --ref $schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt
     """
 }
 
 process sample_name_concordance {
-    publishDir "$params.output_dir",
+    publishDir output_dir,
             overwrite: true,
             mode: "copy"
 
@@ -199,7 +212,7 @@ process sample_name_concordance {
 
 
 process insdc_checker {
-    publishDir "$params.output_dir",
+    publishDir output_dir,
             overwrite: true,
             mode: "copy"
 
diff --git a/tests/ignore_test_nextflow_validation.yml b/tests/ignore_test_nextflow_validation.yml
new file mode 100644
index 0000000..f01116b
--- /dev/null
+++ b/tests/ignore_test_nextflow_validation.yml
@@ -0,0 +1,6 @@
+- name: My pipeline
+  command: nextflow run eva_sub_cli/nextflow/validation.nf --base_dir /Users/tcezard/PycharmProjects/eva-sub-cli --vcf_files_mapping tests/resources/vcf_files.csv --output_dir tests/resources/validation_output/ --metadata_xlsx tests/resources/EVA_Submission_test.xlsx
+  files:
+    - path: "tests/resources/validation_output/assembly_check/input_passed.vcf.assembly_check.log"
+    - path: "tests/resources/validation_output/assembly_check/input_passed.vcf.text_assembly_report.*.txt"
+    - path: "tests/resources/validation_output/assembly_check/input_passed.vcf.assembly_check.log"
diff --git a/tests/test_main.py b/tests/test_main.py
index 1e63dd2..69c8779 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -18,17 +18,25 @@ class TestMain(unittest.TestCase):
     test_sub_dir = os.path.join(resource_dir, 'test_sub_dir')
     config_file = os.path.join(test_sub_dir, SUB_CLI_CONFIG_FILE)
 
-    mapping_file = os.path.join(test_sub_dir, 'vcf_files_metadata.csv')
+    mapping_file = os.path.join(test_sub_dir, 'vcf_mapping_file.csv')
+    vcf_files = [os.path.join(test_sub_dir, 'vcf_file1.vcf'), os.path.join(test_sub_dir, 'vcf_file2.vcf')]
+    assembly_fasta = os.path.join(test_sub_dir, 'genome.fa')
     metadata_json = os.path.join(test_sub_dir, 'sub_metadata.json')
     metadata_xlsx = os.path.join(test_sub_dir, 'sub_metadata.xlsx')
 
+    def setUp(self) -> None:
+        os.makedirs(self.test_sub_dir)
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.test_sub_dir)
+
     def test_orchestrate_validate(self):
         with patch('eva_sub_cli.main.get_vcf_files') as m_get_vcf,  \
                 patch('eva_sub_cli.main.WritableConfig') as m_config, \
                 patch('eva_sub_cli.main.DockerValidator') as m_docker_validator:
             orchestrate_process(
-                self.test_sub_dir, self.mapping_file, self.metadata_json, self.metadata_xlsx, tasks=[VALIDATE],
-                resume=False
+                self.test_sub_dir, self.mapping_file, None, None, self.metadata_json, self.metadata_xlsx,
+                tasks=[VALIDATE], resume=False
             )
             m_get_vcf.assert_called_once_with(self.mapping_file)
             m_docker_validator.assert_any_call(
@@ -40,7 +48,6 @@ def test_orchestrate_validate(self):
                 validator.create_reports.assert_called_once_with()
                 validator.update_config_with_validation_result.assert_called_once_with()
 
-
     def test_orchestrate_validate_submit(self):
         with patch('eva_sub_cli.main.get_vcf_files') as m_get_vcf, \
                 patch('eva_sub_cli.main.WritableConfig') as m_config, \
@@ -50,7 +57,7 @@ def test_orchestrate_validate_submit(self):
             m_config.return_value = {}
 
             orchestrate_process(
-                self.test_sub_dir, self.mapping_file, self.metadata_json, self.metadata_xlsx, tasks=[SUBMIT],
+                self.test_sub_dir, self.mapping_file, None, None, self.metadata_json, self.metadata_xlsx, tasks=[SUBMIT],
                 resume=False
             )
             m_get_vcf.assert_called_once_with(self.mapping_file)
@@ -66,7 +73,7 @@ def test_orchestrate_validate_submit(self):
 
             # Submit was created
             m_submitter.assert_any_call(self.test_sub_dir, m_get_vcf.return_value, self.metadata_json,
-                                        submission_config=m_config.return_value)
+                                        submission_config=m_config.return_value, username=None, password=None)
             with m_submitter() as submitter:
                 submitter.submit.assert_called_once_with(resume=False)
 
@@ -79,7 +86,7 @@ def test_orchestrate_submit_no_validate(self):
             m_config.return_value = {READY_FOR_SUBMISSION_TO_EVA: True}
 
             orchestrate_process(
-                self.test_sub_dir, self.mapping_file, self.metadata_json, self.metadata_xlsx, tasks=[SUBMIT],
+                self.test_sub_dir, self.mapping_file, None, None, self.metadata_json, self.metadata_xlsx, tasks=[SUBMIT],
                 resume=False
             )
             m_get_vcf.assert_called_once_with(self.mapping_file)
@@ -88,6 +95,24 @@ def test_orchestrate_submit_no_validate(self):
 
             # Submit was created
             m_submitter.assert_any_call(self.test_sub_dir, m_get_vcf.return_value, self.metadata_json,
-                                        submission_config=m_config.return_value)
+                                        submission_config=m_config.return_value, username=None, password=None)
             with m_submitter() as submitter:
                 submitter.submit.assert_called_once_with(resume=False)
+
+    def test_orchestrate_with_vcf_files(self):
+        with patch('eva_sub_cli.main.WritableConfig') as m_config, \
+                patch('eva_sub_cli.main.DockerValidator') as m_docker_validator:
+            orchestrate_process(
+                self.test_sub_dir, None, self.vcf_files, self.assembly_fasta, self.metadata_json, self.metadata_xlsx,
+                tasks=[VALIDATE], resume=False
+            )
+            # Mapping file was created from the vcf and assembly files
+            assert os.path.exists(self.mapping_file)
+            m_docker_validator.assert_any_call(
+                self.mapping_file, self.test_sub_dir, self.metadata_json, self.metadata_xlsx,
+                submission_config=m_config.return_value
+            )
+            with m_docker_validator() as validator:
+                validator.validate.assert_called_once_with()
+                validator.create_reports.assert_called_once_with()
+                validator.update_config_with_validation_result.assert_called_once_with()

From d5296c0062ab9aabb0df800569ca74619f4d1e0e Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Wed, 14 Feb 2024 14:12:26 +0000
Subject: [PATCH 2/2] Apply suggestions from code review

Co-authored-by: nitin-ebi <79518737+nitin-ebi@users.noreply.github.com>
Co-authored-by: April Shen <april.tuesday@gmail.com>
---
 README.md          |  4 ++--
 bin/eva-sub-cli.py | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 81bcacb..3e7a21b 100644
--- a/README.md
+++ b/README.md
@@ -12,8 +12,8 @@ There are two ways of specifying the VCF files and associated assembly
 
 ### Using  `--vcf_files` and `--assembly_fasta`
 
-This allows you to provide multiple VCF file to validate and a single genome file associated. 
-The VCF file and genome associated must use the same chromosome naming convention 
+This allows you to provide multiple VCF files to validate and a single associated genome file.
+The VCF files and the associated genome file must use the same chromosome naming convention 
 
 ### Using  `--vcf_files_mapping`
 
diff --git a/bin/eva-sub-cli.py b/bin/eva-sub-cli.py
index bf8973b..54561f9 100755
--- a/bin/eva-sub-cli.py
+++ b/bin/eva-sub-cli.py
@@ -16,7 +16,7 @@ def validate_command_line_arguments(args, argparser):
         sys.exit(1)
 
     if (args.vcf_files and not args.assembly_fasta) or (not args.vcf_files and args.assembly_fasta):
-        print("When using --vcf_files and --assembly_fasta, both needs to be specified")
+        print("When using --vcf_files and --assembly_fasta, both need to be specified")
         argparser.print_usage()
         sys.exit(1)
 
@@ -35,16 +35,16 @@ def validate_command_line_arguments(args, argparser):
                                 'and submission info is/will be stored')
     vcf_group = argparser.add_argument_group(
         'Input VCF and assembly',
-        "Specify the VCF files and associated assembly with the following options. If you used different assembly "
+        "Specify the VCF files and associated assembly with the following options. If you used different assemblies "
         "for different VCF files then use --vcf_file_mapping"
     )
-    vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf file to validate")
+    vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf files to validate")
     vcf_group.add_argument('--assembly_fasta',
-                           help="The fasta file containing the reference genome from which the variant were derived")
+                           help="The fasta file containing the reference genome from which the variants were derived")
     vcf_group.add_argument("--vcf_files_mapping",
                            help="csv file with the mappings for vcf files, fasta and assembly report")
 
-    metadata_group = argparser.add_argument_group('Metadata', 'Specify the metadata in a spreadsheet of in a JSON file')
+    metadata_group = argparser.add_argument_group('Metadata', 'Specify the metadata in a spreadsheet or in a JSON file')
     metadata_group = metadata_group.add_mutually_exclusive_group(required=True)
     metadata_group.add_argument("--metadata_json",
                                help="Json file that describe the project, analysis, samples and files")