EBIvariation · tcezard · Feb 4, 2024 · Jan 26, 2024 · Jan 29, 2024 · Jan 29, 2024
diff --git a/README.md b/README.md
@@ -1,2 +1,76 @@
 # eva-sub-cli
 EVA Submission Command Line Interface for Validation
+
+
+
+
+## Installation
+
+TBD
+
+## Input files for the validation and submission tool
+
+### The VCF file and association with reference genome
+
+The path to the VCF files are provided via CSV file that links the VCF to their respective fasta sequence. This allows 
+us to support different assemblies for each VCF file 
+The CSV file `vcf_mapping.csv` contains the following columns vcf, fasta, report providing respectively:
+ - The VCF to validate/upload
+ - The assembly in fasta format that was used to derive the VCF
+ - The assembly report associated with the assembly (if available) as found in NCBI assemblies (https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#files)
+
+
+Example:
+```shell
+vcf,fasta,report
+/full/path/to/vcf_file1.vcf,/full/path/to/genome.fa,/full/path/to/genome_assembly_report.txt
+/full/path/to/vcf_file2.vcf,/full/path/to/genome.fa,/full/path/to/genome_assembly_report.txt
+/full/path/to/vcf_file3.vcf,/full/path/to/genome2.fa,/full/path/to/genome_assembly_report2.txt
+```
+
+### The metadata spreadsheet 
+
+The metadata template can be found within the etc folder at `eva_sub_cli/etc/EVA_Submission_template.xlsx`
+It should be populated following the instruction provided within the template
+
+### The metadata JSON
+
+The metadata can also be provided via a JSON file which should conform to the schema located  at 
+`eva_sub_cli/etc/eva_schema.json` 
+
+More detail documentation to follow 
+
+## Execution
+
+### Validate and submit your dataset
+
+To validate and submit run the following command
+
+```shell
+eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \
+               --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir
+```
+
+### Validate only
+
+To validate and not submit run the following command
+
+```shell
+eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \
+               --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir 
+               --tasks VALIDATE
+```
+### Submit only
+
+All submission must have been validated. You cannot run the submission without validation. Once validated running 
+
+```shell
+eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \
+               --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir
+```
+or 
+```shell
+eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \
+               --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir --tasks SUBMIT
+```
+Will only submit the data and not validate.
diff --git a/bin/eva-sub-cli.py b/bin/eva-sub-cli.py
@@ -1,36 +1,19 @@
 #!/usr/bin/env python
-import csv
-import os
+
 from argparse import ArgumentParser
 
-from ebi_eva_common_pyutils.config import WritableConfig
 from ebi_eva_common_pyutils.logger import logging_config
 
-from eva_sub_cli import SUB_CLI_CONFIG_FILE, __version__
-from eva_sub_cli.docker_validator import DockerValidator, docker_path, container_image
-from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA
-from eva_sub_cli.submit import StudySubmitter
-
-VALIDATE = 'validate'
-SUBMIT = 'submit'
-RESUME_SUBMISSION = 'resume_submission'
-
-logging_config.add_stdout_handler()
-
-
-def get_vcf_files(mapping_file):
-    vcf_files = []
-    with open(mapping_file) as open_file:
-        reader = csv.DictReader(open_file, delimiter=',')
-        for row in reader:
-            vcf_files.append(row['vcf'])
-    return vcf_files
+from eva_sub_cli import  main
+from eva_sub_cli.main import VALIDATE, SUBMIT
 
 
 if __name__ == "__main__":
     argparser = ArgumentParser(description='EVA Submission CLI - validate and submit data to EVA')
-    argparser.add_argument('--task', required=True, choices=[VALIDATE, SUBMIT, RESUME_SUBMISSION],
-                           help='Select a task to perform')
+    argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT],
+                           help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of '
+                                'previous runs. Selecting SUBMIT will run validate only if the validation was not performed '
+                                'successfully before and then run the submission.')
     argparser.add_argument('--submission_dir', required=True, type=str,
                            help='Full path to the directory where all processing will be done '
                                 'and submission info is/will be stored')
@@ -41,38 +24,17 @@ def get_vcf_files(mapping_file):
                        help="Json file that describe the project, analysis, samples and files")
     group.add_argument("--metadata_xlsx",
                        help="Excel spreadsheet  that describe the project, analysis, samples and files")
-    group.add_argument("--username",
-                       help="Username used for connecting to the ENA webin account")
-    group.add_argument("--password",
-                       help="Password used for connecting to the ENA webin account")
+    argparser.add_argument("--username",
+                           help="Username used for connecting to the ENA webin account")
+    argparser.add_argument("--password",
+                           help="Password used for connecting to the ENA webin account")
+    argparser.add_argument("--resume", default=False, action='store_true',
+                           help="Resume the process execution from where it left of. This is currently only supported "
+                                "for the upload part of the SUBMIT task.")
 
     args = argparser.parse_args()
 
-    # load config
-    config_file_path = os.path.join(args.submission_dir, SUB_CLI_CONFIG_FILE)
-    sub_config = WritableConfig(config_file_path, version=__version__)
-
-    vcf_files = get_vcf_files(args.vcf_files_mapping)
-    metadata_file = args.metadata_json or args.metadata_xlsx
-
-    if args.task == RESUME_SUBMISSION:
-        # if validation is not passed, process task submit (validate and submit)
-        if READY_FOR_SUBMISSION_TO_EVA not in sub_config or not sub_config[READY_FOR_SUBMISSION_TO_EVA]:
-            args.task = SUBMIT
-        else:
-            # if validation is passed, upload files without validating again
-            with StudySubmitter(args.submission_dir, vcf_files, metadata_file, submission_config=sub_config,
-                                username=args.username, password=args.password) as submitter:
-                submitter.upload_submission()
-
-    if args.task == VALIDATE or args.task == SUBMIT:
-        with DockerValidator(args.vcf_files_mapping, args.submission_dir, args.metadata_json, args.metadata_xlsx,
-                             submission_config=sub_config) as validator:
-            validator.validate()
-            validator.create_reports()
-            validator.update_config_with_validation_result()
+    logging_config.add_stdout_handler()
 
-    if args.task == SUBMIT:
-        with StudySubmitter(args.submission_dir, vcf_files, metadata_file, submission_config=sub_config,
-                            username=args.username, password=args.password) as submitter:
-            submitter.submit()
+    main.orchestrate_process(args.submission_dir, args.vcf_files_mapping, args.metadata_json, args.metadata_xlsx,
+                             args.tasks, args.resume)
diff --git a/eva_sub_cli/auth.py b/eva_sub_cli/auth.py
@@ -102,5 +102,5 @@ def get_auth(username=None, password=None):
     global auth
     if auth:
         return auth
-    auth = WebinAuth(username, password)
+    auth = WebinAuth(username=username, password=password)
     return auth
diff --git a/eva_sub_cli/etc/EVA_Submission_template.xlsx b/eva_sub_cli/etc/EVA_Submission_template.xlsx
diff --git a/eva_sub_cli/main.py b/eva_sub_cli/main.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+import csv
+import os
+from ebi_eva_common_pyutils.config import WritableConfig
+from ebi_eva_common_pyutils.logger import logging_config
+
+from eva_sub_cli import SUB_CLI_CONFIG_FILE, __version__
+from eva_sub_cli.docker_validator import DockerValidator
+from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA
+from eva_sub_cli.submit import StudySubmitter
+
+VALIDATE = 'validate'
+SUBMIT = 'submit'
+
+def get_vcf_files(mapping_file):
+    vcf_files = []
+    with open(mapping_file) as open_file:
+        reader = csv.DictReader(open_file, delimiter=',')
+        for row in reader:
+            vcf_files.append(row['vcf'])
+    return vcf_files
+
+
+def orchestrate_process(submission_dir, vcf_files_mapping, metadata_json, metadata_xlsx, tasks, resume):
+    # load config
+    config_file_path = os.path.join(submission_dir, SUB_CLI_CONFIG_FILE)
+    sub_config = WritableConfig(config_file_path, version=__version__)
+
+    metadata_file = metadata_json or metadata_xlsx
+    vcf_files = get_vcf_files(vcf_files_mapping)
+
+    # Validation is mandatory so if submit is requested then VALIDATE must have run before or be requested as well
+    if SUBMIT in tasks and not sub_config.get(READY_FOR_SUBMISSION_TO_EVA, False):
+        if VALIDATE not in tasks:
+            tasks.append(VALIDATE)
+
+    if VALIDATE in tasks:
+        with DockerValidator(vcf_files_mapping, submission_dir, metadata_json, metadata_xlsx,
+                             submission_config=sub_config) as validator:
+            validator.validate()
+            validator.create_reports()
+            validator.update_config_with_validation_result()
+    if SUBMIT in tasks:
+        with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config) as submitter:
+            submitter.submit(resume=resume)
diff --git a/eva_sub_cli/submit.py b/eva_sub_cli/submit.py
@@ -41,20 +41,19 @@ def update_config_with_submission_id_and_upload_url(self, submission_id, upload_
         self.sub_config.set(SUB_CLI_CONFIG_KEY_SUBMISSION_ID, value=submission_id)
         self.sub_config.set(SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL, value=upload_url)
 
-    def upload_submission(self, submission_upload_url=None):
+    def _upload_submission(self):
         if READY_FOR_SUBMISSION_TO_EVA not in self.sub_config or not self.sub_config[READY_FOR_SUBMISSION_TO_EVA]:
             raise Exception(f'There are still validation errors that needs to be addressed. '
                             f'Please review, address and re-validate before uploading.')
 
-        if not submission_upload_url:
-            submission_upload_url = self.sub_config[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL]
+        submission_upload_url = self.sub_config[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL]
 
         for f in self.vcf_files:
-            self.upload_file(submission_upload_url, f)
-        self.upload_file(submission_upload_url, self.metadata_file)
+            self._upload_file(submission_upload_url, f)
+        self._upload_file(submission_upload_url, self.metadata_file)
 
     @retry(tries=5, delay=10, backoff=5)
-    def upload_file(self, submission_upload_url, input_file):
+    def _upload_file(self, submission_upload_url, input_file):
         base_name = os.path.basename(input_file)
         self.info(f'Transfer {base_name} to EVA FTP')
         r = requests.put(urljoin(submission_upload_url, base_name), data=open(input_file, 'rb'))
@@ -67,21 +66,20 @@ def verify_submission_dir(self, submission_dir):
         if not os.access(submission_dir, os.W_OK):
             raise Exception(f"The directory '{submission_dir}' does not have write permissions.")
 
-    def submit(self):
+    def submit(self, resume=False):
         if READY_FOR_SUBMISSION_TO_EVA not in self.sub_config or not self.sub_config[READY_FOR_SUBMISSION_TO_EVA]:
             raise Exception(f'There are still validation errors that need to be addressed. '
                             f'Please review, address and re-validate before submitting.')
-
-        self.verify_submission_dir(self.submission_dir)
-        response = requests.post(self.submission_initiate_url,
-                                 headers={'Accept': 'application/hal+json',
-                                          'Authorization': 'Bearer ' + self.auth.token})
-        response.raise_for_status()
-        response_json = response.json()
-        self.info("Submission ID {} received!!".format(response_json["submissionId"]))
-
-        # update config with submission id and upload url
-        self.update_config_with_submission_id_and_upload_url(response_json["submissionId"], response_json["uploadUrl"])
+        if not (resume or self.sub_config.get(SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL)):
+            self.verify_submission_dir(self.submission_dir)
+            response = requests.post(self.submission_initiate_url,
+                                     headers={'Accept': 'application/hal+json',
+                                              'Authorization': 'Bearer ' + self.auth.token})
+            response.raise_for_status()
+            response_json = response.json()
+            self.info("Submission ID {} received!!".format(response_json["submissionId"]))
+            # update config with submission id and upload url
+            self.update_config_with_submission_id_and_upload_url(response_json["submissionId"], response_json["uploadUrl"])
 
         # upload submission
-        self.upload_submission(response_json["uploadUrl"])
+        self._upload_submission()
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 pyyaml
 jinja2
-minify_html
+minify_html==0.11.1
 openpyxl
 requests
 jsonschema

diff --git a/...urces/EVA_Submission_template.V1.1.4.xlsx → tests/resources/EVA_Submission_test.xlsx b/...urces/EVA_Submission_template.V1.1.4.xlsx → tests/resources/EVA_Submission_test.xlsx
diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py
@@ -62,7 +62,7 @@ def setUp(self):
             container_name='eva-sub-cli-test'
         )
         shutil.copyfile(
-            os.path.join(self.resources_folder, 'EVA_Submission_template.V1.1.4.xlsx'),
+            os.path.join(self.resources_folder, 'EVA_Submission_test.xlsx'),
             self.metadata_xlsx
         )
 

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -0,0 +1,93 @@
+import json
+import os
+import shutil
+import unittest
+from unittest.mock import MagicMock, patch, Mock
+
+import yaml
+from ebi_eva_common_pyutils.config import WritableConfig
+
+from eva_sub_cli import SUB_CLI_CONFIG_FILE
+from eva_sub_cli.main import orchestrate_process, VALIDATE, SUBMIT
+from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA
+from eva_sub_cli.submit import StudySubmitter, SUB_CLI_CONFIG_KEY_SUBMISSION_ID, SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL
+
+
+class TestMain(unittest.TestCase):
+    resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
+    test_sub_dir = os.path.join(resource_dir, 'test_sub_dir')
+    config_file = os.path.join(test_sub_dir, SUB_CLI_CONFIG_FILE)
+
+    mapping_file = os.path.join(test_sub_dir, 'vcf_files_metadata.csv')
+    metadata_json = os.path.join(test_sub_dir, 'sub_metadata.json')
+    metadata_xlsx = os.path.join(test_sub_dir, 'sub_metadata.xlsx')
+
+    def test_orchestrate_validate(self):
+        with patch('eva_sub_cli.main.get_vcf_files') as m_get_vcf,  \
+                patch('eva_sub_cli.main.WritableConfig') as m_config, \
+                patch('eva_sub_cli.main.DockerValidator') as m_docker_validator:
+            orchestrate_process(
+                self.test_sub_dir, self.mapping_file, self.metadata_json, self.metadata_xlsx, tasks=[VALIDATE],
+                resume=False
+            )
+            m_get_vcf.assert_called_once_with(self.mapping_file)
+            m_docker_validator.assert_any_call(
+                self.mapping_file, self.test_sub_dir, self.metadata_json, self.metadata_xlsx,
+                submission_config=m_config.return_value
+            )
+            with m_docker_validator() as validator:
+                validator.validate.assert_called_once_with()
+                validator.create_reports.assert_called_once_with()
+                validator.update_config_with_validation_result.assert_called_once_with()
+
+
+    def test_orchestrate_validate_submit(self):
+        with patch('eva_sub_cli.main.get_vcf_files') as m_get_vcf, \
+                patch('eva_sub_cli.main.WritableConfig') as m_config, \
+                patch('eva_sub_cli.main.DockerValidator') as m_docker_validator, \
+                patch('eva_sub_cli.main.StudySubmitter') as m_submitter:
+            # Empty config
+            m_config.return_value = {}
+
+            orchestrate_process(
+                self.test_sub_dir, self.mapping_file, self.metadata_json, self.metadata_xlsx, tasks=[SUBMIT],
+                resume=False
+            )
+            m_get_vcf.assert_called_once_with(self.mapping_file)
+            # Validate was run because the config show it was not run successfully before
+            m_docker_validator.assert_any_call(
+                self.mapping_file, self.test_sub_dir, self.metadata_json, self.metadata_xlsx,
+                submission_config=m_config.return_value
+            )
+            with m_docker_validator() as validator:
+                validator.validate.assert_called_once_with()
+                validator.create_reports.assert_called_once_with()
+                validator.update_config_with_validation_result.assert_called_once_with()
+
+            # Submit was created
+            m_submitter.assert_any_call(self.test_sub_dir, m_get_vcf.return_value, self.metadata_json,
+                                        submission_config=m_config.return_value)
+            with m_submitter() as submitter:
+                submitter.submit.assert_called_once_with(resume=False)
+
+    def test_orchestrate_submit_no_validate(self):
+        with patch('eva_sub_cli.main.get_vcf_files') as m_get_vcf, \
+                patch('eva_sub_cli.main.WritableConfig') as m_config, \
+                patch('eva_sub_cli.main.DockerValidator') as m_docker_validator, \
+                patch('eva_sub_cli.main.StudySubmitter') as m_submitter:
+            # Empty config
+            m_config.return_value = {READY_FOR_SUBMISSION_TO_EVA: True}
+
+            orchestrate_process(
+                self.test_sub_dir, self.mapping_file, self.metadata_json, self.metadata_xlsx, tasks=[SUBMIT],
+                resume=False
+            )
+            m_get_vcf.assert_called_once_with(self.mapping_file)
+            # Validate was not run because the config showed it was run successfully before
+            assert m_docker_validator.call_count == 0
+
+            # Submit was created
+            m_submitter.assert_any_call(self.test_sub_dir, m_get_vcf.return_value, self.metadata_json,
+                                        submission_config=m_config.return_value)
+            with m_submitter() as submitter:
+                submitter.submit.assert_called_once_with(resume=False)