EBIvariation · tcezard · Aug 30, 2024 · Aug 21, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py
@@ -40,7 +40,7 @@ def main():
     argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA')
     argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}')
     argparser.add_argument('--submission_dir', required=True, type=str,
-                           help='Full path to the directory where all processing will be done '
+                           help='Path to the directory where all processing will be done '
                                 'and submission info is/will be stored')
     vcf_group = argparser.add_argument_group(
         'Input VCF and assembly',
@@ -57,7 +57,7 @@ def main():
                                 help="Json file that describe the project, analysis, samples and files")
     metadata_group.add_argument("--metadata_xlsx",
                                 help="Excel spreadsheet  that describe the project, analysis, samples and files")
-    argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower,
+    argparser.add_argument('--tasks', nargs='+', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower,
                            help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the'
                                 ' outcome of previous runs. Selecting SUBMIT will run validate only if the validation'
                                 ' was not performed successfully before and then run the submission.')
@@ -67,12 +67,18 @@ def main():
                                                                   'upload to the EVA')
     credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
     credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
+    argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages')
 
     args = argparser.parse_args()
 
     validate_command_line_arguments(args, argparser)
 
-    logging_config.add_stdout_handler()
+    args.submission_dir = os.path.abspath(args.submission_dir)
+
+    if args.debug:
+        logging_config.add_stdout_handler(logging.DEBUG)
+    else:
+        logging_config.add_stdout_handler()
     logging_config.add_file_handler(os.path.join(args.submission_dir, 'eva_submission.log'), logging.DEBUG)
 
     try:

diff --git a/eva_sub_cli/executables/xlsx2json.py b/eva_sub_cli/executables/xlsx2json.py
@@ -372,6 +372,6 @@ def main():
     try:
         parser.json(args.metadata_json)
     except Exception as e:
-        parser.add_error(e)
+        parser.add_error(f'An Error was raised while converting the spreadsheet to JSON: {repr(e)}')
     finally:
         parser.save_errors(args.errors_yaml)
diff --git a/eva_sub_cli/metadata_utils.py b/eva_sub_cli/metadata_utils.py
@@ -22,8 +22,7 @@ def get_files_per_analysis(metadata):
     """Returns mapping of analysis alias to filenames, based on metadata."""
     files_per_analysis = defaultdict(list)
     for file_info in metadata.get('files', []):
-        if file_info.get('fileType') == 'vcf':
-            files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
+        files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
     return {
         analysis_alias: set(filepaths)
         for analysis_alias, filepaths in files_per_analysis.items()

diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
@@ -195,12 +195,13 @@ process convert_xlsx_2_json {
     output:
     path "metadata.json", emit: metadata_json, optional: true
     path "metadata_conversion_errors.yml", emit: errors_yaml
+    path "xlsx2json.log", emit: xlsx2json_log
 
     script:
     metadata_json = metadata_xlsx.getBaseName() + '.json'
 
     """
-    $params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --errors_yaml metadata_conversion_errors.yml --conversion_configuration $conversion_configuration
+    $params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --errors_yaml metadata_conversion_errors.yml --conversion_configuration $conversion_configuration > xlsx2json.log 2>&1
     """
 }
 
@@ -217,7 +218,7 @@ process metadata_json_validation {
 
     script:
     """
-    $params.executable.biovalidator --schema $schema_dir/eva_schema.json --ref $schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt
+    $params.executable.biovalidator --schema $schema_dir/eva_schema.json --ref $schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt 2>&1
     """
 }
 
@@ -232,10 +233,11 @@ process sample_name_concordance {
 
     output:
     path "sample_checker.yml", emit: sample_checker_yml
+    path "sample_checker.log", emit: sample_checker_log
 
     script:
     """
-    $params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
+    $params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml > sample_checker.log
     """
 }
 
@@ -251,10 +253,11 @@ process insdc_checker {
 
     output:
     path "${fasta_file}_check.yml", emit: fasta_checker_yml
+    path "fasta_checker.log", emit: fasta_checker_log
 
     script:
     """
-    $params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml
+    $params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml > fasta_checker.log 2>&1
     """
 }
 
@@ -269,9 +272,10 @@ process metadata_semantic_check {
 
     output:
     path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml
+    path "semantic_checker.log", emit: semantic_checker_log
 
     script:
     """
-    $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml
+    $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml > semantic_checker.log
     """
 }
diff --git a/eva_sub_cli/orchestrator.py b/eva_sub_cli/orchestrator.py
@@ -54,7 +54,7 @@ def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, re
             project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, True)
 
         for mapping in vcf_files_mapping:
-            writer.writerow(mapping);
+            writer.writerow(mapping)
 
     return project_title, mapping_file
 
@@ -80,6 +80,7 @@ def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_
 
     return project_title, vcf_fasta_report_mapping
 
+
 def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, mapping_req=False):
     workbook = load_workbook(metadata_xlsx)
 
@@ -108,23 +109,27 @@ def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, mapping_
             files_headers[cell.value] = cell.column - 1
 
         for row in files_sheet.iter_rows(min_row=2, values_only=True):
-            file_name = row[files_headers['File Name']]
+            file_name = os.path.abspath(row[files_headers['File Name']])
             analysis_alias = row[files_headers['Analysis Alias']]
-            reference_fasta = analysis_alias_dict[analysis_alias]
+            reference_fasta = os.path.abspath(analysis_alias_dict[analysis_alias])
+            if not (file_name and os.path.isfile(file_name)):
+                raise FileNotFoundError(f'The variant file {file_name} provided in spreadsheet {metadata_xlsx} does not exist')
+            if not (reference_fasta and os.path.isfile(reference_fasta)):
+                raise FileNotFoundError(f'The reference fasta {reference_fasta} in spreadsheet {metadata_xlsx} does not exist')
             vcf_fasta_report_mapping.append([os.path.abspath(file_name), os.path.abspath(reference_fasta), ''])
 
     return project_title, vcf_fasta_report_mapping
 
 
-def check_validation_required(tasks, sub_config):
+def check_validation_required(tasks, sub_config, username=None, password=None):
     # Validation is mandatory so if submit is requested then VALIDATE must have run before or be requested as well
     if SUBMIT in tasks:
         if not sub_config.get(READY_FOR_SUBMISSION_TO_EVA, False):
             return True
         submission_id = sub_config.get(SUB_CLI_CONFIG_KEY_SUBMISSION_ID, None)
         if submission_id:
             try:
-                submission_status = SubmissionWSClient().get_submission_status(submission_id)
+                submission_status = SubmissionWSClient(username, password).get_submission_status(submission_id)
                 if submission_status == 'FAILED':
                     return True
                 else:
@@ -150,13 +155,20 @@ def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_jso
 
     metadata_file = metadata_json or metadata_xlsx
     if not os.path.exists(os.path.abspath(metadata_file)):
-        raise FileNotFoundError(f'The provided metadata file {metadata_file} does not exist')
+        raise FileNotFoundError(f'The provided metadata file {os.path.abspath(metadata_file)} does not exist')
+
+    if metadata_json:
+        metadata_json = os.path.abspath(metadata_json)
+    if metadata_xlsx:
+        metadata_xlsx = os.path.abspath(metadata_xlsx)
 
     # Get the provided Project Title and VCF files mapping (VCF, Fasta and Report)
-    project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx)
+    project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping(
+        submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx
+    )
     vcf_files = get_vcf_files(vcf_files_mapping)
 
-    if VALIDATE not in tasks and check_validation_required(tasks, sub_config):
+    if VALIDATE not in tasks and check_validation_required(tasks, sub_config, username, password):
         tasks.append(VALIDATE)
 
     if VALIDATE in tasks:

diff --git a/eva_sub_cli/submission_ws.py b/eva_sub_cli/submission_ws.py
@@ -43,19 +43,19 @@ def _submission_status_url(self, submission_id):
 
     def mark_submission_uploaded(self, submission_id, metadata_json):
         response = requests.put(self._submission_uploaded_url(submission_id),
-                                headers={'Accept': 'application/hal+json', 'Authorization': 'Bearer ' + self.auth.token},
-                                data=metadata_json)
+                                headers={'Accept': 'application/json', 'Authorization': 'Bearer ' + self.auth.token},
+                                json=metadata_json)
         response.raise_for_status()
         return response.json()
 
     def initiate_submission(self):
-        response = requests.post(self._submission_initiate_url(), headers={'Accept': 'application/hal+json',
+        response = requests.post(self._submission_initiate_url(), headers={'Accept': 'application/json',
                                                                            'Authorization': 'Bearer ' + self.auth.token})
         response.raise_for_status()
         return response.json()
 
     @retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
     def get_submission_status(self, submission_id):
-        response = requests.get(self.get_submission_status_url(submission_id))
+        response = requests.get(self._submission_status_url(submission_id))
         response.raise_for_status()
         return response.text
diff --git a/eva_sub_cli/submit.py b/eva_sub_cli/submit.py
@@ -64,7 +64,8 @@ def _upload_submission(self):
     def _upload_file(self, submission_upload_url, input_file):
         base_name = os.path.basename(input_file)
         self.debug(f'Transfer {base_name} to EVA FTP')
-        r = requests.put(os.path.join(submission_upload_url, base_name), data=open(input_file, 'rb'))
+        with open(input_file, 'rb') as f:
+            r = requests.put(os.path.join(submission_upload_url, base_name), data=open(input_file, 'rb'))
         r.raise_for_status()
         self.debug(f'Upload of {base_name} completed')
 

diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
@@ -12,7 +12,7 @@
 logger = logging_config.get_logger(__name__)
 
 container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev14'
+container_tag = 'v0.0.1.dev15'
 container_validation_dir = '/opt/vcf_validation'
 container_validation_output_dir = 'vcf_validation_output'
 
@@ -100,10 +100,10 @@ def verify_container_is_running(self):
         try:
             container_run_cmd_output = self._run_quiet_command("check if container is running", f"{self.docker_path} ps", return_process_output=True)
             if container_run_cmd_output is not None and self.container_name in container_run_cmd_output:
-                logger.info(f"Container ({self.container_name}) is running")
+                logger.debug(f"Container ({self.container_name}) is running")
                 return True
             else:
-                logger.info(f"Container ({self.container_name}) is not running")
+                logger.debug(f"Container ({self.container_name}) is not running")
                 return False
         except subprocess.CalledProcessError as ex:
             logger.error(ex)
@@ -114,10 +114,10 @@ def verify_container_is_stopped(self):
             "check if container is stopped", f"{self.docker_path} ps -a",  return_process_output=True
         )
         if container_stop_cmd_output is not None and self.container_name in container_stop_cmd_output:
-            logger.info(f"Container ({self.container_name}) is in stop state")
+            logger.debug(f"Container ({self.container_name}) is in stop state")
             return True
         else:
-            logger.info(f"Container ({self.container_name}) is not in stop state")
+            logger.debug(f"Container ({self.container_name}) is not in stop state")
             return False
 
     def try_restarting_container(self):
@@ -137,14 +137,14 @@ def verify_image_available_locally(self):
             return_process_output=True
         )
         if container_images_cmd_ouptut is not None and re.search(container_image + r'\s+' + container_tag, container_images_cmd_ouptut):
-            logger.info(f"Container ({container_image}) image is available locally")
+            logger.debug(f"Container ({container_image}) image is available locally")
             return True
         else:
-            logger.info(f"Container ({container_image}) image is not available locally")
+            logger.debug(f"Container ({container_image}) image is not available locally")
             return False
 
     def run_container(self):
-        logger.info(f"Trying to run container {self.container_name}")
+        logger.debug(f"Trying to run container {self.container_name}")
         try:
             self._run_quiet_command(
                 "Try running container",
@@ -166,7 +166,7 @@ def stop_running_container(self):
             )
 
     def download_container_image(self):
-        logger.info(f"Pulling container ({container_image}) image")
+        logger.debug(f"Pulling container ({container_image}) image")
         try:
             self._run_quiet_command("pull container image", f"{self.docker_path} pull {container_image}:{container_tag}")
         except subprocess.CalledProcessError as ex:

diff --git a/eva_sub_cli/validators/native_validator.py b/eva_sub_cli/validators/native_validator.py
@@ -24,11 +24,15 @@ def _validate(self):
 
     def run_validator(self):
         self.verify_executables_installed()
+        curr_wd = os.getcwd()
         try:
             command = self.get_validation_cmd()
+            os.chdir(self.submission_dir)
             self._run_quiet_command("Run Validation using Nextflow", command)
         except subprocess.CalledProcessError as ex:
             logger.error(ex)
+        finally:
+            os.chdir(curr_wd)
 
     def get_validation_cmd(self):
         if self.metadata_xlsx and not self.metadata_json:

diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
@@ -545,6 +545,10 @@ def _collect_file_info_to_metadata(self):
                     file_name_2_md5[os.path.basename(vcf_file)] = md5sum
                     file_path_2_file_size[vcf_file] = file_size
                     file_name_2_file_size[os.path.basename(vcf_file)] = file_size
+        else:
+            self.error(
+                f"Cannot locate file_info.txt at {os.path.join(self.output_dir, 'other_validations', 'file_info.txt')}"
+            )
         if self.metadata_json_post_validation:
             with open(self.metadata_json_post_validation) as open_file:
                 try:
@@ -553,12 +557,11 @@ def _collect_file_info_to_metadata(self):
                     files_from_metadata = json_data.get('files', [])
                     if files_from_metadata:
                         for file_dict in json_data.get('files', []):
-                            if file_dict.get('fileType') == 'vcf':
-                                file_path = self._validation_file_path_for(file_dict.get('fileName'))
-                                file_dict['md5'] = file_path_2_md5.get(file_path) or \
-                                                   file_name_2_md5.get(file_dict.get('fileName')) or ''
-                                file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \
-                                                   file_name_2_file_size.get(file_dict.get('fileName')) or ''
+                            file_path = self._validation_file_path_for(file_dict.get('fileName'))
+                            file_dict['md5'] = file_path_2_md5.get(file_path) or \
+                                               file_name_2_md5.get(file_dict.get('fileName')) or ''
+                            file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \
+                                               file_name_2_file_size.get(file_dict.get('fileName')) or ''
                             file_rows.append(file_dict)
                     else:
                         self.error('No file found in metadata and multiple analysis alias exist: '
@@ -570,6 +573,8 @@ def _collect_file_info_to_metadata(self):
             if json_data:
                 with open(self.metadata_json_post_validation, 'w') as open_file:
                     json.dump(json_data, open_file)
+        else:
+            self.error(f'Cannot locate the metadata in JSON format in {os.path.join(self.output_dir, "metadata.json")}')
 
     def get_vcf_fasta_analysis_mapping(self):
         vcf_fasta_analysis_mapping = []