From 8f63c9e3f855bef65aaaa9c5b18cb2312256bb35 Mon Sep 17 00:00:00 2001
From: tcezard <tcezard@ebi.ac.uk>
Date: Mon, 8 Jan 2024 12:30:51 +0000
Subject: [PATCH 1/4] Nextflow: retrieve the correct file after combine

---
 eva_submission/nextflow/accession_and_load.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eva_submission/nextflow/accession_and_load.nf b/eva_submission/nextflow/accession_and_load.nf
index f450794..dd252ce 100644
--- a/eva_submission/nextflow/accession_and_load.nf
+++ b/eva_submission/nextflow/accession_and_load.nf
@@ -122,7 +122,7 @@ workflow {
                 .splitCsv(header:true)
                 .map{row -> tuple(file(row.vcf_file).name, file(row.vcf_file), row.assembly_accession, row.aggregation, file(row.fasta), file(row.report))}
                 .combine(normalise_vcf.out.vcf_tuples, by:0)     // Join based on the vcf_filename
-                .map {tuple(it[0], it[7], it[2], it[3], it[4], it[5])}   // vcf_filename, normalised vcf, assembly_accession, aggregation, fasta, report
+                .map {tuple(it[0], it[6], it[2], it[3], it[4], it[5])}   // vcf_filename, normalised vcf, assembly_accession, aggregation, fasta, report
             accession_vcf(normalised_vcfs_ch)
             sort_and_compress_vcf(accession_vcf.out.accession_done)
             csi_vcfs = sort_and_compress_vcf.out.compressed_vcf

From dfda98a8d8e396d9b0a0ca608549be0133fbccff Mon Sep 17 00:00:00 2001
From: tcezard <tcezard@ebi.ac.uk>
Date: Mon, 8 Jan 2024 12:31:47 +0000
Subject: [PATCH 2/4] Ensure the nextflow process is not run when no tasks are
 needed

---
 eva_submission/eload_ingestion.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/eva_submission/eload_ingestion.py b/eva_submission/eload_ingestion.py
index 8417ced..29fcd03 100644
--- a/eva_submission/eload_ingestion.py
+++ b/eva_submission/eload_ingestion.py
@@ -642,11 +642,14 @@ def run_nextflow(self, workflow_name, params, resume, tasks=all_tasks):
                 task for task in tasks
                 if self.eload_cfg.query(self.config_section, workflow_name, 'nextflow_dir', task) == self.nextflow_complete_value
             ]
-            if completed_tasks:
-                self.info(f'Task(s) {", ".join(completed_tasks)} already completed, skipping.')
+            for task in completed_tasks:
+                self.info(f'Task {task} already completed, skipping.')
             # Remove completed tasks
             for task in completed_tasks:
                 tasks.remove(task)
+            if not tasks:
+                self.info(f'No more to perform: Skip nextflow run.')
+                return
             # Retrieve the work directory for the remaining tasks
             work_dirs = [
                 self.eload_cfg.query(self.config_section, workflow_name, 'nextflow_dir', task)

From 50c63026f3e3b1ed2a0c73821328958c07fbe7ee Mon Sep 17 00:00:00 2001
From: tcezard <tcezard@ebi.ac.uk>
Date: Mon, 8 Jan 2024 12:32:17 +0000
Subject: [PATCH 3/4] Add biosamples module to the deployed version

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6c32f01..daf18aa 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,8 @@
 
 setup(
     name='eva_submission',
-    packages=['eva_submission', 'eva_submission.ENA_submission', 'eva_submission.xlsx', 'eva_submission.steps'],
+    packages=['eva_submission', 'eva_submission.ENA_submission', 'eva_submission.xlsx', 'eva_submission.steps',
+              'eva_submission.biosample_submission'],
     package_data={'eva_submission': ['nextflow/*', 'etc/*', 'VERSION']},
     version=version,
     license='Apache',

From c4b43ea81bc52491c64e88e97c43944a23348ba9 Mon Sep 17 00:00:00 2001
From: tcezard <tcezard@ebi.ac.uk>
Date: Mon, 8 Jan 2024 13:29:34 +0000
Subject: [PATCH 4/4] Check the biosample's validity with the right function

---
 eva_submission/xlsx/xlsx_validation.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/eva_submission/xlsx/xlsx_validation.py b/eva_submission/xlsx/xlsx_validation.py
index f7534b7..d19c746 100644
--- a/eva_submission/xlsx/xlsx_validation.py
+++ b/eva_submission/xlsx/xlsx_validation.py
@@ -140,7 +140,7 @@ def check_biosamples_accessions(self):
                 sample_accession = row.get('Sample Accession').strip()
                 try:
                     sample_data = self.communicator.follows_link('samples', join_url=sample_accession)
-                    self._validate_existing_bioSample(sample_data, row.get('row_num'), sample_accession)
+                    self._validate_existing_biosample(sample_data, row.get('row_num'), sample_accession)
                 except ValueError:
                     self.error_list.append(
                         f'In Sample, row {row.get("row_num")} BioSamples accession {sample_accession} '
@@ -208,7 +208,7 @@ def _check_date(self, date):
                str(date).lower() in not_provided_check_list
 
     def check_date(self, row, key, required=True):
-        if required and key not in row:
+        if required and not row.get(key):
             self.error_list.append(f'In row {row.get("row_num")}, {key} is required and missing')
             return
         if key in row and self._check_date(row[key]):
@@ -226,13 +226,13 @@ def _check_date_str_format(self, d):
     def _validate_existing_biosample(self, sample_data, row_num, accession):
         """This function only check if the existing sample has the expected fields present"""
         found_collection_date=False
-        for key in  ['collection_date', 'collection date']:
+        for key in ['collection_date', 'collection date']:
             if key in sample_data['characteristics'] and \
                     self._check_date(sample_data['characteristics'][key][0]['text']):
                 found_collection_date = True
         if not found_collection_date:
             self.error_list.append(
-                f'In row {row_num}, samples accession {accession} does not have a valid collection date')
+                f'In row {row_num}, existing sample accession {accession} does not have a valid collection date')
         found_geo_loc = False
         for key in ['geographic location (country and/or sea)']:
             if key in sample_data['characteristics'] and \
@@ -240,4 +240,4 @@ def _validate_existing_biosample(self, sample_data, row_num, accession):
                 found_geo_loc = True
         if not found_geo_loc:
             self.error_list.append(
-                f'In row {row_num}, samples accession {accession} does not have a valid geographic location')
+                f'In row {row_num}, existing sample accession {accession} does not have a valid geographic location')