From 8f63c9e3f855bef65aaaa9c5b18cb2312256bb35 Mon Sep 17 00:00:00 2001 From: tcezard Date: Mon, 8 Jan 2024 12:30:51 +0000 Subject: [PATCH 1/4] Nextflow: retrieve the correct file after combine --- eva_submission/nextflow/accession_and_load.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eva_submission/nextflow/accession_and_load.nf b/eva_submission/nextflow/accession_and_load.nf index f450794..dd252ce 100644 --- a/eva_submission/nextflow/accession_and_load.nf +++ b/eva_submission/nextflow/accession_and_load.nf @@ -122,7 +122,7 @@ workflow { .splitCsv(header:true) .map{row -> tuple(file(row.vcf_file).name, file(row.vcf_file), row.assembly_accession, row.aggregation, file(row.fasta), file(row.report))} .combine(normalise_vcf.out.vcf_tuples, by:0) // Join based on the vcf_filename - .map {tuple(it[0], it[7], it[2], it[3], it[4], it[5])} // vcf_filename, normalised vcf, assembly_accession, aggregation, fasta, report + .map {tuple(it[0], it[6], it[2], it[3], it[4], it[5])} // vcf_filename, normalised vcf, assembly_accession, aggregation, fasta, report accession_vcf(normalised_vcfs_ch) sort_and_compress_vcf(accession_vcf.out.accession_done) csi_vcfs = sort_and_compress_vcf.out.compressed_vcf From dfda98a8d8e396d9b0a0ca608549be0133fbccff Mon Sep 17 00:00:00 2001 From: tcezard Date: Mon, 8 Jan 2024 12:31:47 +0000 Subject: [PATCH 2/4] Ensure the nextflow process is not run when no tasks are needed --- eva_submission/eload_ingestion.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/eva_submission/eload_ingestion.py b/eva_submission/eload_ingestion.py index 8417ced..29fcd03 100644 --- a/eva_submission/eload_ingestion.py +++ b/eva_submission/eload_ingestion.py @@ -642,11 +642,14 @@ def run_nextflow(self, workflow_name, params, resume, tasks=all_tasks): task for task in tasks if self.eload_cfg.query(self.config_section, workflow_name, 'nextflow_dir', task) == self.nextflow_complete_value ] - if completed_tasks: - self.info(f'Task(s) {", ".join(completed_tasks)} already completed, skipping.') + for task in completed_tasks: + self.info(f'Task {task} already completed, skipping.') # Remove completed tasks for task in completed_tasks: tasks.remove(task) + if not tasks: + self.info(f'No more to perform: Skip nextflow run.') + return # Retrieve the work directory for the remaining tasks work_dirs = [ self.eload_cfg.query(self.config_section, workflow_name, 'nextflow_dir', task) From 50c63026f3e3b1ed2a0c73821328958c07fbe7ee Mon Sep 17 00:00:00 2001 From: tcezard Date: Mon, 8 Jan 2024 12:32:17 +0000 Subject: [PATCH 3/4] Add biosamples module to the deployed version --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6c32f01..daf18aa 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,8 @@ setup( name='eva_submission', - packages=['eva_submission', 'eva_submission.ENA_submission', 'eva_submission.xlsx', 'eva_submission.steps'], + packages=['eva_submission', 'eva_submission.ENA_submission', 'eva_submission.xlsx', 'eva_submission.steps', + 'eva_submission.biosample_submission'], package_data={'eva_submission': ['nextflow/*', 'etc/*', 'VERSION']}, version=version, license='Apache', From c4b43ea81bc52491c64e88e97c43944a23348ba9 Mon Sep 17 00:00:00 2001 From: tcezard Date: Mon, 8 Jan 2024 13:29:34 +0000 Subject: [PATCH 4/4] Check the biosample's validity with the right function --- eva_submission/xlsx/xlsx_validation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/eva_submission/xlsx/xlsx_validation.py b/eva_submission/xlsx/xlsx_validation.py index f7534b7..d19c746 100644 --- a/eva_submission/xlsx/xlsx_validation.py +++ b/eva_submission/xlsx/xlsx_validation.py @@ -140,7 +140,7 @@ def check_biosamples_accessions(self): sample_accession = row.get('Sample Accession').strip() try: sample_data = self.communicator.follows_link('samples', join_url=sample_accession) - self._validate_existing_bioSample(sample_data, row.get('row_num'), sample_accession) + self._validate_existing_biosample(sample_data, row.get('row_num'), sample_accession) except ValueError: self.error_list.append( f'In Sample, row {row.get("row_num")} BioSamples accession {sample_accession} ' @@ -208,7 +208,7 @@ def _check_date(self, date): str(date).lower() in not_provided_check_list def check_date(self, row, key, required=True): - if required and key not in row: + if required and not row.get(key): self.error_list.append(f'In row {row.get("row_num")}, {key} is required and missing') return if key in row and self._check_date(row[key]): @@ -226,13 +226,13 @@ def _check_date_str_format(self, d): def _validate_existing_biosample(self, sample_data, row_num, accession): """This function only check if the existing sample has the expected fields present""" found_collection_date=False - for key in ['collection_date', 'collection date']: + for key in ['collection_date', 'collection date']: if key in sample_data['characteristics'] and \ self._check_date(sample_data['characteristics'][key][0]['text']): found_collection_date = True if not found_collection_date: self.error_list.append( - f'In row {row_num}, samples accession {accession} does not have a valid collection date') + f'In row {row_num}, existing sample accession {accession} does not have a valid collection date') found_geo_loc = False for key in ['geographic location (country and/or sea)']: if key in sample_data['characteristics'] and \ @@ -240,4 +240,4 @@ def _validate_existing_biosample(self, sample_data, row_num, accession): found_geo_loc = True if not found_geo_loc: self.error_list.append( - f'In row {row_num}, samples accession {accession} does not have a valid geographic location') + f'In row {row_num}, existing sample accession {accession} does not have a valid geographic location')