diff --git a/eva_submission/ENA_submission/upload_to_ENA.py b/eva_submission/ENA_submission/upload_to_ENA.py index b81fe54..7f35b85 100644 --- a/eva_submission/ENA_submission/upload_to_ENA.py +++ b/eva_submission/ENA_submission/upload_to_ENA.py @@ -129,8 +129,8 @@ def upload_xml_files_to_ena(self, dry_ena_upload=False): response = self._post_xml_file_to_ena(cfg.query('ena', 'submit_async'), file_dict) if response.status_code == 200: json_data = response.json() - if 'links' in json_data: - xml_link = [link_dict['href'] for link_dict in json_data['links'] if link_dict['rel'] == 'poll-xml'][0] + if '_links' in json_data: + xml_link = [link_dict['href'] for link_dict in json_data['_links'] if link_dict['rel'] == 'poll-xml'][0] self.results['submissionId'] = json_data['submissionId'] self.results['poll-links'] = xml_link self.monitor_results() diff --git a/eva_submission/eload_submission.py b/eva_submission/eload_submission.py index c58b6cd..141d4f3 100755 --- a/eva_submission/eload_submission.py +++ b/eva_submission/eload_submission.py @@ -122,6 +122,7 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None 'Sample Accession': self.eload_cfg['brokering']['Biosamples']['Samples'][sample_row.get('Sample Name')] }) else: + sample_row['Analysis Alias'] = self._unique_alias(sample_row['Analysis Alias']) sample_rows.append(sample_row) file_rows = [] @@ -131,7 +132,7 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None vcf_file_info = self.eload_cfg['brokering']['analyses'][analysis]['vcf_files'][vcf_file_name] # Add the vcf file file_rows.append({ - 'Analysis Alias': analysis, + 'Analysis Alias': self._unique_alias(analysis), 'File Name': self.eload + '/' + os.path.basename(vcf_file_name), 'File Type': 'vcf', 'MD5': vcf_file_info['md5'] @@ -139,7 +140,7 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None # Add the index file file_rows.append({ - 'Analysis Alias': analysis, + 'Analysis Alias': self._unique_alias(analysis), 'File Name': self.eload + '/' + os.path.basename(vcf_file_info['csi']), 'File Type': 'csi', 'MD5': vcf_file_info['csi_md5'] @@ -154,9 +155,12 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None analysis_rows = reader.analysis for analysis_row in analysis_rows: - if self.eload not in analysis_row['Analysis Alias']: - # Add the eload id to ensure that the analysis alias is unique - analysis_row['Analysis Alias'] = self._unique_alias(analysis_row['Analysis Alias']) + # Add the eload id to ensure that the analysis alias is unique + analysis_row['Analysis Alias'] = self._unique_alias(analysis_row['Analysis Alias']) + # Ensure that the reference used in the brokering is the same as the one used during validation + analysis_row['Reference'] = self.eload_cfg.query( + 'brokering', 'analyses', analysis_row['Analysis Alias'], 'assembly_accession' + ) if output_spreadsheet: eva_xls_writer = EvaXlsxWriter(input_spreadsheet, output_spreadsheet) diff --git a/eva_submission/xlsx/xlsx_validation.py b/eva_submission/xlsx/xlsx_validation.py index a6aae6c..ad4b38c 100644 --- a/eva_submission/xlsx/xlsx_validation.py +++ b/eva_submission/xlsx/xlsx_validation.py @@ -1,5 +1,5 @@ import os -from datetime import datetime +import datetime import yaml from cerberus import Validator @@ -153,8 +153,20 @@ def check_date(self, row, key, required=True): if required and key not in row: self.error_list.append(f'In row {row.get("row_num")}, {key} is required and missing') return - if key in row and (isinstance(row[key], datetime) or str(row[key]).lower() in not_provided_check_list): + if key in row and ( + isinstance(row[key], datetime.date) or + isinstance(row[key], datetime.datetime) or + self._check_date_str_format(row[key]) or + str(row[key]).lower() in not_provided_check_list + ): return self.error_list.append(f'In row {row.get("row_num")}, {key} is not a date or "not provided": ' f'it is set to "{row.get(key)}"') + def _check_date_str_format(self, d): + try: + datetime.datetime.strptime(d, "%Y-%m-%d") + return True + except ValueError: + return False + diff --git a/tests/resources/metadata_2_analysis.xlsx b/tests/resources/metadata_2_analysis.xlsx index 3e31fe0..ea207f4 100644 Binary files a/tests/resources/metadata_2_analysis.xlsx and b/tests/resources/metadata_2_analysis.xlsx differ diff --git a/tests/test_eload_brokering.py b/tests/test_eload_brokering.py index d941456..8ac83ce 100644 --- a/tests/test_eload_brokering.py +++ b/tests/test_eload_brokering.py @@ -221,19 +221,19 @@ def test_update_metadata_from_config_for_files(self): metadata_file = os.path.join(self.resources_folder, 'metadata_2_analysis.xlsx') ena_metadata_file = os.path.join(self.eload.eload_dir, 'metadata_2_analysis_for_brokering.xlsx') analyses = { - 'GAE': { + 'ELOAD_3_GAE': { 'assembly_accession': 'GCA_000001405.1', 'vcf_files': { 'path/to/GAE.vcf.gz': { - 'csi': 'path/to/GAE.vcf.gz.csi', - 'csi_md5': '', - 'md5': '', - 'original_vcf': 'path/to/original_GAE.vcf.gz', - 'output_vcf_file': None + 'csi': 'path/to/GAE.vcf.gz.csi', + 'csi_md5': '', + 'md5': '', + 'original_vcf': 'path/to/original_GAE.vcf.gz', + 'output_vcf_file': None } } }, - 'GAE2': { + 'ELOAD_3_GAE2': { 'assembly_accession': 'GCA_000001405.1', 'vcf_files': { 'path/to/GAE2.vcf.gz': { @@ -247,16 +247,26 @@ def test_update_metadata_from_config_for_files(self): } } self.eload.eload_cfg.set('brokering', 'analyses', value=analyses) - self.eload.update_metadata_spreadsheet(metadata_file, ena_metadata_file) - # Check that the Files get set to the merged file name and that the analysis alias is modified + source_reader = EvaXlsxReader(metadata_file) + self.eload.update_metadata_spreadsheet(metadata_file, ena_metadata_file) reader = EvaXlsxReader(ena_metadata_file) + # Check that the Files get set to the merged file name and that the analysis alias is modified + assert len(source_reader.files) == 12 + assert len(reader.files) == 4 assert reader.files == [ - {'Analysis Alias': 'GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz', 'File Type': 'vcf', 'MD5': None, 'row_num': 2}, - {'Analysis Alias': 'GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz.csi', 'File Type': 'csi', 'MD5': None, 'row_num': 3}, - {'Analysis Alias': 'GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz', 'File Type': 'vcf', 'MD5': None, 'row_num': 4}, - {'Analysis Alias': 'GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz.csi', 'File Type': 'csi', 'MD5': None, 'row_num': 5} + {'Analysis Alias': 'ELOAD_3_GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz', 'File Type': 'vcf', 'MD5': None, + 'row_num': 2}, + {'Analysis Alias': 'ELOAD_3_GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz.csi', 'File Type': 'csi', 'MD5': None, + 'row_num': 3}, + {'Analysis Alias': 'ELOAD_3_GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz', 'File Type': 'vcf', 'MD5': None, + 'row_num': 4}, + {'Analysis Alias': 'ELOAD_3_GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz.csi', 'File Type': 'csi', 'MD5': None, + 'row_num': 5} ] + # Updated the reference genome to what the brokering contains + assert source_reader.analysis[1]['Reference'] == 'GCA_000001405' + assert reader.analysis[1]['Reference'] == 'GCA_000001405.1' def test_archival_confirmation_text(self): self.eload.eload_cfg.set('submission', 'project_title', value='Great project') diff --git a/tests/test_upload_to_ENA.py b/tests/test_upload_to_ENA.py index 6edeb99..48c4950 100644 --- a/tests/test_upload_to_ENA.py +++ b/tests/test_upload_to_ENA.py @@ -97,7 +97,7 @@ def test_parse_ena_receipt_multiple_analyses(self): def test_single_upload_xml_files_to_ena(self): with patch.object(ENAUploader, '_post_xml_file_to_ena') as mock_post,\ patch('eva_submission.ENA_submission.upload_to_ENA.requests.get') as mock_get: - json_data = {'submissionId': 'ERA123456', 'links': [{'rel': 'poll-xml', 'href': 'https://example.com/link'}]} + json_data = {'submissionId': 'ERA123456', '_links': [{'rel': 'poll-xml', 'href': 'https://example.com/link'}]} mock_post.return_value = Mock(status_code=200, json=Mock(return_value=json_data)) mock_get.return_value = Mock(status_code=200, text=self.receipt) self.assertFalse(os.path.isfile(self.uploader_async.converter.single_submission_file)) diff --git a/tests/test_xlsx_validation.py b/tests/test_xlsx_validation.py index 3ee1a37..c6cf26b 100644 --- a/tests/test_xlsx_validation.py +++ b/tests/test_xlsx_validation.py @@ -1,3 +1,4 @@ +import datetime import os import shutil from unittest import TestCase @@ -73,3 +74,22 @@ def test_correct_scientific_name_in_metadata(self): assert len([s for s in scientific_name_list if s == 'Homo Sapiens']) == 0 assert len([s for s in scientific_name_list if s == 'HS']) == 10 + def test_check_date(self): + assert self.validator.error_list == [] + row = {"row_num": 1, "collection_date": 'not provided'} + self.validator.check_date(row, 'collection_date', required=True) + assert self.validator.error_list == [] + + row = {"row_num": 1, "collection_date": datetime.date(year=2019, month=6, day=8)} + self.validator.check_date(row, 'collection_date', required=True) + assert self.validator.error_list == [] + + row = {"row_num": 1, "collection_date": '2019-06-08'} + self.validator.check_date(row, 'collection_date', required=True) + assert self.validator.error_list == [] + + row = {"row_num": 1, "collection_date": '2019-06-08,2019-06-09'} + self.validator.check_date(row, 'collection_date', required=True) + assert self.validator.error_list == [ + 'In row 1, collection_date is not a date or "not provided": it is set to "2019-06-08,2019-06-09"' + ]