Fix from submission (#168)

* Ensure unique analysis when writing to spreadsheet * Fix link retrieval from upload single file to ENA * Fix validation of date in metadata * Update the reference assembly in the metadata spreadsheet before brokering
EBIvariation · Sep 12, 2023 · 0d8acbb · 0d8acbb
1 parent c65adb5
commit 0d8acbb
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 23 deletions.
diff --git a/eva_submission/ENA_submission/upload_to_ENA.py b/eva_submission/ENA_submission/upload_to_ENA.py
@@ -129,8 +129,8 @@ def upload_xml_files_to_ena(self, dry_ena_upload=False):
         response = self._post_xml_file_to_ena(cfg.query('ena', 'submit_async'), file_dict)
         if response.status_code == 200:
             json_data = response.json()
-            if 'links' in json_data:
-                xml_link = [link_dict['href'] for link_dict in json_data['links'] if link_dict['rel'] == 'poll-xml'][0]
+            if '_links' in json_data:
+                xml_link = [link_dict['href'] for link_dict in json_data['_links'] if link_dict['rel'] == 'poll-xml'][0]
                 self.results['submissionId'] = json_data['submissionId']
                 self.results['poll-links'] = xml_link
                 self.monitor_results()

diff --git a/eva_submission/eload_submission.py b/eva_submission/eload_submission.py
@@ -122,6 +122,7 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
                     'Sample Accession': self.eload_cfg['brokering']['Biosamples']['Samples'][sample_row.get('Sample Name')]
                 })
             else:
+                sample_row['Analysis Alias'] = self._unique_alias(sample_row['Analysis Alias'])
                 sample_rows.append(sample_row)
 
         file_rows = []
@@ -131,15 +132,15 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
                 vcf_file_info = self.eload_cfg['brokering']['analyses'][analysis]['vcf_files'][vcf_file_name]
                 # Add the vcf file
                 file_rows.append({
-                    'Analysis Alias': analysis,
+                    'Analysis Alias': self._unique_alias(analysis),
                     'File Name': self.eload + '/' + os.path.basename(vcf_file_name),
                     'File Type': 'vcf',
                     'MD5': vcf_file_info['md5']
                 })
 
                 # Add the index file
                 file_rows.append({
-                    'Analysis Alias': analysis,
+                    'Analysis Alias': self._unique_alias(analysis),
                     'File Name': self.eload + '/' + os.path.basename(vcf_file_info['csi']),
                     'File Type': 'csi',
                     'MD5': vcf_file_info['csi_md5']
@@ -154,9 +155,12 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
 
         analysis_rows = reader.analysis
         for analysis_row in analysis_rows:
-            if self.eload not in analysis_row['Analysis Alias']:
-                # Add the eload id to ensure that the analysis alias is unique
-                analysis_row['Analysis Alias'] = self._unique_alias(analysis_row['Analysis Alias'])
+            # Add the eload id to ensure that the analysis alias is unique
+            analysis_row['Analysis Alias'] = self._unique_alias(analysis_row['Analysis Alias'])
+            # Ensure that the reference used in the brokering is the same as the one used during validation
+            analysis_row['Reference'] = self.eload_cfg.query(
+                'brokering', 'analyses', analysis_row['Analysis Alias'], 'assembly_accession'
+            )
 
         if output_spreadsheet:
             eva_xls_writer = EvaXlsxWriter(input_spreadsheet, output_spreadsheet)

diff --git a/eva_submission/xlsx/xlsx_validation.py b/eva_submission/xlsx/xlsx_validation.py
@@ -1,5 +1,5 @@
 import os
-from datetime import datetime
+import datetime
 
 import yaml
 from cerberus import Validator
@@ -153,8 +153,20 @@ def check_date(self, row, key, required=True):
         if required and key not in row:
             self.error_list.append(f'In row {row.get("row_num")}, {key} is required and missing')
             return
-        if key in row and (isinstance(row[key], datetime) or str(row[key]).lower() in not_provided_check_list):
+        if key in row and (
+                isinstance(row[key], datetime.date) or
+                isinstance(row[key], datetime.datetime) or
+                self._check_date_str_format(row[key]) or
+                str(row[key]).lower() in not_provided_check_list
+        ):
             return
         self.error_list.append(f'In row {row.get("row_num")}, {key} is not a date or "not provided": '
                                f'it is set to "{row.get(key)}"')
 
+    def _check_date_str_format(self, d):
+        try:
+            datetime.datetime.strptime(d, "%Y-%m-%d")
+            return True
+        except ValueError:
+            return False
+
diff --git a/tests/resources/metadata_2_analysis.xlsx b/tests/resources/metadata_2_analysis.xlsx
diff --git a/tests/test_eload_brokering.py b/tests/test_eload_brokering.py
@@ -221,19 +221,19 @@ def test_update_metadata_from_config_for_files(self):
         metadata_file = os.path.join(self.resources_folder, 'metadata_2_analysis.xlsx')
         ena_metadata_file = os.path.join(self.eload.eload_dir, 'metadata_2_analysis_for_brokering.xlsx')
         analyses = {
-            'GAE': {
+            'ELOAD_3_GAE': {
                 'assembly_accession': 'GCA_000001405.1',
                 'vcf_files': {
                     'path/to/GAE.vcf.gz': {
-                      'csi': 'path/to/GAE.vcf.gz.csi',
-                      'csi_md5': '',
-                      'md5': '',
-                      'original_vcf': 'path/to/original_GAE.vcf.gz',
-                      'output_vcf_file': None
+                        'csi': 'path/to/GAE.vcf.gz.csi',
+                        'csi_md5': '',
+                        'md5': '',
+                        'original_vcf': 'path/to/original_GAE.vcf.gz',
+                        'output_vcf_file': None
                     }
                 }
             },
-            'GAE2': {
+            'ELOAD_3_GAE2': {
                 'assembly_accession': 'GCA_000001405.1',
                 'vcf_files': {
                     'path/to/GAE2.vcf.gz': {
@@ -247,16 +247,26 @@ def test_update_metadata_from_config_for_files(self):
             }
         }
         self.eload.eload_cfg.set('brokering', 'analyses', value=analyses)
-        self.eload.update_metadata_spreadsheet(metadata_file, ena_metadata_file)
 
-        # Check that the Files get set to the merged file name and that the analysis alias is modified
+        source_reader = EvaXlsxReader(metadata_file)
+        self.eload.update_metadata_spreadsheet(metadata_file, ena_metadata_file)
         reader = EvaXlsxReader(ena_metadata_file)
+        # Check that the Files get set to the merged file name and that the analysis alias is modified
+        assert len(source_reader.files) == 12
+        assert len(reader.files) == 4
         assert reader.files == [
-            {'Analysis Alias': 'GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz', 'File Type': 'vcf', 'MD5': None, 'row_num': 2},
-            {'Analysis Alias': 'GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz.csi', 'File Type': 'csi', 'MD5': None, 'row_num': 3},
-            {'Analysis Alias': 'GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz', 'File Type': 'vcf', 'MD5': None, 'row_num': 4},
-            {'Analysis Alias': 'GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz.csi', 'File Type': 'csi', 'MD5': None, 'row_num': 5}
+            {'Analysis Alias': 'ELOAD_3_GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz', 'File Type': 'vcf', 'MD5': None,
+             'row_num': 2},
+            {'Analysis Alias': 'ELOAD_3_GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz.csi', 'File Type': 'csi', 'MD5': None,
+             'row_num': 3},
+            {'Analysis Alias': 'ELOAD_3_GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz', 'File Type': 'vcf', 'MD5': None,
+             'row_num': 4},
+            {'Analysis Alias': 'ELOAD_3_GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz.csi', 'File Type': 'csi', 'MD5': None,
+             'row_num': 5}
         ]
+        # Updated the reference genome to what the brokering contains
+        assert source_reader.analysis[1]['Reference'] == 'GCA_000001405'
+        assert reader.analysis[1]['Reference'] == 'GCA_000001405.1'
 
     def test_archival_confirmation_text(self):
         self.eload.eload_cfg.set('submission', 'project_title', value='Great project')

diff --git a/tests/test_upload_to_ENA.py b/tests/test_upload_to_ENA.py
@@ -97,7 +97,7 @@ def test_parse_ena_receipt_multiple_analyses(self):
     def test_single_upload_xml_files_to_ena(self):
         with patch.object(ENAUploader, '_post_xml_file_to_ena') as mock_post,\
              patch('eva_submission.ENA_submission.upload_to_ENA.requests.get') as mock_get:
-            json_data = {'submissionId': 'ERA123456', 'links': [{'rel': 'poll-xml', 'href': 'https://example.com/link'}]}
+            json_data = {'submissionId': 'ERA123456', '_links': [{'rel': 'poll-xml', 'href': 'https://example.com/link'}]}
             mock_post.return_value = Mock(status_code=200, json=Mock(return_value=json_data))
             mock_get.return_value = Mock(status_code=200, text=self.receipt)
             self.assertFalse(os.path.isfile(self.uploader_async.converter.single_submission_file))

diff --git a/tests/test_xlsx_validation.py b/tests/test_xlsx_validation.py
@@ -1,3 +1,4 @@
+import datetime
 import os
 import shutil
 from unittest import TestCase
@@ -73,3 +74,22 @@ def test_correct_scientific_name_in_metadata(self):
         assert len([s for s in scientific_name_list if s == 'Homo Sapiens']) == 0
         assert len([s for s in scientific_name_list if s == 'HS']) == 10
 
+    def test_check_date(self):
+        assert self.validator.error_list == []
+        row = {"row_num": 1, "collection_date": 'not provided'}
+        self.validator.check_date(row, 'collection_date', required=True)
+        assert self.validator.error_list == []
+
+        row = {"row_num": 1, "collection_date": datetime.date(year=2019, month=6, day=8)}
+        self.validator.check_date(row, 'collection_date', required=True)
+        assert self.validator.error_list == []
+
+        row = {"row_num": 1, "collection_date": '2019-06-08'}
+        self.validator.check_date(row, 'collection_date', required=True)
+        assert self.validator.error_list == []
+
+        row = {"row_num": 1, "collection_date": '2019-06-08,2019-06-09'}
+        self.validator.check_date(row, 'collection_date', required=True)
+        assert self.validator.error_list == [
+            'In row 1, collection_date is not a date or "not provided": it is set to "2019-06-08,2019-06-09"'
+        ]