Skip to content

Commit

Permalink
Fix from submission (#168)
Browse files Browse the repository at this point in the history
* Ensure unique analysis when writing to spreadsheet
* Fix link retrieval from upload single file to ENA
* Fix validation of date in metadata
* Update the reference assembly in the metadata spreadsheet before brokering
  • Loading branch information
tcezard authored Sep 12, 2023
1 parent c65adb5 commit 0d8acbb
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 23 deletions.
4 changes: 2 additions & 2 deletions eva_submission/ENA_submission/upload_to_ENA.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ def upload_xml_files_to_ena(self, dry_ena_upload=False):
response = self._post_xml_file_to_ena(cfg.query('ena', 'submit_async'), file_dict)
if response.status_code == 200:
json_data = response.json()
if 'links' in json_data:
xml_link = [link_dict['href'] for link_dict in json_data['links'] if link_dict['rel'] == 'poll-xml'][0]
if '_links' in json_data:
xml_link = [link_dict['href'] for link_dict in json_data['_links'] if link_dict['rel'] == 'poll-xml'][0]
self.results['submissionId'] = json_data['submissionId']
self.results['poll-links'] = xml_link
self.monitor_results()
Expand Down
14 changes: 9 additions & 5 deletions eva_submission/eload_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
'Sample Accession': self.eload_cfg['brokering']['Biosamples']['Samples'][sample_row.get('Sample Name')]
})
else:
sample_row['Analysis Alias'] = self._unique_alias(sample_row['Analysis Alias'])
sample_rows.append(sample_row)

file_rows = []
Expand All @@ -131,15 +132,15 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None
vcf_file_info = self.eload_cfg['brokering']['analyses'][analysis]['vcf_files'][vcf_file_name]
# Add the vcf file
file_rows.append({
'Analysis Alias': analysis,
'Analysis Alias': self._unique_alias(analysis),
'File Name': self.eload + '/' + os.path.basename(vcf_file_name),
'File Type': 'vcf',
'MD5': vcf_file_info['md5']
})

# Add the index file
file_rows.append({
'Analysis Alias': analysis,
'Analysis Alias': self._unique_alias(analysis),
'File Name': self.eload + '/' + os.path.basename(vcf_file_info['csi']),
'File Type': 'csi',
'MD5': vcf_file_info['csi_md5']
Expand All @@ -154,9 +155,12 @@ def update_metadata_spreadsheet(self, input_spreadsheet, output_spreadsheet=None

analysis_rows = reader.analysis
for analysis_row in analysis_rows:
if self.eload not in analysis_row['Analysis Alias']:
# Add the eload id to ensure that the analysis alias is unique
analysis_row['Analysis Alias'] = self._unique_alias(analysis_row['Analysis Alias'])
# Add the eload id to ensure that the analysis alias is unique
analysis_row['Analysis Alias'] = self._unique_alias(analysis_row['Analysis Alias'])
# Ensure that the reference used in the brokering is the same as the one used during validation
analysis_row['Reference'] = self.eload_cfg.query(
'brokering', 'analyses', analysis_row['Analysis Alias'], 'assembly_accession'
)

if output_spreadsheet:
eva_xls_writer = EvaXlsxWriter(input_spreadsheet, output_spreadsheet)
Expand Down
16 changes: 14 additions & 2 deletions eva_submission/xlsx/xlsx_validation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from datetime import datetime
import datetime

import yaml
from cerberus import Validator
Expand Down Expand Up @@ -153,8 +153,20 @@ def check_date(self, row, key, required=True):
if required and key not in row:
self.error_list.append(f'In row {row.get("row_num")}, {key} is required and missing')
return
if key in row and (isinstance(row[key], datetime) or str(row[key]).lower() in not_provided_check_list):
if key in row and (
isinstance(row[key], datetime.date) or
isinstance(row[key], datetime.datetime) or
self._check_date_str_format(row[key]) or
str(row[key]).lower() in not_provided_check_list
):
return
self.error_list.append(f'In row {row.get("row_num")}, {key} is not a date or "not provided": '
f'it is set to "{row.get(key)}"')

def _check_date_str_format(self, d):
try:
datetime.datetime.strptime(d, "%Y-%m-%d")
return True
except ValueError:
return False

Binary file modified tests/resources/metadata_2_analysis.xlsx
Binary file not shown.
36 changes: 23 additions & 13 deletions tests/test_eload_brokering.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,19 +221,19 @@ def test_update_metadata_from_config_for_files(self):
metadata_file = os.path.join(self.resources_folder, 'metadata_2_analysis.xlsx')
ena_metadata_file = os.path.join(self.eload.eload_dir, 'metadata_2_analysis_for_brokering.xlsx')
analyses = {
'GAE': {
'ELOAD_3_GAE': {
'assembly_accession': 'GCA_000001405.1',
'vcf_files': {
'path/to/GAE.vcf.gz': {
'csi': 'path/to/GAE.vcf.gz.csi',
'csi_md5': '',
'md5': '',
'original_vcf': 'path/to/original_GAE.vcf.gz',
'output_vcf_file': None
'csi': 'path/to/GAE.vcf.gz.csi',
'csi_md5': '',
'md5': '',
'original_vcf': 'path/to/original_GAE.vcf.gz',
'output_vcf_file': None
}
}
},
'GAE2': {
'ELOAD_3_GAE2': {
'assembly_accession': 'GCA_000001405.1',
'vcf_files': {
'path/to/GAE2.vcf.gz': {
Expand All @@ -247,16 +247,26 @@ def test_update_metadata_from_config_for_files(self):
}
}
self.eload.eload_cfg.set('brokering', 'analyses', value=analyses)
self.eload.update_metadata_spreadsheet(metadata_file, ena_metadata_file)

# Check that the Files get set to the merged file name and that the analysis alias is modified
source_reader = EvaXlsxReader(metadata_file)
self.eload.update_metadata_spreadsheet(metadata_file, ena_metadata_file)
reader = EvaXlsxReader(ena_metadata_file)
# Check that the Files get set to the merged file name and that the analysis alias is modified
assert len(source_reader.files) == 12
assert len(reader.files) == 4
assert reader.files == [
{'Analysis Alias': 'GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz', 'File Type': 'vcf', 'MD5': None, 'row_num': 2},
{'Analysis Alias': 'GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz.csi', 'File Type': 'csi', 'MD5': None, 'row_num': 3},
{'Analysis Alias': 'GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz', 'File Type': 'vcf', 'MD5': None, 'row_num': 4},
{'Analysis Alias': 'GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz.csi', 'File Type': 'csi', 'MD5': None, 'row_num': 5}
{'Analysis Alias': 'ELOAD_3_GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz', 'File Type': 'vcf', 'MD5': None,
'row_num': 2},
{'Analysis Alias': 'ELOAD_3_GAE', 'File Name': 'ELOAD_3/GAE.vcf.gz.csi', 'File Type': 'csi', 'MD5': None,
'row_num': 3},
{'Analysis Alias': 'ELOAD_3_GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz', 'File Type': 'vcf', 'MD5': None,
'row_num': 4},
{'Analysis Alias': 'ELOAD_3_GAE2', 'File Name': 'ELOAD_3/GAE2.vcf.gz.csi', 'File Type': 'csi', 'MD5': None,
'row_num': 5}
]
# Updated the reference genome to what the brokering contains
assert source_reader.analysis[1]['Reference'] == 'GCA_000001405'
assert reader.analysis[1]['Reference'] == 'GCA_000001405.1'

def test_archival_confirmation_text(self):
self.eload.eload_cfg.set('submission', 'project_title', value='Great project')
Expand Down
2 changes: 1 addition & 1 deletion tests/test_upload_to_ENA.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def test_parse_ena_receipt_multiple_analyses(self):
def test_single_upload_xml_files_to_ena(self):
with patch.object(ENAUploader, '_post_xml_file_to_ena') as mock_post,\
patch('eva_submission.ENA_submission.upload_to_ENA.requests.get') as mock_get:
json_data = {'submissionId': 'ERA123456', 'links': [{'rel': 'poll-xml', 'href': 'https://example.com/link'}]}
json_data = {'submissionId': 'ERA123456', '_links': [{'rel': 'poll-xml', 'href': 'https://example.com/link'}]}
mock_post.return_value = Mock(status_code=200, json=Mock(return_value=json_data))
mock_get.return_value = Mock(status_code=200, text=self.receipt)
self.assertFalse(os.path.isfile(self.uploader_async.converter.single_submission_file))
Expand Down
20 changes: 20 additions & 0 deletions tests/test_xlsx_validation.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import os
import shutil
from unittest import TestCase
Expand Down Expand Up @@ -73,3 +74,22 @@ def test_correct_scientific_name_in_metadata(self):
assert len([s for s in scientific_name_list if s == 'Homo Sapiens']) == 0
assert len([s for s in scientific_name_list if s == 'HS']) == 10

def test_check_date(self):
assert self.validator.error_list == []
row = {"row_num": 1, "collection_date": 'not provided'}
self.validator.check_date(row, 'collection_date', required=True)
assert self.validator.error_list == []

row = {"row_num": 1, "collection_date": datetime.date(year=2019, month=6, day=8)}
self.validator.check_date(row, 'collection_date', required=True)
assert self.validator.error_list == []

row = {"row_num": 1, "collection_date": '2019-06-08'}
self.validator.check_date(row, 'collection_date', required=True)
assert self.validator.error_list == []

row = {"row_num": 1, "collection_date": '2019-06-08,2019-06-09'}
self.validator.check_date(row, 'collection_date', required=True)
assert self.validator.error_list == [
'In row 1, collection_date is not a date or "not provided": it is set to "2019-06-08,2019-06-09"'
]

0 comments on commit 0d8acbb

Please sign in to comment.