diff --git a/eva_sub_cli/executables/xlsx2json.py b/eva_sub_cli/executables/xlsx2json.py index 179c0d3..e3c40b3 100644 --- a/eva_sub_cli/executables/xlsx2json.py +++ b/eva_sub_cli/executables/xlsx2json.py @@ -52,14 +52,16 @@ def __init__(self, xlsx_filename, conf_filename): try: self.workbook = load_workbook(xlsx_filename, read_only=True) except Exception as e: - self.add_error(f'Error loading {xlsx_filename}: {e}') + self.add_error(f'Error loading {xlsx_filename}: {repr(e)}') + self.file_loaded = False return - self.worksheets = None + self.worksheets = [] self._active_worksheet = None self.row_offset = {} self.headers = {} - self.valid = None + self.file_loaded = True self.errors = [] + self.valid_worksheets() @property def active_worksheet(self): @@ -77,14 +79,8 @@ def active_worksheet(self, worksheet): def valid_worksheets(self): """ - Get the list of the names of worksheets which have all the configured required headers - :return: list of valid worksheet names in the Excel file - :rtype: list + Get the list of the names of worksheets which have the expected title and header row. """ - if self.worksheets is not None: - return self.worksheets - - self.worksheets = [] sheet_titles = self.workbook.sheetnames for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]: @@ -97,31 +93,10 @@ def valid_worksheets(self): header_row = self.xlsx_conf[title].get(HEADERS_KEY_ROW, 1) if worksheet.max_row < header_row + 1: continue - # Check required headers are present + # Store headers and worksheet title self.headers[title] = [cell.value if cell.value is None else cell.value.strip() for cell in worksheet[header_row]] - required_headers = self.xlsx_conf[title].get(REQUIRED_HEADERS_KEY_NAME, []) - if set(required_headers) <= set(self.headers[title]): # issubset - self.worksheets.append(title) - else: - missing_headers = set(required_headers) - set(self.headers[title]) - for header in missing_headers: - self.add_error(f'Worksheet {title} is missing required header {header}', - sheet=title, column=header) - - return self.worksheets - - def is_valid(self): - """ - Check that is all the worksheets contain required headers - :return: True if all the worksheets contain required headers. False otherwise - :rtype: bool - """ - if self.valid is None: - self.valid = True - self.valid_worksheets() - - return self.valid + self.worksheets.append(title) @staticmethod def cast_value(value, type_name): @@ -219,16 +194,17 @@ def get_biosample_object(self, data): scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY] # BioSample expects any of organism or species field - data[SPECIES] = data[scientific_name] + if scientific_name in data: + data[SPECIES] = data[scientific_name] # BioSample name goes in its own attribute, not part of characteristics - biosample_name = data.pop(sample_name) - # For all characteristics, BioSample expects value in arrays of objects - data = {k: [{'text': self.serialize(v)}] for k, v in data.items()} + biosample_name = data.pop(sample_name, None) + # For all characteristics, BioSample expects value in arrays of objects biosample_object = { - "name": biosample_name, - "characteristics": data + 'characteristics': {k: [{'text': self.serialize(v)}] for k, v in data.items()} } + if biosample_name is not None: + biosample_object['name'] = biosample_name return biosample_object @@ -263,20 +239,6 @@ def get_sample_json_data(self): json_value.pop(analysis_alias) json_value.pop(sample_name_in_vcf) - # Check for headers that are required only in this case - sample_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SAMPLE_NAME_KEY] - scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY] - if sample_name not in json_value: - self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have ' - f'{SAMPLE_NAME_KEY} populated', - sheet=SAMPLE, row=row_num, column=SAMPLE_NAME_KEY) - return None - if scientific_name not in json_value: - self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have ' - f'{SCIENTIFIC_NAME_KEY} populated', - sheet=SAMPLE, row=row_num, column=SCIENTIFIC_NAME_KEY) - return None - biosample_obj = self.get_biosample_object(json_value) sample_data.update(bioSampleObject=biosample_obj) sample_json[json_key].append(sample_data) @@ -284,9 +246,8 @@ def get_sample_json_data(self): return sample_json def json(self, output_json_file): - # First check that all sheets present have the required headers; - # also guards against the case where conversion fails in init - if not self.is_valid(): + # If the file could not be loaded at all, return without generating JSON. + if not self.file_loaded: return json_data = {} for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]: @@ -295,8 +256,6 @@ def json(self, output_json_file): json_data.update(self.get_project_json_data()) elif title == SAMPLE: sample_data = self.get_sample_json_data() - if sample_data is None: # missing conditionally required headers - return json_data.update(sample_data) else: json_data[self.xlsx_conf[WORKSHEETS_KEY_NAME][title]] = [] @@ -324,7 +283,6 @@ def add_error(self, message, sheet='', row='', column=''): """Adds a conversion error using the same structure as other validation errors, and marks the spreadsheet as invalid.""" self.errors.append({'sheet': sheet, 'row': row, 'column': column, 'description': message}) - self.valid = False def save_errors(self, errors_yaml_file): with open(errors_yaml_file, 'w') as open_file: diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index ced4365..8417144 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -1,4 +1,3 @@ -import re import json from copy import deepcopy @@ -6,7 +5,7 @@ from retry import retry from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator -from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena +from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena, get_scientific_name_and_common_name from ebi_eva_common_pyutils.logger import AppLogger from eva_sub_cli.date_utils import check_date @@ -22,9 +21,11 @@ BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession' CHARACTERISTICS_KEY = 'characteristics' TAX_ID_KEY = 'taxId' +SCI_NAME_KEYS = ['species', 'Species', 'organism', 'Organism'] ANALYSIS_ALIAS_KEY = 'analysisAlias' ANALYSIS_RUNS_KEY = 'runAccessions' + def cast_list(l, type_to_cast=str): for e in l: yield type_to_cast(e) @@ -36,7 +37,7 @@ def __init__(self, metadata, sample_checklist='ERC000011'): self.sample_checklist = sample_checklist self.metadata = metadata self.errors = [] - # Caches whether taxonomy code is valid or not + # Caches whether taxonomy code is valid or not, and maps to scientific name if valid self.taxonomy_valid = {} self.communicator = NoAuthHALCommunicator(bsd_url='https://www.ebi.ac.uk/biosamples') @@ -47,8 +48,9 @@ def write_result_yaml(self, output_path): def check_all(self): self.check_all_project_accessions() self.check_all_taxonomy_codes() + self.check_all_scientific_names() self.check_existing_biosamples() - self.check_all_analysis_run_accessions + self.check_all_analysis_run_accessions() self.check_analysis_alias_coherence() def check_all_project_accessions(self): @@ -68,15 +70,34 @@ def check_all_taxonomy_codes(self): self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}') # Check sample taxonomies for novel samples for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): - if BIOSAMPLE_OBJECT_KEY in sample: + if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]: self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'], f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}') + def check_all_scientific_names(self): + """Check that all scientific names are consistent with taxonomy codes.""" + for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): + if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]: + characteristics = sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY] + # Get the scientific name from the taxonomy (if valid) + tax_code = int(characteristics[TAX_ID_KEY][0]['text']) + sci_name_from_tax = self.taxonomy_valid[tax_code] + if not sci_name_from_tax: + continue + # Check if scientific name in sample matches + for sci_name_key in SCI_NAME_KEYS: + if sci_name_key in characteristics: + sci_name = characteristics[sci_name_key][0]['text'] + if sci_name_from_tax.lower() != sci_name.lower(): + self.add_error( + f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{sci_name_key}', + f'Species {sci_name} does not match taxonomy {tax_code} ({sci_name_from_tax})') + def check_all_analysis_run_accessions(self): """Check that the Run accession are valid and exist in ENA""" for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]): json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}' - if analysis[ANALYSIS_RUNS_KEY]: + if ANALYSIS_RUNS_KEY in analysis and analysis[ANALYSIS_RUNS_KEY]: for run_acc in analysis[ANALYSIS_RUNS_KEY]: self.check_accession_in_ena(run_acc, 'Run', json_path) @@ -98,8 +119,8 @@ def check_taxonomy_code(self, taxonomy_code, json_path): self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') else: try: - download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}') - self.taxonomy_valid[taxonomy_code] = True + sci_name, _ = get_scientific_name_and_common_name(taxonomy_code) + self.taxonomy_valid[taxonomy_code] = sci_name except Exception: self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') self.taxonomy_valid[taxonomy_code] = False diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index 6b01e49..c9e370b 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -1,4 +1,3 @@ -import argparse import csv import os import re @@ -12,7 +11,7 @@ logger = logging_config.get_logger(__name__) container_image = 'ebivariation/eva-sub-cli' -container_tag = 'v0.0.1.dev16' +container_tag = 'v0.0.1.dev17' container_validation_dir = '/opt/vcf_validation' container_validation_output_dir = 'vcf_validation_output' diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py index 321abc4..274b7a6 100644 --- a/eva_sub_cli/validators/validation_results_parsers.py +++ b/eva_sub_cli/validators/validation_results_parsers.py @@ -134,6 +134,10 @@ def clean_read(ifile): if line.startswith('Validation failed with following error(s):'): collect = True else: + while line and not line.startswith('/'): + # Sometimes there are multiple (possibly redundant) errors listed under a single property, + # we only report the first + line = clean_read(open_file) line2 = clean_read(open_file) if line is None or line2 is None: break # EOF @@ -164,6 +168,9 @@ def convert_metadata_attribute(sheet, json_attribute, xls2json_conf): attributes_dict = {} attributes_dict.update(xls2json_conf[sheet].get('required', {})) attributes_dict.update(xls2json_conf[sheet].get('optional', {})) + attributes_dict['Scientific Name'] = 'species' + attributes_dict['BioSample Name'] = 'name' + for attribute in attributes_dict: if attributes_dict[attribute] == json_attribute: return attribute @@ -185,7 +192,12 @@ def parse_metadata_property(property_str): def parse_sample_metadata_property(property_str): + # Check characteristics match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str) if match: return 'sample', match.group(1), match.group(2) + # Check name + match = re.match(r'/sample/(\d+)/bioSampleObject/name', property_str) + if match: + return 'sample', match.group(1), 'name' return None, None, None diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index f3e591d..79e23b2 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -1,7 +1,6 @@ #!/usr/bin/env python import csv import datetime -import glob import json import logging import os @@ -345,7 +344,7 @@ def _convert_biovalidator_validation_to_spreadsheet(self): sheet = convert_metadata_sheet(sheet_json, xls2json_conf) row = convert_metadata_row(sheet, row_json, xls2json_conf) column = convert_metadata_attribute(sheet, attribute_json, xls2json_conf) - if row_json is None and attribute_json is None: + if row_json is None and attribute_json is None and sheet is not None: new_description = f'Sheet "{sheet}" is missing' elif row_json is None: if 'have required' not in error['description']: diff --git a/tests/resources/EVA_Submission_test_fails.xlsx b/tests/resources/EVA_Submission_test_fails.xlsx index 2c06d0d..716160c 100644 Binary files a/tests/resources/EVA_Submission_test_fails.xlsx and b/tests/resources/EVA_Submission_test_fails.xlsx differ diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_conversion_errors.yml b/tests/resources/validation_reports/validation_output/other_validations/metadata_conversion_errors.yml index fd5af64..13b09b9 100644 --- a/tests/resources/validation_reports/validation_output/other_validations/metadata_conversion_errors.yml +++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_conversion_errors.yml @@ -1,4 +1,4 @@ -- column: Tax ID - description: Worksheet Project is missing required header Tax ID +- column: '' + description: 'Error loading problem.xlsx: Exception()' row: '' - sheet: Project + sheet: '' diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml index c0c130c..39ed5d0 100644 --- a/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml +++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml @@ -2,6 +2,8 @@ property: /project/childProjects/1 - description: 1234 is not a valid taxonomy code property: /sample/2/bioSampleObject/characteristics/taxId +- description: Species sheep sapiens does not match taxonomy 9606 (Homo sapiens) + property: /sample/1/bioSampleObject/characteristics/Organism - description: alias1 present in Analysis not in Samples property: /sample/analysisAlias - description: alias_1,alias_2 present in Samples not in Analysis diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt b/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt index 3b805ec..0bc227e 100644 --- a/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt +++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt @@ -26,4 +26,19 @@ should have required property 'bioSampleObject' /sample/0 should match exactly one schema in oneOf +/sample/3/bioSampleObject/name + must have required property 'name' + must have required property 'name' + must have required property 'name' +/sample/3/bioSampleObject/characteristics/organism + must have required property 'organism' + must have required property 'organism' +/sample/3/bioSampleObject/characteristics/Organism + must have required property 'Organism' +/sample/3/bioSampleObject/characteristics/species + must have required property 'species' +/sample/3/bioSampleObject/characteristics/Species + must have required property 'Species' +/sample/3/bioSampleObject/characteristics + must match a schema in anyOf  diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py index fec6214..35d37c0 100644 --- a/tests/test_semantic_metadata.py +++ b/tests/test_semantic_metadata.py @@ -74,9 +74,9 @@ def test_check_all_taxonomy_codes(self): ] } checker = SemanticMetadataChecker(metadata) - with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: + with patch('eva_sub_cli.semantic_metadata.get_scientific_name_and_common_name') as m_get_sci_name: # Mock should only be called once per taxonomy code - m_ena_download.side_effect = [True, Exception('problem downloading')] + m_get_sci_name.side_effect = [('Homo sapiens', 'human'), Exception('problem downloading')] checker.check_all_taxonomy_codes() self.assertEqual(checker.errors, [ { @@ -85,6 +85,47 @@ def test_check_all_taxonomy_codes(self): } ]) + def test_check_all_scientific_names(self): + metadata = { + "sample": [ + { + "bioSampleObject": { + "characteristics": { + "taxId": [{"text": "9606"}], + "Organism": [{"text": "homo sapiens"}] + } + } + }, + { + "bioSampleObject": { + "characteristics": { + "taxId": [{"text": "9606"}], + "Organism": [{"text": "sheep sapiens"}] + } + } + }, + { + "bioSampleObject": { + "characteristics": { + "taxId": [{"text": "1234"}] + } + } + } + ] + } + checker = SemanticMetadataChecker(metadata) + checker.taxonomy_valid = { + 1234: False, + 9606: "Homo sapiens" + } + checker.check_all_scientific_names() + self.assertEqual(checker.errors, [ + { + 'property': '/sample/1/bioSampleObject/characteristics/Organism', + 'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)' + } + ]) + def test_check_existing_biosamples_with_checklist(self): checker = SemanticMetadataChecker(metadata) with patch.object(SemanticMetadataChecker, '_get_biosample', diff --git a/tests/test_validator.py b/tests/test_validator.py index a9031e3..0cfaf73 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -1,10 +1,81 @@ import os.path +from copy import deepcopy from unittest import TestCase from eva_sub_cli.validators.validator import Validator, VALIDATION_OUTPUT_DIR from tests.test_utils import create_mapping_file +expected_validation_results = { + 'shallow_validation': {'requested': False}, + 'vcf_check': { + 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, + 'critical_count': 0, 'critical_list': []} + }, + 'assembly_check': { + 'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0, + 'match': 247, 'total': 247} + }, + 'sample_check': { + 'overall_differences': False, + 'results_per_analysis': { + 'AA': { + 'difference': False, + 'more_metadata_submitted_files': [], + 'more_per_submitted_files_metadata': {}, + 'more_submitted_files_metadata': [] + } + } + }, + 'fasta_check': { + 'input_passed.fa': {'all_insdc': False, 'sequences': [ + {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'}, + {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'} + ]}, + }, + 'metadata_check': { + 'json_errors': [ + {'property': '/files', 'description': "should have required property 'files'"}, + {'property': '/project/title', 'description': "should have required property 'title'"}, + {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, + {'property': '/project/holdDate', 'description': 'must match format "date"'}, + {'property': '/analysis/0/description', + 'description': "should have required property 'description'"}, + {'property': '/analysis/0/referenceGenome', + 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0/bioSampleAccession', + 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0/bioSampleObject', + 'description': "should have required property 'bioSampleObject'"}, + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, + {'property': '/sample/3/bioSampleObject/name', 'description': "must have required property 'name'"}, + {'property': '/sample/3/bioSampleObject/characteristics/organism', + 'description': "must have required property 'organism'"}, + {'property': '/sample/3/bioSampleObject/characteristics/Organism', + 'description': "must have required property 'Organism'"}, + {'property': '/sample/3/bioSampleObject/characteristics/species', + 'description': "must have required property 'species'"}, + {'property': '/sample/3/bioSampleObject/characteristics/Species', + 'description': "must have required property 'Species'"}, + {'property': '/sample/3/bioSampleObject/characteristics', + 'description': 'must match a schema in anyOf'}, + {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, + {'property': '/sample/2/bioSampleObject/characteristics/taxId', + 'description': '1234 is not a valid taxonomy code'}, + {'property': '/sample/1/bioSampleObject/characteristics/Organism', + 'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)'}, + {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, + {'property': '/sample/analysisAlias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'}, + ], + 'spreadsheet_errors': [ + {'sheet': '', 'row': '', 'column': '', + 'description': 'Error loading problem.xlsx: Exception()'} + ] + } +} + + class TestValidator(TestCase): resource_dir = os.path.join(os.path.dirname(__file__), 'resources') vcf_files = os.path.join(resource_dir, 'vcf_files') @@ -34,158 +105,57 @@ def tearDown(self) -> None: if os.path.exists(f): os.remove(f) - def test__collect_validation_workflow_results_with_metadata_xlsx(self): - expected_results = { - 'shallow_validation': {'requested': False}, - 'vcf_check': { - 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, 'critical_count': 0, 'critical_list': []} - }, - 'assembly_check': { - 'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0, 'match': 247, 'total': 247} - }, - 'sample_check': { - 'overall_differences': False, - 'results_per_analysis': { - 'AA': { - 'difference': False, - 'more_metadata_submitted_files': [], - 'more_per_submitted_files_metadata': {}, - 'more_submitted_files_metadata': [] - } - } - }, - 'fasta_check': { - 'input_passed.fa': {'all_insdc': False, 'sequences': [ - {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'}, - {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'} - ]}, - }, - 'metadata_check': { - 'json_errors': [ - {'property': '/files', 'description': "should have required property 'files'"}, - {'property': '/project/title', 'description': "should have required property 'title'"}, - {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, - {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0/description', 'description': "should have required property 'description'"}, - {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, - {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, - {'property': '/sample/2/bioSampleObject/characteristics/taxId', - 'description': '1234 is not a valid taxonomy code'}, - {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, - {'property': '/sample/analysisAlias', - 'description': 'alias_1,alias_2 present in Samples not in Analysis'}, - ], - 'spreadsheet_errors': [ - # NB. Wouldn't normally get conversion error + validation errors together, but it is supported. - {'sheet': 'Project', 'row': '', 'column': 'Tax ID', - 'description': 'Worksheet Project is missing required header Tax ID'}, - {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, - {'sheet': 'Project', 'row': 2, 'column': 'Project Title', - 'description': 'Column "Project Title" is not populated'}, - {'sheet': 'Project', 'row': 2, 'column': 'Tax ID', - 'description': 'Column "Tax ID" is not populated'}, - {'sheet': 'Project', 'row': 2, 'column': 'Hold Date', - 'description': 'must match format "date"'}, - {'sheet': 'Analysis', 'row': 2, 'column': 'Description', - 'description': 'Column "Description" is not populated'}, - {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', - 'description': 'Column "Reference" is not populated'}, - {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', - 'description': 'Column "Sample Accession" is not populated'}, - {'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)', - 'description': 'PRJEBNA does not exist or is private'}, - {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', - 'description': '1234 is not a valid taxonomy code'}, - {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', - 'description': 'alias1 present in Analysis not in Samples'}, - {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', - 'description': 'alias_1,alias_2 present in Samples not in Analysis'} - ] - } - } - - self.validator._collect_validation_workflow_results() + def run_collect_results(self, validator_to_run): + validator_to_run._collect_validation_workflow_results() # Drop report paths from comparison (test will fail if missing) - del self.validator.results['metadata_check']['json_report_path'] - del self.validator.results['metadata_check']['spreadsheet_report_path'] - del self.validator.results['sample_check']['report_path'] - for file in self.validator.results['vcf_check'].values(): + del validator_to_run.results['metadata_check']['json_report_path'] + if 'spreadsheet_report_path' in validator_to_run.results['metadata_check']: + del validator_to_run.results['metadata_check']['spreadsheet_report_path'] + del validator_to_run.results['sample_check']['report_path'] + for file in validator_to_run.results['vcf_check'].values(): del file['report_path'] - for file in self.validator.results['assembly_check'].values(): + for file in validator_to_run.results['assembly_check'].values(): del file['report_path'] - assert self.validator.results == expected_results - def test__collect_validation_workflow_results_with_metadata_json(self): - expected_results = { - 'shallow_validation': {'requested': False}, - 'vcf_check': { - 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, - 'critical_count': 0, 'critical_list': []} - }, - 'assembly_check': { - 'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0, - 'match': 247, 'total': 247} - }, - 'sample_check': { - 'overall_differences': False, - 'results_per_analysis': { - 'AA': { - 'difference': False, - 'more_metadata_submitted_files': [], - 'more_per_submitted_files_metadata': {}, - 'more_submitted_files_metadata': [] - } - } - }, - 'fasta_check': { - 'input_passed.fa': {'all_insdc': False, 'sequences': [ - {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'}, - {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'} - ]}, - }, - 'metadata_check': { - 'json_errors': [ - {'property': '/files', 'description': "should have required property 'files'"}, - {'property': '/project/title', 'description': "should have required property 'title'"}, - {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, - {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0/description', - 'description': "should have required property 'description'"}, - {'property': '/analysis/0/referenceGenome', - 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0/bioSampleAccession', - 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0/bioSampleObject', - 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, - {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, - {'property': '/sample/2/bioSampleObject/characteristics/taxId', - 'description': '1234 is not a valid taxonomy code'}, - {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, - {'property': '/sample/analysisAlias', - 'description': 'alias_1,alias_2 present in Samples not in Analysis'}, - ], - 'spreadsheet_errors': [ - {'sheet': 'Project', 'row': '', 'column': 'Tax ID', - 'description': 'Worksheet Project is missing required header Tax ID'} - ] - } - } + self.run_collect_results(self.validator_json) + assert self.validator_json.results == expected_validation_results - self.validator_json._collect_validation_workflow_results() - # Drop report paths from comparison (test will fail if missing) - del self.validator_json.results['metadata_check']['json_report_path'] - del self.validator_json.results['sample_check']['report_path'] - for file in self.validator_json.results['vcf_check'].values(): - del file['report_path'] - for file in self.validator_json.results['assembly_check'].values(): - del file['report_path'] + def test__collect_validation_workflow_results_with_metadata_xlsx(self): + expected_results = deepcopy(expected_validation_results) + expected_results['metadata_check']['spreadsheet_errors'] = [ + # NB. Wouldn't normally get conversion error + validation errors together, but it is supported. + {'sheet': '', 'row': '', 'column': '', + 'description': 'Error loading problem.xlsx: Exception()'}, + {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': 2, 'column': 'Project Title', + 'description': 'Column "Project Title" is not populated'}, + {'sheet': 'Project', 'row': 2, 'column': 'Tax ID', + 'description': 'Column "Tax ID" is not populated'}, + {'sheet': 'Project', 'row': 2, 'column': 'Hold Date', + 'description': 'must match format "date"'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Description', + 'description': 'Column "Description" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', + 'description': 'Column "Reference" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', + 'description': 'Column "Sample Accession" is not populated'}, + {'sheet': 'Sample', 'row': 6, 'column': 'BioSample Name', + 'description': 'Column "BioSample Name" is not populated'}, + {'sheet': 'Sample', 'row': 6, 'column': 'Scientific Name', + 'description': 'Column "Scientific Name" is not populated'}, + {'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)', + 'description': 'PRJEBNA does not exist or is private'}, + {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', + 'description': '1234 is not a valid taxonomy code'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias1 present in Analysis not in Samples'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'} + ] - assert self.validator_json.results == expected_results + self.run_collect_results(self.validator) + assert self.validator.results == expected_results def test_create_report(self): self.validator._collect_validation_workflow_results() @@ -201,10 +171,22 @@ def test_parse_biovalidator_validation_results(self): {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, {'property': '/project/holdDate', 'description': 'must match format "date"'}, {'property': '/analysis/0/description', 'description': "should have required property 'description'"}, - {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/analysis/0/referenceGenome', + 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0/bioSampleAccession', + 'description': "should have required property 'bioSampleAccession'"}, {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, + {'property': '/sample/3/bioSampleObject/name', 'description': "must have required property 'name'"}, + {'property': '/sample/3/bioSampleObject/characteristics/organism', + 'description': "must have required property 'organism'"}, + {'property': '/sample/3/bioSampleObject/characteristics/Organism', + 'description': "must have required property 'Organism'"}, + {'property': '/sample/3/bioSampleObject/characteristics/species', + 'description': "must have required property 'species'"}, + {'property': '/sample/3/bioSampleObject/characteristics/Species', + 'description': "must have required property 'Species'"}, + {'property': '/sample/3/bioSampleObject/characteristics', 'description': 'must match a schema in anyOf'} ] def test_convert_biovalidator_validation_to_spreadsheet(self): @@ -223,6 +205,19 @@ def test_convert_biovalidator_validation_to_spreadsheet(self): {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, + # Missing BioSamples attributes + {'property': '/sample/3/bioSampleObject/name', + 'description': "must have required property 'name'"}, + {'property': '/sample/3/bioSampleObject/characteristics/organism', + 'description': "must have required property 'organism'"}, + {'property': '/sample/3/bioSampleObject/characteristics/Organism', + 'description': "must have required property 'Organism'"}, + {'property': '/sample/3/bioSampleObject/characteristics/species', + 'description': "must have required property 'species'"}, + {'property': '/sample/3/bioSampleObject/characteristics/Species', + 'description': "must have required property 'Species'"}, + {'property': '/sample/3/bioSampleObject/characteristics', + 'description': 'must match a schema in anyOf'}, # Semantic checks {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, {'property': '/sample/2/bioSampleObject/characteristics/taxId', @@ -248,6 +243,10 @@ def test_convert_biovalidator_validation_to_spreadsheet(self): 'description': 'Column "Reference" is not populated'}, {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'Column "Sample Accession" is not populated'}, + {'sheet': 'Sample', 'row': 6, 'column': 'BioSample Name', + 'description': 'Column "BioSample Name" is not populated'}, + {'sheet': 'Sample', 'row': 6, 'column': 'Scientific Name', + 'description': 'Column "Scientific Name" is not populated'}, {'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)', 'description': 'PRJEBNA does not exist or is private'}, {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', 'description': '1234 is not a valid taxonomy code'}, @@ -261,8 +260,8 @@ def test_collect_conversion_errors(self): self.validator.results['metadata_check'] = {} self.validator._load_spreadsheet_conversion_errors() assert self.validator.results['metadata_check']['spreadsheet_errors'] == [{ - 'column': 'Tax ID', - 'description': 'Worksheet Project is missing required header Tax ID', + 'column': '', + 'description': 'Error loading problem.xlsx: Exception()', 'row': '', - 'sheet': 'Project' + 'sheet': '' }] diff --git a/tests/test_xlsx2json.py b/tests/test_xlsx2json.py index 16c9006..b9c2e45 100644 --- a/tests/test_xlsx2json.py +++ b/tests/test_xlsx2json.py @@ -63,7 +63,7 @@ def test_create_xls_template(self): create_xls_template_from_yaml(metadata_file, self.conf_filename) assert os.path.exists(metadata_file) - def test_json_conversion_fails(self): + def test_json_conversion_succeeds_with_invalid_metadata(self): xls_filename = os.path.join(self.resource_dir, 'EVA_Submission_test_fails.xlsx') self.parser = XlsxParser(xls_filename, self.conf_filename) output_json = os.path.join(self.resource_dir, 'EVA_Submission_test_output.json') @@ -71,15 +71,22 @@ def test_json_conversion_fails(self): self.parser.json(output_json) self.parser.save_errors(errors_yaml) - assert not os.path.exists(output_json) + # confirm no errors with open(errors_yaml) as open_file: errors_data = yaml.safe_load(open_file) - assert errors_data == [{ - 'sheet': 'Project', - 'row': '', - 'column': 'Tax ID', - 'description': 'Worksheet Project is missing required header Tax ID' - }] + assert errors_data == [] + + # json file exists but missing fields + assert os.path.exists(output_json) + with open(output_json) as open_file: + json_data = json.load(open_file) + assert sorted(json_data.keys()) == ['analysis', 'files', 'project', 'sample', 'submitterDetails'] + # required field taxId is missing + assert 'taxId' not in json_data['project'] + # novel sample is missing scientific name in characteristics and sample name + novel_sample = json_data['sample'][3]['bioSampleObject'] + assert 'name' not in novel_sample + assert 'species' not in novel_sample['characteristics'] def get_expected_json(self): return {