From 71de02a88409adb40bfbe518c3cb5f44d3fb46e5 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 11 Sep 2024 09:31:29 +0100 Subject: [PATCH] add scientific name check and fix tests --- eva_sub_cli/semantic_metadata.py | 37 ++- .../metadata_semantic_check.yml | 2 + tests/test_semantic_metadata.py | 45 ++- tests/test_validator.py | 286 +++++++----------- 4 files changed, 189 insertions(+), 181 deletions(-) diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index ced4365..8417144 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -1,4 +1,3 @@ -import re import json from copy import deepcopy @@ -6,7 +5,7 @@ from retry import retry from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator -from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena +from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena, get_scientific_name_and_common_name from ebi_eva_common_pyutils.logger import AppLogger from eva_sub_cli.date_utils import check_date @@ -22,9 +21,11 @@ BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession' CHARACTERISTICS_KEY = 'characteristics' TAX_ID_KEY = 'taxId' +SCI_NAME_KEYS = ['species', 'Species', 'organism', 'Organism'] ANALYSIS_ALIAS_KEY = 'analysisAlias' ANALYSIS_RUNS_KEY = 'runAccessions' + def cast_list(l, type_to_cast=str): for e in l: yield type_to_cast(e) @@ -36,7 +37,7 @@ def __init__(self, metadata, sample_checklist='ERC000011'): self.sample_checklist = sample_checklist self.metadata = metadata self.errors = [] - # Caches whether taxonomy code is valid or not + # Caches whether taxonomy code is valid or not, and maps to scientific name if valid self.taxonomy_valid = {} self.communicator = NoAuthHALCommunicator(bsd_url='https://www.ebi.ac.uk/biosamples') @@ -47,8 +48,9 @@ def write_result_yaml(self, output_path): def check_all(self): self.check_all_project_accessions() self.check_all_taxonomy_codes() + self.check_all_scientific_names() self.check_existing_biosamples() - self.check_all_analysis_run_accessions + self.check_all_analysis_run_accessions() self.check_analysis_alias_coherence() def check_all_project_accessions(self): @@ -68,15 +70,34 @@ def check_all_taxonomy_codes(self): self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}') # Check sample taxonomies for novel samples for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): - if BIOSAMPLE_OBJECT_KEY in sample: + if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]: self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'], f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}') + def check_all_scientific_names(self): + """Check that all scientific names are consistent with taxonomy codes.""" + for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): + if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]: + characteristics = sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY] + # Get the scientific name from the taxonomy (if valid) + tax_code = int(characteristics[TAX_ID_KEY][0]['text']) + sci_name_from_tax = self.taxonomy_valid[tax_code] + if not sci_name_from_tax: + continue + # Check if scientific name in sample matches + for sci_name_key in SCI_NAME_KEYS: + if sci_name_key in characteristics: + sci_name = characteristics[sci_name_key][0]['text'] + if sci_name_from_tax.lower() != sci_name.lower(): + self.add_error( + f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{sci_name_key}', + f'Species {sci_name} does not match taxonomy {tax_code} ({sci_name_from_tax})') + def check_all_analysis_run_accessions(self): """Check that the Run accession are valid and exist in ENA""" for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]): json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}' - if analysis[ANALYSIS_RUNS_KEY]: + if ANALYSIS_RUNS_KEY in analysis and analysis[ANALYSIS_RUNS_KEY]: for run_acc in analysis[ANALYSIS_RUNS_KEY]: self.check_accession_in_ena(run_acc, 'Run', json_path) @@ -98,8 +119,8 @@ def check_taxonomy_code(self, taxonomy_code, json_path): self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') else: try: - download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}') - self.taxonomy_valid[taxonomy_code] = True + sci_name, _ = get_scientific_name_and_common_name(taxonomy_code) + self.taxonomy_valid[taxonomy_code] = sci_name except Exception: self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') self.taxonomy_valid[taxonomy_code] = False diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml index c0c130c..39ed5d0 100644 --- a/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml +++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml @@ -2,6 +2,8 @@ property: /project/childProjects/1 - description: 1234 is not a valid taxonomy code property: /sample/2/bioSampleObject/characteristics/taxId +- description: Species sheep sapiens does not match taxonomy 9606 (Homo sapiens) + property: /sample/1/bioSampleObject/characteristics/Organism - description: alias1 present in Analysis not in Samples property: /sample/analysisAlias - description: alias_1,alias_2 present in Samples not in Analysis diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py index fec6214..35d37c0 100644 --- a/tests/test_semantic_metadata.py +++ b/tests/test_semantic_metadata.py @@ -74,9 +74,9 @@ def test_check_all_taxonomy_codes(self): ] } checker = SemanticMetadataChecker(metadata) - with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: + with patch('eva_sub_cli.semantic_metadata.get_scientific_name_and_common_name') as m_get_sci_name: # Mock should only be called once per taxonomy code - m_ena_download.side_effect = [True, Exception('problem downloading')] + m_get_sci_name.side_effect = [('Homo sapiens', 'human'), Exception('problem downloading')] checker.check_all_taxonomy_codes() self.assertEqual(checker.errors, [ { @@ -85,6 +85,47 @@ def test_check_all_taxonomy_codes(self): } ]) + def test_check_all_scientific_names(self): + metadata = { + "sample": [ + { + "bioSampleObject": { + "characteristics": { + "taxId": [{"text": "9606"}], + "Organism": [{"text": "homo sapiens"}] + } + } + }, + { + "bioSampleObject": { + "characteristics": { + "taxId": [{"text": "9606"}], + "Organism": [{"text": "sheep sapiens"}] + } + } + }, + { + "bioSampleObject": { + "characteristics": { + "taxId": [{"text": "1234"}] + } + } + } + ] + } + checker = SemanticMetadataChecker(metadata) + checker.taxonomy_valid = { + 1234: False, + 9606: "Homo sapiens" + } + checker.check_all_scientific_names() + self.assertEqual(checker.errors, [ + { + 'property': '/sample/1/bioSampleObject/characteristics/Organism', + 'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)' + } + ]) + def test_check_existing_biosamples_with_checklist(self): checker = SemanticMetadataChecker(metadata) with patch.object(SemanticMetadataChecker, '_get_biosample', diff --git a/tests/test_validator.py b/tests/test_validator.py index 6a65a87..0cfaf73 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -1,10 +1,81 @@ import os.path +from copy import deepcopy from unittest import TestCase from eva_sub_cli.validators.validator import Validator, VALIDATION_OUTPUT_DIR from tests.test_utils import create_mapping_file +expected_validation_results = { + 'shallow_validation': {'requested': False}, + 'vcf_check': { + 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, + 'critical_count': 0, 'critical_list': []} + }, + 'assembly_check': { + 'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0, + 'match': 247, 'total': 247} + }, + 'sample_check': { + 'overall_differences': False, + 'results_per_analysis': { + 'AA': { + 'difference': False, + 'more_metadata_submitted_files': [], + 'more_per_submitted_files_metadata': {}, + 'more_submitted_files_metadata': [] + } + } + }, + 'fasta_check': { + 'input_passed.fa': {'all_insdc': False, 'sequences': [ + {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'}, + {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'} + ]}, + }, + 'metadata_check': { + 'json_errors': [ + {'property': '/files', 'description': "should have required property 'files'"}, + {'property': '/project/title', 'description': "should have required property 'title'"}, + {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, + {'property': '/project/holdDate', 'description': 'must match format "date"'}, + {'property': '/analysis/0/description', + 'description': "should have required property 'description'"}, + {'property': '/analysis/0/referenceGenome', + 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0/bioSampleAccession', + 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0/bioSampleObject', + 'description': "should have required property 'bioSampleObject'"}, + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, + {'property': '/sample/3/bioSampleObject/name', 'description': "must have required property 'name'"}, + {'property': '/sample/3/bioSampleObject/characteristics/organism', + 'description': "must have required property 'organism'"}, + {'property': '/sample/3/bioSampleObject/characteristics/Organism', + 'description': "must have required property 'Organism'"}, + {'property': '/sample/3/bioSampleObject/characteristics/species', + 'description': "must have required property 'species'"}, + {'property': '/sample/3/bioSampleObject/characteristics/Species', + 'description': "must have required property 'Species'"}, + {'property': '/sample/3/bioSampleObject/characteristics', + 'description': 'must match a schema in anyOf'}, + {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, + {'property': '/sample/2/bioSampleObject/characteristics/taxId', + 'description': '1234 is not a valid taxonomy code'}, + {'property': '/sample/1/bioSampleObject/characteristics/Organism', + 'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)'}, + {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, + {'property': '/sample/analysisAlias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'}, + ], + 'spreadsheet_errors': [ + {'sheet': '', 'row': '', 'column': '', + 'description': 'Error loading problem.xlsx: Exception()'} + ] + } +} + + class TestValidator(TestCase): resource_dir = os.path.join(os.path.dirname(__file__), 'resources') vcf_files = os.path.join(resource_dir, 'vcf_files') @@ -34,184 +105,57 @@ def tearDown(self) -> None: if os.path.exists(f): os.remove(f) - def test__collect_validation_workflow_results_with_metadata_xlsx(self): - expected_results = { - 'shallow_validation': {'requested': False}, - 'vcf_check': { - 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, 'critical_count': 0, 'critical_list': []} - }, - 'assembly_check': { - 'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0, 'match': 247, 'total': 247} - }, - 'sample_check': { - 'overall_differences': False, - 'results_per_analysis': { - 'AA': { - 'difference': False, - 'more_metadata_submitted_files': [], - 'more_per_submitted_files_metadata': {}, - 'more_submitted_files_metadata': [] - } - } - }, - 'fasta_check': { - 'input_passed.fa': {'all_insdc': False, 'sequences': [ - {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'}, - {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'} - ]}, - }, - 'metadata_check': { - 'json_errors': [ - {'property': '/files', 'description': "should have required property 'files'"}, - {'property': '/project/title', 'description': "should have required property 'title'"}, - {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, - {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0/description', 'description': "should have required property 'description'"}, - {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, - {'property': '/sample/3/bioSampleObject/name', 'description': "must have required property 'name'"}, - {'property': '/sample/3/bioSampleObject/characteristics/organism', - 'description': "must have required property 'organism'"}, - {'property': '/sample/3/bioSampleObject/characteristics/Organism', - 'description': "must have required property 'Organism'"}, - {'property': '/sample/3/bioSampleObject/characteristics/species', - 'description': "must have required property 'species'"}, - {'property': '/sample/3/bioSampleObject/characteristics/Species', - 'description': "must have required property 'Species'"}, - {'property': '/sample/3/bioSampleObject/characteristics', - 'description': 'must match a schema in anyOf'}, - {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, - {'property': '/sample/2/bioSampleObject/characteristics/taxId', - 'description': '1234 is not a valid taxonomy code'}, - {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, - {'property': '/sample/analysisAlias', - 'description': 'alias_1,alias_2 present in Samples not in Analysis'}, - ], - 'spreadsheet_errors': [ - # NB. Wouldn't normally get conversion error + validation errors together, but it is supported. - {'sheet': '', 'row': '', 'column': '', - 'description': 'Error loading problem.xlsx: Exception()'}, - {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, - {'sheet': 'Project', 'row': 2, 'column': 'Project Title', - 'description': 'Column "Project Title" is not populated'}, - {'sheet': 'Project', 'row': 2, 'column': 'Tax ID', - 'description': 'Column "Tax ID" is not populated'}, - {'sheet': 'Project', 'row': 2, 'column': 'Hold Date', - 'description': 'must match format "date"'}, - {'sheet': 'Analysis', 'row': 2, 'column': 'Description', - 'description': 'Column "Description" is not populated'}, - {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', - 'description': 'Column "Reference" is not populated'}, - {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', - 'description': 'Column "Sample Accession" is not populated'}, - {'sheet': 'Sample', 'row': 6, 'column': 'BioSample Name', - 'description': 'Column "BioSample Name" is not populated'}, - {'sheet': 'Sample', 'row': 6, 'column': 'Scientific Name', - 'description': 'Column "Scientific Name" is not populated'}, - {'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)', - 'description': 'PRJEBNA does not exist or is private'}, - {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', - 'description': '1234 is not a valid taxonomy code'}, - {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', - 'description': 'alias1 present in Analysis not in Samples'}, - {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', - 'description': 'alias_1,alias_2 present in Samples not in Analysis'} - ] - } - } - - self.validator._collect_validation_workflow_results() + def run_collect_results(self, validator_to_run): + validator_to_run._collect_validation_workflow_results() # Drop report paths from comparison (test will fail if missing) - del self.validator.results['metadata_check']['json_report_path'] - del self.validator.results['metadata_check']['spreadsheet_report_path'] - del self.validator.results['sample_check']['report_path'] - for file in self.validator.results['vcf_check'].values(): + del validator_to_run.results['metadata_check']['json_report_path'] + if 'spreadsheet_report_path' in validator_to_run.results['metadata_check']: + del validator_to_run.results['metadata_check']['spreadsheet_report_path'] + del validator_to_run.results['sample_check']['report_path'] + for file in validator_to_run.results['vcf_check'].values(): del file['report_path'] - for file in self.validator.results['assembly_check'].values(): + for file in validator_to_run.results['assembly_check'].values(): del file['report_path'] - assert self.validator.results == expected_results - def test__collect_validation_workflow_results_with_metadata_json(self): - expected_results = { - 'shallow_validation': {'requested': False}, - 'vcf_check': { - 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, - 'critical_count': 0, 'critical_list': []} - }, - 'assembly_check': { - 'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0, - 'match': 247, 'total': 247} - }, - 'sample_check': { - 'overall_differences': False, - 'results_per_analysis': { - 'AA': { - 'difference': False, - 'more_metadata_submitted_files': [], - 'more_per_submitted_files_metadata': {}, - 'more_submitted_files_metadata': [] - } - } - }, - 'fasta_check': { - 'input_passed.fa': {'all_insdc': False, 'sequences': [ - {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'}, - {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'} - ]}, - }, - 'metadata_check': { - 'json_errors': [ - {'property': '/files', 'description': "should have required property 'files'"}, - {'property': '/project/title', 'description': "should have required property 'title'"}, - {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, - {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0/description', - 'description': "should have required property 'description'"}, - {'property': '/analysis/0/referenceGenome', - 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0/bioSampleAccession', - 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0/bioSampleObject', - 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, - {'property': '/sample/3/bioSampleObject/name', 'description': "must have required property 'name'"}, - {'property': '/sample/3/bioSampleObject/characteristics/organism', - 'description': "must have required property 'organism'"}, - {'property': '/sample/3/bioSampleObject/characteristics/Organism', - 'description': "must have required property 'Organism'"}, - {'property': '/sample/3/bioSampleObject/characteristics/species', - 'description': "must have required property 'species'"}, - {'property': '/sample/3/bioSampleObject/characteristics/Species', - 'description': "must have required property 'Species'"}, - {'property': '/sample/3/bioSampleObject/characteristics', - 'description': 'must match a schema in anyOf'}, - {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, - {'property': '/sample/2/bioSampleObject/characteristics/taxId', - 'description': '1234 is not a valid taxonomy code'}, - {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, - {'property': '/sample/analysisAlias', - 'description': 'alias_1,alias_2 present in Samples not in Analysis'}, - ], - 'spreadsheet_errors': [ - {'sheet': '', 'row': '', 'column': '', - 'description': 'Error loading problem.xlsx: Exception()'} - ] - } - } + self.run_collect_results(self.validator_json) + assert self.validator_json.results == expected_validation_results - self.validator_json._collect_validation_workflow_results() - # Drop report paths from comparison (test will fail if missing) - del self.validator_json.results['metadata_check']['json_report_path'] - del self.validator_json.results['sample_check']['report_path'] - for file in self.validator_json.results['vcf_check'].values(): - del file['report_path'] - for file in self.validator_json.results['assembly_check'].values(): - del file['report_path'] + def test__collect_validation_workflow_results_with_metadata_xlsx(self): + expected_results = deepcopy(expected_validation_results) + expected_results['metadata_check']['spreadsheet_errors'] = [ + # NB. Wouldn't normally get conversion error + validation errors together, but it is supported. + {'sheet': '', 'row': '', 'column': '', + 'description': 'Error loading problem.xlsx: Exception()'}, + {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': 2, 'column': 'Project Title', + 'description': 'Column "Project Title" is not populated'}, + {'sheet': 'Project', 'row': 2, 'column': 'Tax ID', + 'description': 'Column "Tax ID" is not populated'}, + {'sheet': 'Project', 'row': 2, 'column': 'Hold Date', + 'description': 'must match format "date"'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Description', + 'description': 'Column "Description" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', + 'description': 'Column "Reference" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', + 'description': 'Column "Sample Accession" is not populated'}, + {'sheet': 'Sample', 'row': 6, 'column': 'BioSample Name', + 'description': 'Column "BioSample Name" is not populated'}, + {'sheet': 'Sample', 'row': 6, 'column': 'Scientific Name', + 'description': 'Column "Scientific Name" is not populated'}, + {'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)', + 'description': 'PRJEBNA does not exist or is private'}, + {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', + 'description': '1234 is not a valid taxonomy code'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias1 present in Analysis not in Samples'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'} + ] - assert self.validator_json.results == expected_results + self.run_collect_results(self.validator) + assert self.validator.results == expected_results def test_create_report(self): self.validator._collect_validation_workflow_results()