Skip to content

Commit

Permalink
add scientific name check and fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
apriltuesday committed Sep 11, 2024
1 parent f7d293e commit 71de02a
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 181 deletions.
37 changes: 29 additions & 8 deletions eva_sub_cli/semantic_metadata.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import re
import json
from copy import deepcopy

import yaml

from retry import retry
from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator
from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena, get_scientific_name_and_common_name
from ebi_eva_common_pyutils.logger import AppLogger

from eva_sub_cli.date_utils import check_date
Expand All @@ -22,9 +21,11 @@
BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession'
CHARACTERISTICS_KEY = 'characteristics'
TAX_ID_KEY = 'taxId'
SCI_NAME_KEYS = ['species', 'Species', 'organism', 'Organism']
ANALYSIS_ALIAS_KEY = 'analysisAlias'
ANALYSIS_RUNS_KEY = 'runAccessions'


def cast_list(l, type_to_cast=str):
for e in l:
yield type_to_cast(e)
Expand All @@ -36,7 +37,7 @@ def __init__(self, metadata, sample_checklist='ERC000011'):
self.sample_checklist = sample_checklist
self.metadata = metadata
self.errors = []
# Caches whether taxonomy code is valid or not
# Caches whether taxonomy code is valid or not, and maps to scientific name if valid
self.taxonomy_valid = {}
self.communicator = NoAuthHALCommunicator(bsd_url='https://www.ebi.ac.uk/biosamples')

Expand All @@ -47,8 +48,9 @@ def write_result_yaml(self, output_path):
def check_all(self):
self.check_all_project_accessions()
self.check_all_taxonomy_codes()
self.check_all_scientific_names()
self.check_existing_biosamples()
self.check_all_analysis_run_accessions
self.check_all_analysis_run_accessions()
self.check_analysis_alias_coherence()

def check_all_project_accessions(self):
Expand All @@ -68,15 +70,34 @@ def check_all_taxonomy_codes(self):
self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
# Check sample taxonomies for novel samples
for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
if BIOSAMPLE_OBJECT_KEY in sample:
if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'],
f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')

def check_all_scientific_names(self):
"""Check that all scientific names are consistent with taxonomy codes."""
for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
characteristics = sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]
# Get the scientific name from the taxonomy (if valid)
tax_code = int(characteristics[TAX_ID_KEY][0]['text'])
sci_name_from_tax = self.taxonomy_valid[tax_code]
if not sci_name_from_tax:
continue
# Check if scientific name in sample matches
for sci_name_key in SCI_NAME_KEYS:
if sci_name_key in characteristics:
sci_name = characteristics[sci_name_key][0]['text']
if sci_name_from_tax.lower() != sci_name.lower():
self.add_error(
f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{sci_name_key}',
f'Species {sci_name} does not match taxonomy {tax_code} ({sci_name_from_tax})')

def check_all_analysis_run_accessions(self):
"""Check that the Run accession are valid and exist in ENA"""
for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]):
json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}'
if analysis[ANALYSIS_RUNS_KEY]:
if ANALYSIS_RUNS_KEY in analysis and analysis[ANALYSIS_RUNS_KEY]:
for run_acc in analysis[ANALYSIS_RUNS_KEY]:
self.check_accession_in_ena(run_acc, 'Run', json_path)

Expand All @@ -98,8 +119,8 @@ def check_taxonomy_code(self, taxonomy_code, json_path):
self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
else:
try:
download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}')
self.taxonomy_valid[taxonomy_code] = True
sci_name, _ = get_scientific_name_and_common_name(taxonomy_code)
self.taxonomy_valid[taxonomy_code] = sci_name
except Exception:
self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
self.taxonomy_valid[taxonomy_code] = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
property: /project/childProjects/1
- description: 1234 is not a valid taxonomy code
property: /sample/2/bioSampleObject/characteristics/taxId
- description: Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)
property: /sample/1/bioSampleObject/characteristics/Organism
- description: alias1 present in Analysis not in Samples
property: /sample/analysisAlias
- description: alias_1,alias_2 present in Samples not in Analysis
Expand Down
45 changes: 43 additions & 2 deletions tests/test_semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def test_check_all_taxonomy_codes(self):
]
}
checker = SemanticMetadataChecker(metadata)
with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download:
with patch('eva_sub_cli.semantic_metadata.get_scientific_name_and_common_name') as m_get_sci_name:
# Mock should only be called once per taxonomy code
m_ena_download.side_effect = [True, Exception('problem downloading')]
m_get_sci_name.side_effect = [('Homo sapiens', 'human'), Exception('problem downloading')]
checker.check_all_taxonomy_codes()
self.assertEqual(checker.errors, [
{
Expand All @@ -85,6 +85,47 @@ def test_check_all_taxonomy_codes(self):
}
])

def test_check_all_scientific_names(self):
metadata = {
"sample": [
{
"bioSampleObject": {
"characteristics": {
"taxId": [{"text": "9606"}],
"Organism": [{"text": "homo sapiens"}]
}
}
},
{
"bioSampleObject": {
"characteristics": {
"taxId": [{"text": "9606"}],
"Organism": [{"text": "sheep sapiens"}]
}
}
},
{
"bioSampleObject": {
"characteristics": {
"taxId": [{"text": "1234"}]
}
}
}
]
}
checker = SemanticMetadataChecker(metadata)
checker.taxonomy_valid = {
1234: False,
9606: "Homo sapiens"
}
checker.check_all_scientific_names()
self.assertEqual(checker.errors, [
{
'property': '/sample/1/bioSampleObject/characteristics/Organism',
'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)'
}
])

def test_check_existing_biosamples_with_checklist(self):
checker = SemanticMetadataChecker(metadata)
with patch.object(SemanticMetadataChecker, '_get_biosample',
Expand Down
Loading

0 comments on commit 71de02a

Please sign in to comment.