add scientific name check and fix tests

EBIvariation · Sep 11, 2024 · 71de02a · 71de02a
1 parent f7d293e
commit 71de02a
Show file tree

Hide file tree

Showing 4 changed files with 189 additions and 181 deletions.
diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
@@ -1,12 +1,11 @@
-import re
 import json
 from copy import deepcopy
 
 import yaml
 
 from retry import retry
 from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator
-from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
+from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena, get_scientific_name_and_common_name
 from ebi_eva_common_pyutils.logger import AppLogger
 
 from eva_sub_cli.date_utils import check_date
@@ -22,9 +21,11 @@
 BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession'
 CHARACTERISTICS_KEY = 'characteristics'
 TAX_ID_KEY = 'taxId'
+SCI_NAME_KEYS = ['species', 'Species', 'organism', 'Organism']
 ANALYSIS_ALIAS_KEY = 'analysisAlias'
 ANALYSIS_RUNS_KEY = 'runAccessions'
 
+
 def cast_list(l, type_to_cast=str):
     for e in l:
         yield type_to_cast(e)
@@ -36,7 +37,7 @@ def __init__(self, metadata, sample_checklist='ERC000011'):
         self.sample_checklist = sample_checklist
         self.metadata = metadata
         self.errors = []
-        # Caches whether taxonomy code is valid or not
+        # Caches whether taxonomy code is valid or not, and maps to scientific name if valid
         self.taxonomy_valid = {}
         self.communicator = NoAuthHALCommunicator(bsd_url='https://www.ebi.ac.uk/biosamples')
 
@@ -47,8 +48,9 @@ def write_result_yaml(self, output_path):
     def check_all(self):
         self.check_all_project_accessions()
         self.check_all_taxonomy_codes()
+        self.check_all_scientific_names()
         self.check_existing_biosamples()
-        self.check_all_analysis_run_accessions
+        self.check_all_analysis_run_accessions()
         self.check_analysis_alias_coherence()
 
     def check_all_project_accessions(self):
@@ -68,15 +70,34 @@ def check_all_taxonomy_codes(self):
             self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
         # Check sample taxonomies for novel samples
         for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
-            if BIOSAMPLE_OBJECT_KEY in sample:
+            if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
                 self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'],
                                          f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
 
+    def check_all_scientific_names(self):
+        """Check that all scientific names are consistent with taxonomy codes."""
+        for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
+            if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
+                characteristics = sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]
+                # Get the scientific name from the taxonomy (if valid)
+                tax_code = int(characteristics[TAX_ID_KEY][0]['text'])
+                sci_name_from_tax = self.taxonomy_valid[tax_code]
+                if not sci_name_from_tax:
+                    continue
+                # Check if scientific name in sample matches
+                for sci_name_key in SCI_NAME_KEYS:
+                    if sci_name_key in characteristics:
+                        sci_name = characteristics[sci_name_key][0]['text']
+                        if sci_name_from_tax.lower() != sci_name.lower():
+                            self.add_error(
+                                f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{sci_name_key}',
+                                f'Species {sci_name} does not match taxonomy {tax_code} ({sci_name_from_tax})')
+
     def check_all_analysis_run_accessions(self):
         """Check that the Run accession are valid and exist in ENA"""
         for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]):
             json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}'
-            if analysis[ANALYSIS_RUNS_KEY]:
+            if ANALYSIS_RUNS_KEY in analysis and analysis[ANALYSIS_RUNS_KEY]:
                 for run_acc in analysis[ANALYSIS_RUNS_KEY]:
                     self.check_accession_in_ena(run_acc, 'Run', json_path)
 
@@ -98,8 +119,8 @@ def check_taxonomy_code(self, taxonomy_code, json_path):
                 self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
         else:
             try:
-                download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}')
-                self.taxonomy_valid[taxonomy_code] = True
+                sci_name, _ = get_scientific_name_and_common_name(taxonomy_code)
+                self.taxonomy_valid[taxonomy_code] = sci_name
             except Exception:
                 self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
                 self.taxonomy_valid[taxonomy_code] = False

diff --git a/...ources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml b/...ources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml
@@ -2,6 +2,8 @@
   property: /project/childProjects/1
 - description: 1234 is not a valid taxonomy code
   property: /sample/2/bioSampleObject/characteristics/taxId
+- description: Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)
+  property: /sample/1/bioSampleObject/characteristics/Organism
 - description: alias1 present in Analysis not in Samples
   property: /sample/analysisAlias
 - description: alias_1,alias_2 present in Samples not in Analysis

diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py
@@ -74,9 +74,9 @@ def test_check_all_taxonomy_codes(self):
             ]
         }
         checker = SemanticMetadataChecker(metadata)
-        with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download:
+        with patch('eva_sub_cli.semantic_metadata.get_scientific_name_and_common_name') as m_get_sci_name:
             # Mock should only be called once per taxonomy code
-            m_ena_download.side_effect = [True, Exception('problem downloading')]
+            m_get_sci_name.side_effect = [('Homo sapiens', 'human'), Exception('problem downloading')]
             checker.check_all_taxonomy_codes()
             self.assertEqual(checker.errors, [
                 {
@@ -85,6 +85,47 @@ def test_check_all_taxonomy_codes(self):
                 }
             ])
 
+    def test_check_all_scientific_names(self):
+        metadata = {
+            "sample": [
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": [{"text": "9606"}],
+                            "Organism": [{"text": "homo sapiens"}]
+                        }
+                    }
+                },
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": [{"text": "9606"}],
+                            "Organism": [{"text": "sheep sapiens"}]
+                        }
+                    }
+                },
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": [{"text": "1234"}]
+                        }
+                    }
+                }
+            ]
+        }
+        checker = SemanticMetadataChecker(metadata)
+        checker.taxonomy_valid = {
+            1234: False,
+            9606: "Homo sapiens"
+        }
+        checker.check_all_scientific_names()
+        self.assertEqual(checker.errors, [
+            {
+                'property': '/sample/1/bioSampleObject/characteristics/Organism',
+                'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)'
+            }
+        ])
+
     def test_check_existing_biosamples_with_checklist(self):
         checker = SemanticMetadataChecker(metadata)
         with patch.object(SemanticMetadataChecker, '_get_biosample',