From 71de02a88409adb40bfbe518c3cb5f44d3fb46e5 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Wed, 11 Sep 2024 09:31:29 +0100
Subject: [PATCH] add scientific name check and fix tests

---
 eva_sub_cli/semantic_metadata.py              |  37 ++-
 .../metadata_semantic_check.yml               |   2 +
 tests/test_semantic_metadata.py               |  45 ++-
 tests/test_validator.py                       | 286 +++++++-----------
 4 files changed, 189 insertions(+), 181 deletions(-)

diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
index ced4365..8417144 100644
--- a/eva_sub_cli/semantic_metadata.py
+++ b/eva_sub_cli/semantic_metadata.py
@@ -1,4 +1,3 @@
-import re
 import json
 from copy import deepcopy
 
@@ -6,7 +5,7 @@
 
 from retry import retry
 from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator
-from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
+from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena, get_scientific_name_and_common_name
 from ebi_eva_common_pyutils.logger import AppLogger
 
 from eva_sub_cli.date_utils import check_date
@@ -22,9 +21,11 @@
 BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession'
 CHARACTERISTICS_KEY = 'characteristics'
 TAX_ID_KEY = 'taxId'
+SCI_NAME_KEYS = ['species', 'Species', 'organism', 'Organism']
 ANALYSIS_ALIAS_KEY = 'analysisAlias'
 ANALYSIS_RUNS_KEY = 'runAccessions'
 
+
 def cast_list(l, type_to_cast=str):
     for e in l:
         yield type_to_cast(e)
@@ -36,7 +37,7 @@ def __init__(self, metadata, sample_checklist='ERC000011'):
         self.sample_checklist = sample_checklist
         self.metadata = metadata
         self.errors = []
-        # Caches whether taxonomy code is valid or not
+        # Caches whether taxonomy code is valid or not, and maps to scientific name if valid
         self.taxonomy_valid = {}
         self.communicator = NoAuthHALCommunicator(bsd_url='https://www.ebi.ac.uk/biosamples')
 
@@ -47,8 +48,9 @@ def write_result_yaml(self, output_path):
     def check_all(self):
         self.check_all_project_accessions()
         self.check_all_taxonomy_codes()
+        self.check_all_scientific_names()
         self.check_existing_biosamples()
-        self.check_all_analysis_run_accessions
+        self.check_all_analysis_run_accessions()
         self.check_analysis_alias_coherence()
 
     def check_all_project_accessions(self):
@@ -68,15 +70,34 @@ def check_all_taxonomy_codes(self):
             self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
         # Check sample taxonomies for novel samples
         for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
-            if BIOSAMPLE_OBJECT_KEY in sample:
+            if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
                 self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'],
                                          f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
 
+    def check_all_scientific_names(self):
+        """Check that all scientific names are consistent with taxonomy codes."""
+        for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
+            if BIOSAMPLE_OBJECT_KEY in sample and TAX_ID_KEY in sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]:
+                characteristics = sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY]
+                # Get the scientific name from the taxonomy (if valid)
+                tax_code = int(characteristics[TAX_ID_KEY][0]['text'])
+                sci_name_from_tax = self.taxonomy_valid[tax_code]
+                if not sci_name_from_tax:
+                    continue
+                # Check if scientific name in sample matches
+                for sci_name_key in SCI_NAME_KEYS:
+                    if sci_name_key in characteristics:
+                        sci_name = characteristics[sci_name_key][0]['text']
+                        if sci_name_from_tax.lower() != sci_name.lower():
+                            self.add_error(
+                                f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{sci_name_key}',
+                                f'Species {sci_name} does not match taxonomy {tax_code} ({sci_name_from_tax})')
+
     def check_all_analysis_run_accessions(self):
         """Check that the Run accession are valid and exist in ENA"""
         for idx, analysis in enumerate(self.metadata[ANALYSIS_KEY]):
             json_path = f'/{ANALYSIS_KEY}/{idx}/{ANALYSIS_RUNS_KEY}'
-            if analysis[ANALYSIS_RUNS_KEY]:
+            if ANALYSIS_RUNS_KEY in analysis and analysis[ANALYSIS_RUNS_KEY]:
                 for run_acc in analysis[ANALYSIS_RUNS_KEY]:
                     self.check_accession_in_ena(run_acc, 'Run', json_path)
 
@@ -98,8 +119,8 @@ def check_taxonomy_code(self, taxonomy_code, json_path):
                 self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
         else:
             try:
-                download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}')
-                self.taxonomy_valid[taxonomy_code] = True
+                sci_name, _ = get_scientific_name_and_common_name(taxonomy_code)
+                self.taxonomy_valid[taxonomy_code] = sci_name
             except Exception:
                 self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
                 self.taxonomy_valid[taxonomy_code] = False
diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml
index c0c130c..39ed5d0 100644
--- a/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml
+++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml
@@ -2,6 +2,8 @@
   property: /project/childProjects/1
 - description: 1234 is not a valid taxonomy code
   property: /sample/2/bioSampleObject/characteristics/taxId
+- description: Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)
+  property: /sample/1/bioSampleObject/characteristics/Organism
 - description: alias1 present in Analysis not in Samples
   property: /sample/analysisAlias
 - description: alias_1,alias_2 present in Samples not in Analysis
diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py
index fec6214..35d37c0 100644
--- a/tests/test_semantic_metadata.py
+++ b/tests/test_semantic_metadata.py
@@ -74,9 +74,9 @@ def test_check_all_taxonomy_codes(self):
             ]
         }
         checker = SemanticMetadataChecker(metadata)
-        with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download:
+        with patch('eva_sub_cli.semantic_metadata.get_scientific_name_and_common_name') as m_get_sci_name:
             # Mock should only be called once per taxonomy code
-            m_ena_download.side_effect = [True, Exception('problem downloading')]
+            m_get_sci_name.side_effect = [('Homo sapiens', 'human'), Exception('problem downloading')]
             checker.check_all_taxonomy_codes()
             self.assertEqual(checker.errors, [
                 {
@@ -85,6 +85,47 @@ def test_check_all_taxonomy_codes(self):
                 }
             ])
 
+    def test_check_all_scientific_names(self):
+        metadata = {
+            "sample": [
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": [{"text": "9606"}],
+                            "Organism": [{"text": "homo sapiens"}]
+                        }
+                    }
+                },
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": [{"text": "9606"}],
+                            "Organism": [{"text": "sheep sapiens"}]
+                        }
+                    }
+                },
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": [{"text": "1234"}]
+                        }
+                    }
+                }
+            ]
+        }
+        checker = SemanticMetadataChecker(metadata)
+        checker.taxonomy_valid = {
+            1234: False,
+            9606: "Homo sapiens"
+        }
+        checker.check_all_scientific_names()
+        self.assertEqual(checker.errors, [
+            {
+                'property': '/sample/1/bioSampleObject/characteristics/Organism',
+                'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)'
+            }
+        ])
+
     def test_check_existing_biosamples_with_checklist(self):
         checker = SemanticMetadataChecker(metadata)
         with patch.object(SemanticMetadataChecker, '_get_biosample',
diff --git a/tests/test_validator.py b/tests/test_validator.py
index 6a65a87..0cfaf73 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -1,10 +1,81 @@
 import os.path
+from copy import deepcopy
 from unittest import TestCase
 
 from eva_sub_cli.validators.validator import Validator, VALIDATION_OUTPUT_DIR
 from tests.test_utils import create_mapping_file
 
 
+expected_validation_results = {
+    'shallow_validation': {'requested': False},
+    'vcf_check': {
+        'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0,
+                             'critical_count': 0, 'critical_list': []}
+    },
+    'assembly_check': {
+        'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0,
+                             'match': 247, 'total': 247}
+    },
+    'sample_check': {
+        'overall_differences': False,
+        'results_per_analysis': {
+            'AA': {
+                'difference': False,
+                'more_metadata_submitted_files': [],
+                'more_per_submitted_files_metadata': {},
+                'more_submitted_files_metadata': []
+            }
+        }
+    },
+    'fasta_check': {
+        'input_passed.fa': {'all_insdc': False, 'sequences': [
+            {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'},
+            {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'}
+        ]},
+    },
+    'metadata_check': {
+        'json_errors': [
+            {'property': '/files', 'description': "should have required property 'files'"},
+            {'property': '/project/title', 'description': "should have required property 'title'"},
+            {'property': '/project/taxId', 'description': "must have required property 'taxId'"},
+            {'property': '/project/holdDate', 'description': 'must match format "date"'},
+            {'property': '/analysis/0/description',
+             'description': "should have required property 'description'"},
+            {'property': '/analysis/0/referenceGenome',
+             'description': "should have required property 'referenceGenome'"},
+            {'property': '/sample/0/bioSampleAccession',
+             'description': "should have required property 'bioSampleAccession'"},
+            {'property': '/sample/0/bioSampleObject',
+             'description': "should have required property 'bioSampleObject'"},
+            {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'},
+            {'property': '/sample/3/bioSampleObject/name', 'description': "must have required property 'name'"},
+            {'property': '/sample/3/bioSampleObject/characteristics/organism',
+             'description': "must have required property 'organism'"},
+            {'property': '/sample/3/bioSampleObject/characteristics/Organism',
+             'description': "must have required property 'Organism'"},
+            {'property': '/sample/3/bioSampleObject/characteristics/species',
+             'description': "must have required property 'species'"},
+            {'property': '/sample/3/bioSampleObject/characteristics/Species',
+             'description': "must have required property 'Species'"},
+            {'property': '/sample/3/bioSampleObject/characteristics',
+             'description': 'must match a schema in anyOf'},
+            {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'},
+            {'property': '/sample/2/bioSampleObject/characteristics/taxId',
+             'description': '1234 is not a valid taxonomy code'},
+            {'property': '/sample/1/bioSampleObject/characteristics/Organism',
+             'description': 'Species sheep sapiens does not match taxonomy 9606 (Homo sapiens)'},
+            {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'},
+            {'property': '/sample/analysisAlias',
+             'description': 'alias_1,alias_2 present in Samples not in Analysis'},
+        ],
+        'spreadsheet_errors': [
+            {'sheet': '', 'row': '', 'column': '',
+             'description': 'Error loading problem.xlsx: Exception()'}
+        ]
+    }
+}
+
+
 class TestValidator(TestCase):
     resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
     vcf_files = os.path.join(resource_dir, 'vcf_files')
@@ -34,184 +105,57 @@ def tearDown(self) -> None:
             if os.path.exists(f):
                 os.remove(f)
 
-    def test__collect_validation_workflow_results_with_metadata_xlsx(self):
-        expected_results = {
-            'shallow_validation': {'requested': False},
-            'vcf_check': {
-                'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, 'critical_count': 0, 'critical_list': []}
-            },
-            'assembly_check': {
-                'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0, 'match': 247, 'total': 247}
-            },
-            'sample_check': {
-                'overall_differences': False,
-                'results_per_analysis': {
-                    'AA': {
-                        'difference': False,
-                        'more_metadata_submitted_files': [],
-                        'more_per_submitted_files_metadata': {},
-                        'more_submitted_files_metadata': []
-                    }
-                }
-            },
-            'fasta_check': {
-                'input_passed.fa': {'all_insdc': False, 'sequences': [
-                    {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'},
-                    {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'}
-                ]},
-            },
-            'metadata_check': {
-                'json_errors': [
-                    {'property': '/files', 'description': "should have required property 'files'"},
-                    {'property': '/project/title', 'description': "should have required property 'title'"},
-                    {'property': '/project/taxId', 'description': "must have required property 'taxId'"},
-                    {'property': '/project/holdDate', 'description': 'must match format "date"'},
-                    {'property': '/analysis/0/description', 'description': "should have required property 'description'"},
-                    {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"},
-                    {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"},
-                    {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
-                    {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'},
-                    {'property': '/sample/3/bioSampleObject/name', 'description': "must have required property 'name'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics/organism',
-                     'description': "must have required property 'organism'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics/Organism',
-                     'description': "must have required property 'Organism'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics/species',
-                     'description': "must have required property 'species'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics/Species',
-                     'description': "must have required property 'Species'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics',
-                     'description': 'must match a schema in anyOf'},
-                    {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'},
-                    {'property': '/sample/2/bioSampleObject/characteristics/taxId',
-                     'description': '1234 is not a valid taxonomy code'},
-                    {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'},
-                    {'property': '/sample/analysisAlias',
-                     'description': 'alias_1,alias_2 present in Samples not in Analysis'},
-                ],
-                'spreadsheet_errors': [
-                    # NB. Wouldn't normally get conversion error + validation errors together, but it is supported.
-                    {'sheet': '', 'row': '', 'column': '',
-                     'description': 'Error loading problem.xlsx: Exception()'},
-                    {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
-                    {'sheet': 'Project', 'row': 2, 'column': 'Project Title',
-                     'description': 'Column "Project Title" is not populated'},
-                    {'sheet': 'Project', 'row': 2, 'column': 'Tax ID',
-                     'description': 'Column "Tax ID" is not populated'},
-                    {'sheet': 'Project', 'row': 2, 'column': 'Hold Date',
-                     'description': 'must match format "date"'},
-                    {'sheet': 'Analysis', 'row': 2, 'column': 'Description',
-                     'description': 'Column "Description" is not populated'},
-                    {'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
-                     'description': 'Column "Reference" is not populated'},
-                    {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
-                     'description': 'Column "Sample Accession" is not populated'},
-                    {'sheet': 'Sample', 'row': 6, 'column': 'BioSample Name',
-                     'description': 'Column "BioSample Name" is not populated'},
-                    {'sheet': 'Sample', 'row': 6, 'column': 'Scientific Name',
-                     'description': 'Column "Scientific Name" is not populated'},
-                    {'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)',
-                     'description': 'PRJEBNA does not exist or is private'},
-                    {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id',
-                     'description': '1234 is not a valid taxonomy code'},
-                    {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
-                     'description': 'alias1 present in Analysis not in Samples'},
-                    {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
-                     'description': 'alias_1,alias_2 present in Samples not in Analysis'}
-                ]
-            }
-        }
-
-        self.validator._collect_validation_workflow_results()
+    def run_collect_results(self, validator_to_run):
+        validator_to_run._collect_validation_workflow_results()
         # Drop report paths from comparison (test will fail if missing)
-        del self.validator.results['metadata_check']['json_report_path']
-        del self.validator.results['metadata_check']['spreadsheet_report_path']
-        del self.validator.results['sample_check']['report_path']
-        for file in self.validator.results['vcf_check'].values():
+        del validator_to_run.results['metadata_check']['json_report_path']
+        if 'spreadsheet_report_path' in validator_to_run.results['metadata_check']:
+            del validator_to_run.results['metadata_check']['spreadsheet_report_path']
+        del validator_to_run.results['sample_check']['report_path']
+        for file in validator_to_run.results['vcf_check'].values():
             del file['report_path']
-        for file in self.validator.results['assembly_check'].values():
+        for file in validator_to_run.results['assembly_check'].values():
             del file['report_path']
 
-        assert self.validator.results == expected_results
-
     def test__collect_validation_workflow_results_with_metadata_json(self):
-        expected_results = {
-            'shallow_validation': {'requested': False},
-            'vcf_check': {
-                'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0,
-                                     'critical_count': 0, 'critical_list': []}
-            },
-            'assembly_check': {
-                'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0,
-                                     'match': 247, 'total': 247}
-            },
-            'sample_check': {
-                'overall_differences': False,
-                'results_per_analysis': {
-                    'AA': {
-                        'difference': False,
-                        'more_metadata_submitted_files': [],
-                        'more_per_submitted_files_metadata': {},
-                        'more_submitted_files_metadata': []
-                    }
-                }
-            },
-            'fasta_check': {
-                'input_passed.fa': {'all_insdc': False, 'sequences': [
-                    {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'},
-                    {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'}
-                ]},
-            },
-            'metadata_check': {
-                'json_errors': [
-                    {'property': '/files', 'description': "should have required property 'files'"},
-                    {'property': '/project/title', 'description': "should have required property 'title'"},
-                    {'property': '/project/taxId', 'description': "must have required property 'taxId'"},
-                    {'property': '/project/holdDate', 'description': 'must match format "date"'},
-                    {'property': '/analysis/0/description',
-                     'description': "should have required property 'description'"},
-                    {'property': '/analysis/0/referenceGenome',
-                     'description': "should have required property 'referenceGenome'"},
-                    {'property': '/sample/0/bioSampleAccession',
-                     'description': "should have required property 'bioSampleAccession'"},
-                    {'property': '/sample/0/bioSampleObject',
-                     'description': "should have required property 'bioSampleObject'"},
-                    {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'},
-                    {'property': '/sample/3/bioSampleObject/name', 'description': "must have required property 'name'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics/organism',
-                     'description': "must have required property 'organism'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics/Organism',
-                     'description': "must have required property 'Organism'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics/species',
-                     'description': "must have required property 'species'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics/Species',
-                     'description': "must have required property 'Species'"},
-                    {'property': '/sample/3/bioSampleObject/characteristics',
-                     'description': 'must match a schema in anyOf'},
-                    {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'},
-                    {'property': '/sample/2/bioSampleObject/characteristics/taxId',
-                     'description': '1234 is not a valid taxonomy code'},
-                    {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'},
-                    {'property': '/sample/analysisAlias',
-                     'description': 'alias_1,alias_2 present in Samples not in Analysis'},
-                ],
-                'spreadsheet_errors': [
-                    {'sheet': '', 'row': '', 'column': '',
-                     'description': 'Error loading problem.xlsx: Exception()'}
-                ]
-            }
-        }
+        self.run_collect_results(self.validator_json)
+        assert self.validator_json.results == expected_validation_results
 
-        self.validator_json._collect_validation_workflow_results()
-        # Drop report paths from comparison (test will fail if missing)
-        del self.validator_json.results['metadata_check']['json_report_path']
-        del self.validator_json.results['sample_check']['report_path']
-        for file in self.validator_json.results['vcf_check'].values():
-            del file['report_path']
-        for file in self.validator_json.results['assembly_check'].values():
-            del file['report_path']
+    def test__collect_validation_workflow_results_with_metadata_xlsx(self):
+        expected_results = deepcopy(expected_validation_results)
+        expected_results['metadata_check']['spreadsheet_errors'] = [
+            # NB. Wouldn't normally get conversion error + validation errors together, but it is supported.
+            {'sheet': '', 'row': '', 'column': '',
+             'description': 'Error loading problem.xlsx: Exception()'},
+            {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Project Title',
+             'description': 'Column "Project Title" is not populated'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Tax ID',
+             'description': 'Column "Tax ID" is not populated'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Hold Date',
+             'description': 'must match format "date"'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Description',
+             'description': 'Column "Description" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
+             'description': 'Column "Reference" is not populated'},
+            {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
+             'description': 'Column "Sample Accession" is not populated'},
+            {'sheet': 'Sample', 'row': 6, 'column': 'BioSample Name',
+             'description': 'Column "BioSample Name" is not populated'},
+            {'sheet': 'Sample', 'row': 6, 'column': 'Scientific Name',
+             'description': 'Column "Scientific Name" is not populated'},
+            {'sheet': 'Project', 'row': 2, 'column': 'Child Project(s)',
+             'description': 'PRJEBNA does not exist or is private'},
+            {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id',
+             'description': '1234 is not a valid taxonomy code'},
+            {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
+             'description': 'alias1 present in Analysis not in Samples'},
+            {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
+             'description': 'alias_1,alias_2 present in Samples not in Analysis'}
+        ]
 
-        assert self.validator_json.results == expected_results
+        self.run_collect_results(self.validator)
+        assert self.validator.results == expected_results
 
     def test_create_report(self):
         self.validator._collect_validation_workflow_results()