diff --git a/eva_sub_cli/date_utils.py b/eva_sub_cli/date_utils.py new file mode 100644 index 0000000..a6e5165 --- /dev/null +++ b/eva_sub_cli/date_utils.py @@ -0,0 +1,32 @@ +import datetime + +# Values coming from https://www.ebi.ac.uk/ena/browser/view/ERC000011 +not_provided_check_list = ['not provided', 'not collected', 'restricted access', 'missing: control sample', + 'missing: sample group', 'missing: synthetic construct', 'missing: lab stock', + 'missing: third party data', 'missing: data agreement established pre-2023', + 'missing: endangered species', 'missing: human-identifiable'] + + +def check_date(date): + return isinstance(date, datetime.date) or \ + isinstance(date, datetime.datetime) or \ + check_date_str_format(date) or \ + str(date).lower() in not_provided_check_list + + +def check_date_str_format(d): + try: + datetime.datetime.strptime(d, "%Y-%m-%d") + return True + except ValueError: + pass + try: + datetime.datetime.strptime(d, "%Y-%m") + return True + except ValueError: + pass + try: + datetime.datetime.strptime(d, "%Y") + return True + except ValueError: + return False diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index a11d370..08b4f68 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -1,9 +1,11 @@ import yaml from retry import retry +from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena from ebi_eva_common_pyutils.logger import AppLogger +from eva_sub_cli.date_utils import check_date PROJECT_KEY = 'project' ANALYSIS_KEY = 'analysis' @@ -13,6 +15,7 @@ CHILD_PROJECTS_KEY = 'childProjects' PEER_PROJECTS_KEY = 'peerProjects' BIOSAMPLE_OBJECT_KEY = 'bioSampleObject' +BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession' CHARACTERISTICS_KEY = 'characteristics' TAX_ID_KEY = 'taxId' ANALYSIS_ALIAS_KEY = 'analysisAlias' @@ -30,6 +33,7 @@ def __init__(self, metadata): self.errors = [] # Caches whether taxonomy code is valid or not self.taxonomy_valid = {} + self.communicator = NoAuthHALCommunicator(bsd_url='https://www.ebi.ac.uk/biosamples') def write_result_yaml(self, output_path): with open(output_path, 'w') as open_yaml: @@ -38,6 +42,7 @@ def write_result_yaml(self, output_path): def check_all(self): self.check_all_project_accessions() self.check_all_taxonomy_codes() + self.check_existing_biosamples() self.check_analysis_alias_coherence() def check_all_project_accessions(self): @@ -90,6 +95,34 @@ def add_error(self, property, description): """ self.errors.append({'property': property, 'description': description}) + def check_existing_biosamples(self): + """Check that existing BioSamples are accessible and contain the required attributes.""" + for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): + if BIOSAMPLE_ACCESSION_KEY in sample: + sample_accession = sample[BIOSAMPLE_ACCESSION_KEY] + json_path = f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_ACCESSION_KEY}' + try: + sample_data = self.communicator.follows_link('samples', join_url=sample_accession) + self.validate_existing_biosample(sample_data, sample_accession, json_path) + except ValueError: + self.add_error(json_path, f'{sample_accession} does not exist or is private') + + def validate_existing_biosample(self, sample_data, accession, json_path): + """Check if the existing sample has the expected fields present""" + found_collection_date = False + for key in ['collection_date', 'collection date']: + if key in sample_data['characteristics'] and check_date(sample_data['characteristics'][key][0]['text']): + found_collection_date = True + if not found_collection_date: + self.add_error(json_path, f'Existing sample {accession} does not have a valid collection date') + + found_geo_loc = False + for key in ['geographic location (country and/or sea)', 'geo loc name']: + if key in sample_data['characteristics'] and sample_data['characteristics'][key][0]['text']: + found_geo_loc = True + if not found_geo_loc: + self.add_error(json_path, f'Existing sample {accession} does not have a valid geographic location') + def check_analysis_alias_coherence(self): """Check that the same analysis aliases are used in analysis, sample, and files.""" analysis_aliases = [analysis[ANALYSIS_ALIAS_KEY] for analysis in self.metadata[ANALYSIS_KEY]] diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index 3eb4b1f..e0345a1 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -12,7 +12,7 @@ logger = logging_config.get_logger(__name__) container_image = 'ebivariation/eva-sub-cli' -container_tag = 'v0.0.1.dev9' +container_tag = 'v0.0.1.dev10' container_validation_dir = '/opt/vcf_validation' container_validation_output_dir = 'vcf_validation_output' diff --git a/requirements.txt b/requirements.txt index 5ee6871..971bebf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ebi_eva_common_pyutils==0.6.3 +ebi_eva_common_pyutils==0.6.8 jinja2 jsonschema minify_html==0.11.1 diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py index ab61f7b..e8490bf 100644 --- a/tests/test_semantic_metadata.py +++ b/tests/test_semantic_metadata.py @@ -1,6 +1,8 @@ from unittest import TestCase from unittest.mock import patch +from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator + from eva_sub_cli.semantic_metadata import SemanticMetadataChecker @@ -58,6 +60,48 @@ def test_check_all_taxonomy_codes(self): } ]) + def test_check_existing_biosamples(self): + metadata = { + "sample": [ + {"bioSampleAccession": "SAME00001"}, + {"bioSampleAccession": "SAME00002"}, + {"bioSampleAccession": "SAME00003"} + ] + } + checker = SemanticMetadataChecker(metadata) + valid_sample = { + 'accession': 'SAME00001', + 'characteristics': { + 'organism': [{'text': 'Viridiplantae'}], + 'collection date': [{'text': '2018'}], + 'geo loc name': [{'text': 'France: Montferrier-sur-Lez'}] + } + } + invalid_sample = { + 'accession': 'SAME00003', + 'characteristics': { + 'organism': [{'text': 'Viridiplantae'}] + } + } + + with patch.object(NoAuthHALCommunicator, 'follows_link', + side_effect=[valid_sample, ValueError, invalid_sample]) as m_follows_link: + checker.check_existing_biosamples() + self.assertEqual(checker.errors, [ + { + 'property': '/sample/1/bioSampleAccession', + 'description': 'SAME00002 does not exist or is private' + }, + { + 'property': '/sample/2/bioSampleAccession', + 'description': 'Existing sample SAME00003 does not have a valid collection date' + }, + { + 'property': '/sample/2/bioSampleAccession', + 'description': 'Existing sample SAME00003 does not have a valid geographic location' + } + ]) + def test_check_analysis_alias_coherence(self): metadata = { "analysis": [