Skip to content

Commit

Permalink
Merge pull request #44 from apriltuesday/EVA-3624
Browse files Browse the repository at this point in the history
EVA-3624: Metadata checks on existing BioSamples
  • Loading branch information
apriltuesday authored Jul 15, 2024
2 parents d12c158 + 20752c8 commit bc047ac
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 2 deletions.
32 changes: 32 additions & 0 deletions eva_sub_cli/date_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import datetime

# Values coming from https://www.ebi.ac.uk/ena/browser/view/ERC000011
not_provided_check_list = ['not provided', 'not collected', 'restricted access', 'missing: control sample',
'missing: sample group', 'missing: synthetic construct', 'missing: lab stock',
'missing: third party data', 'missing: data agreement established pre-2023',
'missing: endangered species', 'missing: human-identifiable']


def check_date(date):
return isinstance(date, datetime.date) or \
isinstance(date, datetime.datetime) or \
check_date_str_format(date) or \
str(date).lower() in not_provided_check_list


def check_date_str_format(d):
try:
datetime.datetime.strptime(d, "%Y-%m-%d")
return True
except ValueError:
pass
try:
datetime.datetime.strptime(d, "%Y-%m")
return True
except ValueError:
pass
try:
datetime.datetime.strptime(d, "%Y")
return True
except ValueError:
return False
33 changes: 33 additions & 0 deletions eva_sub_cli/semantic_metadata.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import yaml

from retry import retry
from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator
from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
from ebi_eva_common_pyutils.logger import AppLogger

from eva_sub_cli.date_utils import check_date

PROJECT_KEY = 'project'
ANALYSIS_KEY = 'analysis'
Expand All @@ -13,6 +15,7 @@
CHILD_PROJECTS_KEY = 'childProjects'
PEER_PROJECTS_KEY = 'peerProjects'
BIOSAMPLE_OBJECT_KEY = 'bioSampleObject'
BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession'
CHARACTERISTICS_KEY = 'characteristics'
TAX_ID_KEY = 'taxId'
ANALYSIS_ALIAS_KEY = 'analysisAlias'
Expand All @@ -30,6 +33,7 @@ def __init__(self, metadata):
self.errors = []
# Caches whether taxonomy code is valid or not
self.taxonomy_valid = {}
self.communicator = NoAuthHALCommunicator(bsd_url='https://www.ebi.ac.uk/biosamples')

def write_result_yaml(self, output_path):
with open(output_path, 'w') as open_yaml:
Expand All @@ -38,6 +42,7 @@ def write_result_yaml(self, output_path):
def check_all(self):
self.check_all_project_accessions()
self.check_all_taxonomy_codes()
self.check_existing_biosamples()
self.check_analysis_alias_coherence()

def check_all_project_accessions(self):
Expand Down Expand Up @@ -90,6 +95,34 @@ def add_error(self, property, description):
"""
self.errors.append({'property': property, 'description': description})

def check_existing_biosamples(self):
"""Check that existing BioSamples are accessible and contain the required attributes."""
for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
if BIOSAMPLE_ACCESSION_KEY in sample:
sample_accession = sample[BIOSAMPLE_ACCESSION_KEY]
json_path = f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_ACCESSION_KEY}'
try:
sample_data = self.communicator.follows_link('samples', join_url=sample_accession)
self.validate_existing_biosample(sample_data, sample_accession, json_path)
except ValueError:
self.add_error(json_path, f'{sample_accession} does not exist or is private')

def validate_existing_biosample(self, sample_data, accession, json_path):
"""Check if the existing sample has the expected fields present"""
found_collection_date = False
for key in ['collection_date', 'collection date']:
if key in sample_data['characteristics'] and check_date(sample_data['characteristics'][key][0]['text']):
found_collection_date = True
if not found_collection_date:
self.add_error(json_path, f'Existing sample {accession} does not have a valid collection date')

found_geo_loc = False
for key in ['geographic location (country and/or sea)', 'geo loc name']:
if key in sample_data['characteristics'] and sample_data['characteristics'][key][0]['text']:
found_geo_loc = True
if not found_geo_loc:
self.add_error(json_path, f'Existing sample {accession} does not have a valid geographic location')

def check_analysis_alias_coherence(self):
"""Check that the same analysis aliases are used in analysis, sample, and files."""
analysis_aliases = [analysis[ANALYSIS_ALIAS_KEY] for analysis in self.metadata[ANALYSIS_KEY]]
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev9'
container_tag = 'v0.0.1.dev10'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ebi_eva_common_pyutils==0.6.3
ebi_eva_common_pyutils==0.6.8
jinja2
jsonschema
minify_html==0.11.1
Expand Down
44 changes: 44 additions & 0 deletions tests/test_semantic_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from unittest import TestCase
from unittest.mock import patch

from ebi_eva_common_pyutils.biosamples_communicators import NoAuthHALCommunicator

from eva_sub_cli.semantic_metadata import SemanticMetadataChecker


Expand Down Expand Up @@ -58,6 +60,48 @@ def test_check_all_taxonomy_codes(self):
}
])

def test_check_existing_biosamples(self):
metadata = {
"sample": [
{"bioSampleAccession": "SAME00001"},
{"bioSampleAccession": "SAME00002"},
{"bioSampleAccession": "SAME00003"}
]
}
checker = SemanticMetadataChecker(metadata)
valid_sample = {
'accession': 'SAME00001',
'characteristics': {
'organism': [{'text': 'Viridiplantae'}],
'collection date': [{'text': '2018'}],
'geo loc name': [{'text': 'France: Montferrier-sur-Lez'}]
}
}
invalid_sample = {
'accession': 'SAME00003',
'characteristics': {
'organism': [{'text': 'Viridiplantae'}]
}
}

with patch.object(NoAuthHALCommunicator, 'follows_link',
side_effect=[valid_sample, ValueError, invalid_sample]) as m_follows_link:
checker.check_existing_biosamples()
self.assertEqual(checker.errors, [
{
'property': '/sample/1/bioSampleAccession',
'description': 'SAME00002 does not exist or is private'
},
{
'property': '/sample/2/bioSampleAccession',
'description': 'Existing sample SAME00003 does not have a valid collection date'
},
{
'property': '/sample/2/bioSampleAccession',
'description': 'Existing sample SAME00003 does not have a valid geographic location'
}
])

def test_check_analysis_alias_coherence(self):
metadata = {
"analysis": [
Expand Down

0 comments on commit bc047ac

Please sign in to comment.