Skip to content

Commit

Permalink
EVA-3681 - New process to override BioSamples created in NCBI (#224)
Browse files Browse the repository at this point in the history
* New process to override BioSamples created in NCBI
Co-authored-by: April Shen <april.tuesday@gmail.com>
Co-authored-by: nitin-ebi <79518737+nitin-ebi@users.noreply.github.com>
  • Loading branch information
tcezard authored Oct 22, 2024
1 parent 654217b commit 4cb6c27
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 35 deletions.
9 changes: 6 additions & 3 deletions bin/modify_existing_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@ def main():
'The spreadsheet needs to contain a Sheet called Sample.')
arg_parser.add_argument('--metadata_file', required=True,
help='Spreadsheet file containing the sample information')
arg_parser.add_argument('--action', required=True, choices=('overwrite', 'curate', 'derive'),
arg_parser.add_argument('--action', required=True, choices=('overwrite', 'curate', 'derive', 'override'),
help='Type of modification of the BioSamples that should be made. '
'"overwrite" require that EVA owns the BioSample entity. '
'"curate" will create curation object on top of the BioSample. These are not '
'"overwrite" and "override" will both change the original sample (precuration) with '
'the modified sample defined in the spreadsheet. overwrite will use EVA credentials '
'where override will use superuser credentials. overwrite require that EVA owns the '
'BioSample entity. override requires that the samples are from NCBI.'
'"curate" will create curation object on top of the BioSample. These are not '
'used by ENA. '
'"derive" will create a new BioSample derived from the old one.')
args = arg_parser.parse_args()
Expand Down
71 changes: 53 additions & 18 deletions eva_submission/biosample_submission/biosamples_submitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@

class BioSamplesSubmitter(AppLogger):

valid_actions = ('create', 'overwrite', 'curate', 'derive')
valid_actions = ('create', 'overwrite', 'override', 'curate', 'derive')
characteristics_allowed_to_override = ('collection_date', 'geographic location (country and/or sea)')

def __init__(self, communicators, submit_type=('create',), allow_removal=False):
assert len(communicators) > 0, 'Specify at least one communicator object to BioSamplesSubmitter'
Expand All @@ -39,27 +40,35 @@ def __init__(self, communicators, submit_type=('create',), allow_removal=False):
self.submit_type = submit_type
self.allow_removal = allow_removal

@lru_cache
def _get_existing_sample(self, accession):
return self.default_communicator.follows_link('samples', method='GET', join_url=accession)
@lru_cache(maxsize=0)
def _get_existing_sample(self, accession, include_curation=False):
if include_curation:
append_to_url = accession
else:
append_to_url = accession + '?curationdomain='
return self.default_communicator.follows_link('samples', method='GET', join_url=append_to_url)

def can_create(self, sample):
return 'create' in self.submit_type and 'accession' not in sample

def can_overwrite(self, sample):
""" We should overwrite a samples when it is owned by a domain supported current uploader"""
return 'overwrite' in self.submit_type and \
'accession' in sample and \
self._get_communicator_for_sample(sample)
""" We should overwrite a sample when it is owned by a domain supported by the current uploader
or when we use a superuser to override the original sample"""
return 'accession' in sample and (
'overwrite' in self.submit_type and self._get_communicator_for_sample(sample) or
'override' in self.submit_type and self._allowed_to_override(sample)
)

def can_curate(self, sample):
""" We can curate a samples if it has an existing accessionr"""
""" We can curate a samples if it has an existing accession"""
return 'curate' in self.submit_type and 'accession' in sample

def can_derive(self, sample):
return 'derive' in self.submit_type and 'accession' in sample

def _get_communicator_for_sample(self, sample):
if 'override' in self.submit_type:
return self.communicators[0]
sample_data = self._get_existing_sample(sample.get('accession'))
# This check If one of the account own the BioSample by checking if the 'domain' or 'webinSubmissionAccountId'
# are the same as the one who submitted the sample
Expand All @@ -68,19 +77,26 @@ def _get_communicator_for_sample(self, sample):
return communicator
return None

def _allowed_to_override(self, sample):
if sample.get('accession', '').startswith('SAMN'):
return True
else:
self.warning(f'Sample {sample.get("accession")} cannot be overridden because it is not an NCBI sample ')

def validate_in_bsd(self, samples_data):
for sample in samples_data:
if self.can_overwrite(sample):
sample = self.create_sample_to_overwrite(sample)

# If we're only retrieving don't need to validate.
if self.can_create(sample) or self.can_overwrite(sample):
sample.update(self.default_communicator.communicator_attributes)
if self.can_create(sample):
sample.update(self.default_communicator.communicator_attributes)
self.default_communicator.follows_link('samples', join_url='validate', method='POST', json=sample)

def convert_sample_data_to_curation_object(self, future_sample):
"""Curation object can only change 3 attributes characteristics, externalReferences and relationships"""
current_sample = self._get_existing_sample(future_sample.get('accession'))
current_sample = self._get_existing_sample(future_sample.get('accession'), include_curation=True)
#FIXME: Remove this hack when this is fixed on BioSample's side
# remove null values in externalReferences that causes crash when POSTing the curation object
if 'externalReferences' in current_sample:
Expand Down Expand Up @@ -149,6 +165,13 @@ def _update_from_array(key, sample_source, sample_dest):

def _update_samples_with(self, sample_source, sample_dest):
"""Update a BioSample object with the value of another"""
if 'override' in self.submit_type:
# Ensure that override only change geographic location and collection date
tmp_sample_source = {'characteristics': {}}
for attribute in self.characteristics_allowed_to_override:
if attribute in sample_source['characteristics']:
tmp_sample_source['characteristics'][attribute] = sample_source['characteristics'][attribute]
sample_source = tmp_sample_source
for attribute in sample_source['characteristics']:
if attribute not in sample_dest['characteristics']:
sample_dest['characteristics'][attribute] = sample_source['characteristics'][attribute]
Expand All @@ -165,9 +188,9 @@ def create_sample_to_overwrite(self, sample):
# We only add existing characteristics if we do not want to remove anything
if self.can_overwrite(sample) and not self.allow_removal:
# retrieve the sample without any curation and add the new data on top
current_sample = self._get_existing_sample(sample.get('accession'))
self._update_samples_with(current_sample, sample)
return sample
destination_sample = self._get_existing_sample(sample.get('accession'))
self._update_samples_with(sample, destination_sample)
return destination_sample

def create_derived_sample(self, sample):
skipped_attributes = ['SRA accession']
Expand Down Expand Up @@ -196,25 +219,27 @@ def submit_biosamples_to_bsd(self, samples_data):
Then it returns a map of sample name to sample accession.
"""
for sample in samples_data:
sample.update(self.default_communicator.communicator_attributes)
if self.can_create(sample):
sample.update(self.default_communicator.communicator_attributes)
sample_json = self.default_communicator.follows_link('samples', method='POST', json=sample)
self.debug('Accession sample ' + sample.get('name') + ' as ' + sample_json.get('accession'))
self.debug('Accession sample ' + sample.get('name', '') + ' as ' + sample_json.get('accession'))
elif self.can_overwrite(sample):
sample_to_overwrite = self.create_sample_to_overwrite(sample)
self.debug('Overwrite sample ' + sample.get('name') + ' with accession ' + sample.get('accession'))

self.debug('Overwrite sample ' + sample_to_overwrite.get('name', '') + ' with accession ' + sample_to_overwrite.get('accession'))
# Use the communicator that can own the sample to overwrite it.
communicator = self._get_communicator_for_sample(sample)
sample_json = communicator.follows_link('samples', method='PUT', join_url=sample.get('accession'),
json=sample_to_overwrite)
elif self.can_curate(sample):
self.debug('Update sample ' + sample.get('name') + ' with accession ' + sample.get('accession'))
self.debug('Update sample ' + sample.get('name', '') + ' with accession ' + sample.get('accession'))
curation_object = self.convert_sample_data_to_curation_object(sample)
curation_json = self.default_communicator.follows_link(
'samples', method='POST', join_url=sample.get('accession')+'/curationlinks', json=curation_object
)
sample_json = sample
elif self.can_derive(sample):
sample.update(self.default_communicator.communicator_attributes)
derived_sample, original_accessions = self.create_derived_sample(sample)
sample_json = self.default_communicator.follows_link('samples', method='POST', json=derived_sample)
if 'relationships' not in sample_json:
Expand All @@ -241,6 +266,14 @@ class SampleSubmitter(AppLogger):

def __init__(self, submit_type):
communicators = []

if 'override' in submit_type:
assert len(submit_type) == 1, f'override can only be used as a single action'
communicators.append(AAPHALCommunicator(
cfg.query('biosamples', 'aap_url'), cfg.query('biosamples', 'bsd_url'),
cfg.query('biosamples', 'aap_super_user'), cfg.query('biosamples', 'aap_super_password'),
cfg.query('biosamples', 'aap_super_domain')
))
# If the config has the credential for using webin with BioSamples use webin first
if cfg.query('biosamples', 'webin_url') and cfg.query('biosamples', 'webin_username') and \
cfg.query('biosamples', 'webin_password'):
Expand All @@ -253,6 +286,7 @@ def __init__(self, submit_type):
cfg.query('biosamples', 'username'), cfg.query('biosamples', 'password'),
cfg.query('biosamples', 'domain')
))

self.submitter = BioSamplesSubmitter(communicators, submit_type)
self.sample_data = None

Expand Down Expand Up @@ -433,6 +467,7 @@ def map_metadata_to_bsd_data(self):
self.apply_mapping(bsd_sample_entry, 'organization', organisations)

bsd_sample_entry['release'] = _now
bsd_sample_entry['last_updated_by'] = 'EVA'
payloads.append(bsd_sample_entry)

return payloads
Expand Down
4 changes: 2 additions & 2 deletions eva_submission/nextflow/accession_and_load.nf
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ process prepare_genome {
export PYTHONPATH="$params.executable.python.script_path"
$params.executable.python.interpreter -m eva_submission.steps.rename_contigs_from_insdc_in_assembly \
--assembly_accession $assembly_accession --assembly_fasta $fasta --custom_fasta ${fasta.getSimpleName()}_custom.fa \
--assembly_report $report --vcf_files $vcf_files
--assembly_report $report --vcf_files $vcf_files
"""
}

Expand All @@ -188,7 +188,7 @@ process prepare_genome {
* Normalise the VCF files
*/
process normalise_vcf {
label 'default_time', 'med_mem'
label 'long_time', 'med_mem'

input:
tuple val(vcf_filename), path(fasta), path(vcf_file), path(csi_file)
Expand Down
Binary file added tests/resources/brokering/metadata_sheet_ebi.xlsx
Binary file not shown.
Binary file not shown.
85 changes: 82 additions & 3 deletions tests/test_biosamples_submission.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import copy
import os
from copy import deepcopy
from unittest import TestCase
from unittest.mock import patch

import pytest
import yaml

from ebi_eva_common_pyutils.biosamples_communicators import AAPHALCommunicator, HALCommunicator, WebinHALCommunicator
Expand Down Expand Up @@ -166,7 +168,7 @@ def test_submit_to_bsd_curate_with_removal(self):
submitter_curate = BioSamplesSubmitter([self.communicator], ('curate',), allow_removal=True)
self._test_submit_to_bsd_with_update(submitter_curate, change_original=False, allow_removal=True)

# @pytest.mark.skip(reason='PUT function in dev server does not work')
@pytest.mark.skip(reason='PUT function in dev server does not work')
def test_submit_to_bsd_overwrite(self):
submitter_overwrite = BioSamplesSubmitter([self.communicator], ('overwrite',))
self._test_submit_to_bsd_with_update(submitter_overwrite, change_original=True, allow_removal=False)
Expand All @@ -182,7 +184,7 @@ def test_submit_to_bsd_no_change(self):
self.submitter.sample_name_to_accession.clear()
with patch.object(HALCommunicator, 'follows_link', wraps=self.submitter.default_communicator.follows_link) as mock_fl:
self.submitter.submit_biosamples_to_bsd([{'accession': accession}])
mock_fl.assert_called_once_with('samples', method='GET', join_url=accession)
mock_fl.assert_called_once_with('samples', method='GET', join_url=accession + '?curationdomain=')
self.assertEqual(self.submitter.sample_name_to_accession, {'LH1': accession})


Expand All @@ -203,8 +205,10 @@ def setUp(self) -> None:
def test_map_metadata_to_bsd_data(self):
now = '2020-07-06T19:09:29.090Z'
biosamples_submitters._now = now

expected_payload = [
{'name': 'S%s' % (i + 1), 'taxId': 9606, 'release': now,
'last_updated_by': 'EVA',
'contact': [{'LastName': 'John', 'FirstName': 'Doe', 'E-mail': 'john.doe@example.com'},
{'LastName': 'Jane', 'FirstName': 'Doe', 'E-mail': 'jane.doe@example.com'}],
'organization': [{'Name': 'GPE', 'Address': 'The place to be'},
Expand Down Expand Up @@ -235,6 +239,7 @@ def test_map_partial_metadata_to_bsd_data(self):
]
organizations = [{'Name': 'GPE', 'Address': 'The place to be'}, {'Name': 'GPE', 'Address': 'The place to be'}]
updated_samples = [{
'last_updated_by': 'EVA',
'accession': 'SAMD1234' + str(567 + i),
'name': 'S%s' % (i + 1), 'taxId': 9606, 'release': now,
'contact': contacts, 'organization': organizations,
Expand All @@ -245,12 +250,13 @@ def test_map_partial_metadata_to_bsd_data(self):
}
} for i in range(10)]
existing_samples = [{
'last_updated_by': 'EVA',
'accession': 'SAMD1234' + str(567 + i),
'contact': contacts, 'organization': organizations,
'characteristics': {},
'release': now
} for i in range(10, 20)]
new_samples = [{'name': 'S%s' % (i + 1), 'taxId': 9606, 'release': now,
new_samples = [{'last_updated_by': 'EVA', 'name': 'S%s' % (i + 1), 'taxId': 9606, 'release': now,
'contact': contacts, 'organization': organizations,
'characteristics': {
'Organism': [{'text': 'Homo sapiens'}],
Expand Down Expand Up @@ -284,3 +290,76 @@ def test_retrieve_biosamples(self):
{'name': 'FakeSample1', 'accession': 'SAME001', 'domain': 'self.ExampleDomain', 'externalReferences': [{'url': 'test_url'}, {'url': 'https://www.ebi.ac.uk/eva/?eva-study=PRJEB001'}]},
{'name': 'FakeSample2', 'accession': 'SAME002', 'domain': 'self.ExampleDomain', 'externalReferences': [{'url': 'https://www.ebi.ac.uk/eva/?eva-study=PRJEB001'}]}
]


class TestSampleMetadataOverrider(BSDTestCase):
samples = {
# NCBI samples
'SAMN1234567': {
'accession': 'SAMN1234567',
'name': 'Sample1',
'characteristics': {
'description': [{'text': 'Sample 1'}],
'scientific name': [{'text': 'Larimichthys polyactis'}],
},
'release': '2020-07-06T19:09:29.090Z'},
'SAMN1234568': {
'accession': 'SAMN1234568',
'name': 'Sample2',
'characteristics': {
'description': [{'text': 'Sample 2'}],
'scientific name': [{'text': 'Larimichthys polyactis'}],
},
'release': '2020-07-06T19:09:29.090Z'},
# EBI samples
'SAME1234567': {
'accession': 'SAMN1234567',
'name': 'Sample1',
'characteristics': {
'description': [{'text': 'Sample 1'}],
'scientific name': [{'text': 'Larimichthys polyactis'}],
},
'release': '2020-07-06T19:09:29.090Z'},
'SAME1234568': {
'accession': 'SAMN1234568',
'name': 'Sample2',
'characteristics': {
'description': [{'text': 'Sample 2'}],
'scientific name': [{'text': 'Larimichthys polyactis'}],
},
'release': '2020-07-06T19:09:29.090Z'}
}

@staticmethod
def _get_fake_sample(accession, include_curation=False):
return TestSampleMetadataOverrider.samples.get(accession)

def setUp(self) -> None:
brokering_folder = os.path.join(ROOT_DIR, 'tests', 'resources', 'brokering')
self.metadata_file_ncbi = os.path.join(brokering_folder, 'metadata_sheet_ncbi.xlsx')
self.metadata_file_ebi = os.path.join(brokering_folder, 'metadata_sheet_ebi.xlsx')

def test_override_samples(self):
with patch.object(BioSamplesSubmitter, '_get_existing_sample', side_effect=self._get_fake_sample), \
patch.object(HALCommunicator, 'follows_link') as m_follows_link:

sample_submitter = SampleMetadataSubmitter(self.metadata_file_ncbi, submit_type=('override',))
sample_submitter.submit_to_bioSamples()

sample1 = copy.deepcopy(self.samples.get('SAMN1234567'))
sample1['characteristics']['collection_date'] = [{'text': '2020-12-24'}]
sample1['characteristics']['geographic location (country and/or sea)'] = [{'text': 'United Kingdom'}]
sample2 = copy.deepcopy(self.samples.get('SAMN1234568'))
sample2['characteristics']['collection_date'] = [{'text': '1920-12-24'}]
sample2['characteristics']['geographic location (country and/or sea)'] = [{'text': 'USA'}]

m_follows_link.assert_any_call('samples', method='PUT', join_url='SAMN1234567', json=sample1)
m_follows_link.assert_any_call('samples', method='PUT', join_url='SAMN1234568', json=sample2)

def test_not_override_samples(self):
with patch.object(BioSamplesSubmitter, '_get_existing_sample', side_effect=self._get_fake_sample), \
patch.object(HALCommunicator, 'follows_link') as m_follows_link:
sample_submitter = SampleMetadataSubmitter(self.metadata_file_ebi, submit_type=('override',))
sample_submitter.submit_to_bioSamples()
m_follows_link.assert_not_called()

Loading

0 comments on commit 4cb6c27

Please sign in to comment.