Skip to content

Commit

Permalink
Sample Ownership (#204)
Browse files Browse the repository at this point in the history
* New script to check who owns existing samples from the Sample spreadsheet.
* Fix analysis location change
* Change download URL
* Update bin/check_existing_sample_ownership.py
Co-authored-by: April Shen <april.tuesday@gmail.com>
  • Loading branch information
tcezard authored Apr 23, 2024
1 parent 083da74 commit da26f74
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 66 deletions.
75 changes: 75 additions & 0 deletions bin/check_existing_sample_ownership.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python

# Copyright 2023 EMBL - European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import csv

from ebi_eva_common_pyutils.config import cfg
from ebi_eva_common_pyutils.logger import logging_config as log_cfg

from eva_submission.biosample_submission.biosamples_submitters import AAPHALCommunicator
from eva_submission.submission_config import load_config
from eva_submission.xlsx.xlsx_parser_eva import EvaXlsxReader


def main():
arg_parser = argparse.ArgumentParser(
description='Check ownership of existing Biosamples accessions from a metadata file')
arg_parser.add_argument('--metadata_file', required=True,
help='Spreadsheet file containing the sample information. '
'It should contains some pre-existing BioSample accession')
arg_parser.add_argument('--output', required=True,
help='CSV file containing the ownership information for all existing samples in the '
'metadata spreadsheet')
args = arg_parser.parse_args()

log_cfg.add_stdout_handler()

# Load the config_file from default location
load_config()
metadata_reader = EvaXlsxReader(args.metadata_file)
communicator = AAPHALCommunicator(cfg.query('biosamples', 'aap_url'), cfg.query('biosamples', 'bsd_url'),
cfg.query('biosamples', 'username'), cfg.query('biosamples', 'password'),
cfg.query('biosamples', 'domain'))
with open(args.output, 'w') as open_ouptut:
sample_attrs = ['accession', 'name', 'domain', 'webinSubmissionAccountId', 'status']
writer = csv.DictWriter(open_ouptut, fieldnames=sample_attrs + ['owner'])
writer.writeheader()
for sample_row in metadata_reader.samples:
if sample_row.get('Sample Accession'):
# Existing samples
sample_accession = sample_row.get('Sample Accession').strip()
res = {}
try:
json_response = communicator.follows_link('samples', join_url=sample_accession)
if json_response:
for attr in sample_attrs:
res[attr] = json_response.get(attr)
if res['domain'] == 'subs.team-31' or res['webinSubmissionAccountId'] == 'Webin-1008':
res['owner'] = 'EVA'
elif res['domain'] == 'self.BiosampleImportNCBI':
res['owner'] = 'BioSamples'
else:
res['owner'] = 'Third party'
except ValueError:
print(f'{sample_accession} does not exist or is private')
res = {'accession': sample_accession, 'name': '', 'domain': '', 'webinSubmissionAccountId': '',
'status': 'PRIVATE', 'owner': ''}
writer.writerow(res)


if __name__ == "__main__":
main()
53 changes: 0 additions & 53 deletions bin/check_sample_exist.py

This file was deleted.

25 changes: 23 additions & 2 deletions eva_submission/eload_backlog.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import urllib

import requests
from cached_property import cached_property
from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query

Expand Down Expand Up @@ -118,14 +119,34 @@ def find_local_file(self, fn):
raise FileNotFoundError(f'File not found: {full_path}')
return full_path

def _get_files_from_ena_analysis(self, analysis_accession):
"""Find the location of the file submitted with an analysis"""
analyses_url = (
f"https://www.ebi.ac.uk/ena/portal/api/filereport?result=analysis&accession={analysis_accession}"
f"&format=json&fields=submitted_ftp"
)
response = requests.get(analyses_url)
response.raise_for_status()
data = response.json()
if data:
return data[0].get('submitted_ftp').split(';')
else:
return {}

def find_file_on_ena(self, fn, analysis):
basename = os.path.basename(fn)
full_path = os.path.join(self._get_dir('vcf'), basename)
if not os.path.exists(full_path):
try:
self.info(f'Retrieve {basename} in {analysis} from ENA ftp')
url = f'https://ftp.sra.ebi.ac.uk/vol1/{analysis[:6]}/{analysis}/{basename}'
download_file(url, full_path)
ftp_urls = self._get_files_from_ena_analysis(analysis)
urls = [ftp_url for ftp_url in ftp_urls if ftp_url.endswith(fn)]
if len(urls) == 1:
url = 'https://' + urls[0]
download_file(url, full_path)
else:
self.error(f'Could find {fn} in analysis {analysis} on ENA: most likely does not exist')
raise FileNotFoundError(f'File not found: {full_path}')
except urllib.error.URLError:
self.error(f'Could not access {url} on ENA: most likely does not exist')
raise FileNotFoundError(f'File not found: {full_path}')
Expand Down
2 changes: 1 addition & 1 deletion eva_submission/eload_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def check_variant_db(self):
metadata_connection_handle=conn,
assembly_accession=assembly,
taxonomy_id=self.taxonomy,
ncbi_api_key=cfg['eutils_api_key']
ncbi_api_key=cfg.get('eutils_api_key')
)

for db_info in assembly_to_db_name.values():
Expand Down
6 changes: 3 additions & 3 deletions eva_submission/eload_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ def get_reference_fasta_and_report(species_name, reference_accession, output_dir
if NCBIAssembly.is_assembly_accession_format(reference_accession):
assembly = NCBIAssembly(
reference_accession, species_name, output_directory,
eutils_api_key=cfg['eutils_api_key']
eutils_api_key=cfg.get('eutils_api_key')
)
if not os.path.isfile(assembly.assembly_fasta_path) or not os.path.isfile(assembly.assembly_report_path) or overwrite:
assembly.download_or_construct(overwrite=overwrite)
return assembly.assembly_fasta_path, assembly.assembly_report_path
elif NCBISequence.is_genbank_accession_format(reference_accession):
reference = NCBISequence(reference_accession, species_name, output_directory,
eutils_api_key=cfg['eutils_api_key'])
eutils_api_key=cfg.get('eutils_api_key'))
if not os.path.isfile(reference.sequence_fasta_path) or overwrite:
reference.download_contig_sequence_from_ncbi(genbank_only=True)
return reference.sequence_fasta_path, None
Expand All @@ -54,7 +54,7 @@ def resolve_accession_from_text(reference_text):
if NCBIAssembly.is_assembly_accession_format(reference_text):
return [reference_text]
# Search for a reference genome that resolve this text
accession = retrieve_genbank_assembly_accessions_from_ncbi(reference_text, api_key=cfg['eutils_api_key'])
accession = retrieve_genbank_assembly_accessions_from_ncbi(reference_text, api_key=cfg.get('eutils_api_key'))
if accession:
return accession

Expand Down
4 changes: 2 additions & 2 deletions eva_submission/vep_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def get_species_and_assembly(assembly_acc):
Returns None if the taxonomy is not known.
"""
# We first need to search for the species associated with the assembly
assembly_dicts = get_ncbi_assembly_dicts_from_term(assembly_acc, api_key=cfg['eutils_api_key'])
assembly_dicts = get_ncbi_assembly_dicts_from_term(assembly_acc, api_key=cfg.get('eutils_api_key'))
taxid_and_assembly_name = set([
(assembly_dict.get('taxid'), assembly_dict.get('assemblyname'))
for assembly_dict in assembly_dicts
Expand Down Expand Up @@ -253,7 +253,7 @@ def recursive_nlst(ftp, root, pattern):

@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3), logger=logger)
def download_and_extract_vep_cache(ftp, vep_cache_file, taxonomy_id):
scientific_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id, api_key=cfg['eutils_api_key'])
scientific_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id, api_key=cfg.get('eutils_api_key'))
species_name = scientific_name.replace(' ', '_').lower()

tmp_dir = tempfile.TemporaryDirectory()
Expand Down
2 changes: 1 addition & 1 deletion eva_submission/xlsx/xlsx_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def check_reference_genome(self):
"""Check if the references can be retrieved"""
references = set([row['Reference'] for row in self.metadata['Analysis'] if row['Reference']])
for reference in references:
accessions = retrieve_genbank_assembly_accessions_from_ncbi(reference, api_key=cfg['eutils_api_key'])
accessions = retrieve_genbank_assembly_accessions_from_ncbi(reference, api_key=cfg.get('eutils_api_key'))
if len(accessions) == 0:
self.error_list.append(f'In Analysis, Reference {reference} did not resolve to any accession')
elif len(accessions) > 1:
Expand Down
2 changes: 0 additions & 2 deletions tests/resources/submission_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,3 @@ ena:
ftpport: 22
username: demo
password: password

eutils_api_key: test
8 changes: 6 additions & 2 deletions tests/test_eload_backlog.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,12 @@ def setUp(self):
self.eload = EloadBacklog(44)

def tearDown(self):
if os.path.exists(os.path.join(self.eload._get_dir('ena'), 'IRIS_313-8755.snp.vcf.gz.tbi')):
os.remove(os.path.join(self.eload._get_dir('ena'), 'IRIS_313-8755.snp.vcf.gz.tbi'))
for f in [
os.path.join(self.eload._get_dir('ena'), 'IRIS_313-8755.snp.vcf.gz.tbi'),
os.path.join(self.eload._get_dir('vcf'), 'IRIS_313-8755.snp.vcf.gz.tbi')
]:
if os.path.exists(f):
os.remove(f)
# necessary because test instances are retained during a run and content is a class variable
from eva_submission.submission_config import EloadConfig
EloadConfig.content = {}
Expand Down

0 comments on commit da26f74

Please sign in to comment.