Sample Ownership (#204)

* New script to check who owns existing samples from the Sample spreadsheet. * Fix analysis location change * Change download URL * Update bin/check_existing_sample_ownership.py Co-authored-by: April Shen <april.tuesday@gmail.com>
EBIvariation · Apr 23, 2024 · da26f74 · da26f74
1 parent 083da74
commit da26f74
Show file tree

Hide file tree

Showing 9 changed files with 111 additions and 66 deletions.
diff --git a/bin/check_existing_sample_ownership.py b/bin/check_existing_sample_ownership.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+# Copyright 2023 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import csv
+
+from ebi_eva_common_pyutils.config import cfg
+from ebi_eva_common_pyutils.logger import logging_config as log_cfg
+
+from eva_submission.biosample_submission.biosamples_submitters import AAPHALCommunicator
+from eva_submission.submission_config import load_config
+from eva_submission.xlsx.xlsx_parser_eva import EvaXlsxReader
+
+
+def main():
+    arg_parser = argparse.ArgumentParser(
+        description='Check ownership of existing Biosamples accessions from a metadata file')
+    arg_parser.add_argument('--metadata_file', required=True,
+                            help='Spreadsheet file containing the sample information. '
+                                 'It should contains some pre-existing BioSample accession')
+    arg_parser.add_argument('--output', required=True,
+                            help='CSV file containing the ownership information for all existing samples in the '
+                                 'metadata spreadsheet')
+    args = arg_parser.parse_args()
+
+    log_cfg.add_stdout_handler()
+
+    # Load the config_file from default location
+    load_config()
+    metadata_reader = EvaXlsxReader(args.metadata_file)
+    communicator = AAPHALCommunicator(cfg.query('biosamples', 'aap_url'), cfg.query('biosamples', 'bsd_url'),
+                                      cfg.query('biosamples', 'username'), cfg.query('biosamples', 'password'),
+                                      cfg.query('biosamples', 'domain'))
+    with open(args.output, 'w') as open_ouptut:
+        sample_attrs = ['accession', 'name', 'domain', 'webinSubmissionAccountId', 'status']
+        writer = csv.DictWriter(open_ouptut, fieldnames=sample_attrs + ['owner'])
+        writer.writeheader()
+        for sample_row in metadata_reader.samples:
+            if sample_row.get('Sample Accession'):
+                # Existing samples
+                sample_accession = sample_row.get('Sample Accession').strip()
+                res = {}
+                try:
+                    json_response = communicator.follows_link('samples', join_url=sample_accession)
+                    if json_response:
+                        for attr in sample_attrs:
+                            res[attr] = json_response.get(attr)
+                        if res['domain'] == 'subs.team-31' or res['webinSubmissionAccountId'] == 'Webin-1008':
+                            res['owner'] = 'EVA'
+                        elif res['domain'] == 'self.BiosampleImportNCBI':
+                            res['owner'] = 'BioSamples'
+                        else:
+                            res['owner'] = 'Third party'
+                except ValueError:
+                    print(f'{sample_accession} does not exist or is private')
+                    res = {'accession': sample_accession, 'name': '', 'domain': '', 'webinSubmissionAccountId': '',
+                           'status': 'PRIVATE', 'owner': ''}
+                writer.writerow(res)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/check_sample_exist.py b/bin/check_sample_exist.py
diff --git a/eva_submission/eload_backlog.py b/eva_submission/eload_backlog.py
@@ -1,6 +1,7 @@
 import os
 import urllib
 
+import requests
 from cached_property import cached_property
 from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query
 
@@ -118,14 +119,34 @@ def find_local_file(self, fn):
             raise FileNotFoundError(f'File not found: {full_path}')
         return full_path
 
+    def _get_files_from_ena_analysis(self, analysis_accession):
+        """Find the location of the file submitted with an analysis"""
+        analyses_url = (
+            f"https://www.ebi.ac.uk/ena/portal/api/filereport?result=analysis&accession={analysis_accession}"
+            f"&format=json&fields=submitted_ftp"
+        )
+        response = requests.get(analyses_url)
+        response.raise_for_status()
+        data = response.json()
+        if data:
+            return data[0].get('submitted_ftp').split(';')
+        else:
+            return {}
+
     def find_file_on_ena(self, fn, analysis):
         basename = os.path.basename(fn)
         full_path = os.path.join(self._get_dir('vcf'), basename)
         if not os.path.exists(full_path):
             try:
                 self.info(f'Retrieve {basename} in {analysis} from ENA ftp')
-                url = f'https://ftp.sra.ebi.ac.uk/vol1/{analysis[:6]}/{analysis}/{basename}'
-                download_file(url, full_path)
+                ftp_urls = self._get_files_from_ena_analysis(analysis)
+                urls = [ftp_url for ftp_url in ftp_urls if ftp_url.endswith(fn)]
+                if len(urls) == 1:
+                    url = 'https://' + urls[0]
+                    download_file(url, full_path)
+                else:
+                    self.error(f'Could find {fn} in analysis {analysis} on ENA: most likely does not exist')
+                    raise FileNotFoundError(f'File not found: {full_path}')
             except urllib.error.URLError:
                 self.error(f'Could not access {url} on ENA: most likely does not exist')
                 raise FileNotFoundError(f'File not found: {full_path}')

diff --git a/eva_submission/eload_ingestion.py b/eva_submission/eload_ingestion.py
@@ -206,7 +206,7 @@ def check_variant_db(self):
                     metadata_connection_handle=conn,
                     assembly_accession=assembly,
                     taxonomy_id=self.taxonomy,
-                    ncbi_api_key=cfg['eutils_api_key']
+                    ncbi_api_key=cfg.get('eutils_api_key')
                 )
 
         for db_info in assembly_to_db_name.values():

diff --git a/eva_submission/eload_utils.py b/eva_submission/eload_utils.py
@@ -26,14 +26,14 @@ def get_reference_fasta_and_report(species_name, reference_accession, output_dir
     if NCBIAssembly.is_assembly_accession_format(reference_accession):
         assembly = NCBIAssembly(
             reference_accession, species_name, output_directory,
-            eutils_api_key=cfg['eutils_api_key']
+            eutils_api_key=cfg.get('eutils_api_key')
         )
         if not os.path.isfile(assembly.assembly_fasta_path) or not os.path.isfile(assembly.assembly_report_path) or overwrite:
             assembly.download_or_construct(overwrite=overwrite)
         return assembly.assembly_fasta_path, assembly.assembly_report_path
     elif NCBISequence.is_genbank_accession_format(reference_accession):
         reference = NCBISequence(reference_accession, species_name, output_directory,
-                                 eutils_api_key=cfg['eutils_api_key'])
+                                 eutils_api_key=cfg.get('eutils_api_key'))
         if not os.path.isfile(reference.sequence_fasta_path) or overwrite:
             reference.download_contig_sequence_from_ncbi(genbank_only=True)
         return reference.sequence_fasta_path, None
@@ -54,7 +54,7 @@ def resolve_accession_from_text(reference_text):
     if NCBIAssembly.is_assembly_accession_format(reference_text):
         return [reference_text]
     # Search for a reference genome that resolve this text
-    accession = retrieve_genbank_assembly_accessions_from_ncbi(reference_text, api_key=cfg['eutils_api_key'])
+    accession = retrieve_genbank_assembly_accessions_from_ncbi(reference_text, api_key=cfg.get('eutils_api_key'))
     if accession:
         return accession
 

diff --git a/eva_submission/vep_utils.py b/eva_submission/vep_utils.py
@@ -139,7 +139,7 @@ def get_species_and_assembly(assembly_acc):
     Returns None if the taxonomy is not known.
     """
     # We first need to search for the species associated with the assembly
-    assembly_dicts = get_ncbi_assembly_dicts_from_term(assembly_acc, api_key=cfg['eutils_api_key'])
+    assembly_dicts = get_ncbi_assembly_dicts_from_term(assembly_acc, api_key=cfg.get('eutils_api_key'))
     taxid_and_assembly_name = set([
         (assembly_dict.get('taxid'), assembly_dict.get('assemblyname'))
         for assembly_dict in assembly_dicts
@@ -253,7 +253,7 @@ def recursive_nlst(ftp, root, pattern):
 
 @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3), logger=logger)
 def download_and_extract_vep_cache(ftp, vep_cache_file, taxonomy_id):
-    scientific_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id, api_key=cfg['eutils_api_key'])
+    scientific_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id, api_key=cfg.get('eutils_api_key'))
     species_name = scientific_name.replace(' ', '_').lower()
 
     tmp_dir = tempfile.TemporaryDirectory()

diff --git a/eva_submission/xlsx/xlsx_validation.py b/eva_submission/xlsx/xlsx_validation.py
@@ -107,7 +107,7 @@ def check_reference_genome(self):
         """Check if the references can be retrieved"""
         references = set([row['Reference'] for row in self.metadata['Analysis'] if row['Reference']])
         for reference in references:
-            accessions = retrieve_genbank_assembly_accessions_from_ncbi(reference, api_key=cfg['eutils_api_key'])
+            accessions = retrieve_genbank_assembly_accessions_from_ncbi(reference, api_key=cfg.get('eutils_api_key'))
             if len(accessions) == 0:
                 self.error_list.append(f'In Analysis, Reference {reference} did not resolve to any accession')
             elif len(accessions) > 1:

diff --git a/tests/resources/submission_config.yml b/tests/resources/submission_config.yml
@@ -45,5 +45,3 @@ ena:
   ftpport: 22
   username: demo
   password: password
-
-eutils_api_key: test
diff --git a/tests/test_eload_backlog.py b/tests/test_eload_backlog.py
@@ -23,8 +23,12 @@ def setUp(self):
         self.eload = EloadBacklog(44)
 
     def tearDown(self):
-        if os.path.exists(os.path.join(self.eload._get_dir('ena'), 'IRIS_313-8755.snp.vcf.gz.tbi')):
-            os.remove(os.path.join(self.eload._get_dir('ena'), 'IRIS_313-8755.snp.vcf.gz.tbi'))
+        for f in [
+            os.path.join(self.eload._get_dir('ena'), 'IRIS_313-8755.snp.vcf.gz.tbi'),
+            os.path.join(self.eload._get_dir('vcf'), 'IRIS_313-8755.snp.vcf.gz.tbi')
+        ]:
+            if os.path.exists(f):
+                os.remove(f)
         # necessary because test instances are retained during a run and content is a class variable
         from eva_submission.submission_config import EloadConfig
         EloadConfig.content = {}