Skip to content

Commit

Permalink
EVA-3147 Use Spring properties generator for generating properties fi…
Browse files Browse the repository at this point in the history
…les for accession and variant load (#146)

generate the properties file using springPropertiesGenerator
  • Loading branch information
nitin-ebi authored May 9, 2023
1 parent e011363 commit f8c6f1e
Show file tree
Hide file tree
Showing 14 changed files with 199 additions and 480 deletions.
106 changes: 50 additions & 56 deletions eva_submission/eload_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@
from cached_property import cached_property
from ebi_eva_common_pyutils import command_utils
from ebi_eva_common_pyutils.config import cfg
from ebi_eva_common_pyutils.config_utils import get_mongo_uri_for_eva_profile, get_primary_mongo_creds_for_profile, \
get_accession_pg_creds_for_profile, get_count_service_creds_for_profile
from ebi_eva_common_pyutils.config_utils import get_mongo_uri_for_eva_profile
from ebi_eva_common_pyutils.ena_utils import get_assembly_name_and_taxonomy_id
from ebi_eva_common_pyutils.metadata_utils import resolve_variant_warehouse_db_name, insert_new_assembly_and_taxonomy, \
get_assembly_set_from_metadata, add_to_supported_assemblies
from ebi_eva_common_pyutils.ncbi_utils import get_ncbi_assembly_dicts_from_term, \
retrieve_species_scientific_name_from_tax_id_ncbi
from ebi_eva_common_pyutils.ncbi_utils import get_species_name_from_ncbi
from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query
from ebi_eva_common_pyutils.spring_properties import SpringPropertiesGenerator
from ebi_eva_common_pyutils.assembly.assembly import get_supported_asm_from_ensembl
Expand All @@ -25,7 +23,6 @@
from eva_submission.eload_utils import provision_new_database_for_variant_warehouse
from eva_submission.submission_config import EloadConfig
from eva_submission.vep_utils import get_vep_and_vep_cache_version
from eva_submission.ingestion_templates import accession_props_template, variant_load_props_template

project_dirs = {
'logs': '00_logs',
Expand Down Expand Up @@ -121,30 +118,11 @@ def fill_vep_versions(self, vep_cache_assembly_name=None):
assembly_accession,
vep_cache_assembly_name
)
vep_species = self.get_species_name_from_ncbi(assembly_accession)
vep_species = get_species_name_from_ncbi(assembly_accession)
self.eload_cfg.set(self.config_section, 'vep', assembly_accession, 'version', value=vep_version)
self.eload_cfg.set(self.config_section, 'vep', assembly_accession, 'cache_version', value=vep_cache_version)
self.eload_cfg.set(self.config_section, 'vep', assembly_accession, 'species', value=vep_species)

def get_species_name_from_ncbi(self, assembly_acc):
# We first need to search for the species associated with the assembly
assembly_dicts = get_ncbi_assembly_dicts_from_term(assembly_acc)
taxid_and_assembly_name = set([
(assembly_dict.get('taxid'), assembly_dict.get('assemblyname'))
for assembly_dict in assembly_dicts
if assembly_dict.get('assemblyaccession') == assembly_acc or
assembly_dict.get('synonym', {}).get('genbank') == assembly_acc
])
# This is a search so could retrieve multiple results
if len(taxid_and_assembly_name) != 1:
raise ValueError(f'Multiple assembly found for {assembly_acc}. '
f'Cannot resolve single assembly for assembly {assembly_acc} in NCBI.')

taxonomy_id, assembly_name = taxid_and_assembly_name.pop()

scientific_name = retrieve_species_scientific_name_from_tax_id_ncbi(taxonomy_id)
return scientific_name.replace(' ', '_').lower()

def _get_vcf_files_from_brokering(self):
vcf_files = []
analyses = self.eload_cfg.query('brokering', 'analyses')
Expand Down Expand Up @@ -324,56 +302,39 @@ def _generate_csv_mappings_to_ingest(self):
return vcf_files_to_ingest

def run_accession_workflow(self, vcf_files_to_ingest, resume):
mongo_host, mongo_user, mongo_pass = get_primary_mongo_creds_for_profile(self.maven_profile,
self.private_settings_file)
pg_url, pg_user, pg_pass = get_accession_pg_creds_for_profile(self.maven_profile, self.private_settings_file)
counts_url, counts_user, counts_pass = get_count_service_creds_for_profile(self.maven_profile,
self.private_settings_file)
job_props = accession_props_template(
taxonomy_id=self.taxonomy,
project_accession=self.project_accession,
instance_id=self.eload_cfg.query(self.config_section, 'accession', 'instance_id'),
mongo_host=mongo_host,
mongo_user=mongo_user,
mongo_pass=mongo_pass,
postgres_url=pg_url,
postgres_user=pg_user,
postgres_pass=pg_pass,
counts_url=counts_url,
counts_user=counts_user,
counts_pass=counts_pass
)
instance_id = self.eload_cfg.query(self.config_section, 'accession', 'instance_id')
output_dir = os.path.join(self.project_dir, project_dirs['accessions'])
accession_properties_file = self.create_accession_properties(instance_id=instance_id,
output_file_path=os.path.join(output_dir, 'accession.properties'))
accession_config = {
'valid_vcfs': vcf_files_to_ingest,
'project_accession': self.project_accession,
'instance_id': self.eload_cfg.query(self.config_section, 'accession', 'instance_id'),
'accession_job_props': job_props,
'instance_id': instance_id,
'accession_job_props': accession_properties_file,
'public_ftp_dir': cfg['public_ftp_dir'],
'accessions_dir': os.path.join(self.project_dir, project_dirs['accessions']),
'public_dir': os.path.join(self.project_dir, project_dirs['public']),
'logs_dir': os.path.join(self.project_dir, project_dirs['logs']),
'executable': cfg['executable'],
'jar': cfg['jar'],
'taxonomy': self.taxonomy
}
self.run_nextflow('accession', accession_config, resume)

def run_variant_load_workflow(self, vcf_files_to_ingest, annotation_only, resume):
job_props = variant_load_props_template(
project_accession=self.project_accession,
study_name=self.get_study_name(),
output_dir=self.project_dir.joinpath(project_dirs['transformed']),
annotation_dir=self.project_dir.joinpath(project_dirs['annotation']),
stats_dir=self.project_dir.joinpath(project_dirs['stats']),
)
variant_load_properties_file = self.create_variant_load_properties(
output_file_path=os.path.join(self.project_dir, 'variant_load.properties'))
accession_import_properties_file = self.create_accession_import_properties(
output_file_path=os.path.join(self.project_dir, 'accession_import.properties'))

load_config = {
'valid_vcfs': vcf_files_to_ingest,
'vep_path': cfg['vep_path'],
'load_job_props': job_props,
'acc_import_job_props': {'db.collections.variants.name': 'variants_2_0'},
'load_job_props': variant_load_properties_file,
'acc_import_job_props': accession_import_properties_file,
'project_accession': self.project_accession,
'project_dir': str(self.project_dir),
'logs_dir': os.path.join(self.project_dir, project_dirs['logs']),
'eva_pipeline_props': cfg['eva_pipeline_props'],
'executable': cfg['executable'],
'jar': cfg['jar'],
'annotation_only': annotation_only,
Expand Down Expand Up @@ -496,6 +457,39 @@ def create_clustering_properties(self, output_file_path, clustering_instance, ta
open_file.write(properties)
return output_file_path

def create_accession_properties(self, instance_id, output_file_path):
properties = self.properties_generator.get_accessioning_properties(
instance=instance_id,
target_assembly=self._get_target_assembly(),
project_accession=self.project_accession,
taxonomy_accession=self.taxonomy
)
with open(output_file_path, 'w') as open_file:
open_file.write(properties)
return output_file_path

def create_variant_load_properties(self, output_file_path):
properties = self.properties_generator.get_variant_load_properties(
project_accession=self.project_accession,
study_name=self.get_study_name(),
output_dir=self.project_dir.joinpath(project_dirs['transformed']),
annotation_dir=self.project_dir.joinpath(project_dirs['annotation']),
stats_dir=self.project_dir.joinpath(project_dirs['stats']),
vep_cache_path=cfg['vep_cache_path'],
opencga_path=cfg['opencga_path']
)
with open(output_file_path, 'w') as open_file:
open_file.write(properties)
return output_file_path

def create_accession_import_properties(self, output_file_path):
properties = self.properties_generator.get_accession_import_properties(
opencga_path=cfg['opencga_path']
)
with open(output_file_path, 'w') as open_file:
open_file.write(properties)
return output_file_path

def insert_browsable_files(self):
with self.metadata_connection_handle as conn:
# insert into browsable file table, if files not already there
Expand Down
1 change: 1 addition & 0 deletions eva_submission/etc/example_submission_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ eutils_api_key: 1234556lkxflk
eva_pipeline_props: '/path/to/pipeline/properties'
vep_path: '/path/to/vep'
vep_cache_path: '/path/to/vep/cache'
opencga_path: '/path/to/opencga'

maven:
environment: 'internal'
Expand Down
102 changes: 0 additions & 102 deletions eva_submission/ingestion_templates.py

This file was deleted.

Loading

0 comments on commit f8c6f1e

Please sign in to comment.