Skip to content

Commit

Permalink
Removed study name from pipeline arguments and added EVA version number
Browse files Browse the repository at this point in the history
  • Loading branch information
Cristina Yenyxe Gonzalez Garcia committed Jan 15, 2015
1 parent 0f2c620 commit 79e8e6a
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 25 deletions.
8 changes: 5 additions & 3 deletions accessioning.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ class VariantsAccessioning(luigi.Task):

# TODO Possibly implement a FileParameter or PathParameter class?
file = luigi.Parameter(description='Input VCF file to process and load')
version = luigi.Parameter(description='EVA version where the file is released')
vcf_dir = luigi.Parameter(description='Folder for storage of EVA VCF files')

def requires(self):
return []

def run(self):
# Get study prefix and its last accession
info = evapro_adaptor.get_variant_accessioning_info(os.path.basename(self.file))
info = evapro_adaptor.get_variant_accessioning_info(os.path.basename(self.file), self.version)
if not info:
raise evapro_adaptor.EvaproError('Filename not found in EVAPRO')
(study_id, study_prefix, last_accession) = info
Expand Down Expand Up @@ -79,10 +80,11 @@ class SaveLastAccession(luigi.Task):
"""

file = luigi.Parameter(description='Input VCF file to process and load')
version = luigi.Parameter(description='EVA version where the file is released')
vcf_dir = luigi.Parameter(description='Folder for storage of EVA VCF files')

def requires(self):
return VariantsAccessioning(self.file, self.vcf_dir)
return VariantsAccessioning(self.file, self.version, self.vcf_dir)

def run(self):
# Get the last lines in self.input()
Expand All @@ -105,7 +107,7 @@ def run(self):

print 'Last accession ID generated = ' + max_accession

evapro_adaptor.save_last_accession(os.path.basename(self.file), max_accession)
evapro_adaptor.save_last_accession(os.path.basename(self.file), self.version, max_accession)

def output(self):
return self.input()
Expand Down
14 changes: 7 additions & 7 deletions eva_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ class VariantsLoading(luigi.Task):
# TODO Possible FileParameter or PathParameter class?
file = luigi.Parameter(description='Input VCF file to process and load')
vcf_dir = luigi.Parameter(description='Folder for storage of EVA VCF files')
version = luigi.Parameter(description='EVA version where the file is released')
json_dir = luigi.Parameter(description='Folder for storage of EVA JSON files')

study_name = luigi.Parameter(description='Full name of the study of this input file')
aggregated = luigi.BooleanParameter(default=False)

def requires(self):
return VariantsTransformation(self.file, self.vcf_dir, self.json_dir, self.study_name, self.aggregated)
return VariantsTransformation(self.file, self.version, self.vcf_dir, self.json_dir, self.aggregated)

def run(self):
# Get input files root name (remove .gz, then .json, then .file)
Expand Down Expand Up @@ -96,21 +96,21 @@ class VariantsTransformation(luigi.Task):
"""

file = luigi.Parameter(description='Input VCF file to process and load')
version = luigi.Parameter(description='EVA version where the file is released')
vcf_dir = luigi.Parameter(description='Folder for storage of EVA VCF files')
json_dir = luigi.Parameter(description='Folder for storage of EVA JSON files')

study_name = luigi.Parameter(description='Full name of the study of this input file')
aggregated = luigi.BooleanParameter(default=False)

def requires(self):
return SaveLastAccession(self.file, self.vcf_dir)
return SaveLastAccession(self.file, self.version, self.vcf_dir)

def run(self):
# Get study and file ID
info = evapro_adaptor.get_study_and_file_id(os.path.basename(self.file))
info = evapro_adaptor.get_study_and_file_id(os.path.basename(self.file), self.version)
if not info:
raise evapro_adaptor.EvaproError('Filename not found in EVAPRO')
(study_alias, file_alias) = info
(study_alias, study_name, file_alias) = info

# TODO --include-effect when VEP is ready
config = configuration.get_opencga_config('pipeline_config.conf')
Expand All @@ -120,7 +120,7 @@ def run(self):
kwargs = {'opencga-root': config['root_folder'],
'input': self.input().fn,
'outdir': self.json_dir,
'study': self.study_name,
'study': study_name,
'file-alias': file_alias,
'study-alias': study_alias}

Expand Down
30 changes: 15 additions & 15 deletions evapro_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def disconnect(connection):
connection.close()


def get_study_and_file_id(filename):
def get_study_and_file_id(filename, eva_version):
"""
Given a filename, returns the submission ID of the project where it is classified and the file submission ID in ENA.
Expand All @@ -30,20 +30,21 @@ def get_study_and_file_id(filename):
conn = connect()
cursor = conn.cursor()

cursor.execute('SELECT project.project_accession, file.ena_submission_file_id '
'FROM project, project_analysis, analysis_file, file '
cursor.execute('SELECT project.project_accession, project.title, browsable_file.ena_submission_file_id '
'FROM project, project_analysis, analysis_file, browsable_file '
'WHERE project.project_accession = project_analysis.project_accession '
'AND project_analysis.analysis_accession = analysis_file.analysis_accession '
'AND analysis_file.file_id = file.file_id AND file.filename = \'{fname}\''
.format(fname=filename))
'AND analysis_file.file_id = browsable_file.file_id AND browsable_file.filename = \'{fname}\' '
'AND browsable_file.eva_release = \'{version}\' '
.format(fname=filename, version=eva_version))

rows = tuple(cursor)
info = rows[0] if rows and rows[0] else None
disconnect(conn)
return info


def get_variant_accessioning_info(filename):
def get_variant_accessioning_info(filename, eva_version):
"""
Given a filename, returns the accession ID of the project where it is classified, its prefix for variant
accessioning and the last variant accession generated.
Expand All @@ -54,28 +55,27 @@ def get_variant_accessioning_info(filename):
conn = connect()
cursor = conn.cursor()

cursor.execute('SELECT project.project_accession, project_var_accession.project_prefix, '
'project_var_accession.last_used_accession '
'FROM project_var_accession, project, project_analysis, analysis_file, file '
# 'WHERE project_var_accession.project_accession_code = project.project_accession_code '
'WHERE project_var_accession.project_accession_code = project.project_accession '
cursor.execute('SELECT project.project_accession, pva.project_prefix, pva.last_used_accession '
'FROM project_var_accession as pva, project, project_analysis, analysis_file, browsable_file '
'WHERE pva.project_accession_code = project.project_accession_code '
'AND project.project_accession = project_analysis.project_accession '
'AND project_analysis.analysis_accession = analysis_file.analysis_accession '
'AND analysis_file.file_id = file.file_id AND file.filename = \'{fname}\''
.format(fname=filename))
'AND analysis_file.file_id = browsable_file.file_id AND browsable_file.filename = \'{fname}\' '
'AND browsable_file.eva_release = \'{version}\' '
.format(fname=filename, version=eva_version))

rows = tuple(cursor)
info = rows[0] if rows and rows[0] else None
disconnect(conn)
return info


def save_last_accession(filename, last_accession):
def save_last_accession(filename, eva_version, last_accession):
conn = connect()
cursor = conn.cursor()

# Get study prefix
info = get_variant_accessioning_info(filename)
info = get_variant_accessioning_info(filename, eva_version)
if not info:
raise EvaproError('Filename not found in EVAPRO')
study_prefix = info[1]
Expand Down

0 comments on commit 79e8e6a

Please sign in to comment.