Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3263 - Shallow validation #54

Merged
merged 8 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 1 addition & 28 deletions eva_sub_cli/executables/check_fasta_insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from requests import HTTPError
from retry import retry

from eva_sub_cli.file_utils import fasta_iter
from eva_sub_cli.metadata_utils import get_files_per_analysis, get_analysis_for_vcf_file, \
get_reference_assembly_for_analysis

Expand All @@ -19,13 +20,6 @@
logger = logging_config.get_logger(__name__)


def open_gzip_if_required(input_file):
if input_file.endswith('.gz'):
return gzip.open(input_file, 'rt')
else:
return open(input_file, 'r')


def write_result_yaml(output_yaml, results):
with open(output_yaml, 'w') as open_yaml:
yaml.safe_dump(data=results, stream=open_yaml)
Expand All @@ -34,27 +28,6 @@ def write_result_yaml(output_yaml, results):
def refget_md5_digest(sequence):
return hashlib.md5(sequence.upper().encode('utf-8')).hexdigest()


def fasta_iter(input_fasta):
"""
Given a fasta file. yield tuples of header, sequence
"""
# first open the file outside
with open(input_fasta, 'r') as open_file:

# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">"))

for header in faiter:
# drop the ">"
headerStr = header.__next__()[1:].strip()

# join all sequence lines to one.
seq = "".join(s.strip() for s in faiter.__next__())
yield (headerStr, seq)


@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def get_refget_metadata(md5_digest):
response = requests.get(f'{REFGET_SERVER}/sequence/{md5_digest}/metadata')
Expand Down
19 changes: 13 additions & 6 deletions eva_sub_cli/executables/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ def validate_command_line_arguments(args, argparser):
print(f"'{args.submission_dir}' does not have write permissions or is not a directory.")
sys.exit(1)


def main():
def parse_args(cmd_line_args):
argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA')
argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}')
argparser.add_argument('--submission_dir', required=True, type=str,
Expand Down Expand Up @@ -67,18 +66,26 @@ def main():
'upload to the EVA')
credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages')
argparser.add_argument('--shallow', action='store_true', default=False,
help='Set the validation to be performed on the first 10000 records of the VCF. '
'Only applies if the number of record exceed 10000')
argparser.add_argument('--debug', action='store_true', default=False,
help='Set the script to output debug messages')
args = argparser.parse_args(cmd_line_args)
validate_command_line_arguments(args, argparser)
return args

args = argparser.parse_args()

validate_command_line_arguments(args, argparser)
def main():

args = parse_args(sys.argv[1:])

args.submission_dir = os.path.abspath(args.submission_dir)

if args.debug:
logging_config.add_stdout_handler(logging.DEBUG)
else:
logging_config.add_stdout_handler()
logging_config.add_stdout_handler(logging.INFO)
logging_config.add_file_handler(os.path.join(args.submission_dir, 'eva_submission.log'), logging.DEBUG)

try:
Expand Down
8 changes: 1 addition & 7 deletions eva_sub_cli/executables/samples_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,12 @@

import yaml

from eva_sub_cli.file_utils import open_gzip_if_required
from eva_sub_cli.metadata_utils import get_samples_per_analysis, get_files_per_analysis, get_analysis_for_vcf_file

logger = logging_config.get_logger(__name__)


def open_gzip_if_required(input_file):
if input_file.endswith('.gz'):
return gzip.open(input_file, 'rt')
else:
return open(input_file, 'r')


def get_samples_from_vcf(vcf_file):
"""
Get the list of samples present in a single VCF file
Expand Down
78 changes: 78 additions & 0 deletions eva_sub_cli/executables/trim_down.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import argparse
import os

import yaml
from ebi_eva_common_pyutils.logger import logging_config
from eva_sub_cli.file_utils import open_gzip_if_required, fasta_iter

logger = logging_config.get_logger(__name__)


max_nb_lines = 10000


def trim_down_vcf(vcf_file, output_vcf):
"""
Produce a smaller vcf files containing a maximum of 10000 records
"""
with open_gzip_if_required(vcf_file) as vcf_in, open(output_vcf, 'w') as vcf_out:
line_count = 0
ref_seq_names = set()
for line in vcf_in:
if line.startswith('#') or line_count < max_nb_lines:
vcf_out.write(line)
if not line.startswith('#'):
line_count += 1
ref_seq_names.add(line.split('\t')[0])
else:
break
if line_count != max_nb_lines:
logger.warning(f'Only {line_count} found in the source VCF {vcf_file} ')
return line_count, ref_seq_names


def trim_down_fasta(fasta_file, output_fasta, ref_seq_names):
"""
Produce a smaller fasta files containing only the reference sequences found in the VCF file
"""
found_sequences = set()
with open(output_fasta, 'w') as fasta_out:
for header, sequence in fasta_iter(fasta_file):
name = header.split()[0]
if name in ref_seq_names:
found_sequences.add(name)
print(f'>{header}', file=fasta_out)
for i in range(0, len(sequence), 80):
print(sequence[i:i+80], file=fasta_out)
return found_sequences


def main():
arg_parser = argparse.ArgumentParser(
description=f'Take a VCF file and only keep {max_nb_lines} lines and remove unused fasta sequence from the '
f'associated reference genome')
arg_parser.add_argument('--vcf_file', dest='vcf_file', required=True,
help='Path to the vcf file to be trimmed down')
arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', required=True,
help='Path to the output vcf file')
arg_parser.add_argument('--fasta_file', dest='fasta_file', required=True,
help='Path to the fasta file to be trimmed down')
arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', required=True,
help='Path to the output fasta file')
arg_parser.add_argument('--output_yaml_file', dest='output_yaml_file', required=True,
help='Path to the yaml file containing the trim down metrics')

args = arg_parser.parse_args()
logging_config.add_stdout_handler()

line_count, ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file)
sequence_found = trim_down_fasta(args.fasta_file, args.output_fasta_file, ref_sequence)
trim_down_metrics = {'trim_down_vcf_record': line_count, 'number_sequence_found': sequence_found,
'trim_down_required': line_count == max_nb_lines}
if len(sequence_found) != len(ref_sequence):
logger.warning(f'Not all sequences were found in the fasta file. Cancelling trimming down of fasta file')
os.link(args.fasta_file, args.output_fasta_file)
trim_down_metrics.pop('number_sequence_found')
with open(args.output_yaml_file) as open_file:
yaml.safe_dump(trim_down_metrics, open_file)

38 changes: 38 additions & 0 deletions eva_sub_cli/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
import glob
import gzip
import os
import shutil
from itertools import groupby


def resolve_single_file_path(file_path):
files = glob.glob(file_path)
if len(files) == 0:
return None
elif len(files) > 0:
return files[0]


def is_submission_dir_writable(submission_dir):
Expand Down Expand Up @@ -32,3 +43,30 @@ def backup_file_or_directory(file_name, max_backups=None):
else:
os.rename(f'{file_name}.{i - 1}', f'{file_name}.{i}')
os.rename(file_name, file_name + '.1')


def open_gzip_if_required(input_file):
"""Open a file in read mode using gzip if the file extension says .gz"""
if input_file.endswith('.gz'):
return gzip.open(input_file, 'rt')
else:
return open(input_file, 'r')


def fasta_iter(input_fasta):
"""
Given a fasta file. yield tuples of header, sequence
"""
# first open the file outside
with open_gzip_if_required(input_fasta) as open_file:
# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">"))

for header in faiter:
# drop the ">"
headerStr = header.__next__()[1:].strip()

# join all sequence lines to one.
seq = "".join(s.strip() for s in faiter.__next__())
yield (headerStr, seq)
3 changes: 3 additions & 0 deletions eva_sub_cli/jinja_templates/html_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
{% from 'sample_name_check.html' import sample_name_check_report %}
{% from 'fasta_check.html' import fasta_check_report %}
{% from 'metadata_validation.html' import metadata_validation_report %}
{% from 'shallow_validation.html' import optional_shallow_validation_report %}

<html lang="EN">
<head>
Expand Down Expand Up @@ -46,6 +47,8 @@ <h6>eva-sub-cli v{{cli_version}}</h6>
</div>
</header>

{{ optional_shallow_validation_report(validation_results) }}

apriltuesday marked this conversation as resolved.
Show resolved Hide resolved
<section>
<h2>Project Summary</h2>
<div class="description">
Expand Down
2 changes: 2 additions & 0 deletions eva_sub_cli/jinja_templates/project_details.html
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,6 @@
</div>
{% endif %}



{%- endmacro %}
29 changes: 29 additions & 0 deletions eva_sub_cli/jinja_templates/shallow_validation.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

{% macro optional_shallow_validation_report(validation_results) -%}
{% set results = validation_results.get('shallow_validation', {}) %}

{% if results.get('required') %}
<section>
<div class="report-section fail collapsible"> <span class="expand_icon">&#9654;</span>
&#10060; <b>You requested to run the shallow validation, please run full validation before submitting the data</b>
</div>
<div class="no-show">
<table>
<tr>
<th><strong>VCF File</strong></th>
<th><strong>Variant lines validated in VCF</strong></th>
<th><strong>Entries used in Fasta</strong></th>
</tr>
{% for vcf_file in results.get('metrics') %}
<tr>
<td>{{ vcf_file }}</td>
<td>{{ results.get('metrics').get(vcf_file).get('trim_down_vcf_record') }}</td>
<td>{{ results.get('metrics').get(vcf_file).get('number_sequence_found') }}</td>
</tr>
{% endfor %}
</table>
</div>
</section>
{% endif %}

{%- endmacro %}
47 changes: 35 additions & 12 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ params.python_scripts = [
"samples_checker": "samples_checker.py",
"fasta_checker": "check_fasta_insdc.py",
"xlsx2json": "xlsx2json.py",
"semantic_checker": "check_metadata_semantics.py"
"semantic_checker": "check_metadata_semantics.py",
"trim_down": "trim_down.py"
]
// prefix to prepend to all provided path
params.base_dir = ""
// help
params.help = null
params.shallow_validation = false

// Show help message
if (params.help) exit 0, helpMessage()
Expand Down Expand Up @@ -63,20 +65,23 @@ output_dir = joinBasePath(params.output_dir)

workflow {
// Prepare the file path
vcf_channel = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
vcf_and_ref_ch = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
.map{row -> tuple(
file(joinBasePath(row.vcf)),
file(joinBasePath(row.fasta)),
file(joinBasePath(row.report))
)}
vcf_files = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
.map{row -> file(joinBasePath(row.vcf))}

if (params.shallow_validation){
// create a smaller vcf and fasta then replace the channel
trim_down_vcf(vcf_and_ref_ch)
vcf_and_ref_ch = trim_down_vcf.out.vcf_and_ref
}
vcf_files = vcf_and_ref_ch.map{row -> row[0]}
fasta_to_vcfs = vcf_and_ref_ch.map{row -> tuple(row[1], row[0])}.groupTuple(by:0)
// VCF checks
check_vcf_valid(vcf_channel)
check_vcf_reference(vcf_channel)
check_vcf_valid(vcf_and_ref_ch)
check_vcf_reference(vcf_and_ref_ch)

generate_file_size_and_md5_digests(vcf_files)
collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())
Expand All @@ -94,14 +99,32 @@ workflow {
metadata_json_validation(metadata_json)
metadata_semantic_check(metadata_json)
sample_name_concordance(metadata_json, vcf_files.collect())
fasta_to_vcfs = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
.map{row -> tuple(file(joinBasePath(row.fasta)), file(joinBasePath(row.vcf)))}
.groupTuple(by:0)
insdc_checker(metadata_json, fasta_to_vcfs)
}
}


process trim_down_vcf {
publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.log"
publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.yml"

input:
tuple path(vcf), path(fasta), path(report)

output:
tuple path("output/$vcf"), path("output/$fasta"), path(report), emit: vcf_and_ref
path "${vcf.getBaseName()}_trim_down.log", emit: trim_down_log
path "${vcf.getBaseName()}_trim_down.yml", emit: trim_down_metric

"""
mkdir output
$params.python_scripts.trim_down --vcf_file $vcf --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta --output_yaml_file ${vcf.getBaseName()}_trim_down.yml > ${vcf.getBaseName()}_trim_down.log
# This is needed to ensure that a missing (NO_FILE) report can still be passed down to subsequent steps
touch $report
"""

}

/*
* Validate the VCF file format
*/
Expand Down
Loading
Loading