Skip to content

Commit

Permalink
New script and tests to check if sequences in fasta are from INSDC. A…
Browse files Browse the repository at this point in the history
…lso added report to HTML
  • Loading branch information
tcezard committed Feb 1, 2024
1 parent 1dc47a5 commit 5416002
Show file tree
Hide file tree
Showing 14 changed files with 4,104 additions and 32 deletions.
95 changes: 95 additions & 0 deletions bin/check_fasta_insdc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python

import argparse
import gzip
import hashlib

from itertools import groupby

import requests
from ebi_eva_common_pyutils.logger import logging_config

import yaml
from requests import HTTPError
from retry import retry

REFGET_SERVER = 'https://www.ebi.ac.uk/ena/cram'

logger = logging_config.get_logger(__name__)


def open_gzip_if_required(input_file):
if input_file.endswith('.gz'):
return gzip.open(input_file, 'rt')
else:
return open(input_file, 'r')


def write_result_yaml(output_yaml, results):
with open(output_yaml, 'w') as open_yaml:
yaml.safe_dump(data=results, stream=open_yaml)


def refget_md5_digest(sequence):
return hashlib.md5(sequence.upper().encode('utf-8')).hexdigest()


def fasta_iter(input_fasta):
"""
Given a fasta file. yield tuples of header, sequence
"""
"first open the file outside "
fin = open(input_fasta, 'r')

# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.

faiter = (x[1] for x in groupby(fin, lambda line: line[0] == ">"))

for header in faiter:
# drop the ">"
headerStr = header.__next__()[1:].strip()

# join all sequence lines to one.
seq = "".join(s.strip() for s in faiter.__next__())
yield (headerStr, seq)


@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def get_refget_metadata(md5_digest):
response = requests.get(f'{REFGET_SERVER}/sequence/{md5_digest}/metadata')
if 500 <= response.status_code < 600:
raise HTTPError(f"{response.status_code} Server Error: {response.reason} for url: {response.url}", response=response)
if 200 <= response.status_code < 300:
return response.json()
return None


def assess_fasta(input_fasta):
results = {'sequences': []}
all_insdc = True
for header, sequence in fasta_iter(input_fasta):
name = header.split()[0]
md5_digest = refget_md5_digest(sequence)
sequence_metadata = get_refget_metadata(md5_digest)
results['sequences'].append({'sequence_name': name, 'sequence_md5': md5_digest, 'insdc': bool(sequence_metadata)})
all_insdc = all_insdc and bool(sequence_metadata)
results['all_insdc']: all_insdc
return results


def main():
arg_parser = argparse.ArgumentParser(
description='Calculate the each sequence Refget MD5 identify and compare these against INSDC Refget server.')
arg_parser.add_argument('--input_fasta', required=True, dest='input_fasta',
help='Fasta file that contains the sequence to be checked')
arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
help='Path to the location of the results')
args = arg_parser.parse_args()
logging_config.add_stdout_handler()
results = assess_fasta(args.input_fasta)
write_result_yaml(args.output_yaml, results)


if __name__ == "__main__":
main()
19 changes: 6 additions & 13 deletions eva_sub_cli/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,22 @@ class DockerValidator(Reporter):

def __init__(self, mapping_file, output_dir, metadata_json=None,
metadata_xlsx=None, container_name=None, docker_path='docker', submission_config=None):
# validator write to the validation output directory
# If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
super().__init__(mapping_file, os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
submission_config=submission_config)

self.docker_path = docker_path
self.mapping_file = mapping_file
self.metadata_json = metadata_json
self.metadata_xlsx = metadata_xlsx
self.container_name = container_name
if self.container_name is None:
self.container_name = container_image.split('/')[1] + '.' + container_tag
self.spreadsheet2json_conf = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
# validator write to the validation output directory
# If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
super().__init__(self._find_vcf_file(), os.path.join(output_dir, VALIDATION_OUTPUT_DIR),
submission_config=submission_config)

def _validate(self):
self.run_docker_validator()

def _find_vcf_file(self):
vcf_files = []
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
vcf_files.append(row['vcf'])
return vcf_files

def get_docker_validation_cmd(self):
if self.metadata_xlsx and not self.metadata_json:
docker_cmd = (
Expand Down Expand Up @@ -97,6 +89,7 @@ def run_docker_validator(self):
self.copy_files_to_container()

docker_cmd = self.get_docker_validation_cmd()
print(docker_cmd)
# start validation
# FIXME: If nextflow fails in the docker exec still exit with error code 0
run_command_with_output("Run Validation using Nextflow", docker_cmd)
Expand Down
30 changes: 30 additions & 0 deletions eva_sub_cli/jinja_templates/fasta_check.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

{% macro fasta_check_report(validation_results) -%}
{% for fasta_file, results_for_fasta in validation_results.get('fasta_check', {}).items() %}
{% if results_for_fasta.get('all_insdc') %}
{% set icon = "&#10004;" %}
{% set row_class = "report-section pass" %}
{% set text = "sequences are all INSDC accessioned" %}
{% else %}
{% set icon = "&#10060;" %}
{% set row_class = "report-section fail collapsible" %}
{% set text = "sequences are not INSDC accessioned" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} {{ fasta_file }} {{ text }} </div>
{% if not results_for_fasta.get('all_insdc') %}
<div class="error-list">
<div class="error-description">First 10 sequences not in INSDC. <strong>Full report:</strong> {{ results_for_fasta.get('report_path', '') }}</div>
<table>
<tr>
<th>Sequence name</th><th>Refget md5</th><th>In INSDC</th>
</tr>
{% set sequence_info_list = results_for_fasta.get('sequences', [])|rejectattr("insdc")|list %}
{% for sequence_info in sequence_info_list[:10] %}
<tr>
<td><strong>{{sequence_info.get('sequence_name') }}</strong></td><td> {{ sequence_info.get('sequence_md5') }}</td><td> &#10060; </td>
</tr>
{% endfor %}

{% endif %}
{% endfor %}
{%- endmacro %}
10 changes: 10 additions & 0 deletions eva_sub_cli/jinja_templates/html_report.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
<!DOCTYPE html>
{% from 'file_validation.html' import file_validation_report %}
{% from 'sample_name_check.html' import sample_name_check_report %}
{% from 'fasta_check.html' import fasta_check_report %}
{% from 'metadata_validation.html' import metadata_validation_report %}

<html lang="EN">
<head>
<meta charset="UTF-8">
Expand Down Expand Up @@ -74,6 +76,14 @@ <h2>Sample name concordance check</h2>
{{ sample_name_check_report(validation_results)}}
</section>

<section>
<h2>Reference Genome INSDC check</h2>
<div class="description">
Checks that the reference sequences in the fasta file used to call the variant are accessioned in INSDC.
</div>
{{ fasta_check_report(validation_results)}}
</section>

<script>
let collapsibles = document.querySelectorAll('.collapsible');
for (let collapsible of collapsibles) {
Expand Down
29 changes: 28 additions & 1 deletion eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ params.executable = [
"vcf_validator": "vcf_validator",
"vcf_assembly_checker": "vcf_assembly_checker",
"samples_checker": "samples_checker.py",
"fasta_checker": "check_fasta_insdc.py",
"xlsx2json": "xlsx2json.py",
"biovalidator": "biovalidator"
]
// validation tasks
params.validation_tasks = [ "vcf_check", "assembly_check", "samples_check", "metadata_check"]
params.validation_tasks = [ "vcf_check", "assembly_check", "samples_check", "metadata_check", "genome_check"]
// container validation dir (prefix for vcf files)
params.container_validation_dir = "/opt/vcf_validation"
// help
Expand Down Expand Up @@ -79,6 +80,13 @@ workflow {
if ("samples_check" in params.validation_tasks) {
sample_name_concordance(metadata_json, vcf_files.collect())
}
if ("genome_check" in params.validation_tasks){
fasta_files = Channel.fromPath(params.vcf_files_mapping)
.splitCsv(header:true)
.map{row -> file(params.container_validation_dir+row.fasta)}
.unique()
genome_checker(fasta_files)
}
}

/*
Expand Down Expand Up @@ -187,3 +195,22 @@ process sample_name_concordance {
$params.executable.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
"""
}



process genome_checker {
publishDir "$params.output_dir",
overwrite: true,
mode: "copy"

input:
path(fasta_file)

output:
path "${fasta_file}_check.yml", emit: fasta_checker

script:
"""
$params.executable.fasta_checker --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml
"""
}
25 changes: 24 additions & 1 deletion eva_sub_cli/reporter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
import csv
import datetime
import glob
import os
Expand Down Expand Up @@ -27,9 +28,12 @@ def resolve_single_file_path(file_path):

class Reporter:

def __init__(self, vcf_files, output_dir, submission_config: WritableConfig = None):
def __init__(self, mapping_file, output_dir, submission_config: WritableConfig = None):
self.output_dir = output_dir
self.mapping_file = mapping_file
vcf_files, fasta_files = self._find_vcf_and_fasta_files()
self.vcf_files = vcf_files
self.fasta_files = fasta_files
self.results = {}
self.project_title = None # TODO fill this from metadata?
self.validation_date = datetime.datetime.now()
Expand All @@ -46,6 +50,16 @@ def __exit__(self, exc_type, exc_val, exc_tb):
self.sub_config.backup()
self.sub_config.write()

def _find_vcf_and_fasta_files(self):
vcf_files = []
fasta_files = []
with open(self.mapping_file) as open_file:
reader = csv.DictReader(open_file, delimiter=',')
for row in reader:
vcf_files.append(row['vcf'])
fasta_files.append(row['fasta'])
return vcf_files, fasta_files

def validate(self):
self._validate()
self._collect_validation_workflow_results()
Expand Down Expand Up @@ -151,6 +165,7 @@ def _collect_validation_workflow_results(self, ):
self._collect_vcf_check_results()
self._collect_assembly_check_results()
self._load_sample_check_results()
self._load_fasta_check_results()
self._parse_biovalidator_validation_results()
self._convert_biovalidator_validation_to_spreadsheet()
self._write_spreadsheet_validation_results()
Expand Down Expand Up @@ -247,6 +262,14 @@ def _collect_assembly_check_results(self):
def _sample_check_yaml(self):
return resolve_single_file_path(os.path.join(self.output_dir, 'sample_checker.yml'))

def _load_fasta_check_results(self):
for fasta_file in self.fasta_files:
fasta_file_name = os.path.basename(fasta_file)
fasta_check = resolve_single_file_path(os.path.join(self.output_dir, f'{fasta_file_name}_check.yml'))
self.results['fasta_check'] = {}
with open(fasta_check) as open_yaml:
self.results['fasta_check'][fasta_file_name] = yaml.safe_load(open_yaml)

def _load_sample_check_results(self):
with open(self._sample_check_yaml) as open_yaml:
self.results['sample_check'] = yaml.safe_load(open_yaml)
Expand Down
Loading

0 comments on commit 5416002

Please sign in to comment.