From 42c42a9eedde7f916016f26dad4e07e293b43915 Mon Sep 17 00:00:00 2001 From: tcezard Date: Fri, 30 Aug 2024 16:51:24 +0100 Subject: [PATCH 1/8] first batch of changes for adding shallow validation option --- eva_sub_cli/executables/check_fasta_insdc.py | 29 +--------- eva_sub_cli/executables/cli.py | 12 +++- eva_sub_cli/executables/samples_checker.py | 8 +-- eva_sub_cli/executables/trim_down.py | 61 ++++++++++++++++++++ eva_sub_cli/file_utils.py | 29 ++++++++++ eva_sub_cli/nextflow/validation.nf | 44 ++++++++++---- eva_sub_cli/validators/docker_validator.py | 6 +- eva_sub_cli/validators/native_validator.py | 9 ++- eva_sub_cli/validators/validator.py | 2 +- pyproject.toml | 1 + tests/test_cli.py | 14 +++++ 11 files changed, 158 insertions(+), 57 deletions(-) create mode 100644 eva_sub_cli/executables/trim_down.py create mode 100644 tests/test_cli.py diff --git a/eva_sub_cli/executables/check_fasta_insdc.py b/eva_sub_cli/executables/check_fasta_insdc.py index 6635dca..c05933c 100644 --- a/eva_sub_cli/executables/check_fasta_insdc.py +++ b/eva_sub_cli/executables/check_fasta_insdc.py @@ -10,6 +10,7 @@ from requests import HTTPError from retry import retry +from eva_sub_cli.file_utils import fasta_iter from eva_sub_cli.metadata_utils import get_files_per_analysis, get_analysis_for_vcf_file, \ get_reference_assembly_for_analysis @@ -19,13 +20,6 @@ logger = logging_config.get_logger(__name__) -def open_gzip_if_required(input_file): - if input_file.endswith('.gz'): - return gzip.open(input_file, 'rt') - else: - return open(input_file, 'r') - - def write_result_yaml(output_yaml, results): with open(output_yaml, 'w') as open_yaml: yaml.safe_dump(data=results, stream=open_yaml) @@ -34,27 +28,6 @@ def write_result_yaml(output_yaml, results): def refget_md5_digest(sequence): return hashlib.md5(sequence.upper().encode('utf-8')).hexdigest() - -def fasta_iter(input_fasta): - """ - Given a fasta file. yield tuples of header, sequence - """ - # first open the file outside - with open(input_fasta, 'r') as open_file: - - # ditch the boolean (x[0]) and just keep the header or sequence since - # we know they alternate. - faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">")) - - for header in faiter: - # drop the ">" - headerStr = header.__next__()[1:].strip() - - # join all sequence lines to one. - seq = "".join(s.strip() for s in faiter.__next__()) - yield (headerStr, seq) - - @retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3)) def get_refget_metadata(md5_digest): response = requests.get(f'{REFGET_SERVER}/sequence/{md5_digest}/metadata') diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py index c760de5..f55bed5 100755 --- a/eva_sub_cli/executables/cli.py +++ b/eva_sub_cli/executables/cli.py @@ -35,8 +35,7 @@ def validate_command_line_arguments(args, argparser): print(f"'{args.submission_dir}' does not have write permissions or is not a directory.") sys.exit(1) - -def main(): +def parse_args(args): argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA') argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}') argparser.add_argument('--submission_dir', required=True, type=str, @@ -69,9 +68,16 @@ def main(): credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account") argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages') + argparser.add_argument('--debug', action='store_true', default=False, + help='Set the script to output debug messages') args = argparser.parse_args() - validate_command_line_arguments(args, argparser) + return args + + +def main(): + + args = parse_args(sys.argv[1:]) args.submission_dir = os.path.abspath(args.submission_dir) diff --git a/eva_sub_cli/executables/samples_checker.py b/eva_sub_cli/executables/samples_checker.py index 0e99efd..a2e3ed2 100644 --- a/eva_sub_cli/executables/samples_checker.py +++ b/eva_sub_cli/executables/samples_checker.py @@ -7,18 +7,12 @@ import yaml +from eva_sub_cli.file_utils import open_gzip_if_required from eva_sub_cli.metadata_utils import get_samples_per_analysis, get_files_per_analysis, get_analysis_for_vcf_file logger = logging_config.get_logger(__name__) -def open_gzip_if_required(input_file): - if input_file.endswith('.gz'): - return gzip.open(input_file, 'rt') - else: - return open(input_file, 'r') - - def get_samples_from_vcf(vcf_file): """ Get the list of samples present in a single VCF file diff --git a/eva_sub_cli/executables/trim_down.py b/eva_sub_cli/executables/trim_down.py new file mode 100644 index 0000000..9fc5742 --- /dev/null +++ b/eva_sub_cli/executables/trim_down.py @@ -0,0 +1,61 @@ +import argparse +import os + +from ebi_eva_common_pyutils.logger import logging_config +from eva_sub_cli.file_utils import open_gzip_if_required, fasta_iter + +logger = logging_config.get_logger(__name__) + + +max_nb_lines = 10000 + + +def trim_down_vcf(vcf_file, output_vcf): + """ + Produce a smaller file + """ + with open_gzip_if_required(vcf_file) as vcf_in, open(output_vcf, 'w') as vcf_out: + line_count = 0 + ref_seq_names = set() + for line in vcf_in: + if line.startswith('#') or line_count < max_nb_lines: + vcf_out.write(line) + if not line.startswith('#'): + line_count += 1 + ref_seq_names.add(line.split('\t')[0]) + else: + break + if line_count != max_nb_lines: + logger.warning(f'Only {line_count} found in the source VCF {vcf_file} ') + return ref_seq_names + + +def trim_down_fasta(fasta_file, output_fasta, ref_seq_names): + found_sequences = set() + with open(output_fasta, 'w') as fasta_out: + for header, sequence in fasta_iter(fasta_file): + name = header.split()[0] + if name in ref_seq_names: + found_sequences.add(name) + fasta_out.write(header) + fasta_out.write(sequence) + return found_sequences + + +def main(): + arg_parser = argparse.ArgumentParser( + description=f'Take a VCF file and only keep {max_nb_lines} lines and remove unused fasta sequence from the ' + f'associated reference genome') + arg_parser.add_argument('--vcf_file', dest='vcf_file', help='Path to the vcf file to be trimmed down') + arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', help='Path to the output vcf file') + arg_parser.add_argument('--fasta_file', dest='fasta_file', help='Path to the fasta file to be trimmed down') + arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', help='Path to the output fasta file') + + args = arg_parser.parse_args() + logging_config.add_stdout_handler() + + ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file) + sequence_found = trim_down_fasta(args.fasta_file, args.output_fasta_file, ref_sequence) + if len(sequence_found) != len(ref_sequence): + logger.warning(f'Not all sequences were found in the fasta file. Cancelling trimming down of fasta file') + os.link(args.fasta_file, args.output_fasta_file) diff --git a/eva_sub_cli/file_utils.py b/eva_sub_cli/file_utils.py index 6f8937f..62a9235 100644 --- a/eva_sub_cli/file_utils.py +++ b/eva_sub_cli/file_utils.py @@ -1,5 +1,7 @@ +import gzip import os import shutil +from itertools import groupby def is_submission_dir_writable(submission_dir): @@ -32,3 +34,30 @@ def backup_file_or_directory(file_name, max_backups=None): else: os.rename(f'{file_name}.{i - 1}', f'{file_name}.{i}') os.rename(file_name, file_name + '.1') + + +def open_gzip_if_required(input_file): + """Open a file in read mode using gzip if the file extension says .gz""" + if input_file.endswith('.gz'): + return gzip.open(input_file, 'rt') + else: + return open(input_file, 'r') + + +def fasta_iter(input_fasta): + """ + Given a fasta file. yield tuples of header, sequence + """ + # first open the file outside + with open_gzip_if_required(input_fasta) as open_file: + # ditch the boolean (x[0]) and just keep the header or sequence since + # we know they alternate. + faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">")) + + for header in faiter: + # drop the ">" + headerStr = header.__next__()[1:].strip() + + # join all sequence lines to one. + seq = "".join(s.strip() for s in faiter.__next__()) + yield (headerStr, seq) diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index 1b28e06..087dad9 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -30,12 +30,14 @@ params.python_scripts = [ "samples_checker": "samples_checker.py", "fasta_checker": "check_fasta_insdc.py", "xlsx2json": "xlsx2json.py", - "semantic_checker": "check_metadata_semantics.py" + "semantic_checker": "check_metadata_semantics.py", + "trim_down": "trim_down.py" ] // prefix to prepend to all provided path params.base_dir = "" // help params.help = null +params.shallow_validation = false // Show help message if (params.help) exit 0, helpMessage() @@ -63,20 +65,23 @@ output_dir = joinBasePath(params.output_dir) workflow { // Prepare the file path - vcf_channel = Channel.fromPath(joinBasePath(params.vcf_files_mapping)) + vcf_and_ref_ch = Channel.fromPath(joinBasePath(params.vcf_files_mapping)) .splitCsv(header:true) .map{row -> tuple( file(joinBasePath(row.vcf)), file(joinBasePath(row.fasta)), file(joinBasePath(row.report)) )} - vcf_files = Channel.fromPath(joinBasePath(params.vcf_files_mapping)) - .splitCsv(header:true) - .map{row -> file(joinBasePath(row.vcf))} - + if (params.shallow_validation){ + // create a smaller vcf and fasta then replace the channel + trim_down_vcf(vcf_and_ref_ch) + vcf_and_ref_ch = trim_down_vcf.out.vcf_and_ref + } + vcf_files = vcf_and_ref_ch.map{row -> row[0]} + fasta_to_vcfs = vcf_and_ref_ch.map{row -> tuple(row[1], row[0])}.groupTuple(by:0) // VCF checks - check_vcf_valid(vcf_channel) - check_vcf_reference(vcf_channel) + check_vcf_valid(vcf_and_ref_ch) + check_vcf_reference(vcf_and_ref_ch) generate_file_size_and_md5_digests(vcf_files) collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect()) @@ -94,14 +99,29 @@ workflow { metadata_json_validation(metadata_json) metadata_semantic_check(metadata_json) sample_name_concordance(metadata_json, vcf_files.collect()) - fasta_to_vcfs = Channel.fromPath(joinBasePath(params.vcf_files_mapping)) - .splitCsv(header:true) - .map{row -> tuple(file(joinBasePath(row.fasta)), file(joinBasePath(row.vcf)))} - .groupTuple(by:0) insdc_checker(metadata_json, fasta_to_vcfs) } } + +process trim_down_vcf { + publishDir output_dir, + overwrite: false, + mode: "copy", + pattern: "*.log" + input: + tuple path(vcf), path(fasta), path(report) + + output: + tuple path("output/$vcf"), path("output/$fasta"), path(report), emit: vcf_and_ref + + """ + mkdir output + $params.python_scripts.trim_down --vcf_file $vcf --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta > trim_down.log + """ + +} + /* * Validate the VCF file format */ diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index 6e2ec0b..40bae95 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -12,7 +12,7 @@ logger = logging_config.get_logger(__name__) container_image = 'ebivariation/eva-sub-cli' -container_tag = 'v0.0.1.dev15' +container_tag = 'v0.0.1.dev16' container_validation_dir = '/opt/vcf_validation' container_validation_output_dir = 'vcf_validation_output' @@ -20,10 +20,10 @@ class DockerValidator(Validator): def __init__(self, mapping_file, submission_dir, project_title, metadata_json=None, - metadata_xlsx=None, container_name=None, docker_path='docker', submission_config=None): + metadata_xlsx=None, shallow_validation=False, container_name=None, docker_path='docker', submission_config=None): super().__init__(mapping_file, submission_dir, project_title, metadata_json=metadata_json, metadata_xlsx=metadata_xlsx, - submission_config=submission_config) + shallow_validation=shallow_validation, submission_config=submission_config) self.docker_path = docker_path self.container_name = container_name if self.container_name is None: diff --git a/eva_sub_cli/validators/native_validator.py b/eva_sub_cli/validators/native_validator.py index eb95939..03c21bd 100644 --- a/eva_sub_cli/validators/native_validator.py +++ b/eva_sub_cli/validators/native_validator.py @@ -11,9 +11,11 @@ class NativeValidator(Validator): def __init__(self, mapping_file, submission_dir, project_title, metadata_json=None, metadata_xlsx=None, - vcf_validator_path='vcf_validator', assembly_checker_path='vcf_assembly_checker', - biovalidator_path='biovalidator', submission_config=None): - super().__init__(mapping_file, submission_dir, project_title, metadata_json=metadata_json, metadata_xlsx=metadata_xlsx, + shallow_validation=False, vcf_validator_path='vcf_validator', + assembly_checker_path='vcf_assembly_checker', biovalidator_path='biovalidator', + submission_config=None): + super().__init__(mapping_file, submission_dir, project_title, metadata_json=metadata_json, + metadata_xlsx=metadata_xlsx, shallow_validation=shallow_validation, submission_config=submission_config) self.vcf_validator_path = vcf_validator_path self.assembly_checker_path = assembly_checker_path @@ -46,6 +48,7 @@ def get_validation_cmd(self): f"--vcf_files_mapping {self.mapping_file} " f"{metadata_flag} " f"--output_dir {self.output_dir} " + f"--shallow_validation true " f"--executable.vcf_validator {self.vcf_validator_path} " f"--executable.vcf_assembly_checker {self.assembly_checker_path} " f"--executable.biovalidator {self.biovalidator_path}" diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index c0c9681..210c970 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -35,7 +35,7 @@ def resolve_single_file_path(file_path): class Validator(AppLogger): def __init__(self, mapping_file, submission_dir, project_title=None, metadata_json=None, metadata_xlsx=None, - submission_config: WritableConfig = None): + shallow_validation=False, submission_config: WritableConfig = None): # validator write to the validation output directory # If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR self.submission_dir = submission_dir diff --git a/pyproject.toml b/pyproject.toml index c686ff7..6db5eae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ classifiers = [ 'check_metadata_semantics.py'='eva_sub_cli.executables.check_metadata_semantics:main' 'samples_checker.py'='eva_sub_cli.executables.samples_checker:main' 'xlsx2json.py'='eva_sub_cli.executables.xlsx2json:main' +'trim_down.py'='eva_sub_cli.executables.trim_down:main' [tool.setuptools] packages = ['eva_sub_cli', 'eva_sub_cli.exceptions', 'eva_sub_cli.executables', 'eva_sub_cli.validators'] diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..1308456 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,14 @@ +import sys +from unittest import TestCase + +from eva_sub_cli.executables import cli + + +class TestCli(TestCase): + + print(sys.argv) + args = ['--submission_dir', '.', '--metadata_xlsx', 'test.xlsx'] + sys.argv.extend(args) + + cli.main() + From cede9411719845883ec2be7d433d44359fcd43d8 Mon Sep 17 00:00:00 2001 From: tcezard Date: Sat, 31 Aug 2024 06:52:00 +0100 Subject: [PATCH 2/8] Add shallow validation to command line parameter --- eva_sub_cli/executables/cli.py | 5 +- eva_sub_cli/executables/trim_down.py | 5 +- eva_sub_cli/nextflow/validation.nf | 1 + eva_sub_cli/orchestrator.py | 6 +-- eva_sub_cli/validators/docker_validator.py | 53 ++++++---------------- eva_sub_cli/validators/native_validator.py | 18 ++++---- eva_sub_cli/validators/validator.py | 1 + 7 files changed, 35 insertions(+), 54 deletions(-) diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py index f55bed5..4c07496 100755 --- a/eva_sub_cli/executables/cli.py +++ b/eva_sub_cli/executables/cli.py @@ -66,8 +66,9 @@ def parse_args(args): 'upload to the EVA') credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account") credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account") - argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages') - + argparser.add_argument('--shallow', action='store_true', default=False, + help='Set the validaiotn to be perform on a the first 10000 record of the VCF. ' + 'Only applies if the number of record exceed 10000') argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages') args = argparser.parse_args() diff --git a/eva_sub_cli/executables/trim_down.py b/eva_sub_cli/executables/trim_down.py index 9fc5742..6d90c96 100644 --- a/eva_sub_cli/executables/trim_down.py +++ b/eva_sub_cli/executables/trim_down.py @@ -37,8 +37,9 @@ def trim_down_fasta(fasta_file, output_fasta, ref_seq_names): name = header.split()[0] if name in ref_seq_names: found_sequences.add(name) - fasta_out.write(header) - fasta_out.write(sequence) + print(f'>{header}', file=fasta_out) + for i in range(0, len(sequence), 80): + print(sequence[i:i+80], file=fasta_out) return found_sequences diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index 087dad9..7571587 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -118,6 +118,7 @@ process trim_down_vcf { """ mkdir output $params.python_scripts.trim_down --vcf_file $vcf --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta > trim_down.log + touch $report """ } diff --git a/eva_sub_cli/orchestrator.py b/eva_sub_cli/orchestrator.py index a14c31d..c02f26b 100755 --- a/eva_sub_cli/orchestrator.py +++ b/eva_sub_cli/orchestrator.py @@ -148,7 +148,7 @@ def check_validation_required(tasks, sub_config, username=None, password=None): def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx, - tasks, executor, username=None, password=None, **kwargs): + tasks, executor, username=None, password=None, shallow_validation=False, **kwargs): # load config config_file_path = os.path.join(submission_dir, SUB_CLI_CONFIG_FILE) sub_config = WritableConfig(config_file_path, version=__version__) @@ -174,11 +174,11 @@ def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_jso if VALIDATE in tasks: if executor == DOCKER: validator = DockerValidator(vcf_files_mapping, submission_dir, project_title, metadata_json, metadata_xlsx, - submission_config=sub_config) + shallow_validation=shallow_validation, submission_config=sub_config) # default to native execution else: validator = NativeValidator(vcf_files_mapping, submission_dir, project_title, metadata_json, metadata_xlsx, - submission_config=sub_config) + shallow_validation=shallow_validation, submission_config=sub_config) with validator: validator.validate_and_report() if not metadata_json: diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index 40bae95..71e16e2 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -38,21 +38,24 @@ def _validation_file_path_for(file_path): def get_docker_validation_cmd(self): if self.metadata_xlsx and not self.metadata_json: - docker_cmd = ( - f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf " - f"--base_dir {container_validation_dir} " - f"--vcf_files_mapping {self.mapping_file} " - f"--metadata_xlsx {self.metadata_xlsx} " + docker_cmd = ''.join([ + f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf ", + f"--base_dir {container_validation_dir} ", + f"--vcf_files_mapping {self.mapping_file} ", + f"--metadata_xlsx {self.metadata_xlsx} ", + f"--shallow_validation true " if self.shallow_validation else "", f"--output_dir {container_validation_output_dir}" - ) + ]) else: - docker_cmd = ( - f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf " - f"--base_dir {container_validation_dir} " - f"--vcf_files_mapping {self.mapping_file} " - f"--metadata_json {self.metadata_json} " + docker_cmd = ''.join([ + f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf ", + f"--base_dir {container_validation_dir} ", + f"--vcf_files_mapping {self.mapping_file} ", + f"--metadata_json {self.metadata_json} ", + f"--shallow_validation true " if self.shallow_validation else "", f"--output_dir {container_validation_output_dir}" - ) + ]) + print(docker_cmd) return docker_cmd def run_docker_validator(self): @@ -213,29 +216,3 @@ def _copy(file_description, file_path): # report is optional if row.get('report'): _copy('assembly report files', row['report']) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Run pre-submission validation checks on VCF files', add_help=False) - parser.add_argument("--docker_path", help="Full path to the docker installation, " - "not required if docker is available on path", required=False) - parser.add_argument("--container_name", help="Name of the docker container", required=False) - parser.add_argument("--vcf_files_mapping", - help="csv file with the mappings for vcf files, fasta and assembly report", required=True) - parser.add_argument("--output_dir", help="Directory where the validation output reports will be made available", - required=True) - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument("--metadata_json", - help="Json file that describe the project, analysis, samples and files") - group.add_argument("--metadata_xlsx", - help="Excel spreadsheet that describe the project, analysis, samples and files") - args = parser.parse_args() - - docker_path = args.docker_path if args.docker_path else 'docker' - docker_container_name = args.container_name if args.container_name else container_image - - logging_config.add_stdout_handler() - validator = DockerValidator(args.vcf_files_mapping, args.output_dir, args.metadata_json, args.metadata_xlsx, - docker_container_name, docker_path) - validator.validate() - validator.create_reports() diff --git a/eva_sub_cli/validators/native_validator.py b/eva_sub_cli/validators/native_validator.py index 03c21bd..fc3ebb8 100644 --- a/eva_sub_cli/validators/native_validator.py +++ b/eva_sub_cli/validators/native_validator.py @@ -43,16 +43,16 @@ def get_validation_cmd(self): metadata_flag = f"--metadata_json {self.metadata_json}" path_to_workflow = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'nextflow/validation.nf') - return ( - f"nextflow run {path_to_workflow} " - f"--vcf_files_mapping {self.mapping_file} " - f"{metadata_flag} " - f"--output_dir {self.output_dir} " - f"--shallow_validation true " - f"--executable.vcf_validator {self.vcf_validator_path} " - f"--executable.vcf_assembly_checker {self.assembly_checker_path} " + return ''.join([ + f"nextflow run {path_to_workflow} ", + f"--vcf_files_mapping {self.mapping_file} ", + f"{metadata_flag} ", + f"--output_dir {self.output_dir} ", + f"--shallow_validation true " if self.shallow_validation else "", + f"--executable.vcf_validator {self.vcf_validator_path} ", + f"--executable.vcf_assembly_checker {self.assembly_checker_path} ", f"--executable.biovalidator {self.biovalidator_path}" - ) + ]) def verify_executables_installed(self): for name, path in [('vcf-validator', self.vcf_validator_path), diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index 210c970..a5642b2 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -49,6 +49,7 @@ def __init__(self, mapping_file, submission_dir, project_title=None, metadata_js self.validation_date = datetime.datetime.now() self.metadata_json = metadata_json self.metadata_xlsx = metadata_xlsx + self.shallow_validation = shallow_validation if submission_config: self.sub_config = submission_config else: From 52d351b51f0f636478787b3ca777a49a1c888b36 Mon Sep 17 00:00:00 2001 From: tcezard Date: Sat, 31 Aug 2024 15:50:06 +0100 Subject: [PATCH 3/8] verify that validation passes before allowing submission --- eva_sub_cli/executables/cli.py | 6 ++--- eva_sub_cli/orchestrator.py | 24 +++++++++++++------- eva_sub_cli/validators/validator.py | 17 +++++++++++++-- tests/test_cli.py | 34 +++++++++++++++++++++++++---- tests/test_orchestrator.py | 14 ++++++------ 5 files changed, 71 insertions(+), 24 deletions(-) diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py index 4c07496..594002c 100755 --- a/eva_sub_cli/executables/cli.py +++ b/eva_sub_cli/executables/cli.py @@ -35,7 +35,7 @@ def validate_command_line_arguments(args, argparser): print(f"'{args.submission_dir}' does not have write permissions or is not a directory.") sys.exit(1) -def parse_args(args): +def parse_args(cmd_line_args): argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA') argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}') argparser.add_argument('--submission_dir', required=True, type=str, @@ -71,7 +71,7 @@ def parse_args(args): 'Only applies if the number of record exceed 10000') argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages') - args = argparser.parse_args() + args = argparser.parse_args(cmd_line_args) validate_command_line_arguments(args, argparser) return args @@ -85,7 +85,7 @@ def main(): if args.debug: logging_config.add_stdout_handler(logging.DEBUG) else: - logging_config.add_stdout_handler() + logging_config.add_stdout_handler(logging.INFO) logging_config.add_file_handler(os.path.join(args.submission_dir, 'eva_submission.log'), logging.DEBUG) try: diff --git a/eva_sub_cli/orchestrator.py b/eva_sub_cli/orchestrator.py index c02f26b..c4fb4f1 100755 --- a/eva_sub_cli/orchestrator.py +++ b/eva_sub_cli/orchestrator.py @@ -25,6 +25,7 @@ logger = logging_config.get_logger(__name__) + def get_vcf_files(mapping_file): vcf_files = [] with open(mapping_file) as open_file: @@ -58,6 +59,7 @@ def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, re return project_title, mapping_file + def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_req=False): with open(metadata_json) as file: json_metadata = json.load(file) @@ -71,12 +73,15 @@ def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_ analysis_alias_dict = defaultdict(dict) for analysis in json_metadata['analysis']: analysis_alias_dict[analysis['analysisAlias']]['referenceFasta'] = analysis['referenceFasta'] - analysis_alias_dict[analysis['analysisAlias']]['assemblyReport'] = analysis['assemblyReport'] if 'assemblyReport' in analysis else '' + analysis_alias_dict[analysis['analysisAlias']]['assemblyReport'] = analysis['assemblyReport'] \ + if 'assemblyReport' in analysis else '' - for file in json_metadata['files']: - reference_fasta = analysis_alias_dict[file['analysisAlias']]['referenceFasta'] - assembly_report = analysis_alias_dict[file['analysisAlias']]['assemblyReport'] - vcf_fasta_report_mapping.append([os.path.abspath(file['fileName']), os.path.abspath(reference_fasta), os.path.abspath(assembly_report) if assembly_report else '']) + for file_dict in json_metadata['files']: + reference_fasta = analysis_alias_dict[file_dict['analysisAlias']]['referenceFasta'] + assembly_report = analysis_alias_dict[file_dict['analysisAlias']]['assemblyReport'] + vcf_fasta_report_mapping.append([os.path.abspath(file_dict['fileName']), + os.path.abspath(reference_fasta), + os.path.abspath(assembly_report) if assembly_report else '']) return project_title, vcf_fasta_report_mapping @@ -137,11 +142,14 @@ def check_validation_required(tasks, sub_config, username=None, password=None): except requests.HTTPError as ex: if ex.response.status_code == 404: logger.error( - f'Submission with id {submission_id} could not be found: statuc code: {ex.response.status_code} response: {ex.response.text}') + f'Submission with id {submission_id} could not be found: ' + f'status code: {ex.response.status_code} response: {ex.response.text}') raise SubmissionNotFoundException(f'Submission with id {submission_id} could not be found') else: - logger.error(f'Error occurred while getting status of the submission with Id {submission_id}: status code: {ex.response.status_code} response: {ex.response.text}') - raise SubmissionStatusException(f'Error occurred while getting status of the submission with Id {submission_id}') + logger.error(f'Error occurred while getting status of the submission with Id {submission_id}: ' + f'status code: {ex.response.status_code} response: {ex.response.text}') + raise SubmissionStatusException(f'Error occurred while getting status of the submission ' + f'with Id {submission_id}') logger.info(f'submission id not found in config. This might be the first time user is submitting') return False diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index a5642b2..69da34a 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -158,8 +158,18 @@ def update_config_with_validation_result(self): self.sub_config.set(READY_FOR_SUBMISSION_TO_EVA, value=self.verify_ready_for_submission_to_eva()) def verify_ready_for_submission_to_eva(self): - # TODO: check validation results and confirm if they are good enough for submitting to EVA - return True + return all(( + self.results.get('vcf_check', {}).get('critical_count', 1) == 0, + self.results.get('assembly_check', {}).get('nb_mismatch', 1) == 0, + self.results.get('assembly_check', {}).get('nb_error', 1) == 0, + all(( + fa_file_check.get('all_insdc', False) is True + for fa_file, fa_file_check in self.results.get('fasta_check', {}).items() + )), + self.results.get('sample_check', {}).get('overall_differences', True) is False, + len(self.results.get('metadata_check', {}).get('spreadsheet_errors', [])) == 0, + self.shallow_validation is False + )) def parse_assembly_check_log(self, assembly_check_log): error_list = [] @@ -317,6 +327,7 @@ def _assembly_check_text_report(self, vcf_name): os.path.join(self.output_dir, 'assembly_check', vcf_name + '*text_assembly_report*') ) + def _collect_assembly_check_results(self): # detect output files for assembly check self.results['assembly_check'] = {} @@ -360,6 +371,8 @@ def _load_fasta_check_results(self): with open(fasta_check) as open_yaml: self.results['fasta_check'][fasta_file_name] = yaml.safe_load(open_yaml) + + def _load_sample_check_results(self): self.results['sample_check'] = {} if not self._sample_check_yaml: diff --git a/tests/test_cli.py b/tests/test_cli.py index 1308456..2381cdc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,14 +1,40 @@ +import copy +import logging import sys from unittest import TestCase +from unittest.mock import patch, Mock + +from eva_sub_cli import orchestrator from eva_sub_cli.executables import cli class TestCli(TestCase): - print(sys.argv) - args = ['--submission_dir', '.', '--metadata_xlsx', 'test.xlsx'] - sys.argv.extend(args) + def test_main(self): + args = Mock(submission_dir='.', vcf_files=[], reference_fasta='', metadata_json=None, metadata_xlsx='', + tasks='validate', executor='native', debug=False) + with patch('eva_sub_cli.executables.cli.parse_args', return_value=args), \ + patch('eva_sub_cli.orchestrator.orchestrate_process'): + cli.main() + # Check that the debug message is shown + logger = orchestrator.logger + logger.debug('test') + + def test_validate_args(self): + cmd_args = [ + '--submission_dir', '.', + '--vcf_files', 'test.vcf', + '--reference_fasta', 'test.fasta', + '--metadata_json', 'test.json', + '--tasks', 'validate', + '--executor', 'native', + '--debug' + ] + args = cli.parse_args(cmd_args) + assert args.submission_dir == '.' - cli.main() + with patch('sys.exit') as m_exit: + cli.parse_args(cmd_args[:2]+cmd_args[4:]) + m_exit.assert_called_once_with(1) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index b11a856..22e6ec1 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -87,7 +87,7 @@ def test_orchestrate_validate(self): m_get_vcf.assert_called_once_with(self.mapping_file) m_docker_validator.assert_any_call( self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, self.metadata_xlsx, - submission_config=m_config.return_value + submission_config=m_config.return_value, shallow_validation=False ) m_docker_validator().validate_and_report.assert_called_once_with() @@ -108,7 +108,7 @@ def test_orchestrate_validate_submit(self): # Validate was run because the config show it was not run successfully before m_docker_validator.assert_any_call( self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, self.metadata_xlsx, - submission_config=m_config.return_value + submission_config=m_config.return_value, shallow_validation=False ) m_docker_validator().validate_and_report.assert_called_once_with() @@ -154,7 +154,7 @@ def test_orchestrate_with_vcf_files(self): assert row['report'] == None m_docker_validator.assert_any_call( self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, self.metadata_xlsx, - submission_config=m_config.return_value + submission_config=m_config.return_value, shallow_validation=False ) m_docker_validator().validate_and_report.assert_called_once_with() @@ -172,7 +172,7 @@ def test_orchestrate_with_metadata_json_without_asm_report(self): assert row['report'] == '' m_docker_validator.assert_any_call( self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, None, - submission_config=m_config.return_value + submission_config=m_config.return_value, shallow_validation=False ) m_docker_validator().validate_and_report.assert_called_once_with() @@ -192,7 +192,7 @@ def test_orchestrate_with_metadata_json_with_asm_report(self): assert row['report'].__contains__('GCA_000001405.27_report.txt') m_docker_validator.assert_any_call( self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, None, - submission_config=m_config.return_value + submission_config=m_config.return_value, shallow_validation=False ) m_docker_validator().validate_and_report.assert_called_once_with() @@ -212,7 +212,7 @@ def test_orchestrate_vcf_files_takes_precedence_over_metadata(self): assert row['report'] == None m_docker_validator.assert_any_call( self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, None, - submission_config=m_config.return_value + submission_config=m_config.return_value, shallow_validation=False ) m_docker_validator().validate_and_report.assert_called_once_with() @@ -232,7 +232,7 @@ def test_orchestrate_with_metadata_xlsx(self): assert row['report'] == '' m_docker_validator.assert_any_call( self.mapping_file, self.test_sub_dir, self.project_title, None, self.metadata_xlsx, - submission_config=m_config.return_value + submission_config=m_config.return_value, shallow_validation=False ) m_docker_validator().validate_and_report.assert_called_once_with() From ac54c2f5e51537877d1b5610cbfb701f7023d3de Mon Sep 17 00:00:00 2001 From: tcezard Date: Sat, 31 Aug 2024 19:21:27 +0100 Subject: [PATCH 4/8] remove print statement --- eva_sub_cli/validators/docker_validator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index 71e16e2..2c862f3 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -55,7 +55,6 @@ def get_docker_validation_cmd(self): f"--shallow_validation true " if self.shallow_validation else "", f"--output_dir {container_validation_output_dir}" ]) - print(docker_cmd) return docker_cmd def run_docker_validator(self): From facfd7317684877130d2bbe751e34847ed619313 Mon Sep 17 00:00:00 2001 From: tcezard Date: Sun, 1 Sep 2024 15:23:32 +0100 Subject: [PATCH 5/8] Add shallow statement in report refactor validator.py --- eva_sub_cli/executables/trim_down.py | 33 ++- eva_sub_cli/file_utils.py | 9 + eva_sub_cli/jinja_templates/html_report.html | 5 + .../jinja_templates/project_details.html | 2 + .../jinja_templates/shallow_validation.html | 27 ++ eva_sub_cli/nextflow/validation.nf | 12 +- eva_sub_cli/report.py | 5 +- .../validators/validation_results_parsers.py | 191 +++++++++++++ eva_sub_cli/validators/validator.py | 270 ++++-------------- .../expected_report_metadata_json.html | 2 +- .../expected_report_metadata_xlsx.html | 2 +- ...expected_shallow_metadata_xlsx_report.html | 22 ++ tests/test_report.py | 52 ++-- tests/test_validaton_results_parsers.py | 32 +++ tests/test_validator.py | 27 +- 15 files changed, 423 insertions(+), 268 deletions(-) create mode 100644 eva_sub_cli/jinja_templates/shallow_validation.html create mode 100644 eva_sub_cli/validators/validation_results_parsers.py create mode 100644 tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html create mode 100644 tests/test_validaton_results_parsers.py diff --git a/eva_sub_cli/executables/trim_down.py b/eva_sub_cli/executables/trim_down.py index 6d90c96..8d4490d 100644 --- a/eva_sub_cli/executables/trim_down.py +++ b/eva_sub_cli/executables/trim_down.py @@ -1,6 +1,7 @@ import argparse import os +import yaml from ebi_eva_common_pyutils.logger import logging_config from eva_sub_cli.file_utils import open_gzip_if_required, fasta_iter @@ -12,7 +13,7 @@ def trim_down_vcf(vcf_file, output_vcf): """ - Produce a smaller file + Produce a smaller vcf files containing a maximum of 10000 records """ with open_gzip_if_required(vcf_file) as vcf_in, open(output_vcf, 'w') as vcf_out: line_count = 0 @@ -27,10 +28,13 @@ def trim_down_vcf(vcf_file, output_vcf): break if line_count != max_nb_lines: logger.warning(f'Only {line_count} found in the source VCF {vcf_file} ') - return ref_seq_names + return line_count, ref_seq_names def trim_down_fasta(fasta_file, output_fasta, ref_seq_names): + """ + Produce a smaller fasta files containing only the reference sequences found in the VCF file + """ found_sequences = set() with open(output_fasta, 'w') as fasta_out: for header, sequence in fasta_iter(fasta_file): @@ -47,16 +51,31 @@ def main(): arg_parser = argparse.ArgumentParser( description=f'Take a VCF file and only keep {max_nb_lines} lines and remove unused fasta sequence from the ' f'associated reference genome') - arg_parser.add_argument('--vcf_file', dest='vcf_file', help='Path to the vcf file to be trimmed down') - arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', help='Path to the output vcf file') - arg_parser.add_argument('--fasta_file', dest='fasta_file', help='Path to the fasta file to be trimmed down') - arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', help='Path to the output fasta file') + arg_parser.add_argument('--vcf_file', dest='vcf_file', required=True, + help='Path to the vcf file to be trimmed down') + arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', required=True, + help='Path to the output vcf file') + arg_parser.add_argument('--fasta_file', dest='fasta_file', required=True, + help='Path to the fasta file to be trimmed down') + arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', required=True, + help='Path to the output fasta file') + arg_parser.add_argument('--output_yaml_file', dest='output_yaml_file', required=True, + help='Path to the yaml file containing the trim down metrics') args = arg_parser.parse_args() logging_config.add_stdout_handler() - ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file) + line_count, ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file) sequence_found = trim_down_fasta(args.fasta_file, args.output_fasta_file, ref_sequence) + trim_down_metrics = {'trim_down_vcf_record': line_count, 'number_sequence_found': sequence_found, + 'trim_down_required': line_count == max_nb_lines} if len(sequence_found) != len(ref_sequence): logger.warning(f'Not all sequences were found in the fasta file. Cancelling trimming down of fasta file') os.link(args.fasta_file, args.output_fasta_file) + trim_down_metrics.pop('number_sequence_found') + with open(args.output_yaml_file) as open_file: + yaml.safe_dump(trim_down_metrics, open_file) + + + + diff --git a/eva_sub_cli/file_utils.py b/eva_sub_cli/file_utils.py index 62a9235..b4ad203 100644 --- a/eva_sub_cli/file_utils.py +++ b/eva_sub_cli/file_utils.py @@ -1,9 +1,18 @@ +import glob import gzip import os import shutil from itertools import groupby +def resolve_single_file_path(file_path): + files = glob.glob(file_path) + if len(files) == 0: + return None + elif len(files) > 0: + return files[0] + + def is_submission_dir_writable(submission_dir): if not os.path.exists(submission_dir): os.makedirs(submission_dir) diff --git a/eva_sub_cli/jinja_templates/html_report.html b/eva_sub_cli/jinja_templates/html_report.html index dade5de..cc50889 100644 --- a/eva_sub_cli/jinja_templates/html_report.html +++ b/eva_sub_cli/jinja_templates/html_report.html @@ -4,6 +4,7 @@ {% from 'sample_name_check.html' import sample_name_check_report %} {% from 'fasta_check.html' import fasta_check_report %} {% from 'metadata_validation.html' import metadata_validation_report %} +{% from 'shallow_validation.html' import shallow_validation_report %} @@ -46,6 +47,10 @@
eva-sub-cli v{{cli_version}}
+
+ {{ shallow_validation_report(validation_results) }} +
+

Project Summary

diff --git a/eva_sub_cli/jinja_templates/project_details.html b/eva_sub_cli/jinja_templates/project_details.html index 45aa785..c220cf3 100644 --- a/eva_sub_cli/jinja_templates/project_details.html +++ b/eva_sub_cli/jinja_templates/project_details.html @@ -32,4 +32,6 @@
{% endif %} + + {%- endmacro %} \ No newline at end of file diff --git a/eva_sub_cli/jinja_templates/shallow_validation.html b/eva_sub_cli/jinja_templates/shallow_validation.html new file mode 100644 index 0000000..cf20851 --- /dev/null +++ b/eva_sub_cli/jinja_templates/shallow_validation.html @@ -0,0 +1,27 @@ + +{% macro shallow_validation_report(validation_results) -%} + {% set results = validation_results.get('shallow_validation', {}) %} + + {% if results.get('required') %} +
+ ❌ You requested to run the shallow validation, please run full validation before submitting the data +
+
+ + + + + + + {% for vcf_file in results.get('metrics') %} + + + + + + {% endfor %} +
VCF FileRecords validated in VCFRecords validated in Fasta
{{ vcf_file }}{{ results.get('metrics').get(vcf_file).get('trim_down_vcf_record') }}{{ results.get('metrics').get(vcf_file).get('number_sequence_found') }}
+
+ {% endif %} + +{%- endmacro %} \ No newline at end of file diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index 7571587..3d54489 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -105,19 +105,21 @@ workflow { process trim_down_vcf { - publishDir output_dir, - overwrite: false, - mode: "copy", - pattern: "*.log" + publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.log" + publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.yml" + input: tuple path(vcf), path(fasta), path(report) output: tuple path("output/$vcf"), path("output/$fasta"), path(report), emit: vcf_and_ref + path "${vcf.getBaseName()}_trim_down.log", emit: trim_down_log + path "${vcf.getBaseName()}_trim_down.yml", emit: trim_down_metric """ mkdir output - $params.python_scripts.trim_down --vcf_file $vcf --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta > trim_down.log + $params.python_scripts.trim_down --vcf_file $vcf --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta --output_yaml_file ${vcf.getBaseName()}_trim_down.yml > ${vcf.getBaseName()}_trim_down.log + # This is needed to ensure that a missing (NO_FILE) report can still be passed down to subsequent steps touch $report """ diff --git a/eva_sub_cli/report.py b/eva_sub_cli/report.py index f85b73f..f546b92 100644 --- a/eva_sub_cli/report.py +++ b/eva_sub_cli/report.py @@ -14,7 +14,8 @@ def get_logo_data(): return logo_data -def generate_html_report(validation_results, validation_date, submission_dir, vcf_fasta_analysis_mapping, project_title=None): +def generate_html_report(validation_results, validation_date, submission_dir, vcf_fasta_analysis_mapping, + project_title=None): vcf_files = sorted(set([file_name for check in validation_results if check in ["vcf_check", "assembly_check"] for file_name in validation_results[check] @@ -32,7 +33,7 @@ def generate_html_report(validation_results, validation_date, submission_dir, vc fasta_files=fasta_files, submission_dir=submission_dir, vcf_fasta_analysis_mapping=vcf_fasta_analysis_mapping, - validation_results=validation_results, + validation_results=validation_results ) try: diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py new file mode 100644 index 0000000..321abc4 --- /dev/null +++ b/eva_sub_cli/validators/validation_results_parsers.py @@ -0,0 +1,191 @@ +import re + +from ebi_eva_common_pyutils.logger import logging_config + +logger = logging_config.get_logger(__name__) + +def parse_assembly_check_log(assembly_check_log): + error_list = [] + nb_error, nb_mismatch = 0, 0 + match = total = None + with open(assembly_check_log) as open_file: + for line in open_file: + if line.startswith('[error]'): + nb_error += 1 + if nb_error < 11: + error_list.append(line.strip()[len('[error] '):]) + elif line.startswith('[info] Number of matches:'): + match, total = line.strip()[len('[info] Number of matches: '):].split('/') + match = int(match) + total = int(total) + return error_list, nb_error, match, total + + +def parse_assembly_check_report(assembly_check_report): + mismatch_list = [] + nb_mismatch = 0 + nb_error = 0 + error_list = [] + with open(assembly_check_report) as open_file: + for line in open_file: + if 'does not match the reference sequence' in line: + nb_mismatch += 1 + if nb_mismatch < 11: + mismatch_list.append(line.strip()) + elif 'Multiple synonyms' in line: + nb_error += 1 + if nb_error < 11: + error_list.append(line.strip()) + # Contig not found in FASTA is reported here rather than in logs when no assembly report is used. + # Count and report once per contig name rather than once per line, to avoid redundant errors. + elif 'is not present in FASTA file' in line: + line_num, error_msg = line.split(': ') + error_msg = error_msg.strip() + if error_msg not in error_list: + nb_error += 1 + if nb_error < 11: + error_list.append(error_msg) + return mismatch_list, nb_mismatch, error_list, nb_error + + +def parse_vcf_check_report(vcf_check_report): + valid = True + max_error_reported = 10 + error_list, critical_list = [], [] + warning_count = error_count = critical_count = 0 + with open(vcf_check_report) as open_file: + for line in open_file: + if 'warning' in line: + warning_count = 1 + elif line.startswith('According to the VCF specification'): + if 'not' in line: + valid = False + elif vcf_check_errors_is_critical(line.strip()): + critical_count += 1 + if critical_count <= max_error_reported: + critical_list.append(line.strip()) + else: + error_count += 1 + if error_count <= max_error_reported: + error_list.append(line.strip()) + + return valid, warning_count, error_count, critical_count, error_list, critical_list + + +def vcf_check_errors_is_critical(error): + """ + This function identify VCF check errors that are not critical for the processing of the VCF within EVA. + They affect specific INFO or FORMAT fields that are used in the variant detection but less so in the downstream + analysis. + Critical: + Reference and alternate alleles must not be the same. + Requested evidence presence with --require-evidence. Please provide genotypes (GT field in FORMAT and samples), + or allele frequencies (AF field in INFO), or allele counts (AC and AN fields in INFO).. + Contig is not sorted by position. Contig chr10 position 41695506 found after 41883113. + Duplicated variant chr1A:1106203:A>G found. + Metadata description string is not valid. + + Error + Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive + its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2 + (derived from ploidy 1). + Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=.. + """ + non_critical_format_fields = ['PL', 'AD', 'AC'] + non_critical_info_fields = ['AC'] + regexes = { + r'^INFO (\w+) does not match the specification Number': non_critical_format_fields, + r'^Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_info_fields + } + for regex in regexes: + match = re.match(regex, error) + if match: + field_affected = match.group(1) + if field_affected in regexes[regex]: + return False + return True + + +def parse_biovalidator_validation_results(metadata_check_file): + """ + Read the biovalidator's report and extract the list of validation errors + """ + ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') + + def clean_read(ifile): + l = ifile.readline() + if l: + return ansi_escape.sub('', l).strip() + + if not metadata_check_file: + return + + errors = [] + + with open(metadata_check_file) as open_file: + collect = False + while True: + line = clean_read(open_file) + if line is None: + break # EOF + elif not line: + continue # Empty line + if not collect: + if line.startswith('Validation failed with following error(s):'): + collect = True + else: + line2 = clean_read(open_file) + if line is None or line2 is None: + break # EOF + errors.append({'property': line, 'description': line2}) + return errors + + +def convert_metadata_sheet(json_attribute, xls2json_conf): + if json_attribute is None: + return None + for sheet_name in xls2json_conf['worksheets']: + if xls2json_conf['worksheets'][sheet_name] == json_attribute: + return sheet_name + + +def convert_metadata_row(sheet, json_row, xls2json_conf): + if json_row is None: + return '' + if 'header_row' in xls2json_conf[sheet]: + return int(json_row) + xls2json_conf[sheet]['header_row'] + else: + return int(json_row) + 2 + + +def convert_metadata_attribute(sheet, json_attribute, xls2json_conf): + if json_attribute is None: + return '' + attributes_dict = {} + attributes_dict.update(xls2json_conf[sheet].get('required', {})) + attributes_dict.update(xls2json_conf[sheet].get('optional', {})) + for attribute in attributes_dict: + if attributes_dict[attribute] == json_attribute: + return attribute + + +def parse_metadata_property(property_str): + if property_str.startswith('.'): + return property_str.strip('./'), None, None + # First attempt to parse as BioSample object + sheet, row, col = parse_sample_metadata_property(property_str) + if sheet is not None and row is not None and col is not None: + return sheet, row, col + match = re.match(r'/(\w+)(/(\d+))?([./](\w+))?', property_str) + if match: + return match.group(1), match.group(3), match.group(5) + else: + logger.error(f'Cannot parse {property_str} in JSON metadata error') + return None, None, None + + +def parse_sample_metadata_property(property_str): + match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str) + if match: + return 'sample', match.group(1), match.group(2) + return None, None, None diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index 69da34a..4a1984a 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -5,7 +5,6 @@ import json import logging import os -import re from functools import lru_cache, cached_property import yaml @@ -13,10 +12,14 @@ from ebi_eva_common_pyutils.config import WritableConfig from eva_sub_cli import ETC_DIR, SUB_CLI_CONFIG_FILE, __version__ -from eva_sub_cli.file_utils import backup_file_or_directory +from eva_sub_cli.file_utils import backup_file_or_directory, resolve_single_file_path from eva_sub_cli.report import generate_html_report from ebi_eva_common_pyutils.logger import logging_config, AppLogger +from eva_sub_cli.validators.validation_results_parsers import parse_assembly_check_log, parse_assembly_check_report, \ + parse_biovalidator_validation_results, convert_metadata_sheet, convert_metadata_row, convert_metadata_attribute, \ + parse_vcf_check_report, parse_metadata_property + VALIDATION_OUTPUT_DIR = "validation_output" VALIDATION_RESULTS = 'validation_results' READY_FOR_SUBMISSION_TO_EVA = 'ready_for_submission_to_eva' @@ -24,14 +27,6 @@ logger = logging_config.get_logger(__name__) -def resolve_single_file_path(file_path): - files = glob.glob(file_path) - if len(files) == 0: - return None - elif len(files) > 0: - return files[0] - - class Validator(AppLogger): def __init__(self, mapping_file, submission_dir, project_title=None, metadata_json=None, metadata_xlsx=None, @@ -44,7 +39,7 @@ def __init__(self, mapping_file, submission_dir, project_title=None, metadata_js vcf_files, fasta_files = self._find_vcf_and_fasta_files() self.vcf_files = vcf_files self.fasta_files = fasta_files - self.results = {} + self.results = {'shallow_validation': {'requested': shallow_validation}} self.project_title = project_title self.validation_date = datetime.datetime.now() self.metadata_json = metadata_json @@ -133,7 +128,7 @@ def verify_files_present(self): files_missing, missing_files_list = self.check_if_file_missing() if files_missing: raise FileNotFoundError(f"some files (vcf/fasta) mentioned in metadata file could not be found. " - f"Missing files list {missing_files_list}") + f"Missing files list {missing_files_list}") def check_if_file_missing(self): files_missing = False @@ -158,6 +153,10 @@ def update_config_with_validation_result(self): self.sub_config.set(READY_FOR_SUBMISSION_TO_EVA, value=self.verify_ready_for_submission_to_eva()) def verify_ready_for_submission_to_eva(self): + """ + Assess if the validation results are meeting expectations + It assumes all validation have been parsed already. + """ return all(( self.results.get('vcf_check', {}).get('critical_count', 1) == 0, self.results.get('assembly_check', {}).get('nb_mismatch', 1) == 0, @@ -168,105 +167,16 @@ def verify_ready_for_submission_to_eva(self): )), self.results.get('sample_check', {}).get('overall_differences', True) is False, len(self.results.get('metadata_check', {}).get('spreadsheet_errors', [])) == 0, - self.shallow_validation is False + any(( + self.results['shallow_validation']['requested'] is False, + self.results['shallow_validation'].get('required', True) is False + )) )) - def parse_assembly_check_log(self, assembly_check_log): - error_list = [] - nb_error, nb_mismatch = 0, 0 - match = total = None - with open(assembly_check_log) as open_file: - for line in open_file: - if line.startswith('[error]'): - nb_error += 1 - if nb_error < 11: - error_list.append(line.strip()[len('[error] '):]) - elif line.startswith('[info] Number of matches:'): - match, total = line.strip()[len('[info] Number of matches: '):].split('/') - match = int(match) - total = int(total) - return error_list, nb_error, match, total - - def parse_assembly_check_report(self, assembly_check_report): - mismatch_list = [] - nb_mismatch = 0 - nb_error = 0 - error_list = [] - with open(assembly_check_report) as open_file: - for line in open_file: - if 'does not match the reference sequence' in line: - nb_mismatch += 1 - if nb_mismatch < 11: - mismatch_list.append(line.strip()) - elif 'Multiple synonyms' in line: - nb_error += 1 - if nb_error < 11: - error_list.append(line.strip()) - # Contig not found in FASTA is reported here rather than in logs when no assembly report is used. - # Count and report once per contig name rather than once per line, to avoid redundant errors. - elif 'is not present in FASTA file' in line: - line_num, error_msg = line.split(': ') - error_msg = error_msg.strip() - if error_msg not in error_list: - nb_error += 1 - if nb_error < 11: - error_list.append(error_msg) - return mismatch_list, nb_mismatch, error_list, nb_error - - def parse_vcf_check_report(self, vcf_check_report): - valid = True - max_error_reported = 10 - error_list, critical_list = [], [] - warning_count = error_count = critical_count = 0 - with open(vcf_check_report) as open_file: - for line in open_file: - if 'warning' in line: - warning_count = 1 - elif line.startswith('According to the VCF specification'): - if 'not' in line: - valid = False - elif self.vcf_check_errors_is_critical(line.strip()): - critical_count += 1 - if critical_count <= max_error_reported: - critical_list.append(line.strip()) - else: - error_count += 1 - if error_count <= max_error_reported: - error_list.append(line.strip()) - - return valid, warning_count, error_count, critical_count, error_list, critical_list - - def vcf_check_errors_is_critical(self, error): - """ - This function identify VCF check errors that are not critical for the processing of the VCF within EVA. - They affect specific INFO or FORMAT fields that are used in the variant detection but less so in the downstream analysis. - Critical: - Reference and alternate alleles must not be the same. - Requested evidence presence with --require-evidence. Please provide genotypes (GT field in FORMAT and samples), or allele frequencies (AF field in INFO), or allele counts (AC and AN fields in INFO).. - Contig is not sorted by position. Contig chr10 position 41695506 found after 41883113. - Duplicated variant chr1A:1106203:A>G found. - Metadata description string is not valid. - - Error - Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2 (derived from ploidy 1). - Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=.. - """ - non_critical_format_fields = ['PL', 'AD', 'AC'] - non_critical_info_fields = ['AC'] - regexes = { - r'^INFO (\w+) does not match the specification Number': non_critical_format_fields, - r'^Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_info_fields - } - for regex in regexes: - match = re.match(regex, error) - if match: - field_affected = match.group(1) - if field_affected in regexes[regex]: - return False - return True - def _collect_validation_workflow_results(self): # Collect information from the output and summarise in the config + if self.shallow_validation: + self._collect_trim_down_metrics() self._collect_vcf_check_results() self._collect_assembly_check_results() self._load_sample_check_results() @@ -291,6 +201,22 @@ def _vcf_check_db_report(self, vcf_name): os.path.join(self.output_dir, 'vcf_format', vcf_name + '.*.db') ) + @lru_cache + def _assembly_check_log(self, vcf_name): + return resolve_single_file_path( + os.path.join(self.output_dir, 'assembly_check', vcf_name + '.assembly_check.log') + ) + + @lru_cache + def _assembly_check_text_report(self, vcf_name): + return resolve_single_file_path( + os.path.join(self.output_dir, 'assembly_check', vcf_name + '*text_assembly_report*') + ) + + @cached_property + def _sample_check_yaml(self): + return resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'sample_checker.yml')) + def _collect_vcf_check_results(self,): # detect output files for vcf check self.results['vcf_check'] = {} @@ -302,7 +228,7 @@ def _collect_vcf_check_results(self,): vcf_check_db_report = self._vcf_check_db_report(vcf_name) if vcf_check_log and vcf_check_text_report and vcf_check_db_report: - valid, warning_count, error_count, critical_count, error_list, critical_list = self.parse_vcf_check_report(vcf_check_text_report) + valid, warning_count, error_count, critical_count, error_list, critical_list = parse_vcf_check_report(vcf_check_text_report) else: valid, warning_count, error_count, critical_count, error_list, critical_list = (False, 0, 0, 1, [], ['Process failed']) self.results['vcf_check'][vcf_name] = { @@ -315,19 +241,6 @@ def _collect_vcf_check_results(self,): 'critical_list': critical_list } - @lru_cache - def _assembly_check_log(self, vcf_name): - return resolve_single_file_path( - os.path.join(self.output_dir, 'assembly_check', vcf_name + '.assembly_check.log') - ) - - @lru_cache - def _assembly_check_text_report(self, vcf_name): - return resolve_single_file_path( - os.path.join(self.output_dir, 'assembly_check', vcf_name + '*text_assembly_report*') - ) - - def _collect_assembly_check_results(self): # detect output files for assembly check self.results['assembly_check'] = {} @@ -339,9 +252,9 @@ def _collect_assembly_check_results(self): if assembly_check_log and assembly_check_text_report: error_list_from_log, nb_error_from_log, match, total = \ - self.parse_assembly_check_log(assembly_check_log) + parse_assembly_check_log(assembly_check_log) mismatch_list, nb_mismatch, error_list_from_report, nb_error_from_report = \ - self.parse_assembly_check_report(assembly_check_text_report) + parse_assembly_check_report(assembly_check_text_report) nb_error = nb_error_from_log + nb_error_from_report error_list = error_list_from_log + error_list_from_report else: @@ -356,10 +269,6 @@ def _collect_assembly_check_results(self): 'total': total } - @cached_property - def _sample_check_yaml(self): - return resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'sample_checker.yml')) - def _load_fasta_check_results(self): for fasta_file in self.fasta_files: fasta_file_name = os.path.basename(fasta_file) @@ -371,8 +280,6 @@ def _load_fasta_check_results(self): with open(fasta_check) as open_yaml: self.results['fasta_check'][fasta_file_name] = yaml.safe_load(open_yaml) - - def _load_sample_check_results(self): self.results['sample_check'] = {} if not self._sample_check_yaml: @@ -384,8 +291,8 @@ def _load_sample_check_results(self): def _collect_metadata_results(self): self.results['metadata_check'] = {} self._load_spreadsheet_conversion_errors() - self._parse_biovalidator_validation_results() - self._parse_semantic_metadata_results() + self.collect_biovalidator_validation_results() + self._collect_semantic_metadata_results() if self.metadata_xlsx: self._convert_biovalidator_validation_to_spreadsheet() self._write_spreadsheet_validation_results() @@ -399,64 +306,19 @@ def _load_spreadsheet_conversion_errors(self): with open(errors_file) as open_yaml: self.results['metadata_check']['spreadsheet_errors'] = yaml.safe_load(open_yaml) - def _parse_biovalidator_validation_results(self): + def collect_biovalidator_validation_results(self): """ Read the biovalidator's report and extract the list of validation errors """ metadata_check_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'metadata_validation.txt')) - ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') - - def clean_read(ifile): - l = ifile.readline() - if l: - return ansi_escape.sub('', l).strip() - - if not metadata_check_file: - return - with open(metadata_check_file) as open_file: - errors = [] - collect = False - while True: - line = clean_read(open_file) - if line is None: - break # EOF - elif not line: - continue # Empty line - if not collect: - if line.startswith('Validation failed with following error(s):'): - collect = True - else: - line2 = clean_read(open_file) - if line is None or line2 is None: - break # EOF - errors.append({'property': line, 'description': line2}) + errors = parse_biovalidator_validation_results(metadata_check_file) self.results['metadata_check'].update({ 'json_report_path': metadata_check_file, 'json_errors': errors }) - def _parse_metadata_property(self, property_str): - if property_str.startswith('.'): - return property_str.strip('./'), None, None - # First attempt to parse as BioSample object - sheet, row, col = self._parse_sample_metadata_property(property_str) - if sheet is not None and row is not None and col is not None: - return sheet, row, col - match = re.match(r'/(\w+)(/(\d+))?([./](\w+))?', property_str) - if match: - return match.group(1), match.group(3), match.group(5) - else: - logger.error(f'Cannot parse {property_str} in JSON metadata error') - return None, None, None - - def _parse_sample_metadata_property(self, property_str): - match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str) - if match: - return 'sample', match.group(1), match.group(2) - return None, None, None - - def _parse_semantic_metadata_results(self): + def _collect_semantic_metadata_results(self): errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'metadata_semantic_check.yml')) if not errors_file: @@ -475,13 +337,13 @@ def _convert_biovalidator_validation_to_spreadsheet(self): if 'spreadsheet_errors' not in self.results['metadata_check']: self.results['metadata_check']['spreadsheet_errors'] = [] for error in self.results['metadata_check'].get('json_errors', {}): - sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property']) + sheet_json, row_json, attribute_json = parse_metadata_property(error['property']) # There should only be one Project but adding the row back means it's easier for users to find if sheet_json == 'project' and row_json is None: row_json = 0 - sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf) - row = self._convert_metadata_row(sheet, row_json, xls2json_conf) - column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf) + sheet = convert_metadata_sheet(sheet_json, xls2json_conf) + row = convert_metadata_row(sheet, row_json, xls2json_conf) + column = convert_metadata_attribute(sheet, attribute_json, xls2json_conf) if row_json is None and attribute_json is None: new_description = f'Sheet "{sheet}" is missing' elif row_json is None: @@ -517,31 +379,6 @@ def _write_spreadsheet_validation_results(self): open_file.write(error_dict.get('description') + '\n') self.results['metadata_check']['spreadsheet_report_path'] = spreadsheet_report_file - def _convert_metadata_sheet(self, json_attribute, xls2json_conf): - if json_attribute is None: - return None - for sheet_name in xls2json_conf['worksheets']: - if xls2json_conf['worksheets'][sheet_name] == json_attribute: - return sheet_name - - def _convert_metadata_row(self, sheet, json_row, xls2json_conf): - if json_row is None: - return '' - if 'header_row' in xls2json_conf[sheet]: - return int(json_row) + xls2json_conf[sheet]['header_row'] - else: - return int(json_row) + 2 - - def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf): - if json_attribute is None: - return '' - attributes_dict = {} - attributes_dict.update(xls2json_conf[sheet].get('required', {})) - attributes_dict.update(xls2json_conf[sheet].get('optional', {})) - for attribute in attributes_dict: - if attributes_dict[attribute] == json_attribute: - return attribute - def _collect_file_info_to_metadata(self): md5sum_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'file_info.txt')) file_path_2_md5 = {} @@ -590,6 +427,20 @@ def _collect_file_info_to_metadata(self): else: self.error(f'Cannot locate the metadata in JSON format in {os.path.join(self.output_dir, "metadata.json")}') + def _collect_trim_down_metrics(self): + self.results['shallow_validation']['metrics'] = {} + shallow_validation_required = False + for vcf_file in self.vcf_files: + basename = os.path.basename(vcf_file) + vcf_name, _ = os.path.splitext(basename) + trimmed_down_metrics = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', + f'{vcf_name}_trim_down.yml')) + with open(trimmed_down_metrics) as open_file: + metrics = yaml.safe_load(open_file) + shallow_validation_required = shallow_validation_required or metrics['trim_down_required'] + self.results['shallow_validation']['metrics'][vcf_file] = metrics + self.results['shallow_validation']['required'] = shallow_validation_required + def get_vcf_fasta_analysis_mapping(self): vcf_fasta_analysis_mapping = [] with open(self.mapping_file) as open_file: @@ -622,7 +473,8 @@ def get_vcf_fasta_analysis_mapping(self): def create_reports(self): report_html = generate_html_report(self.results, self.validation_date, self.submission_dir, - self.get_vcf_fasta_analysis_mapping(), self.project_title) + self.get_vcf_fasta_analysis_mapping(), + self.project_title) file_path = os.path.join(self.output_dir, 'report.html') with open(file_path, "w") as f: f.write(report_html) diff --git a/tests/resources/validation_reports/expected_report_metadata_json.html b/tests/resources/validation_reports/expected_report_metadata_json.html index 6ef2da7..01a2c35 100644 --- a/tests/resources/validation_reports/expected_report_metadata_json.html +++ b/tests/resources/validation_reports/expected_report_metadata_json.html @@ -19,4 +19,4 @@ .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } .info { background-color: #dadada; } - .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/json/metadata/report
JSON PropertyError Description
.filesshould have required property 'files'
/project.titleshould have required property 'title'
/project.descriptionshould have required property 'description'
/project.taxIdshould have required property 'taxId'
/project.centreshould have required property 'centre'
/analysis/0.analysisTitleshould have required property 'analysisTitle'
/analysis/0.descriptionshould have required property 'description'
/analysis/0.experimentTypeshould have required property 'experimentType'
/analysis/0.referenceGenomeshould have required property 'referenceGenome'
/sample/0.bioSampleAccessionshould have required property 'bioSampleAccession'
/sample/0.bioSampleObjectshould have required property 'bioSampleObject'
/sample/0should match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file + .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/json/metadata/report
JSON PropertyError Description
.filesshould have required property 'files'
/project.titleshould have required property 'title'
/project.descriptionshould have required property 'description'
/project.taxIdshould have required property 'taxId'
/project.centreshould have required property 'centre'
/analysis/0.analysisTitleshould have required property 'analysisTitle'
/analysis/0.descriptionshould have required property 'description'
/analysis/0.experimentTypeshould have required property 'experimentType'
/analysis/0.referenceGenomeshould have required property 'referenceGenome'
/sample/0.bioSampleAccessionshould have required property 'bioSampleAccession'
/sample/0.bioSampleObjectshould have required property 'bioSampleObject'
/sample/0should match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file diff --git a/tests/resources/validation_reports/expected_report_metadata_xlsx.html b/tests/resources/validation_reports/expected_report_metadata_xlsx.html index fa9e51d..a1576d8 100644 --- a/tests/resources/validation_reports/expected_report_metadata_xlsx.html +++ b/tests/resources/validation_reports/expected_report_metadata_xlsx.html @@ -19,4 +19,4 @@ .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } .info { background-color: #dadada; } - .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file + .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file diff --git a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html new file mode 100644 index 0000000..8f26e24 --- /dev/null +++ b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html @@ -0,0 +1,22 @@ +Validation Report

Validation Report

eva-sub-cli vcligeneratedversion
You requested to run the shallow validation, please run full validation before submitting the data
VCF FileRecords validated in VCFRecords validated in Fasta
input_fail.vcf1000024
input_passed.vcf1000024

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file diff --git a/tests/test_report.py b/tests/test_report.py index 3f1e3f3..e2dbd3d 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -1,3 +1,4 @@ +import copy import os import datetime from unittest import TestCase @@ -316,6 +317,7 @@ class TestReport(TestCase): resource_dir = os.path.join(os.path.dirname(__file__), 'resources') expected_report_metadata_xlsx = os.path.join(resource_dir, 'validation_reports', 'expected_report_metadata_xlsx.html') expected_report_metadata_json = os.path.join(resource_dir, 'validation_reports', 'expected_report_metadata_json.html') + expected_report_metadata_xlsx_shallow = os.path.join(resource_dir, 'validation_reports', 'expected_shallow_metadata_xlsx_report.html') test_project_name = "My cool project" test_validation_date = datetime.datetime(2023, 8, 31, 12, 34, 56) test_submission_dir = "/test/submission/dir" @@ -324,34 +326,46 @@ class TestReport(TestCase): test_vcf_fasta_analysis_mapping.append({'vcf_file': 'input_pass.vcf', 'fasta_file': 'input_pass.fa', 'analysis': 'B'}) test_vcf_fasta_analysis_mapping.append({'vcf_file': 'input_test.vcf', 'fasta_file': 'input_test.fa', 'analysis': 'could not be linked'}) - def test_generate_html_report_metadata_xlsx(self): - report = generate_html_report(validation_results_xlsx, self.test_validation_date, self.test_submission_dir, + def check_report_vs_expected(self, validation_results, output_report, expected_report): + report = generate_html_report(validation_results, self.test_validation_date, self.test_submission_dir, self.test_vcf_fasta_analysis_mapping, self.test_project_name) - with open('metadata_xlsx_report.html', 'w') as open_file: + with open(output_report, 'w') as open_file: open_file.write(report) - with open(self.expected_report_metadata_xlsx) as open_html: + with open(expected_report) as open_html: expected_report_text = open_html.read() # Inject the version in the expected report expected_report_text = expected_report_text.replace('cligeneratedversion', eva_sub_cli.__version__) assert report == expected_report_text # Remove output file if assert passes - if os.path.exists('metadata_xlsx_report.html'): - os.remove('metadata_xlsx_report.html') + if os.path.exists(output_report): + os.remove(output_report) - def test_generate_html_report_metadata_json(self): - report = generate_html_report(validation_results_json, self.test_validation_date, self.test_submission_dir, - self.test_vcf_fasta_analysis_mapping, self.test_project_name) - with open('metadata_json_report.html', 'w') as open_file: - open_file.write(report) + def test_generate_html_report_metadata_xlsx(self): + self.check_report_vs_expected( + validation_results_xlsx, + 'metadata_xlsx_report.html', + self.expected_report_metadata_xlsx + ) - with open(self.expected_report_metadata_json) as open_html: - expected_report_text = open_html.read() - # Inject the version in the expected report - expected_report_text = expected_report_text.replace('cligeneratedversion', eva_sub_cli.__version__) - assert report == expected_report_text + def test_generate_html_report_metadata_json(self): + self.check_report_vs_expected( + validation_results_json, + 'metadata_json_report.html', + self.expected_report_metadata_json + ) - # Remove output file if assert passes - if os.path.exists('metadata_json_report.html'): - os.remove('metadata_json_report.html') + def test_generate_html_report_metadata_xlsx_shallow(self): + shallow_validation_results_xlsx = copy.deepcopy(validation_results_xlsx) + shallow_validation_results_xlsx['shallow_validation'] = { + 'required': True, 'requested': True, + 'metrics': { + 'input_fail.vcf': {'trim_down_vcf_record': 10000, 'number_sequence_found': 24, 'trim_down_required': True}, + 'input_passed.vcf': {'trim_down_vcf_record': 10000, 'number_sequence_found': 24, 'trim_down_required': True} + }} + self.check_report_vs_expected( + shallow_validation_results_xlsx, + 'shallow_metadata_xlsx_report.html', + self.expected_report_metadata_xlsx_shallow + ) diff --git a/tests/test_validaton_results_parsers.py b/tests/test_validaton_results_parsers.py new file mode 100644 index 0000000..388b44c --- /dev/null +++ b/tests/test_validaton_results_parsers.py @@ -0,0 +1,32 @@ +import os.path +from unittest import TestCase + +from eva_sub_cli.validators.validation_results_parsers import vcf_check_errors_is_critical, parse_assembly_check_log, \ + parse_assembly_check_report + + +class TestValidationParsers(TestCase): + resource_dir = os.path.join(os.path.dirname(__file__), 'resources') + + def test_vcf_check_errors_is_critical(self): + errors = [ + 'INFO AC does not match the specification Number=A (expected 1 value(s)). AC=100,37.', + 'Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2 (derived from ploidy 1).', + 'Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..' + ] + expected_return = [False, True, True] + for i, error in enumerate(errors): + assert vcf_check_errors_is_critical(error) == expected_return[i] + + def test_parse_assembly_check_log(self): + assembly_check_log = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.assembly_check.log') + error_list, nb_error, match, total = parse_assembly_check_log(assembly_check_log) + assert error_list == ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"] + + def test_parse_assembly_check_report(self): + assembly_check_report = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.text_assembly_report.txt') + mismatch_list, nb_mismatch, error_list, nb_error = parse_assembly_check_report(assembly_check_report) + assert mismatch_list[0] == "Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'" + assert nb_mismatch == 12 + assert error_list == ['Chromosome scaffold_chr1 is not present in FASTA file'] + assert nb_error == 1 diff --git a/tests/test_validator.py b/tests/test_validator.py index 6745965..a9031e3 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -36,6 +36,7 @@ def tearDown(self) -> None: def test__collect_validation_workflow_results_with_metadata_xlsx(self): expected_results = { + 'shallow_validation': {'requested': False}, 'vcf_check': { 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, 'critical_count': 0, 'critical_list': []} }, @@ -120,6 +121,7 @@ def test__collect_validation_workflow_results_with_metadata_xlsx(self): def test__collect_validation_workflow_results_with_metadata_json(self): expected_results = { + 'shallow_validation': {'requested': False}, 'vcf_check': { 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, 'critical_count': 0, 'critical_list': []} @@ -190,19 +192,9 @@ def test_create_report(self): report_path = self.validator.create_reports() assert os.path.exists(report_path) - def test_vcf_check_errors_is_critical(self): - errors = [ - 'INFO AC does not match the specification Number=A (expected 1 value(s)). AC=100,37.', - 'Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2 (derived from ploidy 1).', - 'Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..' - ] - expected_return = [False, True, True] - for i, error in enumerate(errors): - assert self.validator.vcf_check_errors_is_critical(error) == expected_return[i] - def test_parse_biovalidator_validation_results(self): self.validator.results['metadata_check'] = {} - self.validator._parse_biovalidator_validation_results() + self.validator.collect_biovalidator_validation_results() assert self.validator.results['metadata_check']['json_errors'] == [ {'property': '/files', 'description': "should have required property 'files'"}, {'property': '/project/title', 'description': "should have required property 'title'"}, @@ -265,19 +257,6 @@ def test_convert_biovalidator_validation_to_spreadsheet(self): 'description': 'alias_1,alias_2 present in Samples not in Analysis'} ] - def test_parse_assembly_check_log(self): - assembly_check_log = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.assembly_check.log') - error_list, nb_error, match, total = self.validator.parse_assembly_check_log(assembly_check_log) - assert error_list == ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"] - - def test_parse_assembly_check_report(self): - assembly_check_report = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.text_assembly_report.txt') - mismatch_list, nb_mismatch, error_list, nb_error = self.validator.parse_assembly_check_report(assembly_check_report) - assert mismatch_list[0] == "Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'" - assert nb_mismatch == 12 - assert error_list == ['Chromosome scaffold_chr1 is not present in FASTA file'] - assert nb_error == 1 - def test_collect_conversion_errors(self): self.validator.results['metadata_check'] = {} self.validator._load_spreadsheet_conversion_errors() From 45e6d68cdea0677ea65d99e4a8b41e819e3ce1ba Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Mon, 9 Sep 2024 12:41:51 +0100 Subject: [PATCH 6/8] Apply suggestions from code review Co-authored-by: April Shen --- eva_sub_cli/executables/cli.py | 2 +- eva_sub_cli/executables/trim_down.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py index 594002c..e3d029c 100755 --- a/eva_sub_cli/executables/cli.py +++ b/eva_sub_cli/executables/cli.py @@ -67,7 +67,7 @@ def parse_args(cmd_line_args): credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account") credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account") argparser.add_argument('--shallow', action='store_true', default=False, - help='Set the validaiotn to be perform on a the first 10000 record of the VCF. ' + help='Set the validation to be performed on the first 10000 records of the VCF. ' 'Only applies if the number of record exceed 10000') argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages') diff --git a/eva_sub_cli/executables/trim_down.py b/eva_sub_cli/executables/trim_down.py index 8d4490d..3cc31df 100644 --- a/eva_sub_cli/executables/trim_down.py +++ b/eva_sub_cli/executables/trim_down.py @@ -76,6 +76,3 @@ def main(): with open(args.output_yaml_file) as open_file: yaml.safe_dump(trim_down_metrics, open_file) - - - From aecf38662964282567c8615fbca65239fd179397 Mon Sep 17 00:00:00 2001 From: tcezard Date: Mon, 9 Sep 2024 12:59:36 +0100 Subject: [PATCH 7/8] Address comments from review --- eva_sub_cli/jinja_templates/html_report.html | 6 ++--- .../jinja_templates/shallow_validation.html | 8 +++--- eva_sub_cli/validators/docker_validator.py | 27 +++++++------------ eva_sub_cli/validators/validator.py | 1 + .../expected_report_metadata_json.html | 2 +- .../expected_report_metadata_xlsx.html | 2 +- ...expected_shallow_metadata_xlsx_report.html | 2 +- 7 files changed, 20 insertions(+), 28 deletions(-) diff --git a/eva_sub_cli/jinja_templates/html_report.html b/eva_sub_cli/jinja_templates/html_report.html index cc50889..5398d00 100644 --- a/eva_sub_cli/jinja_templates/html_report.html +++ b/eva_sub_cli/jinja_templates/html_report.html @@ -4,7 +4,7 @@ {% from 'sample_name_check.html' import sample_name_check_report %} {% from 'fasta_check.html' import fasta_check_report %} {% from 'metadata_validation.html' import metadata_validation_report %} -{% from 'shallow_validation.html' import shallow_validation_report %} +{% from 'shallow_validation.html' import optional_shallow_validation_report %} @@ -47,9 +47,7 @@
eva-sub-cli v{{cli_version}}
-
- {{ shallow_validation_report(validation_results) }} -
+{{ optional_shallow_validation_report(validation_results) }}

Project Summary

diff --git a/eva_sub_cli/jinja_templates/shallow_validation.html b/eva_sub_cli/jinja_templates/shallow_validation.html index cf20851..e8dfe50 100644 --- a/eva_sub_cli/jinja_templates/shallow_validation.html +++ b/eva_sub_cli/jinja_templates/shallow_validation.html @@ -1,8 +1,9 @@ -{% macro shallow_validation_report(validation_results) -%} +{% macro optional_shallow_validation_report(validation_results) -%} {% set results = validation_results.get('shallow_validation', {}) %} {% if results.get('required') %} +
You requested to run the shallow validation, please run full validation before submitting the data
@@ -10,8 +11,8 @@ - - + + {% for vcf_file in results.get('metrics') %} @@ -22,6 +23,7 @@ {% endfor %}
VCF FileRecords validated in VCFRecords validated in FastaVariant lines validated in VCFEntries used in Fasta
+
{% endif %} {%- endmacro %} \ No newline at end of file diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index 2c862f3..6b01e49 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -37,24 +37,15 @@ def _validation_file_path_for(file_path): return f'{container_validation_dir}/{file_path}' def get_docker_validation_cmd(self): - if self.metadata_xlsx and not self.metadata_json: - docker_cmd = ''.join([ - f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf ", - f"--base_dir {container_validation_dir} ", - f"--vcf_files_mapping {self.mapping_file} ", - f"--metadata_xlsx {self.metadata_xlsx} ", - f"--shallow_validation true " if self.shallow_validation else "", - f"--output_dir {container_validation_output_dir}" - ]) - else: - docker_cmd = ''.join([ - f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf ", - f"--base_dir {container_validation_dir} ", - f"--vcf_files_mapping {self.mapping_file} ", - f"--metadata_json {self.metadata_json} ", - f"--shallow_validation true " if self.shallow_validation else "", - f"--output_dir {container_validation_output_dir}" - ]) + docker_cmd = ''.join([ + f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf ", + f"--base_dir {container_validation_dir} ", + f"--vcf_files_mapping {self.mapping_file} ", + f"--metadata_xlsx {self.metadata_xlsx} " if self.metadata_xlsx and not self.metadata_json + else f"--metadata_json {self.metadata_json} ", + f"--shallow_validation true " if self.shallow_validation else "", + f"--output_dir {container_validation_output_dir}" + ]) return docker_cmd def run_docker_validator(self): diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index 4a1984a..f3e591d 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -167,6 +167,7 @@ def verify_ready_for_submission_to_eva(self): )), self.results.get('sample_check', {}).get('overall_differences', True) is False, len(self.results.get('metadata_check', {}).get('spreadsheet_errors', [])) == 0, + len(self.results.get('metadata_check', {}).get('json_errors', [])) == 0, any(( self.results['shallow_validation']['requested'] is False, self.results['shallow_validation'].get('required', True) is False diff --git a/tests/resources/validation_reports/expected_report_metadata_json.html b/tests/resources/validation_reports/expected_report_metadata_json.html index 01a2c35..9972434 100644 --- a/tests/resources/validation_reports/expected_report_metadata_json.html +++ b/tests/resources/validation_reports/expected_report_metadata_json.html @@ -19,4 +19,4 @@ .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } .info { background-color: #dadada; } - .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/json/metadata/report
JSON PropertyError Description
.filesshould have required property 'files'
/project.titleshould have required property 'title'
/project.descriptionshould have required property 'description'
/project.taxIdshould have required property 'taxId'
/project.centreshould have required property 'centre'
/analysis/0.analysisTitleshould have required property 'analysisTitle'
/analysis/0.descriptionshould have required property 'description'
/analysis/0.experimentTypeshould have required property 'experimentType'
/analysis/0.referenceGenomeshould have required property 'referenceGenome'
/sample/0.bioSampleAccessionshould have required property 'bioSampleAccession'
/sample/0.bioSampleObjectshould have required property 'bioSampleObject'
/sample/0should match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file + .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli v0.4.dev82+g45e6d68.d20240909

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/json/metadata/report
JSON PropertyError Description
.filesshould have required property 'files'
/project.titleshould have required property 'title'
/project.descriptionshould have required property 'description'
/project.taxIdshould have required property 'taxId'
/project.centreshould have required property 'centre'
/analysis/0.analysisTitleshould have required property 'analysisTitle'
/analysis/0.descriptionshould have required property 'description'
/analysis/0.experimentTypeshould have required property 'experimentType'
/analysis/0.referenceGenomeshould have required property 'referenceGenome'
/sample/0.bioSampleAccessionshould have required property 'bioSampleAccession'
/sample/0.bioSampleObjectshould have required property 'bioSampleObject'
/sample/0should match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file diff --git a/tests/resources/validation_reports/expected_report_metadata_xlsx.html b/tests/resources/validation_reports/expected_report_metadata_xlsx.html index a1576d8..fc7dbb1 100644 --- a/tests/resources/validation_reports/expected_report_metadata_xlsx.html +++ b/tests/resources/validation_reports/expected_report_metadata_xlsx.html @@ -19,4 +19,4 @@ .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } .info { background-color: #dadada; } - .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file + .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli v0.4.dev82+g45e6d68.d20240909

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file diff --git a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html index 8f26e24..cf1cf7f 100644 --- a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html +++ b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html @@ -19,4 +19,4 @@ .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } .info { background-color: #dadada; } - .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion
You requested to run the shallow validation, please run full validation before submitting the data
VCF FileRecords validated in VCFRecords validated in Fasta
input_fail.vcf1000024
input_passed.vcf1000024

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file + .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli v0.4.dev82+g45e6d68.d20240909
You requested to run the shallow validation, please run full validation before submitting the data
VCF FileVariant lines validated in VCFEntries used in Fasta
input_fail.vcf1000024
input_passed.vcf1000024

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file From f9a891eaae6706a7de2743c1190dc6a4b713386b Mon Sep 17 00:00:00 2001 From: tcezard Date: Mon, 9 Sep 2024 13:44:54 +0100 Subject: [PATCH 8/8] Add back the test generated version in the report --- .../validation_reports/expected_report_metadata_json.html | 2 +- .../validation_reports/expected_report_metadata_xlsx.html | 2 +- .../expected_shallow_metadata_xlsx_report.html | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/resources/validation_reports/expected_report_metadata_json.html b/tests/resources/validation_reports/expected_report_metadata_json.html index 9972434..6ef2da7 100644 --- a/tests/resources/validation_reports/expected_report_metadata_json.html +++ b/tests/resources/validation_reports/expected_report_metadata_json.html @@ -19,4 +19,4 @@ .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } .info { background-color: #dadada; } - .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli v0.4.dev82+g45e6d68.d20240909

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/json/metadata/report
JSON PropertyError Description
.filesshould have required property 'files'
/project.titleshould have required property 'title'
/project.descriptionshould have required property 'description'
/project.taxIdshould have required property 'taxId'
/project.centreshould have required property 'centre'
/analysis/0.analysisTitleshould have required property 'analysisTitle'
/analysis/0.descriptionshould have required property 'description'
/analysis/0.experimentTypeshould have required property 'experimentType'
/analysis/0.referenceGenomeshould have required property 'referenceGenome'
/sample/0.bioSampleAccessionshould have required property 'bioSampleAccession'
/sample/0.bioSampleObjectshould have required property 'bioSampleObject'
/sample/0should match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file + .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/json/metadata/report
JSON PropertyError Description
.filesshould have required property 'files'
/project.titleshould have required property 'title'
/project.descriptionshould have required property 'description'
/project.taxIdshould have required property 'taxId'
/project.centreshould have required property 'centre'
/analysis/0.analysisTitleshould have required property 'analysisTitle'
/analysis/0.descriptionshould have required property 'description'
/analysis/0.experimentTypeshould have required property 'experimentType'
/analysis/0.referenceGenomeshould have required property 'referenceGenome'
/sample/0.bioSampleAccessionshould have required property 'bioSampleAccession'
/sample/0.bioSampleObjectshould have required property 'bioSampleObject'
/sample/0should match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file diff --git a/tests/resources/validation_reports/expected_report_metadata_xlsx.html b/tests/resources/validation_reports/expected_report_metadata_xlsx.html index fc7dbb1..fa9e51d 100644 --- a/tests/resources/validation_reports/expected_report_metadata_xlsx.html +++ b/tests/resources/validation_reports/expected_report_metadata_xlsx.html @@ -19,4 +19,4 @@ .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } .info { background-color: #dadada; } - .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli v0.4.dev82+g45e6d68.d20240909

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file + .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file diff --git a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html index cf1cf7f..5fc4e3a 100644 --- a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html +++ b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html @@ -19,4 +19,4 @@ .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } .info { background-color: #dadada; } - .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli v0.4.dev82+g45e6d68.d20240909
You requested to run the shallow validation, please run full validation before submitting the data
VCF FileVariant lines validated in VCFEntries used in Fasta
input_fail.vcf1000024
input_passed.vcf1000024

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file + .error-list, .no-show { display: none; }

Validation Report

eva-sub-cli vcligeneratedversion
You requested to run the shallow validation, please run full validation before submitting the data
VCF FileVariant lines validated in VCFEntries used in Fasta
input_fail.vcf1000024
input_passed.vcf1000024

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
Project2Project TitleColumn "Project Title" is not populated
Project2DescriptionColumn "Description" is not populated
Project2Tax IDColumn "Tax ID" is not populated
Project2CenterColumn "Center" is not populated
Analysis2Analysis TitleColumn "Analysis Title" is not populated
Analysis2DescriptionColumn "Description" is not populated
Analysis2Experiment TypeColumn "Experiment Type" is not populated
Analysis2ReferenceColumn "Reference" is not populated
Sample3Sample AccessionColumn "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file