diff --git a/eva_sub_cli/executables/check_fasta_insdc.py b/eva_sub_cli/executables/check_fasta_insdc.py
index 6635dca..c05933c 100644
--- a/eva_sub_cli/executables/check_fasta_insdc.py
+++ b/eva_sub_cli/executables/check_fasta_insdc.py
@@ -10,6 +10,7 @@
from requests import HTTPError
from retry import retry
+from eva_sub_cli.file_utils import fasta_iter
from eva_sub_cli.metadata_utils import get_files_per_analysis, get_analysis_for_vcf_file, \
get_reference_assembly_for_analysis
@@ -19,13 +20,6 @@
logger = logging_config.get_logger(__name__)
-def open_gzip_if_required(input_file):
- if input_file.endswith('.gz'):
- return gzip.open(input_file, 'rt')
- else:
- return open(input_file, 'r')
-
-
def write_result_yaml(output_yaml, results):
with open(output_yaml, 'w') as open_yaml:
yaml.safe_dump(data=results, stream=open_yaml)
@@ -34,27 +28,6 @@ def write_result_yaml(output_yaml, results):
def refget_md5_digest(sequence):
return hashlib.md5(sequence.upper().encode('utf-8')).hexdigest()
-
-def fasta_iter(input_fasta):
- """
- Given a fasta file. yield tuples of header, sequence
- """
- # first open the file outside
- with open(input_fasta, 'r') as open_file:
-
- # ditch the boolean (x[0]) and just keep the header or sequence since
- # we know they alternate.
- faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">"))
-
- for header in faiter:
- # drop the ">"
- headerStr = header.__next__()[1:].strip()
-
- # join all sequence lines to one.
- seq = "".join(s.strip() for s in faiter.__next__())
- yield (headerStr, seq)
-
-
@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def get_refget_metadata(md5_digest):
response = requests.get(f'{REFGET_SERVER}/sequence/{md5_digest}/metadata')
diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py
index c760de5..e3d029c 100755
--- a/eva_sub_cli/executables/cli.py
+++ b/eva_sub_cli/executables/cli.py
@@ -35,8 +35,7 @@ def validate_command_line_arguments(args, argparser):
print(f"'{args.submission_dir}' does not have write permissions or is not a directory.")
sys.exit(1)
-
-def main():
+def parse_args(cmd_line_args):
argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA')
argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}')
argparser.add_argument('--submission_dir', required=True, type=str,
@@ -67,18 +66,26 @@ def main():
'upload to the EVA')
credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
- argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages')
+ argparser.add_argument('--shallow', action='store_true', default=False,
+ help='Set the validation to be performed on the first 10000 records of the VCF. '
+ 'Only applies if the number of record exceed 10000')
+ argparser.add_argument('--debug', action='store_true', default=False,
+ help='Set the script to output debug messages')
+ args = argparser.parse_args(cmd_line_args)
+ validate_command_line_arguments(args, argparser)
+ return args
- args = argparser.parse_args()
- validate_command_line_arguments(args, argparser)
+def main():
+
+ args = parse_args(sys.argv[1:])
args.submission_dir = os.path.abspath(args.submission_dir)
if args.debug:
logging_config.add_stdout_handler(logging.DEBUG)
else:
- logging_config.add_stdout_handler()
+ logging_config.add_stdout_handler(logging.INFO)
logging_config.add_file_handler(os.path.join(args.submission_dir, 'eva_submission.log'), logging.DEBUG)
try:
diff --git a/eva_sub_cli/executables/samples_checker.py b/eva_sub_cli/executables/samples_checker.py
index 0e99efd..a2e3ed2 100644
--- a/eva_sub_cli/executables/samples_checker.py
+++ b/eva_sub_cli/executables/samples_checker.py
@@ -7,18 +7,12 @@
import yaml
+from eva_sub_cli.file_utils import open_gzip_if_required
from eva_sub_cli.metadata_utils import get_samples_per_analysis, get_files_per_analysis, get_analysis_for_vcf_file
logger = logging_config.get_logger(__name__)
-def open_gzip_if_required(input_file):
- if input_file.endswith('.gz'):
- return gzip.open(input_file, 'rt')
- else:
- return open(input_file, 'r')
-
-
def get_samples_from_vcf(vcf_file):
"""
Get the list of samples present in a single VCF file
diff --git a/eva_sub_cli/executables/trim_down.py b/eva_sub_cli/executables/trim_down.py
new file mode 100644
index 0000000..3cc31df
--- /dev/null
+++ b/eva_sub_cli/executables/trim_down.py
@@ -0,0 +1,78 @@
+import argparse
+import os
+
+import yaml
+from ebi_eva_common_pyutils.logger import logging_config
+from eva_sub_cli.file_utils import open_gzip_if_required, fasta_iter
+
+logger = logging_config.get_logger(__name__)
+
+
+max_nb_lines = 10000
+
+
+def trim_down_vcf(vcf_file, output_vcf):
+ """
+ Produce a smaller vcf files containing a maximum of 10000 records
+ """
+ with open_gzip_if_required(vcf_file) as vcf_in, open(output_vcf, 'w') as vcf_out:
+ line_count = 0
+ ref_seq_names = set()
+ for line in vcf_in:
+ if line.startswith('#') or line_count < max_nb_lines:
+ vcf_out.write(line)
+ if not line.startswith('#'):
+ line_count += 1
+ ref_seq_names.add(line.split('\t')[0])
+ else:
+ break
+ if line_count != max_nb_lines:
+ logger.warning(f'Only {line_count} found in the source VCF {vcf_file} ')
+ return line_count, ref_seq_names
+
+
+def trim_down_fasta(fasta_file, output_fasta, ref_seq_names):
+ """
+ Produce a smaller fasta files containing only the reference sequences found in the VCF file
+ """
+ found_sequences = set()
+ with open(output_fasta, 'w') as fasta_out:
+ for header, sequence in fasta_iter(fasta_file):
+ name = header.split()[0]
+ if name in ref_seq_names:
+ found_sequences.add(name)
+ print(f'>{header}', file=fasta_out)
+ for i in range(0, len(sequence), 80):
+ print(sequence[i:i+80], file=fasta_out)
+ return found_sequences
+
+
+def main():
+ arg_parser = argparse.ArgumentParser(
+ description=f'Take a VCF file and only keep {max_nb_lines} lines and remove unused fasta sequence from the '
+ f'associated reference genome')
+ arg_parser.add_argument('--vcf_file', dest='vcf_file', required=True,
+ help='Path to the vcf file to be trimmed down')
+ arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', required=True,
+ help='Path to the output vcf file')
+ arg_parser.add_argument('--fasta_file', dest='fasta_file', required=True,
+ help='Path to the fasta file to be trimmed down')
+ arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', required=True,
+ help='Path to the output fasta file')
+ arg_parser.add_argument('--output_yaml_file', dest='output_yaml_file', required=True,
+ help='Path to the yaml file containing the trim down metrics')
+
+ args = arg_parser.parse_args()
+ logging_config.add_stdout_handler()
+
+ line_count, ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file)
+ sequence_found = trim_down_fasta(args.fasta_file, args.output_fasta_file, ref_sequence)
+ trim_down_metrics = {'trim_down_vcf_record': line_count, 'number_sequence_found': sequence_found,
+ 'trim_down_required': line_count == max_nb_lines}
+ if len(sequence_found) != len(ref_sequence):
+ logger.warning(f'Not all sequences were found in the fasta file. Cancelling trimming down of fasta file')
+ os.link(args.fasta_file, args.output_fasta_file)
+ trim_down_metrics.pop('number_sequence_found')
+ with open(args.output_yaml_file) as open_file:
+ yaml.safe_dump(trim_down_metrics, open_file)
+
diff --git a/eva_sub_cli/file_utils.py b/eva_sub_cli/file_utils.py
index 6f8937f..b4ad203 100644
--- a/eva_sub_cli/file_utils.py
+++ b/eva_sub_cli/file_utils.py
@@ -1,5 +1,16 @@
+import glob
+import gzip
import os
import shutil
+from itertools import groupby
+
+
+def resolve_single_file_path(file_path):
+ files = glob.glob(file_path)
+ if len(files) == 0:
+ return None
+ elif len(files) > 0:
+ return files[0]
def is_submission_dir_writable(submission_dir):
@@ -32,3 +43,30 @@ def backup_file_or_directory(file_name, max_backups=None):
else:
os.rename(f'{file_name}.{i - 1}', f'{file_name}.{i}')
os.rename(file_name, file_name + '.1')
+
+
+def open_gzip_if_required(input_file):
+ """Open a file in read mode using gzip if the file extension says .gz"""
+ if input_file.endswith('.gz'):
+ return gzip.open(input_file, 'rt')
+ else:
+ return open(input_file, 'r')
+
+
+def fasta_iter(input_fasta):
+ """
+ Given a fasta file. yield tuples of header, sequence
+ """
+ # first open the file outside
+ with open_gzip_if_required(input_fasta) as open_file:
+ # ditch the boolean (x[0]) and just keep the header or sequence since
+ # we know they alternate.
+ faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">"))
+
+ for header in faiter:
+ # drop the ">"
+ headerStr = header.__next__()[1:].strip()
+
+ # join all sequence lines to one.
+ seq = "".join(s.strip() for s in faiter.__next__())
+ yield (headerStr, seq)
diff --git a/eva_sub_cli/jinja_templates/html_report.html b/eva_sub_cli/jinja_templates/html_report.html
index dade5de..5398d00 100644
--- a/eva_sub_cli/jinja_templates/html_report.html
+++ b/eva_sub_cli/jinja_templates/html_report.html
@@ -4,6 +4,7 @@
{% from 'sample_name_check.html' import sample_name_check_report %}
{% from 'fasta_check.html' import fasta_check_report %}
{% from 'metadata_validation.html' import metadata_validation_report %}
+{% from 'shallow_validation.html' import optional_shallow_validation_report %}
@@ -46,6 +47,8 @@ eva-sub-cli v{{cli_version}}
+{{ optional_shallow_validation_report(validation_results) }}
+
Project Summary
diff --git a/eva_sub_cli/jinja_templates/project_details.html b/eva_sub_cli/jinja_templates/project_details.html
index 45aa785..c220cf3 100644
--- a/eva_sub_cli/jinja_templates/project_details.html
+++ b/eva_sub_cli/jinja_templates/project_details.html
@@ -32,4 +32,6 @@
{% endif %}
+
+
{%- endmacro %}
\ No newline at end of file
diff --git a/eva_sub_cli/jinja_templates/shallow_validation.html b/eva_sub_cli/jinja_templates/shallow_validation.html
new file mode 100644
index 0000000..e8dfe50
--- /dev/null
+++ b/eva_sub_cli/jinja_templates/shallow_validation.html
@@ -0,0 +1,29 @@
+
+{% macro optional_shallow_validation_report(validation_results) -%}
+ {% set results = validation_results.get('shallow_validation', {}) %}
+
+ {% if results.get('required') %}
+
+ ▶
+ ❌ You requested to run the shallow validation, please run full validation before submitting the data
+
+
+
+
+ VCF File |
+ Variant lines validated in VCF |
+ Entries used in Fasta |
+
+ {% for vcf_file in results.get('metrics') %}
+
+ {{ vcf_file }} |
+ {{ results.get('metrics').get(vcf_file).get('trim_down_vcf_record') }} |
+ {{ results.get('metrics').get(vcf_file).get('number_sequence_found') }} |
+
+ {% endfor %}
+
+
+
+ {% endif %}
+
+{%- endmacro %}
\ No newline at end of file
diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
index 1b28e06..3d54489 100644
--- a/eva_sub_cli/nextflow/validation.nf
+++ b/eva_sub_cli/nextflow/validation.nf
@@ -30,12 +30,14 @@ params.python_scripts = [
"samples_checker": "samples_checker.py",
"fasta_checker": "check_fasta_insdc.py",
"xlsx2json": "xlsx2json.py",
- "semantic_checker": "check_metadata_semantics.py"
+ "semantic_checker": "check_metadata_semantics.py",
+ "trim_down": "trim_down.py"
]
// prefix to prepend to all provided path
params.base_dir = ""
// help
params.help = null
+params.shallow_validation = false
// Show help message
if (params.help) exit 0, helpMessage()
@@ -63,20 +65,23 @@ output_dir = joinBasePath(params.output_dir)
workflow {
// Prepare the file path
- vcf_channel = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
+ vcf_and_ref_ch = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
.map{row -> tuple(
file(joinBasePath(row.vcf)),
file(joinBasePath(row.fasta)),
file(joinBasePath(row.report))
)}
- vcf_files = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
- .splitCsv(header:true)
- .map{row -> file(joinBasePath(row.vcf))}
-
+ if (params.shallow_validation){
+ // create a smaller vcf and fasta then replace the channel
+ trim_down_vcf(vcf_and_ref_ch)
+ vcf_and_ref_ch = trim_down_vcf.out.vcf_and_ref
+ }
+ vcf_files = vcf_and_ref_ch.map{row -> row[0]}
+ fasta_to_vcfs = vcf_and_ref_ch.map{row -> tuple(row[1], row[0])}.groupTuple(by:0)
// VCF checks
- check_vcf_valid(vcf_channel)
- check_vcf_reference(vcf_channel)
+ check_vcf_valid(vcf_and_ref_ch)
+ check_vcf_reference(vcf_and_ref_ch)
generate_file_size_and_md5_digests(vcf_files)
collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())
@@ -94,14 +99,32 @@ workflow {
metadata_json_validation(metadata_json)
metadata_semantic_check(metadata_json)
sample_name_concordance(metadata_json, vcf_files.collect())
- fasta_to_vcfs = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
- .splitCsv(header:true)
- .map{row -> tuple(file(joinBasePath(row.fasta)), file(joinBasePath(row.vcf)))}
- .groupTuple(by:0)
insdc_checker(metadata_json, fasta_to_vcfs)
}
}
+
+process trim_down_vcf {
+ publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.log"
+ publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.yml"
+
+ input:
+ tuple path(vcf), path(fasta), path(report)
+
+ output:
+ tuple path("output/$vcf"), path("output/$fasta"), path(report), emit: vcf_and_ref
+ path "${vcf.getBaseName()}_trim_down.log", emit: trim_down_log
+ path "${vcf.getBaseName()}_trim_down.yml", emit: trim_down_metric
+
+ """
+ mkdir output
+ $params.python_scripts.trim_down --vcf_file $vcf --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta --output_yaml_file ${vcf.getBaseName()}_trim_down.yml > ${vcf.getBaseName()}_trim_down.log
+ # This is needed to ensure that a missing (NO_FILE) report can still be passed down to subsequent steps
+ touch $report
+ """
+
+}
+
/*
* Validate the VCF file format
*/
diff --git a/eva_sub_cli/orchestrator.py b/eva_sub_cli/orchestrator.py
index a14c31d..c4fb4f1 100755
--- a/eva_sub_cli/orchestrator.py
+++ b/eva_sub_cli/orchestrator.py
@@ -25,6 +25,7 @@
logger = logging_config.get_logger(__name__)
+
def get_vcf_files(mapping_file):
vcf_files = []
with open(mapping_file) as open_file:
@@ -58,6 +59,7 @@ def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, re
return project_title, mapping_file
+
def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_req=False):
with open(metadata_json) as file:
json_metadata = json.load(file)
@@ -71,12 +73,15 @@ def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_
analysis_alias_dict = defaultdict(dict)
for analysis in json_metadata['analysis']:
analysis_alias_dict[analysis['analysisAlias']]['referenceFasta'] = analysis['referenceFasta']
- analysis_alias_dict[analysis['analysisAlias']]['assemblyReport'] = analysis['assemblyReport'] if 'assemblyReport' in analysis else ''
+ analysis_alias_dict[analysis['analysisAlias']]['assemblyReport'] = analysis['assemblyReport'] \
+ if 'assemblyReport' in analysis else ''
- for file in json_metadata['files']:
- reference_fasta = analysis_alias_dict[file['analysisAlias']]['referenceFasta']
- assembly_report = analysis_alias_dict[file['analysisAlias']]['assemblyReport']
- vcf_fasta_report_mapping.append([os.path.abspath(file['fileName']), os.path.abspath(reference_fasta), os.path.abspath(assembly_report) if assembly_report else ''])
+ for file_dict in json_metadata['files']:
+ reference_fasta = analysis_alias_dict[file_dict['analysisAlias']]['referenceFasta']
+ assembly_report = analysis_alias_dict[file_dict['analysisAlias']]['assemblyReport']
+ vcf_fasta_report_mapping.append([os.path.abspath(file_dict['fileName']),
+ os.path.abspath(reference_fasta),
+ os.path.abspath(assembly_report) if assembly_report else ''])
return project_title, vcf_fasta_report_mapping
@@ -137,18 +142,21 @@ def check_validation_required(tasks, sub_config, username=None, password=None):
except requests.HTTPError as ex:
if ex.response.status_code == 404:
logger.error(
- f'Submission with id {submission_id} could not be found: statuc code: {ex.response.status_code} response: {ex.response.text}')
+ f'Submission with id {submission_id} could not be found: '
+ f'status code: {ex.response.status_code} response: {ex.response.text}')
raise SubmissionNotFoundException(f'Submission with id {submission_id} could not be found')
else:
- logger.error(f'Error occurred while getting status of the submission with Id {submission_id}: status code: {ex.response.status_code} response: {ex.response.text}')
- raise SubmissionStatusException(f'Error occurred while getting status of the submission with Id {submission_id}')
+ logger.error(f'Error occurred while getting status of the submission with Id {submission_id}: '
+ f'status code: {ex.response.status_code} response: {ex.response.text}')
+ raise SubmissionStatusException(f'Error occurred while getting status of the submission '
+ f'with Id {submission_id}')
logger.info(f'submission id not found in config. This might be the first time user is submitting')
return False
def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx,
- tasks, executor, username=None, password=None, **kwargs):
+ tasks, executor, username=None, password=None, shallow_validation=False, **kwargs):
# load config
config_file_path = os.path.join(submission_dir, SUB_CLI_CONFIG_FILE)
sub_config = WritableConfig(config_file_path, version=__version__)
@@ -174,11 +182,11 @@ def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_jso
if VALIDATE in tasks:
if executor == DOCKER:
validator = DockerValidator(vcf_files_mapping, submission_dir, project_title, metadata_json, metadata_xlsx,
- submission_config=sub_config)
+ shallow_validation=shallow_validation, submission_config=sub_config)
# default to native execution
else:
validator = NativeValidator(vcf_files_mapping, submission_dir, project_title, metadata_json, metadata_xlsx,
- submission_config=sub_config)
+ shallow_validation=shallow_validation, submission_config=sub_config)
with validator:
validator.validate_and_report()
if not metadata_json:
diff --git a/eva_sub_cli/report.py b/eva_sub_cli/report.py
index f85b73f..f546b92 100644
--- a/eva_sub_cli/report.py
+++ b/eva_sub_cli/report.py
@@ -14,7 +14,8 @@ def get_logo_data():
return logo_data
-def generate_html_report(validation_results, validation_date, submission_dir, vcf_fasta_analysis_mapping, project_title=None):
+def generate_html_report(validation_results, validation_date, submission_dir, vcf_fasta_analysis_mapping,
+ project_title=None):
vcf_files = sorted(set([file_name
for check in validation_results if check in ["vcf_check", "assembly_check"]
for file_name in validation_results[check]
@@ -32,7 +33,7 @@ def generate_html_report(validation_results, validation_date, submission_dir, vc
fasta_files=fasta_files,
submission_dir=submission_dir,
vcf_fasta_analysis_mapping=vcf_fasta_analysis_mapping,
- validation_results=validation_results,
+ validation_results=validation_results
)
try:
diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
index 6e2ec0b..6b01e49 100644
--- a/eva_sub_cli/validators/docker_validator.py
+++ b/eva_sub_cli/validators/docker_validator.py
@@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)
container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev15'
+container_tag = 'v0.0.1.dev16'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'
@@ -20,10 +20,10 @@
class DockerValidator(Validator):
def __init__(self, mapping_file, submission_dir, project_title, metadata_json=None,
- metadata_xlsx=None, container_name=None, docker_path='docker', submission_config=None):
+ metadata_xlsx=None, shallow_validation=False, container_name=None, docker_path='docker', submission_config=None):
super().__init__(mapping_file, submission_dir, project_title,
metadata_json=metadata_json, metadata_xlsx=metadata_xlsx,
- submission_config=submission_config)
+ shallow_validation=shallow_validation, submission_config=submission_config)
self.docker_path = docker_path
self.container_name = container_name
if self.container_name is None:
@@ -37,22 +37,15 @@ def _validation_file_path_for(file_path):
return f'{container_validation_dir}/{file_path}'
def get_docker_validation_cmd(self):
- if self.metadata_xlsx and not self.metadata_json:
- docker_cmd = (
- f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
- f"--base_dir {container_validation_dir} "
- f"--vcf_files_mapping {self.mapping_file} "
- f"--metadata_xlsx {self.metadata_xlsx} "
- f"--output_dir {container_validation_output_dir}"
- )
- else:
- docker_cmd = (
- f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf "
- f"--base_dir {container_validation_dir} "
- f"--vcf_files_mapping {self.mapping_file} "
- f"--metadata_json {self.metadata_json} "
- f"--output_dir {container_validation_output_dir}"
- )
+ docker_cmd = ''.join([
+ f"{self.docker_path} exec {self.container_name} nextflow run eva_sub_cli/nextflow/validation.nf ",
+ f"--base_dir {container_validation_dir} ",
+ f"--vcf_files_mapping {self.mapping_file} ",
+ f"--metadata_xlsx {self.metadata_xlsx} " if self.metadata_xlsx and not self.metadata_json
+ else f"--metadata_json {self.metadata_json} ",
+ f"--shallow_validation true " if self.shallow_validation else "",
+ f"--output_dir {container_validation_output_dir}"
+ ])
return docker_cmd
def run_docker_validator(self):
@@ -213,29 +206,3 @@ def _copy(file_description, file_path):
# report is optional
if row.get('report'):
_copy('assembly report files', row['report'])
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description='Run pre-submission validation checks on VCF files', add_help=False)
- parser.add_argument("--docker_path", help="Full path to the docker installation, "
- "not required if docker is available on path", required=False)
- parser.add_argument("--container_name", help="Name of the docker container", required=False)
- parser.add_argument("--vcf_files_mapping",
- help="csv file with the mappings for vcf files, fasta and assembly report", required=True)
- parser.add_argument("--output_dir", help="Directory where the validation output reports will be made available",
- required=True)
- group = parser.add_mutually_exclusive_group(required=True)
- group.add_argument("--metadata_json",
- help="Json file that describe the project, analysis, samples and files")
- group.add_argument("--metadata_xlsx",
- help="Excel spreadsheet that describe the project, analysis, samples and files")
- args = parser.parse_args()
-
- docker_path = args.docker_path if args.docker_path else 'docker'
- docker_container_name = args.container_name if args.container_name else container_image
-
- logging_config.add_stdout_handler()
- validator = DockerValidator(args.vcf_files_mapping, args.output_dir, args.metadata_json, args.metadata_xlsx,
- docker_container_name, docker_path)
- validator.validate()
- validator.create_reports()
diff --git a/eva_sub_cli/validators/native_validator.py b/eva_sub_cli/validators/native_validator.py
index eb95939..fc3ebb8 100644
--- a/eva_sub_cli/validators/native_validator.py
+++ b/eva_sub_cli/validators/native_validator.py
@@ -11,9 +11,11 @@
class NativeValidator(Validator):
def __init__(self, mapping_file, submission_dir, project_title, metadata_json=None, metadata_xlsx=None,
- vcf_validator_path='vcf_validator', assembly_checker_path='vcf_assembly_checker',
- biovalidator_path='biovalidator', submission_config=None):
- super().__init__(mapping_file, submission_dir, project_title, metadata_json=metadata_json, metadata_xlsx=metadata_xlsx,
+ shallow_validation=False, vcf_validator_path='vcf_validator',
+ assembly_checker_path='vcf_assembly_checker', biovalidator_path='biovalidator',
+ submission_config=None):
+ super().__init__(mapping_file, submission_dir, project_title, metadata_json=metadata_json,
+ metadata_xlsx=metadata_xlsx, shallow_validation=shallow_validation,
submission_config=submission_config)
self.vcf_validator_path = vcf_validator_path
self.assembly_checker_path = assembly_checker_path
@@ -41,15 +43,16 @@ def get_validation_cmd(self):
metadata_flag = f"--metadata_json {self.metadata_json}"
path_to_workflow = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
'nextflow/validation.nf')
- return (
- f"nextflow run {path_to_workflow} "
- f"--vcf_files_mapping {self.mapping_file} "
- f"{metadata_flag} "
- f"--output_dir {self.output_dir} "
- f"--executable.vcf_validator {self.vcf_validator_path} "
- f"--executable.vcf_assembly_checker {self.assembly_checker_path} "
+ return ''.join([
+ f"nextflow run {path_to_workflow} ",
+ f"--vcf_files_mapping {self.mapping_file} ",
+ f"{metadata_flag} ",
+ f"--output_dir {self.output_dir} ",
+ f"--shallow_validation true " if self.shallow_validation else "",
+ f"--executable.vcf_validator {self.vcf_validator_path} ",
+ f"--executable.vcf_assembly_checker {self.assembly_checker_path} ",
f"--executable.biovalidator {self.biovalidator_path}"
- )
+ ])
def verify_executables_installed(self):
for name, path in [('vcf-validator', self.vcf_validator_path),
diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py
new file mode 100644
index 0000000..321abc4
--- /dev/null
+++ b/eva_sub_cli/validators/validation_results_parsers.py
@@ -0,0 +1,191 @@
+import re
+
+from ebi_eva_common_pyutils.logger import logging_config
+
+logger = logging_config.get_logger(__name__)
+
+def parse_assembly_check_log(assembly_check_log):
+ error_list = []
+ nb_error, nb_mismatch = 0, 0
+ match = total = None
+ with open(assembly_check_log) as open_file:
+ for line in open_file:
+ if line.startswith('[error]'):
+ nb_error += 1
+ if nb_error < 11:
+ error_list.append(line.strip()[len('[error] '):])
+ elif line.startswith('[info] Number of matches:'):
+ match, total = line.strip()[len('[info] Number of matches: '):].split('/')
+ match = int(match)
+ total = int(total)
+ return error_list, nb_error, match, total
+
+
+def parse_assembly_check_report(assembly_check_report):
+ mismatch_list = []
+ nb_mismatch = 0
+ nb_error = 0
+ error_list = []
+ with open(assembly_check_report) as open_file:
+ for line in open_file:
+ if 'does not match the reference sequence' in line:
+ nb_mismatch += 1
+ if nb_mismatch < 11:
+ mismatch_list.append(line.strip())
+ elif 'Multiple synonyms' in line:
+ nb_error += 1
+ if nb_error < 11:
+ error_list.append(line.strip())
+ # Contig not found in FASTA is reported here rather than in logs when no assembly report is used.
+ # Count and report once per contig name rather than once per line, to avoid redundant errors.
+ elif 'is not present in FASTA file' in line:
+ line_num, error_msg = line.split(': ')
+ error_msg = error_msg.strip()
+ if error_msg not in error_list:
+ nb_error += 1
+ if nb_error < 11:
+ error_list.append(error_msg)
+ return mismatch_list, nb_mismatch, error_list, nb_error
+
+
+def parse_vcf_check_report(vcf_check_report):
+ valid = True
+ max_error_reported = 10
+ error_list, critical_list = [], []
+ warning_count = error_count = critical_count = 0
+ with open(vcf_check_report) as open_file:
+ for line in open_file:
+ if 'warning' in line:
+ warning_count = 1
+ elif line.startswith('According to the VCF specification'):
+ if 'not' in line:
+ valid = False
+ elif vcf_check_errors_is_critical(line.strip()):
+ critical_count += 1
+ if critical_count <= max_error_reported:
+ critical_list.append(line.strip())
+ else:
+ error_count += 1
+ if error_count <= max_error_reported:
+ error_list.append(line.strip())
+
+ return valid, warning_count, error_count, critical_count, error_list, critical_list
+
+
+def vcf_check_errors_is_critical(error):
+ """
+ This function identify VCF check errors that are not critical for the processing of the VCF within EVA.
+ They affect specific INFO or FORMAT fields that are used in the variant detection but less so in the downstream
+ analysis.
+ Critical:
+ Reference and alternate alleles must not be the same.
+ Requested evidence presence with --require-evidence. Please provide genotypes (GT field in FORMAT and samples),
+ or allele frequencies (AF field in INFO), or allele counts (AC and AN fields in INFO)..
+ Contig is not sorted by position. Contig chr10 position 41695506 found after 41883113.
+ Duplicated variant chr1A:1106203:A>G found.
+ Metadata description string is not valid.
+
+ Error
+ Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive
+ its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2
+ (derived from ploidy 1).
+ Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..
+ """
+ non_critical_format_fields = ['PL', 'AD', 'AC']
+ non_critical_info_fields = ['AC']
+ regexes = {
+ r'^INFO (\w+) does not match the specification Number': non_critical_format_fields,
+ r'^Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_info_fields
+ }
+ for regex in regexes:
+ match = re.match(regex, error)
+ if match:
+ field_affected = match.group(1)
+ if field_affected in regexes[regex]:
+ return False
+ return True
+
+
+def parse_biovalidator_validation_results(metadata_check_file):
+ """
+ Read the biovalidator's report and extract the list of validation errors
+ """
+ ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+
+ def clean_read(ifile):
+ l = ifile.readline()
+ if l:
+ return ansi_escape.sub('', l).strip()
+
+ if not metadata_check_file:
+ return
+
+ errors = []
+
+ with open(metadata_check_file) as open_file:
+ collect = False
+ while True:
+ line = clean_read(open_file)
+ if line is None:
+ break # EOF
+ elif not line:
+ continue # Empty line
+ if not collect:
+ if line.startswith('Validation failed with following error(s):'):
+ collect = True
+ else:
+ line2 = clean_read(open_file)
+ if line is None or line2 is None:
+ break # EOF
+ errors.append({'property': line, 'description': line2})
+ return errors
+
+
+def convert_metadata_sheet(json_attribute, xls2json_conf):
+ if json_attribute is None:
+ return None
+ for sheet_name in xls2json_conf['worksheets']:
+ if xls2json_conf['worksheets'][sheet_name] == json_attribute:
+ return sheet_name
+
+
+def convert_metadata_row(sheet, json_row, xls2json_conf):
+ if json_row is None:
+ return ''
+ if 'header_row' in xls2json_conf[sheet]:
+ return int(json_row) + xls2json_conf[sheet]['header_row']
+ else:
+ return int(json_row) + 2
+
+
+def convert_metadata_attribute(sheet, json_attribute, xls2json_conf):
+ if json_attribute is None:
+ return ''
+ attributes_dict = {}
+ attributes_dict.update(xls2json_conf[sheet].get('required', {}))
+ attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
+ for attribute in attributes_dict:
+ if attributes_dict[attribute] == json_attribute:
+ return attribute
+
+
+def parse_metadata_property(property_str):
+ if property_str.startswith('.'):
+ return property_str.strip('./'), None, None
+ # First attempt to parse as BioSample object
+ sheet, row, col = parse_sample_metadata_property(property_str)
+ if sheet is not None and row is not None and col is not None:
+ return sheet, row, col
+ match = re.match(r'/(\w+)(/(\d+))?([./](\w+))?', property_str)
+ if match:
+ return match.group(1), match.group(3), match.group(5)
+ else:
+ logger.error(f'Cannot parse {property_str} in JSON metadata error')
+ return None, None, None
+
+
+def parse_sample_metadata_property(property_str):
+ match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str)
+ if match:
+ return 'sample', match.group(1), match.group(2)
+ return None, None, None
diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
index c0c9681..f3e591d 100755
--- a/eva_sub_cli/validators/validator.py
+++ b/eva_sub_cli/validators/validator.py
@@ -5,7 +5,6 @@
import json
import logging
import os
-import re
from functools import lru_cache, cached_property
import yaml
@@ -13,10 +12,14 @@
from ebi_eva_common_pyutils.config import WritableConfig
from eva_sub_cli import ETC_DIR, SUB_CLI_CONFIG_FILE, __version__
-from eva_sub_cli.file_utils import backup_file_or_directory
+from eva_sub_cli.file_utils import backup_file_or_directory, resolve_single_file_path
from eva_sub_cli.report import generate_html_report
from ebi_eva_common_pyutils.logger import logging_config, AppLogger
+from eva_sub_cli.validators.validation_results_parsers import parse_assembly_check_log, parse_assembly_check_report, \
+ parse_biovalidator_validation_results, convert_metadata_sheet, convert_metadata_row, convert_metadata_attribute, \
+ parse_vcf_check_report, parse_metadata_property
+
VALIDATION_OUTPUT_DIR = "validation_output"
VALIDATION_RESULTS = 'validation_results'
READY_FOR_SUBMISSION_TO_EVA = 'ready_for_submission_to_eva'
@@ -24,18 +27,10 @@
logger = logging_config.get_logger(__name__)
-def resolve_single_file_path(file_path):
- files = glob.glob(file_path)
- if len(files) == 0:
- return None
- elif len(files) > 0:
- return files[0]
-
-
class Validator(AppLogger):
def __init__(self, mapping_file, submission_dir, project_title=None, metadata_json=None, metadata_xlsx=None,
- submission_config: WritableConfig = None):
+ shallow_validation=False, submission_config: WritableConfig = None):
# validator write to the validation output directory
# If the submission_config is not set it will also be written to the VALIDATION_OUTPUT_DIR
self.submission_dir = submission_dir
@@ -44,11 +39,12 @@ def __init__(self, mapping_file, submission_dir, project_title=None, metadata_js
vcf_files, fasta_files = self._find_vcf_and_fasta_files()
self.vcf_files = vcf_files
self.fasta_files = fasta_files
- self.results = {}
+ self.results = {'shallow_validation': {'requested': shallow_validation}}
self.project_title = project_title
self.validation_date = datetime.datetime.now()
self.metadata_json = metadata_json
self.metadata_xlsx = metadata_xlsx
+ self.shallow_validation = shallow_validation
if submission_config:
self.sub_config = submission_config
else:
@@ -132,7 +128,7 @@ def verify_files_present(self):
files_missing, missing_files_list = self.check_if_file_missing()
if files_missing:
raise FileNotFoundError(f"some files (vcf/fasta) mentioned in metadata file could not be found. "
- f"Missing files list {missing_files_list}")
+ f"Missing files list {missing_files_list}")
def check_if_file_missing(self):
files_missing = False
@@ -157,105 +153,31 @@ def update_config_with_validation_result(self):
self.sub_config.set(READY_FOR_SUBMISSION_TO_EVA, value=self.verify_ready_for_submission_to_eva())
def verify_ready_for_submission_to_eva(self):
- # TODO: check validation results and confirm if they are good enough for submitting to EVA
- return True
-
- def parse_assembly_check_log(self, assembly_check_log):
- error_list = []
- nb_error, nb_mismatch = 0, 0
- match = total = None
- with open(assembly_check_log) as open_file:
- for line in open_file:
- if line.startswith('[error]'):
- nb_error += 1
- if nb_error < 11:
- error_list.append(line.strip()[len('[error] '):])
- elif line.startswith('[info] Number of matches:'):
- match, total = line.strip()[len('[info] Number of matches: '):].split('/')
- match = int(match)
- total = int(total)
- return error_list, nb_error, match, total
-
- def parse_assembly_check_report(self, assembly_check_report):
- mismatch_list = []
- nb_mismatch = 0
- nb_error = 0
- error_list = []
- with open(assembly_check_report) as open_file:
- for line in open_file:
- if 'does not match the reference sequence' in line:
- nb_mismatch += 1
- if nb_mismatch < 11:
- mismatch_list.append(line.strip())
- elif 'Multiple synonyms' in line:
- nb_error += 1
- if nb_error < 11:
- error_list.append(line.strip())
- # Contig not found in FASTA is reported here rather than in logs when no assembly report is used.
- # Count and report once per contig name rather than once per line, to avoid redundant errors.
- elif 'is not present in FASTA file' in line:
- line_num, error_msg = line.split(': ')
- error_msg = error_msg.strip()
- if error_msg not in error_list:
- nb_error += 1
- if nb_error < 11:
- error_list.append(error_msg)
- return mismatch_list, nb_mismatch, error_list, nb_error
-
- def parse_vcf_check_report(self, vcf_check_report):
- valid = True
- max_error_reported = 10
- error_list, critical_list = [], []
- warning_count = error_count = critical_count = 0
- with open(vcf_check_report) as open_file:
- for line in open_file:
- if 'warning' in line:
- warning_count = 1
- elif line.startswith('According to the VCF specification'):
- if 'not' in line:
- valid = False
- elif self.vcf_check_errors_is_critical(line.strip()):
- critical_count += 1
- if critical_count <= max_error_reported:
- critical_list.append(line.strip())
- else:
- error_count += 1
- if error_count <= max_error_reported:
- error_list.append(line.strip())
-
- return valid, warning_count, error_count, critical_count, error_list, critical_list
-
- def vcf_check_errors_is_critical(self, error):
"""
- This function identify VCF check errors that are not critical for the processing of the VCF within EVA.
- They affect specific INFO or FORMAT fields that are used in the variant detection but less so in the downstream analysis.
- Critical:
- Reference and alternate alleles must not be the same.
- Requested evidence presence with --require-evidence. Please provide genotypes (GT field in FORMAT and samples), or allele frequencies (AF field in INFO), or allele counts (AC and AN fields in INFO)..
- Contig is not sorted by position. Contig chr10 position 41695506 found after 41883113.
- Duplicated variant chr1A:1106203:A>G found.
- Metadata description string is not valid.
-
- Error
- Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2 (derived from ploidy 1).
- Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..
+ Assess if the validation results are meeting expectations
+ It assumes all validation have been parsed already.
"""
- non_critical_format_fields = ['PL', 'AD', 'AC']
- non_critical_info_fields = ['AC']
- regexes = {
- r'^INFO (\w+) does not match the specification Number': non_critical_format_fields,
- r'^Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_info_fields
- }
- for regex in regexes:
- match = re.match(regex, error)
- if match:
- field_affected = match.group(1)
- if field_affected in regexes[regex]:
- return False
- return True
+ return all((
+ self.results.get('vcf_check', {}).get('critical_count', 1) == 0,
+ self.results.get('assembly_check', {}).get('nb_mismatch', 1) == 0,
+ self.results.get('assembly_check', {}).get('nb_error', 1) == 0,
+ all((
+ fa_file_check.get('all_insdc', False) is True
+ for fa_file, fa_file_check in self.results.get('fasta_check', {}).items()
+ )),
+ self.results.get('sample_check', {}).get('overall_differences', True) is False,
+ len(self.results.get('metadata_check', {}).get('spreadsheet_errors', [])) == 0,
+ len(self.results.get('metadata_check', {}).get('json_errors', [])) == 0,
+ any((
+ self.results['shallow_validation']['requested'] is False,
+ self.results['shallow_validation'].get('required', True) is False
+ ))
+ ))
def _collect_validation_workflow_results(self):
# Collect information from the output and summarise in the config
+ if self.shallow_validation:
+ self._collect_trim_down_metrics()
self._collect_vcf_check_results()
self._collect_assembly_check_results()
self._load_sample_check_results()
@@ -280,6 +202,22 @@ def _vcf_check_db_report(self, vcf_name):
os.path.join(self.output_dir, 'vcf_format', vcf_name + '.*.db')
)
+ @lru_cache
+ def _assembly_check_log(self, vcf_name):
+ return resolve_single_file_path(
+ os.path.join(self.output_dir, 'assembly_check', vcf_name + '.assembly_check.log')
+ )
+
+ @lru_cache
+ def _assembly_check_text_report(self, vcf_name):
+ return resolve_single_file_path(
+ os.path.join(self.output_dir, 'assembly_check', vcf_name + '*text_assembly_report*')
+ )
+
+ @cached_property
+ def _sample_check_yaml(self):
+ return resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'sample_checker.yml'))
+
def _collect_vcf_check_results(self,):
# detect output files for vcf check
self.results['vcf_check'] = {}
@@ -291,7 +229,7 @@ def _collect_vcf_check_results(self,):
vcf_check_db_report = self._vcf_check_db_report(vcf_name)
if vcf_check_log and vcf_check_text_report and vcf_check_db_report:
- valid, warning_count, error_count, critical_count, error_list, critical_list = self.parse_vcf_check_report(vcf_check_text_report)
+ valid, warning_count, error_count, critical_count, error_list, critical_list = parse_vcf_check_report(vcf_check_text_report)
else:
valid, warning_count, error_count, critical_count, error_list, critical_list = (False, 0, 0, 1, [], ['Process failed'])
self.results['vcf_check'][vcf_name] = {
@@ -304,18 +242,6 @@ def _collect_vcf_check_results(self,):
'critical_list': critical_list
}
- @lru_cache
- def _assembly_check_log(self, vcf_name):
- return resolve_single_file_path(
- os.path.join(self.output_dir, 'assembly_check', vcf_name + '.assembly_check.log')
- )
-
- @lru_cache
- def _assembly_check_text_report(self, vcf_name):
- return resolve_single_file_path(
- os.path.join(self.output_dir, 'assembly_check', vcf_name + '*text_assembly_report*')
- )
-
def _collect_assembly_check_results(self):
# detect output files for assembly check
self.results['assembly_check'] = {}
@@ -327,9 +253,9 @@ def _collect_assembly_check_results(self):
if assembly_check_log and assembly_check_text_report:
error_list_from_log, nb_error_from_log, match, total = \
- self.parse_assembly_check_log(assembly_check_log)
+ parse_assembly_check_log(assembly_check_log)
mismatch_list, nb_mismatch, error_list_from_report, nb_error_from_report = \
- self.parse_assembly_check_report(assembly_check_text_report)
+ parse_assembly_check_report(assembly_check_text_report)
nb_error = nb_error_from_log + nb_error_from_report
error_list = error_list_from_log + error_list_from_report
else:
@@ -344,10 +270,6 @@ def _collect_assembly_check_results(self):
'total': total
}
- @cached_property
- def _sample_check_yaml(self):
- return resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'sample_checker.yml'))
-
def _load_fasta_check_results(self):
for fasta_file in self.fasta_files:
fasta_file_name = os.path.basename(fasta_file)
@@ -370,8 +292,8 @@ def _load_sample_check_results(self):
def _collect_metadata_results(self):
self.results['metadata_check'] = {}
self._load_spreadsheet_conversion_errors()
- self._parse_biovalidator_validation_results()
- self._parse_semantic_metadata_results()
+ self.collect_biovalidator_validation_results()
+ self._collect_semantic_metadata_results()
if self.metadata_xlsx:
self._convert_biovalidator_validation_to_spreadsheet()
self._write_spreadsheet_validation_results()
@@ -385,64 +307,19 @@ def _load_spreadsheet_conversion_errors(self):
with open(errors_file) as open_yaml:
self.results['metadata_check']['spreadsheet_errors'] = yaml.safe_load(open_yaml)
- def _parse_biovalidator_validation_results(self):
+ def collect_biovalidator_validation_results(self):
"""
Read the biovalidator's report and extract the list of validation errors
"""
metadata_check_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations',
'metadata_validation.txt'))
- ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
-
- def clean_read(ifile):
- l = ifile.readline()
- if l:
- return ansi_escape.sub('', l).strip()
-
- if not metadata_check_file:
- return
- with open(metadata_check_file) as open_file:
- errors = []
- collect = False
- while True:
- line = clean_read(open_file)
- if line is None:
- break # EOF
- elif not line:
- continue # Empty line
- if not collect:
- if line.startswith('Validation failed with following error(s):'):
- collect = True
- else:
- line2 = clean_read(open_file)
- if line is None or line2 is None:
- break # EOF
- errors.append({'property': line, 'description': line2})
+ errors = parse_biovalidator_validation_results(metadata_check_file)
self.results['metadata_check'].update({
'json_report_path': metadata_check_file,
'json_errors': errors
})
- def _parse_metadata_property(self, property_str):
- if property_str.startswith('.'):
- return property_str.strip('./'), None, None
- # First attempt to parse as BioSample object
- sheet, row, col = self._parse_sample_metadata_property(property_str)
- if sheet is not None and row is not None and col is not None:
- return sheet, row, col
- match = re.match(r'/(\w+)(/(\d+))?([./](\w+))?', property_str)
- if match:
- return match.group(1), match.group(3), match.group(5)
- else:
- logger.error(f'Cannot parse {property_str} in JSON metadata error')
- return None, None, None
-
- def _parse_sample_metadata_property(self, property_str):
- match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str)
- if match:
- return 'sample', match.group(1), match.group(2)
- return None, None, None
-
- def _parse_semantic_metadata_results(self):
+ def _collect_semantic_metadata_results(self):
errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations',
'metadata_semantic_check.yml'))
if not errors_file:
@@ -461,13 +338,13 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
if 'spreadsheet_errors' not in self.results['metadata_check']:
self.results['metadata_check']['spreadsheet_errors'] = []
for error in self.results['metadata_check'].get('json_errors', {}):
- sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property'])
+ sheet_json, row_json, attribute_json = parse_metadata_property(error['property'])
# There should only be one Project but adding the row back means it's easier for users to find
if sheet_json == 'project' and row_json is None:
row_json = 0
- sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf)
- row = self._convert_metadata_row(sheet, row_json, xls2json_conf)
- column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
+ sheet = convert_metadata_sheet(sheet_json, xls2json_conf)
+ row = convert_metadata_row(sheet, row_json, xls2json_conf)
+ column = convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
if row_json is None and attribute_json is None:
new_description = f'Sheet "{sheet}" is missing'
elif row_json is None:
@@ -503,31 +380,6 @@ def _write_spreadsheet_validation_results(self):
open_file.write(error_dict.get('description') + '\n')
self.results['metadata_check']['spreadsheet_report_path'] = spreadsheet_report_file
- def _convert_metadata_sheet(self, json_attribute, xls2json_conf):
- if json_attribute is None:
- return None
- for sheet_name in xls2json_conf['worksheets']:
- if xls2json_conf['worksheets'][sheet_name] == json_attribute:
- return sheet_name
-
- def _convert_metadata_row(self, sheet, json_row, xls2json_conf):
- if json_row is None:
- return ''
- if 'header_row' in xls2json_conf[sheet]:
- return int(json_row) + xls2json_conf[sheet]['header_row']
- else:
- return int(json_row) + 2
-
- def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf):
- if json_attribute is None:
- return ''
- attributes_dict = {}
- attributes_dict.update(xls2json_conf[sheet].get('required', {}))
- attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
- for attribute in attributes_dict:
- if attributes_dict[attribute] == json_attribute:
- return attribute
-
def _collect_file_info_to_metadata(self):
md5sum_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'file_info.txt'))
file_path_2_md5 = {}
@@ -576,6 +428,20 @@ def _collect_file_info_to_metadata(self):
else:
self.error(f'Cannot locate the metadata in JSON format in {os.path.join(self.output_dir, "metadata.json")}')
+ def _collect_trim_down_metrics(self):
+ self.results['shallow_validation']['metrics'] = {}
+ shallow_validation_required = False
+ for vcf_file in self.vcf_files:
+ basename = os.path.basename(vcf_file)
+ vcf_name, _ = os.path.splitext(basename)
+ trimmed_down_metrics = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations',
+ f'{vcf_name}_trim_down.yml'))
+ with open(trimmed_down_metrics) as open_file:
+ metrics = yaml.safe_load(open_file)
+ shallow_validation_required = shallow_validation_required or metrics['trim_down_required']
+ self.results['shallow_validation']['metrics'][vcf_file] = metrics
+ self.results['shallow_validation']['required'] = shallow_validation_required
+
def get_vcf_fasta_analysis_mapping(self):
vcf_fasta_analysis_mapping = []
with open(self.mapping_file) as open_file:
@@ -608,7 +474,8 @@ def get_vcf_fasta_analysis_mapping(self):
def create_reports(self):
report_html = generate_html_report(self.results, self.validation_date, self.submission_dir,
- self.get_vcf_fasta_analysis_mapping(), self.project_title)
+ self.get_vcf_fasta_analysis_mapping(),
+ self.project_title)
file_path = os.path.join(self.output_dir, 'report.html')
with open(file_path, "w") as f:
f.write(report_html)
diff --git a/pyproject.toml b/pyproject.toml
index c686ff7..6db5eae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ classifiers = [
'check_metadata_semantics.py'='eva_sub_cli.executables.check_metadata_semantics:main'
'samples_checker.py'='eva_sub_cli.executables.samples_checker:main'
'xlsx2json.py'='eva_sub_cli.executables.xlsx2json:main'
+'trim_down.py'='eva_sub_cli.executables.trim_down:main'
[tool.setuptools]
packages = ['eva_sub_cli', 'eva_sub_cli.exceptions', 'eva_sub_cli.executables', 'eva_sub_cli.validators']
diff --git a/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html
new file mode 100644
index 0000000..5fc4e3a
--- /dev/null
+++ b/tests/resources/validation_reports/expected_shallow_metadata_xlsx_report.html
@@ -0,0 +1,22 @@
+Validation Report▶ ❌ You requested to run the shallow validation, please run full validation before submitting the data
VCF File | Variant lines validated in VCF | Entries used in Fasta |
---|
input_fail.vcf | 10000 | 24 |
input_passed.vcf | 10000 | 24 |
Project Summary
General details about the project
Project Title: My cool project
Validation Date: 2023-08-31 12:34:56
Submission Directory: /test/submission/dir
▶ Files mapping
VCF File | Fasta File | Analysis |
---|
input_fail.vcf | input_fail.fa | A |
input_pass.vcf | input_pass.fa | B |
input_test.vcf | input_test.fa | could not be linked |
Metadata validation results
Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the
EVA website.
▶ ❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
Sheet | Row | Column | Description |
---|
Files | | | Sheet "Files" is missing |
Project | 2 | Project Title | Column "Project Title" is not populated |
Project | 2 | Description | Column "Description" is not populated |
Project | 2 | Tax ID | Column "Tax ID" is not populated |
Project | 2 | Center | Column "Center" is not populated |
Analysis | 2 | Analysis Title | Column "Analysis Title" is not populated |
Analysis | 2 | Description | Column "Description" is not populated |
Analysis | 2 | Experiment Type | Column "Experiment Type" is not populated |
Analysis | 2 | Reference | Column "Reference" is not populated |
Sample | 3 | Sample Accession | Column "Sample Accession" is not populated |
VCF validation results
Checks whether each file is compliant with the
VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.
input_fail.vcf
▶ ❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
Category | Error |
---|
Parsing Error | The assembly checking could not be completed: Contig 'chr23' not found in assembly report |
mismatch error | Chromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c' |
mismatch error | Chromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a' |
mismatch error | Chromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c' |
mismatch error | Chromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g' |
mismatch error | Chromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c' |
mismatch error | Chromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g' |
mismatch error | Chromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g' |
mismatch error | Chromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a' |
mismatch error | Chromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c' |
▶ ❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
Category | Error |
---|
critical error | Line 4: Error in meta-data section. |
non-critical error | Sample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=.. |
input_passed.vcf
✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors
Sample name concordance check
Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
▶ ❌ Analysis A: Sample names in metadata do not match with those in VCF files
Category | First 5 Errors For Category | Link To View All Errors |
---|
Samples described in the metadata but not in the VCF files | SampleA1, SampleA2 , SampleA3, SampleA4, SampleA5 | Show All Errors For Category |
Samples in the VCF files but not described in the metadata | A1Sample , A2Sample, A3Sample, A4Sample, A5Sample | Show All Errors For Category |
All Errors For Category - Samples in the VCF files but not described in the metadata:
- A1Sample•
- •A2Sample
- A3Sample
- A4Sample
- A5Sample
- A6Sample
- A7Sample
- A8Sample
- A9Sample
- A10Sample
Hide ✔ Analysis B: Sample names in metadata match with those in VCF files
▶ ❌ Analysis C: Sample names in metadata do not match with those in VCF files
Category | First 5 Errors For Category | Link To View All Errors |
---|
Samples described in the metadata but not in the VCF files | SampleC1 , SampleC2, SampleC3, SampleC4 | Show All Errors For Category |
Samples in the VCF files but not described in the metadata | C1Sample , C2Sample, C3Sample, C4Sample | Show All Errors For Category |
All Errors For Category - Samples in the VCF files but not described in the metadata:
- C1Sample•
- •C2Sample
- C3Sample
- C4Sample
HideReference genome INSDC check
Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.
metadata_asm_match.fa
✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible
metadata_asm_not_found.fa
✔ All sequences are INSDC accessioned
▶ ❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
Category | Accessions |
---|
Assembly accession found in metadata | Not found |
Assembly accession(s) compatible with FASTA | GCA_1 |
metadata_asm_not_match.fa
✔ All sequences are INSDC accessioned
▶ ❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
Category | Accessions |
---|
Assembly accession found in metadata | GCA_2 |
Assembly accession(s) compatible with FASTA | GCA_1 |
metadata_error.fa
Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible
not_all_insdc.fa
▶ ❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence name | Refget md5 |
---|
2 | hjfdoijsfc47hfg0gh9qwjrve |
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..2381cdc
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,40 @@
+import copy
+import logging
+import sys
+from unittest import TestCase
+from unittest.mock import patch, Mock
+
+
+from eva_sub_cli import orchestrator
+from eva_sub_cli.executables import cli
+
+
+class TestCli(TestCase):
+
+ def test_main(self):
+ args = Mock(submission_dir='.', vcf_files=[], reference_fasta='', metadata_json=None, metadata_xlsx='',
+ tasks='validate', executor='native', debug=False)
+ with patch('eva_sub_cli.executables.cli.parse_args', return_value=args), \
+ patch('eva_sub_cli.orchestrator.orchestrate_process'):
+ cli.main()
+ # Check that the debug message is shown
+ logger = orchestrator.logger
+ logger.debug('test')
+
+ def test_validate_args(self):
+ cmd_args = [
+ '--submission_dir', '.',
+ '--vcf_files', 'test.vcf',
+ '--reference_fasta', 'test.fasta',
+ '--metadata_json', 'test.json',
+ '--tasks', 'validate',
+ '--executor', 'native',
+ '--debug'
+ ]
+ args = cli.parse_args(cmd_args)
+ assert args.submission_dir == '.'
+
+
+ with patch('sys.exit') as m_exit:
+ cli.parse_args(cmd_args[:2]+cmd_args[4:])
+ m_exit.assert_called_once_with(1)
diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py
index b11a856..22e6ec1 100644
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -87,7 +87,7 @@ def test_orchestrate_validate(self):
m_get_vcf.assert_called_once_with(self.mapping_file)
m_docker_validator.assert_any_call(
self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, self.metadata_xlsx,
- submission_config=m_config.return_value
+ submission_config=m_config.return_value, shallow_validation=False
)
m_docker_validator().validate_and_report.assert_called_once_with()
@@ -108,7 +108,7 @@ def test_orchestrate_validate_submit(self):
# Validate was run because the config show it was not run successfully before
m_docker_validator.assert_any_call(
self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, self.metadata_xlsx,
- submission_config=m_config.return_value
+ submission_config=m_config.return_value, shallow_validation=False
)
m_docker_validator().validate_and_report.assert_called_once_with()
@@ -154,7 +154,7 @@ def test_orchestrate_with_vcf_files(self):
assert row['report'] == None
m_docker_validator.assert_any_call(
self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, self.metadata_xlsx,
- submission_config=m_config.return_value
+ submission_config=m_config.return_value, shallow_validation=False
)
m_docker_validator().validate_and_report.assert_called_once_with()
@@ -172,7 +172,7 @@ def test_orchestrate_with_metadata_json_without_asm_report(self):
assert row['report'] == ''
m_docker_validator.assert_any_call(
self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, None,
- submission_config=m_config.return_value
+ submission_config=m_config.return_value, shallow_validation=False
)
m_docker_validator().validate_and_report.assert_called_once_with()
@@ -192,7 +192,7 @@ def test_orchestrate_with_metadata_json_with_asm_report(self):
assert row['report'].__contains__('GCA_000001405.27_report.txt')
m_docker_validator.assert_any_call(
self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, None,
- submission_config=m_config.return_value
+ submission_config=m_config.return_value, shallow_validation=False
)
m_docker_validator().validate_and_report.assert_called_once_with()
@@ -212,7 +212,7 @@ def test_orchestrate_vcf_files_takes_precedence_over_metadata(self):
assert row['report'] == None
m_docker_validator.assert_any_call(
self.mapping_file, self.test_sub_dir, self.project_title, self.metadata_json, None,
- submission_config=m_config.return_value
+ submission_config=m_config.return_value, shallow_validation=False
)
m_docker_validator().validate_and_report.assert_called_once_with()
@@ -232,7 +232,7 @@ def test_orchestrate_with_metadata_xlsx(self):
assert row['report'] == ''
m_docker_validator.assert_any_call(
self.mapping_file, self.test_sub_dir, self.project_title, None, self.metadata_xlsx,
- submission_config=m_config.return_value
+ submission_config=m_config.return_value, shallow_validation=False
)
m_docker_validator().validate_and_report.assert_called_once_with()
diff --git a/tests/test_report.py b/tests/test_report.py
index 3f1e3f3..e2dbd3d 100644
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -1,3 +1,4 @@
+import copy
import os
import datetime
from unittest import TestCase
@@ -316,6 +317,7 @@ class TestReport(TestCase):
resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
expected_report_metadata_xlsx = os.path.join(resource_dir, 'validation_reports', 'expected_report_metadata_xlsx.html')
expected_report_metadata_json = os.path.join(resource_dir, 'validation_reports', 'expected_report_metadata_json.html')
+ expected_report_metadata_xlsx_shallow = os.path.join(resource_dir, 'validation_reports', 'expected_shallow_metadata_xlsx_report.html')
test_project_name = "My cool project"
test_validation_date = datetime.datetime(2023, 8, 31, 12, 34, 56)
test_submission_dir = "/test/submission/dir"
@@ -324,34 +326,46 @@ class TestReport(TestCase):
test_vcf_fasta_analysis_mapping.append({'vcf_file': 'input_pass.vcf', 'fasta_file': 'input_pass.fa', 'analysis': 'B'})
test_vcf_fasta_analysis_mapping.append({'vcf_file': 'input_test.vcf', 'fasta_file': 'input_test.fa', 'analysis': 'could not be linked'})
- def test_generate_html_report_metadata_xlsx(self):
- report = generate_html_report(validation_results_xlsx, self.test_validation_date, self.test_submission_dir,
+ def check_report_vs_expected(self, validation_results, output_report, expected_report):
+ report = generate_html_report(validation_results, self.test_validation_date, self.test_submission_dir,
self.test_vcf_fasta_analysis_mapping, self.test_project_name)
- with open('metadata_xlsx_report.html', 'w') as open_file:
+ with open(output_report, 'w') as open_file:
open_file.write(report)
- with open(self.expected_report_metadata_xlsx) as open_html:
+ with open(expected_report) as open_html:
expected_report_text = open_html.read()
# Inject the version in the expected report
expected_report_text = expected_report_text.replace('cligeneratedversion', eva_sub_cli.__version__)
assert report == expected_report_text
# Remove output file if assert passes
- if os.path.exists('metadata_xlsx_report.html'):
- os.remove('metadata_xlsx_report.html')
+ if os.path.exists(output_report):
+ os.remove(output_report)
- def test_generate_html_report_metadata_json(self):
- report = generate_html_report(validation_results_json, self.test_validation_date, self.test_submission_dir,
- self.test_vcf_fasta_analysis_mapping, self.test_project_name)
- with open('metadata_json_report.html', 'w') as open_file:
- open_file.write(report)
+ def test_generate_html_report_metadata_xlsx(self):
+ self.check_report_vs_expected(
+ validation_results_xlsx,
+ 'metadata_xlsx_report.html',
+ self.expected_report_metadata_xlsx
+ )
- with open(self.expected_report_metadata_json) as open_html:
- expected_report_text = open_html.read()
- # Inject the version in the expected report
- expected_report_text = expected_report_text.replace('cligeneratedversion', eva_sub_cli.__version__)
- assert report == expected_report_text
+ def test_generate_html_report_metadata_json(self):
+ self.check_report_vs_expected(
+ validation_results_json,
+ 'metadata_json_report.html',
+ self.expected_report_metadata_json
+ )
- # Remove output file if assert passes
- if os.path.exists('metadata_json_report.html'):
- os.remove('metadata_json_report.html')
+ def test_generate_html_report_metadata_xlsx_shallow(self):
+ shallow_validation_results_xlsx = copy.deepcopy(validation_results_xlsx)
+ shallow_validation_results_xlsx['shallow_validation'] = {
+ 'required': True, 'requested': True,
+ 'metrics': {
+ 'input_fail.vcf': {'trim_down_vcf_record': 10000, 'number_sequence_found': 24, 'trim_down_required': True},
+ 'input_passed.vcf': {'trim_down_vcf_record': 10000, 'number_sequence_found': 24, 'trim_down_required': True}
+ }}
+ self.check_report_vs_expected(
+ shallow_validation_results_xlsx,
+ 'shallow_metadata_xlsx_report.html',
+ self.expected_report_metadata_xlsx_shallow
+ )
diff --git a/tests/test_validaton_results_parsers.py b/tests/test_validaton_results_parsers.py
new file mode 100644
index 0000000..388b44c
--- /dev/null
+++ b/tests/test_validaton_results_parsers.py
@@ -0,0 +1,32 @@
+import os.path
+from unittest import TestCase
+
+from eva_sub_cli.validators.validation_results_parsers import vcf_check_errors_is_critical, parse_assembly_check_log, \
+ parse_assembly_check_report
+
+
+class TestValidationParsers(TestCase):
+ resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
+
+ def test_vcf_check_errors_is_critical(self):
+ errors = [
+ 'INFO AC does not match the specification Number=A (expected 1 value(s)). AC=100,37.',
+ 'Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2 (derived from ploidy 1).',
+ 'Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..'
+ ]
+ expected_return = [False, True, True]
+ for i, error in enumerate(errors):
+ assert vcf_check_errors_is_critical(error) == expected_return[i]
+
+ def test_parse_assembly_check_log(self):
+ assembly_check_log = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.assembly_check.log')
+ error_list, nb_error, match, total = parse_assembly_check_log(assembly_check_log)
+ assert error_list == ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"]
+
+ def test_parse_assembly_check_report(self):
+ assembly_check_report = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.text_assembly_report.txt')
+ mismatch_list, nb_mismatch, error_list, nb_error = parse_assembly_check_report(assembly_check_report)
+ assert mismatch_list[0] == "Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'"
+ assert nb_mismatch == 12
+ assert error_list == ['Chromosome scaffold_chr1 is not present in FASTA file']
+ assert nb_error == 1
diff --git a/tests/test_validator.py b/tests/test_validator.py
index 6745965..a9031e3 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -36,6 +36,7 @@ def tearDown(self) -> None:
def test__collect_validation_workflow_results_with_metadata_xlsx(self):
expected_results = {
+ 'shallow_validation': {'requested': False},
'vcf_check': {
'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, 'critical_count': 0, 'critical_list': []}
},
@@ -120,6 +121,7 @@ def test__collect_validation_workflow_results_with_metadata_xlsx(self):
def test__collect_validation_workflow_results_with_metadata_json(self):
expected_results = {
+ 'shallow_validation': {'requested': False},
'vcf_check': {
'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0,
'critical_count': 0, 'critical_list': []}
@@ -190,19 +192,9 @@ def test_create_report(self):
report_path = self.validator.create_reports()
assert os.path.exists(report_path)
- def test_vcf_check_errors_is_critical(self):
- errors = [
- 'INFO AC does not match the specification Number=A (expected 1 value(s)). AC=100,37.',
- 'Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2 (derived from ploidy 1).',
- 'Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..'
- ]
- expected_return = [False, True, True]
- for i, error in enumerate(errors):
- assert self.validator.vcf_check_errors_is_critical(error) == expected_return[i]
-
def test_parse_biovalidator_validation_results(self):
self.validator.results['metadata_check'] = {}
- self.validator._parse_biovalidator_validation_results()
+ self.validator.collect_biovalidator_validation_results()
assert self.validator.results['metadata_check']['json_errors'] == [
{'property': '/files', 'description': "should have required property 'files'"},
{'property': '/project/title', 'description': "should have required property 'title'"},
@@ -265,19 +257,6 @@ def test_convert_biovalidator_validation_to_spreadsheet(self):
'description': 'alias_1,alias_2 present in Samples not in Analysis'}
]
- def test_parse_assembly_check_log(self):
- assembly_check_log = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.assembly_check.log')
- error_list, nb_error, match, total = self.validator.parse_assembly_check_log(assembly_check_log)
- assert error_list == ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"]
-
- def test_parse_assembly_check_report(self):
- assembly_check_report = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.text_assembly_report.txt')
- mismatch_list, nb_mismatch, error_list, nb_error = self.validator.parse_assembly_check_report(assembly_check_report)
- assert mismatch_list[0] == "Line 43: Chromosome chr1, position 955679, reference allele 'T' does not match the reference sequence, expected 'C'"
- assert nb_mismatch == 12
- assert error_list == ['Chromosome scaffold_chr1 is not present in FASTA file']
- assert nb_error == 1
-
def test_collect_conversion_errors(self):
self.validator.results['metadata_check'] = {}
self.validator._load_spreadsheet_conversion_errors()