From 353ca0d3d341d012d10c0fafb5191c865a0ad198 Mon Sep 17 00:00:00 2001 From: tcezard Date: Tue, 1 Oct 2024 15:39:06 +0100 Subject: [PATCH] Add more error sorting as non-critical --- .../validators/validation_results_parsers.py | 13 +++++++--- tests/test_validaton_results_parsers.py | 25 ++++++++++++++----- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py index 274b7a6..f38344e 100644 --- a/eva_sub_cli/validators/validation_results_parsers.py +++ b/eva_sub_cli/validators/validation_results_parsers.py @@ -91,15 +91,22 @@ def vcf_check_errors_is_critical(error): (derived from ploidy 1). Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=.. """ - non_critical_format_fields = ['PL', 'AD', 'AC'] + non_critical_format_fields = ['PL', 'AD', 'AC', 'GQ'] non_critical_info_fields = ['AC'] regexes = { - r'^INFO (\w+) does not match the specification Number': non_critical_format_fields, - r'^Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_info_fields + r'^INFO (\w+) does not match the specification Number': non_critical_info_fields, + r'^INFO (\w+) metadata Number is not ': non_critical_info_fields, + r'^Line \d+: Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_format_fields, + r'^Line \d+: FORMAT (\w+) metadata Type is not ': non_critical_format_fields, + r'^Line \d+: FORMAT (\w+) metadata Number is not ': non_critical_format_fields, + r'^Line \d+: INFO SVLEN must be equal to "length of ALT - length of REF" for non-symbolic alternate alleles. SVLEN=': None } for regex in regexes: match = re.match(regex, error) if match: + if regexes[regex] is None: + # No list of value to match against + return False field_affected = match.group(1) if field_affected in regexes[regex]: return False diff --git a/tests/test_validaton_results_parsers.py b/tests/test_validaton_results_parsers.py index 388b44c..ee44ed7 100644 --- a/tests/test_validaton_results_parsers.py +++ b/tests/test_validaton_results_parsers.py @@ -10,13 +10,26 @@ class TestValidationParsers(TestCase): def test_vcf_check_errors_is_critical(self): errors = [ - 'INFO AC does not match the specification Number=A (expected 1 value(s)). AC=100,37.', - 'Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2 (derived from ploidy 1).', - 'Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..' + ('INFO AC does not match the specification Number=A (expected 1 value(s)). AC=100,37.', False), + ('Line 124385: Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). ' + 'PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. ' + 'Contains 1 value(s), expected 2 (derived from ploidy 1).', False), + ('Line 124384: Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..', False), + ('Line 8: SAMPLE metadata Genomes is not a valid string (maybe it contains quotes?).', True), + ('Line 6: FORMAT GQ metadata Type is not Integer.', False), + ('Line 7: FORMAT PL metadata Number is not G.', False), + ('Line 10: INFO AF metadata Number is not A.', True), + ('Line 4039: FORMAT GQ metadata Type is not Integer.', False), + ('Line 1525: Duplicated variant NA:5:C>T found.', True), + ('Line 8: Metadata ID contains a character different from alphanumeric, dot, underscore and dash.', True), + ('Line 14: FORMAT metadata Number is not a number, A, G or dot.', True), + ('Line 13: Contig is not sorted by position. Contig 1 position 5600263 found after 12313283.', True), + ('Line 1067: INFO SVLEN must be equal to "length of ALT - length of REF" for non-symbolic alternate ' + 'alleles. SVLEN=31, expected value=33.', False), ] - expected_return = [False, True, True] - for i, error in enumerate(errors): - assert vcf_check_errors_is_critical(error) == expected_return[i] + for error, is_critical in errors: + print(error, vcf_check_errors_is_critical(error)) + assert vcf_check_errors_is_critical(error) == is_critical, error def test_parse_assembly_check_log(self): assembly_check_log = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.assembly_check.log')