From 353ca0d3d341d012d10c0fafb5191c865a0ad198 Mon Sep 17 00:00:00 2001
From: tcezard <tcezard@ebi.ac.uk>
Date: Tue, 1 Oct 2024 15:39:06 +0100
Subject: [PATCH 1/2] Add more error sorting as non-critical

---
 .../validators/validation_results_parsers.py  | 13 +++++++---
 tests/test_validaton_results_parsers.py       | 25 ++++++++++++++-----
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py
index 274b7a6..f38344e 100644
--- a/eva_sub_cli/validators/validation_results_parsers.py
+++ b/eva_sub_cli/validators/validation_results_parsers.py
@@ -91,15 +91,22 @@ def vcf_check_errors_is_critical(error):
     (derived from ploidy 1).
     Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..
     """
-    non_critical_format_fields = ['PL', 'AD', 'AC']
+    non_critical_format_fields = ['PL', 'AD', 'AC', 'GQ']
     non_critical_info_fields = ['AC']
     regexes = {
-        r'^INFO (\w+) does not match the specification Number': non_critical_format_fields,
-        r'^Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_info_fields
+        r'^INFO (\w+) does not match the specification Number': non_critical_info_fields,
+        r'^INFO (\w+) metadata Number is not ': non_critical_info_fields,
+        r'^Line \d+: Sample #\d+, field (\w+) does not match the meta specification Number=': non_critical_format_fields,
+        r'^Line \d+: FORMAT (\w+) metadata Type is not ': non_critical_format_fields,
+        r'^Line \d+: FORMAT (\w+) metadata Number is not ': non_critical_format_fields,
+        r'^Line \d+: INFO SVLEN must be equal to "length of ALT - length of REF" for non-symbolic alternate alleles. SVLEN=': None
     }
     for regex in regexes:
         match = re.match(regex, error)
         if match:
+            if regexes[regex] is None:
+                # No list of value to match against
+                return False
             field_affected = match.group(1)
             if field_affected in regexes[regex]:
                 return False
diff --git a/tests/test_validaton_results_parsers.py b/tests/test_validaton_results_parsers.py
index 388b44c..ee44ed7 100644
--- a/tests/test_validaton_results_parsers.py
+++ b/tests/test_validaton_results_parsers.py
@@ -10,13 +10,26 @@ class TestValidationParsers(TestCase):
 
     def test_vcf_check_errors_is_critical(self):
         errors = [
-            'INFO AC does not match the specification Number=A (expected 1 value(s)). AC=100,37.',
-            'Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. Contains 1 value(s), expected 2 (derived from ploidy 1).',
-            'Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..'
+            ('INFO AC does not match the specification Number=A (expected 1 value(s)). AC=100,37.', False),
+            ('Line 124385: Sample #10, field PL does not match the meta specification Number=G (expected 2 value(s)). '
+             'PL=.. It must derive its number of values from the ploidy of GT (if present), or assume diploidy. '
+             'Contains 1 value(s), expected 2 (derived from ploidy 1).', False),
+            ('Line 124384: Sample #102, field AD does not match the meta specification Number=R (expected 3 value(s)). AD=..', False),
+            ('Line 8: SAMPLE metadata Genomes is not a valid string (maybe it contains quotes?).', True),
+            ('Line 6: FORMAT GQ metadata Type is not Integer.', False),
+            ('Line 7: FORMAT PL metadata Number is not G.', False),
+            ('Line 10: INFO AF metadata Number is not A.', True),
+            ('Line 4039: FORMAT GQ metadata Type is not Integer.', False),
+            ('Line 1525: Duplicated variant NA:5:C>T found.', True),
+            ('Line 8: Metadata ID contains a character different from alphanumeric, dot, underscore and dash.', True),
+            ('Line 14: FORMAT metadata Number is not a number, A, G or dot.', True),
+            ('Line 13: Contig is not sorted by position. Contig 1 position 5600263 found after 12313283.', True),
+            ('Line 1067: INFO SVLEN must be equal to "length of ALT - length of REF" for non-symbolic alternate '
+             'alleles. SVLEN=31, expected value=33.', False),
         ]
-        expected_return = [False, True, True]
-        for i, error in enumerate(errors):
-            assert vcf_check_errors_is_critical(error) == expected_return[i]
+        for error, is_critical in errors:
+            print(error, vcf_check_errors_is_critical(error))
+            assert vcf_check_errors_is_critical(error) == is_critical, error
 
     def test_parse_assembly_check_log(self):
         assembly_check_log = os.path.join(self.resource_dir, 'assembly_check', 'invalid.vcf.assembly_check.log')

From 645151eee1558029e3ad56858f79f153f878b01a Mon Sep 17 00:00:00 2001
From: tcezard <tcezard@ebi.ac.uk>
Date: Tue, 1 Oct 2024 15:50:10 +0100
Subject: [PATCH 2/2] Remove unused print

---
 tests/test_validaton_results_parsers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_validaton_results_parsers.py b/tests/test_validaton_results_parsers.py
index ee44ed7..29fec21 100644
--- a/tests/test_validaton_results_parsers.py
+++ b/tests/test_validaton_results_parsers.py
@@ -28,7 +28,6 @@ def test_vcf_check_errors_is_critical(self):
              'alleles. SVLEN=31, expected value=33.', False),
         ]
         for error, is_critical in errors:
-            print(error, vcf_check_errors_is_critical(error))
             assert vcf_check_errors_is_critical(error) == is_critical, error
 
     def test_parse_assembly_check_log(self):