Merge pull request #10 from tcezard/EVA-3226

EVA-3226 - Map errors to Spreadsheet location
EBIvariation · Sep 11, 2023 · 5505724 · 5505724
2 parents 550c293 + a74b04f
commit 5505724
Show file tree

Hide file tree

Showing 5 changed files with 153 additions and 16 deletions.
diff --git a/cli/jinja_templates/metadata_validation.html b/cli/jinja_templates/metadata_validation.html
@@ -1,25 +1,28 @@
 
 {% macro metadata_validation_report(validation_results) -%}
     {% set results = validation_results.get('metadata_check', {}) %}
-    {% set json_errors = results.get('json_errors', []) %}
-    {% if json_errors %}
+    {% set spreadsheet_errors = results.get('spreadsheet_errors', []) %}
+    {% if spreadsheet_errors %}
         {% set icon = "&#10060;" %}
         {% set row_class = "report-section fail collapsible" %}
     {% else %}
         {% set icon = "&#10004;" %}
         {% set row_class = "report-section pass" %}
     {% endif %}
     <div class='{{ row_class }}'>{{ icon }} Metadata validation check </div>
-    {% if json_errors %}
+    {% if spreadsheet_errors %}
         <div class="error-list">
-            <div class="error-description"><strong>Full report:</strong> {{ results.get('report_path', '') }}</div>
+            <div class="error-description"><strong>Full report:</strong> {{ results.get('spreadsheet_report_path', '') }}</div>
             <table>
                 <tr>
-                    <th>Property</th><th>Error</th>
+                    <th>Sheet</th><th>Row</th><th>Column</th><th>Description</th>
                 </tr>
-                {% for error in json_errors %}
+                {% for error in spreadsheet_errors %}
                     <tr>
-                        <td><strong>{{ error.get('property') }}</strong></td><td> {{ error.get('description') }}</td>
+                        <td><strong>{{ error.get('sheet') }}</strong></td>
+                        <td><strong>{{ error.get('row') }}</strong></td>
+                        <td><strong>{{ error.get('column') }}</strong></td>
+                        <td> {{ error.get('description') }}</td>
                     </tr>
                 {% endfor %}
             </table>

diff --git a/cli/reporter.py b/cli/reporter.py
@@ -6,9 +6,13 @@
 
 import yaml
 
+from cli import ETC_DIR
 from cli.report import generate_html_report
+from ebi_eva_common_pyutils.logger import logging_config
 
 
+logger = logging_config.get_logger(__name__)
+
 def resolve_single_file_path(file_path):
     files = glob.glob(file_path)
     if len(files) == 0:
@@ -123,7 +127,9 @@ def _collect_validation_workflow_results(self, ):
         self._collect_vcf_check_results()
         self._collect_assembly_check_results()
         self._load_sample_check_results()
-        self._parse_metadata_validation_results()
+        self._parse_biovalidator_validation_results()
+        self._convert_biovalidator_validation_to_spreadsheet()
+        self._write_spreadsheet_validation_results()
 
     def _collect_vcf_check_results(self,):
         # detect output files for vcf check
@@ -196,7 +202,7 @@ def _load_sample_check_results(self):
             self.results['sample_check'] = yaml.safe_load(open_yaml)
         self.results['sample_check']['report_path'] = sample_check_yaml
 
-    def _parse_metadata_validation_results(self):
+    def _parse_biovalidator_validation_results(self):
         """
         Read the biovalidator's report and extract the list of validation errors
         """
@@ -226,10 +232,84 @@ def clean_read(ifile):
                         break  # EOF
                     errors.append({'property': line, 'description': line2})
         self.results['metadata_check'] = {
-            'report_path': metadata_check_file,
+            'json_report_path': metadata_check_file,
             'json_errors': errors
         }
 
+    def _parse_metadata_property(self, property_str):
+        if property_str.startswith('.'):
+            return property_str.strip('.'), None, None
+        match = re.match(r'/(\w+)(/(\d+))?(\.(\w+))?', property_str)
+        if match:
+            return match.group(1), match.group(3), match.group(5)
+        else:
+            logger.error(f'Cannot parse {property_str} in JSON metadata error')
+            return None, None, None
+
+    def _convert_biovalidator_validation_to_spreadsheet(self):
+        config_file = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
+        with open(config_file) as open_file:
+            xls2json_conf = yaml.safe_load(open_file)
+
+        self.results['metadata_check']['spreadsheet_errors'] = []
+        for error in self.results['metadata_check']['json_errors']:
+            sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property'])
+            sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf)
+            row = self._convert_metadata_row(sheet, row_json, xls2json_conf)
+            column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
+            if row_json is None and attribute_json is None:
+                new_description = f'Sheet "{sheet}" is missing'
+            elif row_json is None:
+                new_description = f'In sheet "{sheet}", column "{column}" is not populated'
+            elif attribute_json and column:
+                new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated'
+            else:
+                new_description = error["description"].replace(sheet_json, sheet)
+            if column is None:
+                # We do not know this attribute. It's most likely about bioSampleObject
+                continue
+            if 'schema' in new_description:
+                # This is an error specific to json schema
+                continue
+            self.results['metadata_check']['spreadsheet_errors'].append({
+                'sheet': sheet, 'row': row, 'column': column,
+                'description': new_description
+            })
+
+    def _write_spreadsheet_validation_results(self):
+        if 'spreadsheet_errors' in self.results['metadata_check']:
+            spreadsheet_report_file = os.path.join(os.path.dirname(self.results['metadata_check']['json_report_path']),
+                                                   'metadata_spreadsheet_validation.txt')
+            with open(spreadsheet_report_file, 'w') as open_file:
+                for error_dict in self.results['metadata_check']['spreadsheet_errors']:
+                    open_file.write(error_dict.get('description') + '\n')
+            self.results['metadata_check']['spreadsheet_report_path'] = spreadsheet_report_file
+
+    def _convert_metadata_sheet(self, json_attribute, xls2json_conf):
+        if json_attribute is None:
+            return None
+        for sheet_name in xls2json_conf['worksheets']:
+            if xls2json_conf['worksheets'][sheet_name] == json_attribute:
+                return sheet_name
+
+    def _convert_metadata_row(self, sheet, json_row, xls2json_conf):
+        if json_row is None:
+            return ''
+        if 'header_row' in xls2json_conf[sheet]:
+            return int(json_row) + xls2json_conf[sheet]['header_row']
+        else:
+            return int(json_row) + 2
+
+    def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf):
+        if json_attribute is None:
+            return ''
+        attributes_dict = {}
+        attributes_dict.update(xls2json_conf[sheet].get('required', {}))
+        attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
+        for attribute in attributes_dict:
+            if attributes_dict[attribute] == json_attribute:
+                return attribute
+
     def create_reports(self):
         report_html = generate_html_report(self.results, self.validation_date, self.project_title)
         file_path = 'report.html'

diff --git a/tests/resources/validation_reports/expected_report.html b/tests/resources/validation_reports/expected_report.html
diff --git a/tests/test_report.py b/tests/test_report.py
@@ -4,6 +4,8 @@
 from unittest import TestCase
 
 from cli.report import generate_html_report
+from cli.reporter import Reporter
+
 validation_results = {
     "assembly_check": {
         "input_passed.vcf": {
@@ -80,11 +82,26 @@
             {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
             {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
         ],
-        'report_path': '/path/to/metadata/report'
+        'json_report_path': '/path/to/metadata/report',
+        'spreadsheet_errors': [
+            {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
+            {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
+            {'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'},
+            {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'},
+            {'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
+            {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
+        ],
+        'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt',
     }
 }
 
 
+
+
 class TestReport(TestCase):
     resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
     expected_report = os.path.join(resource_dir, 'validation_reports', 'expected_report.html')

diff --git a/tests/test_reporter.py b/tests/test_reporter.py
@@ -45,13 +45,25 @@ def test__collect_validation_workflow_results(self):
                     {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"},
                     {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
                     {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
+                ],
+                'spreadsheet_errors': [
+                    {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
+                    {'sheet': 'Project', 'row': '', 'column': 'Project Title',
+                     'description': 'In sheet "Project", column "Project Title" is not populated'},
+                    {'sheet': 'Analysis', 'row': 2, 'column': 'Description',
+                     'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
+                    {'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
+                     'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
+                    {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
+                     'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
                 ]
             }
         }
-        self.reporter._collect_validation_workflow_results()
 
+        self.reporter._collect_validation_workflow_results()
         # Drop report paths from comparison (test will fail if missing)
-        del self.reporter.results['metadata_check']['report_path']
+        del self.reporter.results['metadata_check']['json_report_path']
+        del self.reporter.results['metadata_check']['spreadsheet_report_path']
         del self.reporter.results['sample_check']['report_path']
         for file in self.reporter.results['vcf_check'].values():
             del file['report_path']
@@ -75,8 +87,8 @@ def test_vcf_check_errors_is_critical(self):
         for i, error in enumerate(errors):
             assert self.reporter.vcf_check_errors_is_critical(error) == expected_return[i]
 
-    def test_parse_metadata_validation_results(self):
-        self.reporter._parse_metadata_validation_results()
+    def test_parse_biovalidator_validation_results(self):
+        self.reporter._parse_biovalidator_validation_results()
         assert self.reporter.results['metadata_check']['json_errors'] == [
             {'property': '.files', 'description': "should have required property 'files'"},
             {'property': '/project.title', 'description': "should have required property 'title'"},
@@ -87,3 +99,28 @@ def test_parse_metadata_validation_results(self):
             {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
         ]
 
+    def test_convert_biovalidator_validation_to_spreadsheet(self):
+        self.reporter.results['metadata_check'] = {
+            'json_errors': [
+                {'property': '.files', 'description': "should have required property 'files'"},
+                {'property': '/project.title', 'description': "should have required property 'title'"},
+                {'property': '/analysis/0.description',
+                 'description': "should have required property 'description'"},
+                {'property': '/analysis/0.referenceGenome',
+                 'description': "should have required property 'referenceGenome'"},
+                {'property': '/sample/0.bioSampleAccession',
+                 'description': "should have required property 'bioSampleAccession'"},
+                {'property': '/sample/0.bioSampleObject',
+                 'description': "should have required property 'bioSampleObject'"},
+                {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
+            ]
+        }
+        self.reporter._convert_biovalidator_validation_to_spreadsheet()
+
+        assert self.reporter.results['metadata_check']['spreadsheet_errors'] == [
+            {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
+            {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
+            {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
+            {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
+        ]