Skip to content

Commit

Permalink
Merge pull request #10 from tcezard/EVA-3226
Browse files Browse the repository at this point in the history
EVA-3226 - Map errors to Spreadsheet location
  • Loading branch information
tcezard authored Sep 11, 2023
2 parents 550c293 + a74b04f commit 5505724
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 16 deletions.
17 changes: 10 additions & 7 deletions cli/jinja_templates/metadata_validation.html
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@

{% macro metadata_validation_report(validation_results) -%}
{% set results = validation_results.get('metadata_check', {}) %}
{% set json_errors = results.get('json_errors', []) %}
{% if json_errors %}
{% set spreadsheet_errors = results.get('spreadsheet_errors', []) %}
{% if spreadsheet_errors %}
{% set icon = "❌" %}
{% set row_class = "report-section fail collapsible" %}
{% else %}
{% set icon = "✔" %}
{% set row_class = "report-section pass" %}
{% endif %}
<div class='{{ row_class }}'>{{ icon }} Metadata validation check </div>
{% if json_errors %}
{% if spreadsheet_errors %}
<div class="error-list">
<div class="error-description"><strong>Full report:</strong> {{ results.get('report_path', '') }}</div>
<div class="error-description"><strong>Full report:</strong> {{ results.get('spreadsheet_report_path', '') }}</div>
<table>
<tr>
<th>Property</th><th>Error</th>
<th>Sheet</th><th>Row</th><th>Column</th><th>Description</th>
</tr>
{% for error in json_errors %}
{% for error in spreadsheet_errors %}
<tr>
<td><strong>{{ error.get('property') }}</strong></td><td> {{ error.get('description') }}</td>
<td><strong>{{ error.get('sheet') }}</strong></td>
<td><strong>{{ error.get('row') }}</strong></td>
<td><strong>{{ error.get('column') }}</strong></td>
<td> {{ error.get('description') }}</td>
</tr>
{% endfor %}
</table>
Expand Down
86 changes: 83 additions & 3 deletions cli/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,13 @@

import yaml

from cli import ETC_DIR
from cli.report import generate_html_report
from ebi_eva_common_pyutils.logger import logging_config


logger = logging_config.get_logger(__name__)

def resolve_single_file_path(file_path):
files = glob.glob(file_path)
if len(files) == 0:
Expand Down Expand Up @@ -123,7 +127,9 @@ def _collect_validation_workflow_results(self, ):
self._collect_vcf_check_results()
self._collect_assembly_check_results()
self._load_sample_check_results()
self._parse_metadata_validation_results()
self._parse_biovalidator_validation_results()
self._convert_biovalidator_validation_to_spreadsheet()
self._write_spreadsheet_validation_results()

def _collect_vcf_check_results(self,):
# detect output files for vcf check
Expand Down Expand Up @@ -196,7 +202,7 @@ def _load_sample_check_results(self):
self.results['sample_check'] = yaml.safe_load(open_yaml)
self.results['sample_check']['report_path'] = sample_check_yaml

def _parse_metadata_validation_results(self):
def _parse_biovalidator_validation_results(self):
"""
Read the biovalidator's report and extract the list of validation errors
"""
Expand Down Expand Up @@ -226,10 +232,84 @@ def clean_read(ifile):
break # EOF
errors.append({'property': line, 'description': line2})
self.results['metadata_check'] = {
'report_path': metadata_check_file,
'json_report_path': metadata_check_file,
'json_errors': errors
}

def _parse_metadata_property(self, property_str):
if property_str.startswith('.'):
return property_str.strip('.'), None, None
match = re.match(r'/(\w+)(/(\d+))?(\.(\w+))?', property_str)
if match:
return match.group(1), match.group(3), match.group(5)
else:
logger.error(f'Cannot parse {property_str} in JSON metadata error')
return None, None, None

def _convert_biovalidator_validation_to_spreadsheet(self):
config_file = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
with open(config_file) as open_file:
xls2json_conf = yaml.safe_load(open_file)

self.results['metadata_check']['spreadsheet_errors'] = []
for error in self.results['metadata_check']['json_errors']:
sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property'])
sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf)
row = self._convert_metadata_row(sheet, row_json, xls2json_conf)
column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
if row_json is None and attribute_json is None:
new_description = f'Sheet "{sheet}" is missing'
elif row_json is None:
new_description = f'In sheet "{sheet}", column "{column}" is not populated'
elif attribute_json and column:
new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated'
else:
new_description = error["description"].replace(sheet_json, sheet)
if column is None:
# We do not know this attribute. It's most likely about bioSampleObject
continue
if 'schema' in new_description:
# This is an error specific to json schema
continue
self.results['metadata_check']['spreadsheet_errors'].append({
'sheet': sheet, 'row': row, 'column': column,
'description': new_description
})

def _write_spreadsheet_validation_results(self):
if 'spreadsheet_errors' in self.results['metadata_check']:
spreadsheet_report_file = os.path.join(os.path.dirname(self.results['metadata_check']['json_report_path']),
'metadata_spreadsheet_validation.txt')
with open(spreadsheet_report_file, 'w') as open_file:
for error_dict in self.results['metadata_check']['spreadsheet_errors']:
open_file.write(error_dict.get('description') + '\n')
self.results['metadata_check']['spreadsheet_report_path'] = spreadsheet_report_file

def _convert_metadata_sheet(self, json_attribute, xls2json_conf):
if json_attribute is None:
return None
for sheet_name in xls2json_conf['worksheets']:
if xls2json_conf['worksheets'][sheet_name] == json_attribute:
return sheet_name

def _convert_metadata_row(self, sheet, json_row, xls2json_conf):
if json_row is None:
return ''
if 'header_row' in xls2json_conf[sheet]:
return int(json_row) + xls2json_conf[sheet]['header_row']
else:
return int(json_row) + 2

def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf):
if json_attribute is None:
return ''
attributes_dict = {}
attributes_dict.update(xls2json_conf[sheet].get('required', {}))
attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
for attribute in attributes_dict:
if attributes_dict[attribute] == json_attribute:
return attribute

def create_reports(self):
report_html = generate_html_report(self.results, self.validation_date, self.project_title)
file_path = 'report.html'
Expand Down
2 changes: 1 addition & 1 deletion tests/resources/validation_reports/expected_report.html

Large diffs are not rendered by default.

19 changes: 18 additions & 1 deletion tests/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from unittest import TestCase

from cli.report import generate_html_report
from cli.reporter import Reporter

validation_results = {
"assembly_check": {
"input_passed.vcf": {
Expand Down Expand Up @@ -80,11 +82,26 @@
{'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
],
'report_path': '/path/to/metadata/report'
'json_report_path': '/path/to/metadata/report',
'spreadsheet_errors': [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
],
'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt',
}
}




class TestReport(TestCase):
resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
expected_report = os.path.join(resource_dir, 'validation_reports', 'expected_report.html')
Expand Down
45 changes: 41 additions & 4 deletions tests/test_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,25 @@ def test__collect_validation_workflow_results(self):
{'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"},
{'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
],
'spreadsheet_errors': [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title',
'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description',
'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
]
}
}
self.reporter._collect_validation_workflow_results()

self.reporter._collect_validation_workflow_results()
# Drop report paths from comparison (test will fail if missing)
del self.reporter.results['metadata_check']['report_path']
del self.reporter.results['metadata_check']['json_report_path']
del self.reporter.results['metadata_check']['spreadsheet_report_path']
del self.reporter.results['sample_check']['report_path']
for file in self.reporter.results['vcf_check'].values():
del file['report_path']
Expand All @@ -75,8 +87,8 @@ def test_vcf_check_errors_is_critical(self):
for i, error in enumerate(errors):
assert self.reporter.vcf_check_errors_is_critical(error) == expected_return[i]

def test_parse_metadata_validation_results(self):
self.reporter._parse_metadata_validation_results()
def test_parse_biovalidator_validation_results(self):
self.reporter._parse_biovalidator_validation_results()
assert self.reporter.results['metadata_check']['json_errors'] == [
{'property': '.files', 'description': "should have required property 'files'"},
{'property': '/project.title', 'description': "should have required property 'title'"},
Expand All @@ -87,3 +99,28 @@ def test_parse_metadata_validation_results(self):
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
]

def test_convert_biovalidator_validation_to_spreadsheet(self):
self.reporter.results['metadata_check'] = {
'json_errors': [
{'property': '.files', 'description': "should have required property 'files'"},
{'property': '/project.title', 'description': "should have required property 'title'"},
{'property': '/analysis/0.description',
'description': "should have required property 'description'"},
{'property': '/analysis/0.referenceGenome',
'description': "should have required property 'referenceGenome'"},
{'property': '/sample/0.bioSampleAccession',
'description': "should have required property 'bioSampleAccession'"},
{'property': '/sample/0.bioSampleObject',
'description': "should have required property 'bioSampleObject'"},
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
]
}
self.reporter._convert_biovalidator_validation_to_spreadsheet()

assert self.reporter.results['metadata_check']['spreadsheet_errors'] == [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
]

0 comments on commit 5505724

Please sign in to comment.