diff --git a/cli/jinja_templates/metadata_validation.html b/cli/jinja_templates/metadata_validation.html
index 1dffd44..da5c284 100644
--- a/cli/jinja_templates/metadata_validation.html
+++ b/cli/jinja_templates/metadata_validation.html
@@ -1,8 +1,8 @@
{% macro metadata_validation_report(validation_results) -%}
{% set results = validation_results.get('metadata_check', {}) %}
- {% set json_errors = results.get('json_errors', []) %}
- {% if json_errors %}
+ {% set spreadsheet_errors = results.get('spreadsheet_errors', []) %}
+ {% if spreadsheet_errors %}
{% set icon = "❌" %}
{% set row_class = "report-section fail collapsible" %}
{% else %}
@@ -10,16 +10,19 @@
{% set row_class = "report-section pass" %}
{% endif %}
-
Full report: {{ results.get('report_path', '') }}
+
Full report: {{ results.get('spreadsheet_report_path', '') }}
- Property | Error |
+ Sheet | Row | Column | Description |
- {% for error in json_errors %}
+ {% for error in spreadsheet_errors %}
- {{ error.get('property') }} | {{ error.get('description') }} |
+ {{ error.get('sheet') }} |
+ {{ error.get('row') }} |
+ {{ error.get('column') }} |
+ {{ error.get('description') }} |
{% endfor %}
diff --git a/cli/reporter.py b/cli/reporter.py
index 368526e..2214e77 100755
--- a/cli/reporter.py
+++ b/cli/reporter.py
@@ -6,9 +6,13 @@
import yaml
+from cli import ETC_DIR
from cli.report import generate_html_report
+from ebi_eva_common_pyutils.logger import logging_config
+logger = logging_config.get_logger(__name__)
+
def resolve_single_file_path(file_path):
files = glob.glob(file_path)
if len(files) == 0:
@@ -123,7 +127,9 @@ def _collect_validation_workflow_results(self, ):
self._collect_vcf_check_results()
self._collect_assembly_check_results()
self._load_sample_check_results()
- self._parse_metadata_validation_results()
+ self._parse_biovalidator_validation_results()
+ self._convert_biovalidator_validation_to_spreadsheet()
+ self._write_spreadsheet_validation_results()
def _collect_vcf_check_results(self,):
# detect output files for vcf check
@@ -196,7 +202,7 @@ def _load_sample_check_results(self):
self.results['sample_check'] = yaml.safe_load(open_yaml)
self.results['sample_check']['report_path'] = sample_check_yaml
- def _parse_metadata_validation_results(self):
+ def _parse_biovalidator_validation_results(self):
"""
Read the biovalidator's report and extract the list of validation errors
"""
@@ -226,10 +232,84 @@ def clean_read(ifile):
break # EOF
errors.append({'property': line, 'description': line2})
self.results['metadata_check'] = {
- 'report_path': metadata_check_file,
+ 'json_report_path': metadata_check_file,
'json_errors': errors
}
+ def _parse_metadata_property(self, property_str):
+ if property_str.startswith('.'):
+ return property_str.strip('.'), None, None
+ match = re.match(r'/(\w+)(/(\d+))?(\.(\w+))?', property_str)
+ if match:
+ return match.group(1), match.group(3), match.group(5)
+ else:
+ logger.error(f'Cannot parse {property_str} in JSON metadata error')
+ return None, None, None
+
+ def _convert_biovalidator_validation_to_spreadsheet(self):
+ config_file = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
+ with open(config_file) as open_file:
+ xls2json_conf = yaml.safe_load(open_file)
+
+ self.results['metadata_check']['spreadsheet_errors'] = []
+ for error in self.results['metadata_check']['json_errors']:
+ sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property'])
+ sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf)
+ row = self._convert_metadata_row(sheet, row_json, xls2json_conf)
+ column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
+ if row_json is None and attribute_json is None:
+ new_description = f'Sheet "{sheet}" is missing'
+ elif row_json is None:
+ new_description = f'In sheet "{sheet}", column "{column}" is not populated'
+ elif attribute_json and column:
+ new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated'
+ else:
+ new_description = error["description"].replace(sheet_json, sheet)
+ if column is None:
+ # We do not know this attribute. It's most likely about bioSampleObject
+ continue
+ if 'schema' in new_description:
+ # This is an error specific to json schema
+ continue
+ self.results['metadata_check']['spreadsheet_errors'].append({
+ 'sheet': sheet, 'row': row, 'column': column,
+ 'description': new_description
+ })
+
+ def _write_spreadsheet_validation_results(self):
+ if 'spreadsheet_errors' in self.results['metadata_check']:
+ spreadsheet_report_file = os.path.join(os.path.dirname(self.results['metadata_check']['json_report_path']),
+ 'metadata_spreadsheet_validation.txt')
+ with open(spreadsheet_report_file, 'w') as open_file:
+ for error_dict in self.results['metadata_check']['spreadsheet_errors']:
+ open_file.write(error_dict.get('description') + '\n')
+ self.results['metadata_check']['spreadsheet_report_path'] = spreadsheet_report_file
+
+ def _convert_metadata_sheet(self, json_attribute, xls2json_conf):
+ if json_attribute is None:
+ return None
+ for sheet_name in xls2json_conf['worksheets']:
+ if xls2json_conf['worksheets'][sheet_name] == json_attribute:
+ return sheet_name
+
+ def _convert_metadata_row(self, sheet, json_row, xls2json_conf):
+ if json_row is None:
+ return ''
+ if 'header_row' in xls2json_conf[sheet]:
+ return int(json_row) + xls2json_conf[sheet]['header_row']
+ else:
+ return int(json_row) + 2
+
+ def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf):
+ if json_attribute is None:
+ return ''
+ attributes_dict = {}
+ attributes_dict.update(xls2json_conf[sheet].get('required', {}))
+ attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
+ for attribute in attributes_dict:
+ if attributes_dict[attribute] == json_attribute:
+ return attribute
+
def create_reports(self):
report_html = generate_html_report(self.results, self.validation_date, self.project_title)
file_path = 'report.html'
diff --git a/tests/resources/validation_reports/expected_report.html b/tests/resources/validation_reports/expected_report.html
index d901fe3..901d111 100644
--- a/tests/resources/validation_reports/expected_report.html
+++ b/tests/resources/validation_reports/expected_report.html
@@ -17,4 +17,4 @@
th { background-color: lightgrey; }
.fail { background-color: #FFB6C1; }
.pass { background-color: #90EE90; }
- .error-list { display: none; }
Metadata validation results
Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the
EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/report
Property | Error |
---|
.files | should have required property 'files' |
/project.title | should have required property 'title' |
/project.description | should have required property 'description' |
/project.taxId | should have required property 'taxId' |
/project.centre | should have required property 'centre' |
/analysis/0.analysisTitle | should have required property 'analysisTitle' |
/analysis/0.description | should have required property 'description' |
/analysis/0.experimentType | should have required property 'experimentType' |
/analysis/0.referenceGenome | should have required property 'referenceGenome' |
/sample/0.bioSampleAccession | should have required property 'bioSampleAccession' |
/sample/0.bioSampleObject | should have required property 'bioSampleObject' |
/sample/0 | should match exactly one schema in oneOf |
VCF validation results
Checks whether each file is compliant with the
VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.
input_fail.vcf
❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
Category | Error |
---|
mismatch error | Chromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c' |
mismatch error | Chromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a' |
mismatch error | Chromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c' |
mismatch error | Chromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g' |
mismatch error | Chromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c' |
mismatch error | Chromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g' |
mismatch error | Chromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g' |
mismatch error | Chromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a' |
mismatch error | Chromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c' |
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
Category | Error |
---|
critical error | Line 4: Error in meta-data section. |
non-critical error | Sample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=.. |
input_passed.vcf
✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings
Sample name concordance check
Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
Category | Error |
---|
Samples described in the metadata but not in the VCF files | Sample1 |
Samples in the VCF files but not described in the metadata | 1Sample |
\ No newline at end of file
+ .error-list { display: none; }
Metadata validation results
Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the
EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
Sheet | Row | Column | Description |
---|
Files | | | Sheet "Files" is missing |
Project | | Project Title | In sheet "Project", column "Project Title" is not populated |
Project | | Description | In sheet "Project", column "Description" is not populated |
Project | | Tax ID | In sheet "Project", column "Tax ID" is not populated |
Project | | Center | In sheet "Project", column "Center" is not populated |
Analysis | 2 | Analysis Title | In sheet "Analysis", row "2", column "Analysis Title" is not populated |
Analysis | 2 | Description | In sheet "Analysis", row "2", column "Description" is not populated |
Analysis | 2 | Experiment Type | In sheet "Analysis", row "2", column "Experiment Type" is not populated |
Analysis | 2 | Reference | In sheet "Analysis", row "2", column "Reference" is not populated |
Sample | 3 | Sample Accession | In sheet "Sample", row "3", column "Sample Accession" is not populated |
VCF validation results
Checks whether each file is compliant with the
VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.
input_fail.vcf
❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
Category | Error |
---|
mismatch error | Chromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c' |
mismatch error | Chromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a' |
mismatch error | Chromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c' |
mismatch error | Chromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g' |
mismatch error | Chromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c' |
mismatch error | Chromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g' |
mismatch error | Chromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g' |
mismatch error | Chromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a' |
mismatch error | Chromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c' |
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
Category | Error |
---|
critical error | Line 4: Error in meta-data section. |
non-critical error | Sample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=.. |
input_passed.vcf
✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings
Sample name concordance check
Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
Category | Error |
---|
Samples described in the metadata but not in the VCF files | Sample1 |
Samples in the VCF files but not described in the metadata | 1Sample |
\ No newline at end of file
diff --git a/tests/test_report.py b/tests/test_report.py
index be24c66..0d99755 100644
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -4,6 +4,8 @@
from unittest import TestCase
from cli.report import generate_html_report
+from cli.reporter import Reporter
+
validation_results = {
"assembly_check": {
"input_passed.vcf": {
@@ -80,11 +82,26 @@
{'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
],
- 'report_path': '/path/to/metadata/report'
+ 'json_report_path': '/path/to/metadata/report',
+ 'spreadsheet_errors': [
+ {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
+ {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
+ {'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'},
+ {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'},
+ {'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'},
+ {'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'},
+ {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
+ {'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'},
+ {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
+ {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
+ ],
+ 'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt',
}
}
+
+
class TestReport(TestCase):
resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
expected_report = os.path.join(resource_dir, 'validation_reports', 'expected_report.html')
diff --git a/tests/test_reporter.py b/tests/test_reporter.py
index fc47322..6878549 100644
--- a/tests/test_reporter.py
+++ b/tests/test_reporter.py
@@ -45,13 +45,25 @@ def test__collect_validation_workflow_results(self):
{'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"},
{'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
+ ],
+ 'spreadsheet_errors': [
+ {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
+ {'sheet': 'Project', 'row': '', 'column': 'Project Title',
+ 'description': 'In sheet "Project", column "Project Title" is not populated'},
+ {'sheet': 'Analysis', 'row': 2, 'column': 'Description',
+ 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
+ {'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
+ 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
+ {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
+ 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
]
}
}
- self.reporter._collect_validation_workflow_results()
+ self.reporter._collect_validation_workflow_results()
# Drop report paths from comparison (test will fail if missing)
- del self.reporter.results['metadata_check']['report_path']
+ del self.reporter.results['metadata_check']['json_report_path']
+ del self.reporter.results['metadata_check']['spreadsheet_report_path']
del self.reporter.results['sample_check']['report_path']
for file in self.reporter.results['vcf_check'].values():
del file['report_path']
@@ -75,8 +87,8 @@ def test_vcf_check_errors_is_critical(self):
for i, error in enumerate(errors):
assert self.reporter.vcf_check_errors_is_critical(error) == expected_return[i]
- def test_parse_metadata_validation_results(self):
- self.reporter._parse_metadata_validation_results()
+ def test_parse_biovalidator_validation_results(self):
+ self.reporter._parse_biovalidator_validation_results()
assert self.reporter.results['metadata_check']['json_errors'] == [
{'property': '.files', 'description': "should have required property 'files'"},
{'property': '/project.title', 'description': "should have required property 'title'"},
@@ -87,3 +99,28 @@ def test_parse_metadata_validation_results(self):
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
]
+ def test_convert_biovalidator_validation_to_spreadsheet(self):
+ self.reporter.results['metadata_check'] = {
+ 'json_errors': [
+ {'property': '.files', 'description': "should have required property 'files'"},
+ {'property': '/project.title', 'description': "should have required property 'title'"},
+ {'property': '/analysis/0.description',
+ 'description': "should have required property 'description'"},
+ {'property': '/analysis/0.referenceGenome',
+ 'description': "should have required property 'referenceGenome'"},
+ {'property': '/sample/0.bioSampleAccession',
+ 'description': "should have required property 'bioSampleAccession'"},
+ {'property': '/sample/0.bioSampleObject',
+ 'description': "should have required property 'bioSampleObject'"},
+ {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
+ ]
+ }
+ self.reporter._convert_biovalidator_validation_to_spreadsheet()
+
+ assert self.reporter.results['metadata_check']['spreadsheet_errors'] == [
+ {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
+ {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
+ {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
+ {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
+ {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
+ ]