diff --git a/cli/jinja_templates/metadata_validation.html b/cli/jinja_templates/metadata_validation.html index 1dffd44..da5c284 100644 --- a/cli/jinja_templates/metadata_validation.html +++ b/cli/jinja_templates/metadata_validation.html @@ -1,8 +1,8 @@ {% macro metadata_validation_report(validation_results) -%} {% set results = validation_results.get('metadata_check', {}) %} - {% set json_errors = results.get('json_errors', []) %} - {% if json_errors %} + {% set spreadsheet_errors = results.get('spreadsheet_errors', []) %} + {% if spreadsheet_errors %} {% set icon = "❌" %} {% set row_class = "report-section fail collapsible" %} {% else %} @@ -10,16 +10,19 @@ {% set row_class = "report-section pass" %} {% endif %}
{{ icon }} Metadata validation check
- {% if json_errors %} + {% if spreadsheet_errors %}
-
Full report: {{ results.get('report_path', '') }}
+
Full report: {{ results.get('spreadsheet_report_path', '') }}
- + - {% for error in json_errors %} + {% for error in spreadsheet_errors %} - + + + + {% endfor %}
PropertyErrorSheetRowColumnDescription
{{ error.get('property') }} {{ error.get('description') }}{{ error.get('sheet') }}{{ error.get('row') }}{{ error.get('column') }} {{ error.get('description') }}
diff --git a/cli/reporter.py b/cli/reporter.py index 368526e..2214e77 100755 --- a/cli/reporter.py +++ b/cli/reporter.py @@ -6,9 +6,13 @@ import yaml +from cli import ETC_DIR from cli.report import generate_html_report +from ebi_eva_common_pyutils.logger import logging_config +logger = logging_config.get_logger(__name__) + def resolve_single_file_path(file_path): files = glob.glob(file_path) if len(files) == 0: @@ -123,7 +127,9 @@ def _collect_validation_workflow_results(self, ): self._collect_vcf_check_results() self._collect_assembly_check_results() self._load_sample_check_results() - self._parse_metadata_validation_results() + self._parse_biovalidator_validation_results() + self._convert_biovalidator_validation_to_spreadsheet() + self._write_spreadsheet_validation_results() def _collect_vcf_check_results(self,): # detect output files for vcf check @@ -196,7 +202,7 @@ def _load_sample_check_results(self): self.results['sample_check'] = yaml.safe_load(open_yaml) self.results['sample_check']['report_path'] = sample_check_yaml - def _parse_metadata_validation_results(self): + def _parse_biovalidator_validation_results(self): """ Read the biovalidator's report and extract the list of validation errors """ @@ -226,10 +232,84 @@ def clean_read(ifile): break # EOF errors.append({'property': line, 'description': line2}) self.results['metadata_check'] = { - 'report_path': metadata_check_file, + 'json_report_path': metadata_check_file, 'json_errors': errors } + def _parse_metadata_property(self, property_str): + if property_str.startswith('.'): + return property_str.strip('.'), None, None + match = re.match(r'/(\w+)(/(\d+))?(\.(\w+))?', property_str) + if match: + return match.group(1), match.group(3), match.group(5) + else: + logger.error(f'Cannot parse {property_str} in JSON metadata error') + return None, None, None + + def _convert_biovalidator_validation_to_spreadsheet(self): + config_file = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml") + with open(config_file) as open_file: + xls2json_conf = yaml.safe_load(open_file) + + self.results['metadata_check']['spreadsheet_errors'] = [] + for error in self.results['metadata_check']['json_errors']: + sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property']) + sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf) + row = self._convert_metadata_row(sheet, row_json, xls2json_conf) + column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf) + if row_json is None and attribute_json is None: + new_description = f'Sheet "{sheet}" is missing' + elif row_json is None: + new_description = f'In sheet "{sheet}", column "{column}" is not populated' + elif attribute_json and column: + new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated' + else: + new_description = error["description"].replace(sheet_json, sheet) + if column is None: + # We do not know this attribute. It's most likely about bioSampleObject + continue + if 'schema' in new_description: + # This is an error specific to json schema + continue + self.results['metadata_check']['spreadsheet_errors'].append({ + 'sheet': sheet, 'row': row, 'column': column, + 'description': new_description + }) + + def _write_spreadsheet_validation_results(self): + if 'spreadsheet_errors' in self.results['metadata_check']: + spreadsheet_report_file = os.path.join(os.path.dirname(self.results['metadata_check']['json_report_path']), + 'metadata_spreadsheet_validation.txt') + with open(spreadsheet_report_file, 'w') as open_file: + for error_dict in self.results['metadata_check']['spreadsheet_errors']: + open_file.write(error_dict.get('description') + '\n') + self.results['metadata_check']['spreadsheet_report_path'] = spreadsheet_report_file + + def _convert_metadata_sheet(self, json_attribute, xls2json_conf): + if json_attribute is None: + return None + for sheet_name in xls2json_conf['worksheets']: + if xls2json_conf['worksheets'][sheet_name] == json_attribute: + return sheet_name + + def _convert_metadata_row(self, sheet, json_row, xls2json_conf): + if json_row is None: + return '' + if 'header_row' in xls2json_conf[sheet]: + return int(json_row) + xls2json_conf[sheet]['header_row'] + else: + return int(json_row) + 2 + + def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf): + if json_attribute is None: + return '' + attributes_dict = {} + attributes_dict.update(xls2json_conf[sheet].get('required', {})) + attributes_dict.update(xls2json_conf[sheet].get('optional', {})) + for attribute in attributes_dict: + if attributes_dict[attribute] == json_attribute: + return attribute + def create_reports(self): report_html = generate_html_report(self.results, self.validation_date, self.project_title) file_path = 'report.html' diff --git a/tests/resources/validation_reports/expected_report.html b/tests/resources/validation_reports/expected_report.html index d901fe3..901d111 100644 --- a/tests/resources/validation_reports/expected_report.html +++ b/tests/resources/validation_reports/expected_report.html @@ -17,4 +17,4 @@ th { background-color: lightgrey; } .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } - .error-list { display: none; }

Validation Report: My cool project

Generated at 2023-08-31 12:34:56

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/report
PropertyError
.filesshould have required property 'files'
/project.titleshould have required property 'title'
/project.descriptionshould have required property 'description'
/project.taxIdshould have required property 'taxId'
/project.centreshould have required property 'centre'
/analysis/0.analysisTitleshould have required property 'analysisTitle'
/analysis/0.descriptionshould have required property 'description'
/analysis/0.experimentTypeshould have required property 'experimentType'
/analysis/0.referenceGenomeshould have required property 'referenceGenome'
/sample/0.bioSampleAccessionshould have required property 'bioSampleAccession'
/sample/0.bioSampleObjectshould have required property 'bioSampleObject'
/sample/0should match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
CategoryError
Samples described in the metadata but not in the VCF filesSample1
Samples in the VCF files but not described in the metadata1Sample
\ No newline at end of file + .error-list { display: none; }

Validation Report: My cool project

Generated at 2023-08-31 12:34:56

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
ProjectProject TitleIn sheet "Project", column "Project Title" is not populated
ProjectDescriptionIn sheet "Project", column "Description" is not populated
ProjectTax IDIn sheet "Project", column "Tax ID" is not populated
ProjectCenterIn sheet "Project", column "Center" is not populated
Analysis2Analysis TitleIn sheet "Analysis", row "2", column "Analysis Title" is not populated
Analysis2DescriptionIn sheet "Analysis", row "2", column "Description" is not populated
Analysis2Experiment TypeIn sheet "Analysis", row "2", column "Experiment Type" is not populated
Analysis2ReferenceIn sheet "Analysis", row "2", column "Reference" is not populated
Sample3Sample AccessionIn sheet "Sample", row "3", column "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
CategoryError
Samples described in the metadata but not in the VCF filesSample1
Samples in the VCF files but not described in the metadata1Sample
\ No newline at end of file diff --git a/tests/test_report.py b/tests/test_report.py index be24c66..0d99755 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -4,6 +4,8 @@ from unittest import TestCase from cli.report import generate_html_report +from cli.reporter import Reporter + validation_results = { "assembly_check": { "input_passed.vcf": { @@ -80,11 +82,26 @@ {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ], - 'report_path': '/path/to/metadata/report' + 'json_report_path': '/path/to/metadata/report', + 'spreadsheet_errors': [ + {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'} + ], + 'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt', } } + + class TestReport(TestCase): resource_dir = os.path.join(os.path.dirname(__file__), 'resources') expected_report = os.path.join(resource_dir, 'validation_reports', 'expected_report.html') diff --git a/tests/test_reporter.py b/tests/test_reporter.py index fc47322..6878549 100644 --- a/tests/test_reporter.py +++ b/tests/test_reporter.py @@ -45,13 +45,25 @@ def test__collect_validation_workflow_results(self): {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + ], + 'spreadsheet_errors': [ + {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': '', 'column': 'Project Title', + 'description': 'In sheet "Project", column "Project Title" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Description', + 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', + 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', + 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}, ] } } - self.reporter._collect_validation_workflow_results() + self.reporter._collect_validation_workflow_results() # Drop report paths from comparison (test will fail if missing) - del self.reporter.results['metadata_check']['report_path'] + del self.reporter.results['metadata_check']['json_report_path'] + del self.reporter.results['metadata_check']['spreadsheet_report_path'] del self.reporter.results['sample_check']['report_path'] for file in self.reporter.results['vcf_check'].values(): del file['report_path'] @@ -75,8 +87,8 @@ def test_vcf_check_errors_is_critical(self): for i, error in enumerate(errors): assert self.reporter.vcf_check_errors_is_critical(error) == expected_return[i] - def test_parse_metadata_validation_results(self): - self.reporter._parse_metadata_validation_results() + def test_parse_biovalidator_validation_results(self): + self.reporter._parse_biovalidator_validation_results() assert self.reporter.results['metadata_check']['json_errors'] == [ {'property': '.files', 'description': "should have required property 'files'"}, {'property': '/project.title', 'description': "should have required property 'title'"}, @@ -87,3 +99,28 @@ def test_parse_metadata_validation_results(self): {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ] + def test_convert_biovalidator_validation_to_spreadsheet(self): + self.reporter.results['metadata_check'] = { + 'json_errors': [ + {'property': '.files', 'description': "should have required property 'files'"}, + {'property': '/project.title', 'description': "should have required property 'title'"}, + {'property': '/analysis/0.description', + 'description': "should have required property 'description'"}, + {'property': '/analysis/0.referenceGenome', + 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0.bioSampleAccession', + 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0.bioSampleObject', + 'description': "should have required property 'bioSampleObject'"}, + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + ] + } + self.reporter._convert_biovalidator_validation_to_spreadsheet() + + assert self.reporter.results['metadata_check']['spreadsheet_errors'] == [ + {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'} + ]