From 705ebc5bed4781b873d937510acf91754935d653 Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 7 Sep 2023 14:50:06 +0100 Subject: [PATCH 1/4] Map errors to Spreadsheet location Adapt report to use the spreadsheet location --- cli/jinja_templates/metadata_validation.html | 17 ++-- cli/reporter.py | 79 ++++++++++++++++++- .../validation_reports/expected_report.html | 2 +- tests/test_report.py | 7 ++ tests/test_reporter.py | 28 +++++++ 5 files changed, 124 insertions(+), 9 deletions(-) diff --git a/cli/jinja_templates/metadata_validation.html b/cli/jinja_templates/metadata_validation.html index 1dffd44..da5c284 100644 --- a/cli/jinja_templates/metadata_validation.html +++ b/cli/jinja_templates/metadata_validation.html @@ -1,8 +1,8 @@ {% macro metadata_validation_report(validation_results) -%} {% set results = validation_results.get('metadata_check', {}) %} - {% set json_errors = results.get('json_errors', []) %} - {% if json_errors %} + {% set spreadsheet_errors = results.get('spreadsheet_errors', []) %} + {% if spreadsheet_errors %} {% set icon = "❌" %} {% set row_class = "report-section fail collapsible" %} {% else %} @@ -10,16 +10,19 @@ {% set row_class = "report-section pass" %} {% endif %}
{{ icon }} Metadata validation check
- {% if json_errors %} + {% if spreadsheet_errors %}
-
Full report: {{ results.get('report_path', '') }}
+
Full report: {{ results.get('spreadsheet_report_path', '') }}
- + - {% for error in json_errors %} + {% for error in spreadsheet_errors %} - + + + + {% endfor %}
PropertyErrorSheetRowColumnDescription
{{ error.get('property') }} {{ error.get('description') }}{{ error.get('sheet') }}{{ error.get('row') }}{{ error.get('column') }} {{ error.get('description') }}
diff --git a/cli/reporter.py b/cli/reporter.py index 368526e..70fe5f7 100755 --- a/cli/reporter.py +++ b/cli/reporter.py @@ -6,9 +6,13 @@ import yaml +from cli import ETC_DIR from cli.report import generate_html_report +from ebi_eva_common_pyutils.logger import logging_config +logger = logging_config.get_logger(__name__) + def resolve_single_file_path(file_path): files = glob.glob(file_path) if len(files) == 0: @@ -226,9 +230,82 @@ def clean_read(ifile): break # EOF errors.append({'property': line, 'description': line2}) self.results['metadata_check'] = { - 'report_path': metadata_check_file, + 'json_report_path': metadata_check_file, 'json_errors': errors } + self.convert_metadata_validation_results() + self.write_converted_metadata_reslts() + + + + def _parse_metadata_property(self, property_str): + if property_str.startswith('.'): + return property_str.strip('.'), None, None + match = re.match(r'/(\w+)(/(\d+))?(\.(\w+))?', property_str) + if match: + return match.group(1), match.group(3), match.group(5) + else: + logger.error(f'Cannot parse {property_str} in JSON metadata error') + return None, None, None + + def convert_metadata_validation_results(self): + config_file = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml") + with open(config_file) as open_file: + xls2json_conf = yaml.safe_load(open_file) + + self.results['metadata_check']['spreadsheet_errors'] = [] + for error in self.results['metadata_check']['json_errors']: + sheet_json, row_json, attribute_json = self._parse_metadata_property(error['property']) + sheet = self._convert_metadata_sheet(sheet_json, xls2json_conf) + row = self._convert_metadata_row(sheet, row_json, xls2json_conf) + column = self._convert_metadata_attribute(sheet, attribute_json, xls2json_conf) + if row_json is None and attribute_json is None: + new_description = f'Sheet "{sheet}" is missing' + elif row_json is None: + new_description = f'In sheet "{sheet}", column "{column}" is not populated' + elif attribute_json and column: + new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated' + else: + new_description = error["description"].replace(sheet_json, sheet) + self.results['metadata_check']['spreadsheet_errors'].append({ + 'sheet': sheet, 'row': row, 'column': column, + 'description': new_description + }) + + def write_converted_metadata_reslts(self): + if 'spreadsheet_errors' in self.results['metadata_check']: + spreadsheet_report_file = os.path.join(os.path.dirname(self.results['metadata_check']['json_report_path']), + 'metadata_spreadsheet_validation.txt') + with open(spreadsheet_report_file, 'w') as open_file: + for error_dict in self.results['metadata_check']['spreadsheet_errors']: + open_file.write(error_dict.get('description') + '\n') + self.results['metadata_check']['spreadsheet_report_path'] = spreadsheet_report_file + + def _convert_metadata_sheet(self, json_attribute, xls2json_conf): + if json_attribute is None: + return None + for sheet_name in xls2json_conf['worksheets']: + if xls2json_conf['worksheets'][sheet_name] == json_attribute: + return sheet_name + + def _convert_metadata_row(self, sheet, json_row, xls2json_conf): + if json_row is None: + # This is for Sheet that can only have a single entry (Project) + json_row = 0 + if 'header_row' in xls2json_conf[sheet]: + return int(json_row) + xls2json_conf[sheet]['header_row'] + else: + return int(json_row) + 2 + + def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf): + if json_attribute is None: + return None + attributes_dict = {} + attributes_dict.update(xls2json_conf[sheet].get('required', {})) + attributes_dict.update(xls2json_conf[sheet].get('optional', {})) + for attribute in attributes_dict: + if attributes_dict[attribute] == json_attribute: + return attribute def create_reports(self): report_html = generate_html_report(self.results, self.validation_date, self.project_title) diff --git a/tests/resources/validation_reports/expected_report.html b/tests/resources/validation_reports/expected_report.html index d901fe3..4e7b357 100644 --- a/tests/resources/validation_reports/expected_report.html +++ b/tests/resources/validation_reports/expected_report.html @@ -17,4 +17,4 @@ th { background-color: lightgrey; } .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } - .error-list { display: none; }

Validation Report: My cool project

Generated at 2023-08-31 12:34:56

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/report
PropertyError
.filesshould have required property 'files'
/project.titleshould have required property 'title'
/project.descriptionshould have required property 'description'
/project.taxIdshould have required property 'taxId'
/project.centreshould have required property 'centre'
/analysis/0.analysisTitleshould have required property 'analysisTitle'
/analysis/0.descriptionshould have required property 'description'
/analysis/0.experimentTypeshould have required property 'experimentType'
/analysis/0.referenceGenomeshould have required property 'referenceGenome'
/sample/0.bioSampleAccessionshould have required property 'bioSampleAccession'
/sample/0.bioSampleObjectshould have required property 'bioSampleObject'
/sample/0should match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
CategoryError
Samples described in the metadata but not in the VCF filesSample1
Samples in the VCF files but not described in the metadata1Sample
\ No newline at end of file + .error-list { display: none; }

Validation Report: My cool project

Generated at 2023-08-31 12:34:56

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report:
SheetRowColumnDescription
Files2NoneSheet "Files" is missing
Project2Project TitleIn sheet "Project", column "Project Title" is not populated
Project2DescriptionIn sheet "Project", column "Description" is not populated
Project2Tax IDIn sheet "Project", column "Tax ID" is not populated
Project2CenterIn sheet "Project", column "Center" is not populated
Analysis2Analysis TitleIn sheet "Analysis", row "2", column "Analysis Title" is not populated
Analysis2DescriptionIn sheet "Analysis", row "2", column "Description" is not populated
Analysis2Experiment TypeIn sheet "Analysis", row "2", column "Experiment Type" is not populated
Analysis2ReferenceIn sheet "Analysis", row "2", column "Reference" is not populated
Sample3Sample AccessionIn sheet "Sample", row "3", column "Sample Accession" is not populated
Sample3Noneshould have required property 'bioSampleObject'
Sample3Noneshould match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
CategoryError
Samples described in the metadata but not in the VCF filesSample1
Samples in the VCF files but not described in the metadata1Sample
\ No newline at end of file diff --git a/tests/test_report.py b/tests/test_report.py index be24c66..e589144 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -4,6 +4,8 @@ from unittest import TestCase from cli.report import generate_html_report +from cli.reporter import Reporter + validation_results = { "assembly_check": { "input_passed.vcf": { @@ -90,6 +92,11 @@ class TestReport(TestCase): expected_report = os.path.join(resource_dir, 'validation_reports', 'expected_report.html') def test_generate_html_report(self): + reporter = Reporter(['input_passed.vcf'], '') + global validation_results + reporter.results = validation_results + reporter.convert_metadata_validation_results() + validation_results = reporter.results report = generate_html_report(validation_results, datetime.datetime(2023, 8, 31, 12, 34, 56), "My cool project") with open(self.expected_report) as open_html: assert report == open_html.read() diff --git a/tests/test_reporter.py b/tests/test_reporter.py index fc47322..3097f26 100644 --- a/tests/test_reporter.py +++ b/tests/test_reporter.py @@ -87,3 +87,31 @@ def test_parse_metadata_validation_results(self): {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ] + def test_convert_metadata_validation_results(self): + self.reporter.results['metadata_check'] = { + 'json_errors': [ + {'property': '.files', 'description': "should have required property 'files'"}, + {'property': '/project.title', 'description': "should have required property 'title'"}, + {'property': '/analysis/0.description', + 'description': "should have required property 'description'"}, + {'property': '/analysis/0.referenceGenome', + 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0.bioSampleAccession', + 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0.bioSampleObject', + 'description': "should have required property 'bioSampleObject'"}, + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + ] + } + self.reporter.convert_metadata_validation_results() + for error in self.reporter.results['metadata_check']['spreadsheet_errors']: + print(error) + + self.reporter.results['metadata_check']['spreadsheet_errors'] = { + "required sheet 'Files' is missing", + "In Sheet 'Project', required column 'Project Title' is missing.", + "In Sheet 'Analysis', in row number 3 required column 'Description' is missing.", + "In Sheet 'Analysis', in row number 3 required column 'Reference' is missing.", + + } + From fbf3bc30485249437840ff4da3254dc32e942ef6 Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 7 Sep 2023 15:22:39 +0100 Subject: [PATCH 2/4] Fix test --- tests/test_reporter.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/test_reporter.py b/tests/test_reporter.py index 3097f26..a481109 100644 --- a/tests/test_reporter.py +++ b/tests/test_reporter.py @@ -45,13 +45,30 @@ def test__collect_validation_workflow_results(self): {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + ], + 'spreadsheet_errors': [ + {'sheet': 'Files', 'row': 2, 'column': None, 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': 2, 'column': 'Project Title', + 'description': 'In sheet "Project", column "Project Title" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Description', + 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', + 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', + 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': None, + 'description': "should have required property 'bioSampleObject'"}, + {'sheet': 'Sample', 'row': 3, 'column': None, + 'description': 'should match exactly one schema in oneOf'} ] } } - self.reporter._collect_validation_workflow_results() + self.reporter._collect_validation_workflow_results() + print(self.reporter.results) # Drop report paths from comparison (test will fail if missing) - del self.reporter.results['metadata_check']['report_path'] + del self.reporter.results['metadata_check']['json_report_path'] + del self.reporter.results['metadata_check']['spreadsheet_report_path'] del self.reporter.results['sample_check']['report_path'] for file in self.reporter.results['vcf_check'].values(): del file['report_path'] From 1806cebe6b4fb50de78e0ee290a18943a57bfb16 Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 7 Sep 2023 15:50:37 +0100 Subject: [PATCH 3/4] Remove entries that have no relevance to spreadsheet --- cli/reporter.py | 11 ++++++++--- .../resources/validation_reports/expected_report.html | 2 +- tests/test_report.py | 6 +++--- tests/test_reporter.py | 8 ++------ 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cli/reporter.py b/cli/reporter.py index 70fe5f7..b49026c 100755 --- a/cli/reporter.py +++ b/cli/reporter.py @@ -267,6 +267,12 @@ def convert_metadata_validation_results(self): new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated' else: new_description = error["description"].replace(sheet_json, sheet) + if column is None: + # We do not know this attribute. It's most likely about bioSampleObject + continue + if 'schema' in new_description: + # This is an error specific to json schema + continue self.results['metadata_check']['spreadsheet_errors'].append({ 'sheet': sheet, 'row': row, 'column': column, 'description': new_description @@ -290,8 +296,7 @@ def _convert_metadata_sheet(self, json_attribute, xls2json_conf): def _convert_metadata_row(self, sheet, json_row, xls2json_conf): if json_row is None: - # This is for Sheet that can only have a single entry (Project) - json_row = 0 + return '' if 'header_row' in xls2json_conf[sheet]: return int(json_row) + xls2json_conf[sheet]['header_row'] else: @@ -299,7 +304,7 @@ def _convert_metadata_row(self, sheet, json_row, xls2json_conf): def _convert_metadata_attribute(self, sheet, json_attribute, xls2json_conf): if json_attribute is None: - return None + return '' attributes_dict = {} attributes_dict.update(xls2json_conf[sheet].get('required', {})) attributes_dict.update(xls2json_conf[sheet].get('optional', {})) diff --git a/tests/resources/validation_reports/expected_report.html b/tests/resources/validation_reports/expected_report.html index 4e7b357..d069a3e 100644 --- a/tests/resources/validation_reports/expected_report.html +++ b/tests/resources/validation_reports/expected_report.html @@ -17,4 +17,4 @@ th { background-color: lightgrey; } .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } - .error-list { display: none; }

Validation Report: My cool project

Generated at 2023-08-31 12:34:56

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report:
SheetRowColumnDescription
Files2NoneSheet "Files" is missing
Project2Project TitleIn sheet "Project", column "Project Title" is not populated
Project2DescriptionIn sheet "Project", column "Description" is not populated
Project2Tax IDIn sheet "Project", column "Tax ID" is not populated
Project2CenterIn sheet "Project", column "Center" is not populated
Analysis2Analysis TitleIn sheet "Analysis", row "2", column "Analysis Title" is not populated
Analysis2DescriptionIn sheet "Analysis", row "2", column "Description" is not populated
Analysis2Experiment TypeIn sheet "Analysis", row "2", column "Experiment Type" is not populated
Analysis2ReferenceIn sheet "Analysis", row "2", column "Reference" is not populated
Sample3Sample AccessionIn sheet "Sample", row "3", column "Sample Accession" is not populated
Sample3Noneshould have required property 'bioSampleObject'
Sample3Noneshould match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
CategoryError
Samples described in the metadata but not in the VCF filesSample1
Samples in the VCF files but not described in the metadata1Sample
\ No newline at end of file + .error-list { display: none; }

Validation Report: My cool project

Generated at 2023-08-31 12:34:56

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report:
SheetRowColumnDescription
FilesSheet "Files" is missing
ProjectProject TitleIn sheet "Project", column "Project Title" is not populated
ProjectDescriptionIn sheet "Project", column "Description" is not populated
ProjectTax IDIn sheet "Project", column "Tax ID" is not populated
ProjectCenterIn sheet "Project", column "Center" is not populated
Analysis2Analysis TitleIn sheet "Analysis", row "2", column "Analysis Title" is not populated
Analysis2DescriptionIn sheet "Analysis", row "2", column "Description" is not populated
Analysis2Experiment TypeIn sheet "Analysis", row "2", column "Experiment Type" is not populated
Analysis2ReferenceIn sheet "Analysis", row "2", column "Reference" is not populated
Sample3Sample AccessionIn sheet "Sample", row "3", column "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
CategoryError
Samples described in the metadata but not in the VCF filesSample1
Samples in the VCF files but not described in the metadata1Sample
\ No newline at end of file diff --git a/tests/test_report.py b/tests/test_report.py index e589144..9ca2b2a 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -78,9 +78,7 @@ {'property': '/analysis/0.description', 'description': "should have required property 'description'"}, {'property': '/analysis/0.experimentType', 'description': "should have required property 'experimentType'"}, {'property': '/analysis/0.referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"} ], 'report_path': '/path/to/metadata/report' } @@ -98,5 +96,7 @@ def test_generate_html_report(self): reporter.convert_metadata_validation_results() validation_results = reporter.results report = generate_html_report(validation_results, datetime.datetime(2023, 8, 31, 12, 34, 56), "My cool project") + with open('report.html', 'w') as open_html: + assert open_html.write(report) with open(self.expected_report) as open_html: assert report == open_html.read() diff --git a/tests/test_reporter.py b/tests/test_reporter.py index a481109..09e3da9 100644 --- a/tests/test_reporter.py +++ b/tests/test_reporter.py @@ -47,8 +47,8 @@ def test__collect_validation_workflow_results(self): {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ], 'spreadsheet_errors': [ - {'sheet': 'Files', 'row': 2, 'column': None, 'description': 'Sheet "Files" is missing'}, - {'sheet': 'Project', 'row': 2, 'column': 'Project Title', + {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'}, {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, @@ -56,10 +56,6 @@ def test__collect_validation_workflow_results(self): 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}, - {'sheet': 'Sample', 'row': 3, 'column': None, - 'description': "should have required property 'bioSampleObject'"}, - {'sheet': 'Sample', 'row': 3, 'column': None, - 'description': 'should match exactly one schema in oneOf'} ] } } From a74b04f0b811c697ab28f2ae5e60bd4261c6de2e Mon Sep 17 00:00:00 2001 From: tcezard Date: Mon, 11 Sep 2023 10:27:08 +0100 Subject: [PATCH 4/4] Address revie comments --- cli/reporter.py | 14 ++++------ .../validation_reports/expected_report.html | 2 +- tests/test_report.py | 28 +++++++++++++------ tests/test_reporter.py | 26 ++++++++--------- 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/cli/reporter.py b/cli/reporter.py index b49026c..2214e77 100755 --- a/cli/reporter.py +++ b/cli/reporter.py @@ -127,7 +127,9 @@ def _collect_validation_workflow_results(self, ): self._collect_vcf_check_results() self._collect_assembly_check_results() self._load_sample_check_results() - self._parse_metadata_validation_results() + self._parse_biovalidator_validation_results() + self._convert_biovalidator_validation_to_spreadsheet() + self._write_spreadsheet_validation_results() def _collect_vcf_check_results(self,): # detect output files for vcf check @@ -200,7 +202,7 @@ def _load_sample_check_results(self): self.results['sample_check'] = yaml.safe_load(open_yaml) self.results['sample_check']['report_path'] = sample_check_yaml - def _parse_metadata_validation_results(self): + def _parse_biovalidator_validation_results(self): """ Read the biovalidator's report and extract the list of validation errors """ @@ -233,10 +235,6 @@ def clean_read(ifile): 'json_report_path': metadata_check_file, 'json_errors': errors } - self.convert_metadata_validation_results() - self.write_converted_metadata_reslts() - - def _parse_metadata_property(self, property_str): if property_str.startswith('.'): @@ -248,7 +246,7 @@ def _parse_metadata_property(self, property_str): logger.error(f'Cannot parse {property_str} in JSON metadata error') return None, None, None - def convert_metadata_validation_results(self): + def _convert_biovalidator_validation_to_spreadsheet(self): config_file = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml") with open(config_file) as open_file: xls2json_conf = yaml.safe_load(open_file) @@ -278,7 +276,7 @@ def convert_metadata_validation_results(self): 'description': new_description }) - def write_converted_metadata_reslts(self): + def _write_spreadsheet_validation_results(self): if 'spreadsheet_errors' in self.results['metadata_check']: spreadsheet_report_file = os.path.join(os.path.dirname(self.results['metadata_check']['json_report_path']), 'metadata_spreadsheet_validation.txt') diff --git a/tests/resources/validation_reports/expected_report.html b/tests/resources/validation_reports/expected_report.html index d069a3e..901d111 100644 --- a/tests/resources/validation_reports/expected_report.html +++ b/tests/resources/validation_reports/expected_report.html @@ -17,4 +17,4 @@ th { background-color: lightgrey; } .fail { background-color: #FFB6C1; } .pass { background-color: #90EE90; } - .error-list { display: none; }

Validation Report: My cool project

Generated at 2023-08-31 12:34:56

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report:
SheetRowColumnDescription
FilesSheet "Files" is missing
ProjectProject TitleIn sheet "Project", column "Project Title" is not populated
ProjectDescriptionIn sheet "Project", column "Description" is not populated
ProjectTax IDIn sheet "Project", column "Tax ID" is not populated
ProjectCenterIn sheet "Project", column "Center" is not populated
Analysis2Analysis TitleIn sheet "Analysis", row "2", column "Analysis Title" is not populated
Analysis2DescriptionIn sheet "Analysis", row "2", column "Description" is not populated
Analysis2Experiment TypeIn sheet "Analysis", row "2", column "Experiment Type" is not populated
Analysis2ReferenceIn sheet "Analysis", row "2", column "Reference" is not populated
Sample3Sample AccessionIn sheet "Sample", row "3", column "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
CategoryError
Samples described in the metadata but not in the VCF filesSample1
Samples in the VCF files but not described in the metadata1Sample
\ No newline at end of file + .error-list { display: none; }

Validation Report: My cool project

Generated at 2023-08-31 12:34:56

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/metadata/metadata_spreadsheet_validation.txt
SheetRowColumnDescription
FilesSheet "Files" is missing
ProjectProject TitleIn sheet "Project", column "Project Title" is not populated
ProjectDescriptionIn sheet "Project", column "Description" is not populated
ProjectTax IDIn sheet "Project", column "Tax ID" is not populated
ProjectCenterIn sheet "Project", column "Center" is not populated
Analysis2Analysis TitleIn sheet "Analysis", row "2", column "Analysis Title" is not populated
Analysis2DescriptionIn sheet "Analysis", row "2", column "Description" is not populated
Analysis2Experiment TypeIn sheet "Analysis", row "2", column "Experiment Type" is not populated
Analysis2ReferenceIn sheet "Analysis", row "2", column "Reference" is not populated
Sample3Sample AccessionIn sheet "Sample", row "3", column "Sample Accession" is not populated

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors, 0 warnings
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors, 0 warnings

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
❌ AA: Sample names concordance check
First 10 errors per category are below. Full report: /path/to/sample/report
CategoryError
Samples described in the metadata but not in the VCF filesSample1
Samples in the VCF files but not described in the metadata1Sample
\ No newline at end of file diff --git a/tests/test_report.py b/tests/test_report.py index 9ca2b2a..0d99755 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -78,25 +78,35 @@ {'property': '/analysis/0.description', 'description': "should have required property 'description'"}, {'property': '/analysis/0.experimentType', 'description': "should have required property 'experimentType'"}, {'property': '/analysis/0.referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"} + {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ], - 'report_path': '/path/to/metadata/report' + 'json_report_path': '/path/to/metadata/report', + 'spreadsheet_errors': [ + {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'} + ], + 'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt', } } + + class TestReport(TestCase): resource_dir = os.path.join(os.path.dirname(__file__), 'resources') expected_report = os.path.join(resource_dir, 'validation_reports', 'expected_report.html') def test_generate_html_report(self): - reporter = Reporter(['input_passed.vcf'], '') - global validation_results - reporter.results = validation_results - reporter.convert_metadata_validation_results() - validation_results = reporter.results report = generate_html_report(validation_results, datetime.datetime(2023, 8, 31, 12, 34, 56), "My cool project") - with open('report.html', 'w') as open_html: - assert open_html.write(report) with open(self.expected_report) as open_html: assert report == open_html.read() diff --git a/tests/test_reporter.py b/tests/test_reporter.py index 09e3da9..6878549 100644 --- a/tests/test_reporter.py +++ b/tests/test_reporter.py @@ -61,7 +61,6 @@ def test__collect_validation_workflow_results(self): } self.reporter._collect_validation_workflow_results() - print(self.reporter.results) # Drop report paths from comparison (test will fail if missing) del self.reporter.results['metadata_check']['json_report_path'] del self.reporter.results['metadata_check']['spreadsheet_report_path'] @@ -88,8 +87,8 @@ def test_vcf_check_errors_is_critical(self): for i, error in enumerate(errors): assert self.reporter.vcf_check_errors_is_critical(error) == expected_return[i] - def test_parse_metadata_validation_results(self): - self.reporter._parse_metadata_validation_results() + def test_parse_biovalidator_validation_results(self): + self.reporter._parse_biovalidator_validation_results() assert self.reporter.results['metadata_check']['json_errors'] == [ {'property': '.files', 'description': "should have required property 'files'"}, {'property': '/project.title', 'description': "should have required property 'title'"}, @@ -100,7 +99,7 @@ def test_parse_metadata_validation_results(self): {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ] - def test_convert_metadata_validation_results(self): + def test_convert_biovalidator_validation_to_spreadsheet(self): self.reporter.results['metadata_check'] = { 'json_errors': [ {'property': '.files', 'description': "should have required property 'files'"}, @@ -116,15 +115,12 @@ def test_convert_metadata_validation_results(self): {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ] } - self.reporter.convert_metadata_validation_results() - for error in self.reporter.results['metadata_check']['spreadsheet_errors']: - print(error) - - self.reporter.results['metadata_check']['spreadsheet_errors'] = { - "required sheet 'Files' is missing", - "In Sheet 'Project', required column 'Project Title' is missing.", - "In Sheet 'Analysis', in row number 3 required column 'Description' is missing.", - "In Sheet 'Analysis', in row number 3 required column 'Reference' is missing.", - - } + self.reporter._convert_biovalidator_validation_to_spreadsheet() + assert self.reporter.results['metadata_check']['spreadsheet_errors'] == [ + {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'} + ]