Skip to content

Commit

Permalink
EVA-3620 Report metadata errors for Json directly when no spreadsheet…
Browse files Browse the repository at this point in the history
… used (#48)

* report metadata errors for json directly when no spreadsheet used
  • Loading branch information
nitin-ebi authored Jul 30, 2024
1 parent bf150ed commit 51a14a6
Show file tree
Hide file tree
Showing 6 changed files with 295 additions and 27 deletions.
22 changes: 21 additions & 1 deletion eva_sub_cli/jinja_templates/metadata_validation.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
{% macro metadata_validation_report(validation_results) -%}
{% set results = validation_results.get('metadata_check', {}) %}
{% set spreadsheet_errors = results.get('spreadsheet_errors', []) %}
{% if spreadsheet_errors %}
{% set json_errors = results.get('json_errors', []) %}

{% set has_errors = spreadsheet_errors or json_errors %}
{% if has_errors %}
{% set expand_icon = "▶" %}
{% set icon = "❌" %}
{% set row_class = "report-section fail collapsible" %}
Expand All @@ -29,4 +32,21 @@
</table>
</div>
{% endif %}

{% if json_errors %}
<div class="error-list">
<div class="error-description"><strong>Full report:</strong> {{ results.get('json_report_path', '') }}</div>
<table>
<tr>
<th>JSON Property</th><th>Error Description</th>
</tr>
{% for error in json_errors %}
<tr>
<td><strong>{{ error.get('property') }}</strong></td>
<td> {{ error.get('description') }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endif %}
{%- endmacro %}
5 changes: 3 additions & 2 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,8 +361,9 @@ def _collect_metadata_results(self):
self._load_spreadsheet_conversion_errors()
self._parse_biovalidator_validation_results()
self._parse_semantic_metadata_results()
self._convert_biovalidator_validation_to_spreadsheet()
self._write_spreadsheet_validation_results()
if self.metadata_xlsx:
self._convert_biovalidator_validation_to_spreadsheet()
self._write_spreadsheet_validation_results()
self._collect_file_info_to_metadata()

def _load_spreadsheet_conversion_errors(self):
Expand Down

Large diffs are not rendered by default.

200 changes: 178 additions & 22 deletions tests/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,159 @@
import eva_sub_cli
from eva_sub_cli.report import generate_html_report

validation_results = {
validation_results_xlsx = {
"assembly_check": {
"input_passed.vcf": {
"report_path": "/path/to/assembly_passed/report",
"error_list": [],
"match": 247,
"mismatch_list": [],
"nb_error": 0,
"nb_mismatch": 0,
"total": 247,
},
"input_fail.vcf": {
"report_path": "/path/to/assembly_failed/report",
"error_list": ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"],
"match": 26,
"mismatch_list": [
"Chromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'",
"Chromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'",
"Chromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'",
"Chromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'",
"Chromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'",
"Chromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'",
"Chromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'",
"Chromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'",
"Chromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'",
],
"nb_error": 1,
"nb_mismatch": 10,
"total": 36,
},
},
"vcf_check": {
"input_passed.vcf": {
'report_path': '/path/to/vcf_passed/report',
"error_count": 0,
"error_list": [],
"valid": True,
"warning_count": 0,
},
"input_fail.vcf": {
'report_path': '/path/to/vcf_failed/report',
"critical_count": 1,
"critical_list": ["Line 4: Error in meta-data section."],
"error_count": 1,
"error_list": ["Sample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=.."],
"valid": False,
"warning_count": 0,
},
},
"sample_check": {
'report_path': '/path/to/sample/report',
'overall_differences': True,
'results_per_analysis': {
'Analysis A': {
'difference': True,
'more_metadata_submitted_files': [' SampleA1', 'SampleA2 ','SampleA3', 'SampleA4', 'SampleA5', 'SampleA6', 'SampleA7','SampleA8', 'SampleA9', 'SampleA10'],
'more_per_submitted_files_metadata': {},
'more_submitted_files_metadata': ['A1Sample ', ' A2Sample', 'A3Sample', 'A4Sample', 'A5Sample', 'A6Sample', 'A7Sample', 'A8Sample', 'A9Sample', 'A10Sample']
},
'Analysis B': {
'difference': False,
'more_metadata_submitted_files': [],
'more_per_submitted_files_metadata': {},
'more_submitted_files_metadata': []
},
'Analysis C': {
'difference': True,
'more_metadata_submitted_files': ['SampleC1 ', ' SampleC2', 'SampleC3', 'SampleC4'],
'more_per_submitted_files_metadata': {},
'more_submitted_files_metadata': ['C1Sample ', ' C2Sample', 'C3Sample', 'C4Sample']
}
}
},
# NB. obviously this doesn't make sense for the number of analyses in this report, but demonstrates the possible
# outputs for this check.
"fasta_check": {
'not_all_insdc.fa': {
'report_path': '/path/to/not_all_insdc_check.yml',
'all_insdc': False,
'sequences': [
{'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
{'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': False}
],
'metadata_assembly_compatible': True,
'possible_assemblies': {'GCA_1'},
'assembly_in_metadata': 'GCA_1',
'associated_analyses': ['Analysis A']
},
'metadata_asm_not_found.fa': {
'report_path': '/path/to/metadata_asm_not_found.yml',
'all_insdc': True,
'sequences': [
{'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
{'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True}
],
'possible_assemblies': {'GCA_1'}
},
'metadata_asm_not_match.fa': {
'report_path': '/path/to/metadata_asm_not_match.yml',
'all_insdc': True,
'sequences': [
{'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
{'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True}
],
'metadata_assembly_compatible': False,
'possible_assemblies': {'GCA_1'},
'assembly_in_metadata': 'GCA_2',
'associated_analyses': ['Analysis B']
},
'metadata_asm_match.fa': {
'report_path': '/path/to/metadata_asm_match.yml',
'all_insdc': True,
'sequences': [
{'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
{'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True}
],
'metadata_assembly_compatible': True,
'possible_assemblies': {'GCA_1'},
'assembly_in_metadata': 'GCA_1',
'associated_analyses': ['Analysis A']
},
'metadata_error.fa': {
'report_path': '/path/to/metadata_error.yml',
'all_insdc': True,
'sequences': [
{'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True},
{'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True}
],
'metadata_assembly_compatible': True,
'possible_assemblies': {'GCA_1'},
'assembly_in_metadata': 'GCA_1',
'associated_analyses': ['Analysis C'],
'connection_error': '500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve'
}
},
'metadata_check': {
'spreadsheet_errors': [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
],
'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt',
}
}

validation_results_json = {
"assembly_check": {
"input_passed.vcf": {
"report_path": "/path/to/assembly_passed/report",
Expand Down Expand Up @@ -155,27 +307,15 @@
{'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
{'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
],
'json_report_path': '/path/to/metadata/report',
'spreadsheet_errors': [
{'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'},
{'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'},
{'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'},
{'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
{'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
],
'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt',
'json_report_path': '/path/to/json/metadata/report'
}
}


class TestReport(TestCase):
resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
expected_report = os.path.join(resource_dir, 'validation_reports', 'expected_report.html')
expected_report_metadata_xlsx = os.path.join(resource_dir, 'validation_reports', 'expected_report_metadata_xlsx.html')
expected_report_metadata_json = os.path.join(resource_dir, 'validation_reports', 'expected_report_metadata_json.html')
test_project_name = "My cool project"
test_validation_date = datetime.datetime(2023, 8, 31, 12, 34, 56)
test_submission_dir = "/test/submission/dir"
Expand All @@ -184,18 +324,34 @@ class TestReport(TestCase):
test_vcf_fasta_analysis_mapping.append({'vcf_file': 'input_pass.vcf', 'fasta_file': 'input_pass.fa', 'analysis': 'B'})
test_vcf_fasta_analysis_mapping.append({'vcf_file': 'input_test.vcf', 'fasta_file': 'input_test.fa', 'analysis': 'could not be linked'})

def test_generate_html_report(self):
report = generate_html_report(validation_results, self.test_validation_date, self.test_submission_dir,
def test_generate_html_report_metadata_xlsx(self):
report = generate_html_report(validation_results_xlsx, self.test_validation_date, self.test_submission_dir,
self.test_vcf_fasta_analysis_mapping, self.test_project_name)
with open('metadata_xlsx_report.html', 'w') as open_file:
open_file.write(report)

with open(self.expected_report_metadata_xlsx) as open_html:
expected_report_text = open_html.read()
# Inject the version in the expected report
expected_report_text = expected_report_text.replace('cligeneratedversion', eva_sub_cli.__version__)
assert report == expected_report_text

# Remove output file if assert passes
if os.path.exists('metadata_xlsx_report.html'):
os.remove('metadata_xlsx_report.html')

def test_generate_html_report_metadata_json(self):
report = generate_html_report(validation_results_json, self.test_validation_date, self.test_submission_dir,
self.test_vcf_fasta_analysis_mapping, self.test_project_name)
with open('report.html', 'w') as open_file:
with open('metadata_json_report.html', 'w') as open_file:
open_file.write(report)

with open(self.expected_report) as open_html:
with open(self.expected_report_metadata_json) as open_html:
expected_report_text = open_html.read()
# Inject the version in the expected report
expected_report_text = expected_report_text.replace('cligeneratedversion', eva_sub_cli.__version__)
assert report == expected_report_text

# Remove output file if assert passes
if os.path.exists('report.html'):
os.remove('report.html')
if os.path.exists('metadata_json_report.html'):
os.remove('metadata_json_report.html')
Loading

0 comments on commit 51a14a6

Please sign in to comment.