From 6892adf68e1403970243a2d815c64f9e72c66fae Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Tue, 23 Jul 2024 16:14:30 +0100 Subject: [PATCH] report metadata errors for json directly when no spreadsheet used --- .../jinja_templates/metadata_validation.html | 22 +- eva_sub_cli/validators/validator.py | 5 +- .../expected_report_metadata_json.html | 22 ++ ...tml => expected_report_metadata_xlsx.html} | 0 tests/test_docker_validator.py | 5 - tests/test_report.py | 194 ++++++++++++++++-- tests/test_validator.py | 73 ++++++- 7 files changed, 292 insertions(+), 29 deletions(-) create mode 100644 tests/resources/validation_reports/expected_report_metadata_json.html rename tests/resources/validation_reports/{expected_report.html => expected_report_metadata_xlsx.html} (100%) diff --git a/eva_sub_cli/jinja_templates/metadata_validation.html b/eva_sub_cli/jinja_templates/metadata_validation.html index 0c5e960..c3f29ab 100644 --- a/eva_sub_cli/jinja_templates/metadata_validation.html +++ b/eva_sub_cli/jinja_templates/metadata_validation.html @@ -2,7 +2,10 @@ {% macro metadata_validation_report(validation_results) -%} {% set results = validation_results.get('metadata_check', {}) %} {% set spreadsheet_errors = results.get('spreadsheet_errors', []) %} - {% if spreadsheet_errors %} + {% set json_errors = results.get('json_errors', []) %} + + {% set has_errors = spreadsheet_errors or json_errors %} + {% if has_errors %} {% set expand_icon = "▶" %} {% set icon = "❌" %} {% set row_class = "report-section fail collapsible" %} @@ -29,4 +32,21 @@ {% endif %} + + {% if json_errors %} +
+
Full report: {{ results.get('json_report_path', '') }}
+ + + + + {% for error in json_errors %} + + + + + {% endfor %} +
Json PropertyError Description
{{ error.get('property') }} {{ error.get('description') }}
+
+ {% endif %} {%- endmacro %} \ No newline at end of file diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index 9795531..a99d329 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -361,8 +361,9 @@ def _collect_metadata_results(self): self._load_spreadsheet_conversion_errors() self._parse_biovalidator_validation_results() self._parse_semantic_metadata_results() - self._convert_biovalidator_validation_to_spreadsheet() - self._write_spreadsheet_validation_results() + if self.metadata_xlsx: + self._convert_biovalidator_validation_to_spreadsheet() + self._write_spreadsheet_validation_results() self._collect_md5sum_to_metadata() def _load_spreadsheet_conversion_errors(self): diff --git a/tests/resources/validation_reports/expected_report_metadata_json.html b/tests/resources/validation_reports/expected_report_metadata_json.html new file mode 100644 index 0000000..53b45f7 --- /dev/null +++ b/tests/resources/validation_reports/expected_report_metadata_json.html @@ -0,0 +1,22 @@ +Validation Report

Validation Report

eva-sub-cli v0.4.dev38+g5ffe5ee

Project Summary

General details about the project

Project Title: My cool project

Validation Date: 2023-08-31 12:34:56

Submission Directory: /test/submission/dir

Files mapping
VCF FileFasta FileAnalysis
input_fail.vcfinput_fail.faA
input_pass.vcfinput_pass.faB
input_test.vcfinput_test.facould not be linked

Metadata validation results

Ensures that required fields are present and values are formatted correctly. For requirements, please refer to the EVA website.
❌ Metadata validation check
Full report: /path/to/json/metadata/report
Json PropertyError Description
.filesshould have required property 'files'
/project.titleshould have required property 'title'
/project.descriptionshould have required property 'description'
/project.taxIdshould have required property 'taxId'
/project.centreshould have required property 'centre'
/analysis/0.analysisTitleshould have required property 'analysisTitle'
/analysis/0.descriptionshould have required property 'description'
/analysis/0.experimentTypeshould have required property 'experimentType'
/analysis/0.referenceGenomeshould have required property 'referenceGenome'
/sample/0.bioSampleAccessionshould have required property 'bioSampleAccession'
/sample/0.bioSampleObjectshould have required property 'bioSampleObject'
/sample/0should match exactly one schema in oneOf

VCF validation results

Checks whether each file is compliant with the VCF specification. Also checks whether the variants' reference alleles match against the reference assembly.

input_fail.vcf

❌ Assembly check: 26/36 (72.22%)
First 10 errors per category are below. Full report: /path/to/assembly_failed/report
CategoryError
Parsing ErrorThe assembly checking could not be completed: Contig 'chr23' not found in assembly report
mismatch errorChromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'
mismatch errorChromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'
mismatch errorChromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'
mismatch errorChromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'
❌ VCF check: 1 critical errors, 1 non-critical errors
First 10 errors per category are below. Full report: /path/to/vcf_failed/report
CategoryError
critical errorLine 4: Error in meta-data section.
non-critical errorSample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=..

input_passed.vcf

✔ Assembly check: 247/247 (100.0%)
✔ VCF check: 0 critical errors, 0 non-critical errors

Sample name concordance check

Checks whether information in the metadata is concordant with that contained in the VCF files, in particular sample names.
Analysis A: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleA1, SampleA2 , SampleA3, SampleA4, SampleA5Show All Errors For Category
Samples in the VCF files but not described in the metadataA1Sample , A2Sample, A3Sample, A4Sample, A5SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. •SampleA1
  2. SampleA2•
  3. SampleA3
  4. SampleA4
  5. SampleA5
  6. SampleA6
  7. SampleA7
  8. SampleA8
  9. SampleA9
  10. SampleA10
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. A1Sample•
  2. •A2Sample
  3. A3Sample
  4. A4Sample
  5. A5Sample
  6. A6Sample
  7. A7Sample
  8. A8Sample
  9. A9Sample
  10. A10Sample
Hide
Analysis B: Sample names in metadata match with those in VCF files
Analysis C: Sample names in metadata do not match with those in VCF files
CategoryFirst 5 Errors For CategoryLink To View All Errors
Samples described in the metadata but not in the VCF filesSampleC1 , SampleC2, SampleC3, SampleC4Show All Errors For Category
Samples in the VCF files but not described in the metadataC1Sample , C2Sample, C3Sample, C4SampleShow All Errors For Category
All Errors For Category - Samples described in the metadata but not in the VCF files:
  1. SampleC1•
  2. •SampleC2
  3. SampleC3
  4. SampleC4
Hide
All Errors For Category - Samples in the VCF files but not described in the metadata:
  1. C1Sample•
  2. •C2Sample
  3. C3Sample
  4. C4Sample
Hide

Reference genome INSDC check

Checks that the reference sequences in the FASTA file used to call the variants are accessioned in INSDC. Also checks if the reference assembly accession in the metadata matches the one determined from the FASTA file.

metadata_asm_match.fa

✔ All sequences are INSDC accessioned
✔ Analysis A: Assembly accession in metadata is compatible

metadata_asm_not_found.fa

✔ All sequences are INSDC accessioned
❌ No assembly accession found in metadata
Full report: /path/to/metadata_asm_not_found.yml
CategoryAccessions
Assembly accession found in metadataNot found
Assembly accession(s) compatible with FASTAGCA_1

metadata_asm_not_match.fa

✔ All sequences are INSDC accessioned
❌ Analysis B: Assembly accession in metadata is not compatible
Full report: /path/to/metadata_asm_not_match.yml
CategoryAccessions
Assembly accession found in metadataGCA_2
Assembly accession(s) compatible with FASTAGCA_1

metadata_error.fa

Warning: The following results may be incomplete due to problems with external services. Please try again later for complete results.
Error message: 500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve
✔ All sequences are INSDC accessioned
✔ Analysis C: Assembly accession in metadata is compatible

not_all_insdc.fa

❌ Some sequences are not INSDC accessioned
First 10 sequences not in INSDC. Full report: /path/to/not_all_insdc_check.yml
Sequence nameRefget md5
2hjfdoijsfc47hfg0gh9qwjrve
✔ Analysis A: Assembly accession in metadata is compatible
\ No newline at end of file diff --git a/tests/resources/validation_reports/expected_report.html b/tests/resources/validation_reports/expected_report_metadata_xlsx.html similarity index 100% rename from tests/resources/validation_reports/expected_report.html rename to tests/resources/validation_reports/expected_report_metadata_xlsx.html diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index 72aba00..a065485 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -133,11 +133,6 @@ def test_validate(self): 'md5': '96a80c9368cc3c37095c86fbe6044fb2'} ] - # Check metadata errors - with open(os.path.join(self.validator.output_dir, 'other_validations', 'metadata_validation.txt')) as open_file: - metadata_val_lines = {l.strip() for l in open_file.readlines()} - assert 'must match pattern "^PRJ(EB|NA)\\d+$"' in metadata_val_lines - def test_validate_from_excel(self): self.validator_from_excel.validate() self.assertTrue(os.path.isfile(self.validator_from_excel._sample_check_yaml)) diff --git a/tests/test_report.py b/tests/test_report.py index 4b55ee1..5553752 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -5,7 +5,159 @@ import eva_sub_cli from eva_sub_cli.report import generate_html_report -validation_results = { +validation_results_xlsx = { + "assembly_check": { + "input_passed.vcf": { + "report_path": "/path/to/assembly_passed/report", + "error_list": [], + "match": 247, + "mismatch_list": [], + "nb_error": 0, + "nb_mismatch": 0, + "total": 247, + }, + "input_fail.vcf": { + "report_path": "/path/to/assembly_failed/report", + "error_list": ["The assembly checking could not be completed: Contig 'chr23' not found in assembly report"], + "match": 26, + "mismatch_list": [ + "Chromosome 1, position 35549, reference allele 'G' does not match the reference sequence, expected 'c'", + "Chromosome 1, position 35595, reference allele 'G' does not match the reference sequence, expected 'a'", + "Chromosome 1, position 35618, reference allele 'G' does not match the reference sequence, expected 'c'", + "Chromosome 1, position 35626, reference allele 'A' does not match the reference sequence, expected 'g'", + "Chromosome 1, position 35639, reference allele 'T' does not match the reference sequence, expected 'c'", + "Chromosome 1, position 35643, reference allele 'T' does not match the reference sequence, expected 'g'", + "Chromosome 1, position 35717, reference allele 'T' does not match the reference sequence, expected 'g'", + "Chromosome 1, position 35819, reference allele 'T' does not match the reference sequence, expected 'a'", + "Chromosome 1, position 35822, reference allele 'T' does not match the reference sequence, expected 'c'", + ], + "nb_error": 1, + "nb_mismatch": 10, + "total": 36, + }, + }, + "vcf_check": { + "input_passed.vcf": { + 'report_path': '/path/to/vcf_passed/report', + "error_count": 0, + "error_list": [], + "valid": True, + "warning_count": 0, + }, + "input_fail.vcf": { + 'report_path': '/path/to/vcf_failed/report', + "critical_count": 1, + "critical_list": ["Line 4: Error in meta-data section."], + "error_count": 1, + "error_list": ["Sample #11, field AD does not match the meta specification Number=R (expected 2 value(s)). AD=.."], + "valid": False, + "warning_count": 0, + }, + }, + "sample_check": { + 'report_path': '/path/to/sample/report', + 'overall_differences': True, + 'results_per_analysis': { + 'Analysis A': { + 'difference': True, + 'more_metadata_submitted_files': [' SampleA1', 'SampleA2 ','SampleA3', 'SampleA4', 'SampleA5', 'SampleA6', 'SampleA7','SampleA8', 'SampleA9', 'SampleA10'], + 'more_per_submitted_files_metadata': {}, + 'more_submitted_files_metadata': ['A1Sample ', ' A2Sample', 'A3Sample', 'A4Sample', 'A5Sample', 'A6Sample', 'A7Sample', 'A8Sample', 'A9Sample', 'A10Sample'] + }, + 'Analysis B': { + 'difference': False, + 'more_metadata_submitted_files': [], + 'more_per_submitted_files_metadata': {}, + 'more_submitted_files_metadata': [] + }, + 'Analysis C': { + 'difference': True, + 'more_metadata_submitted_files': ['SampleC1 ', ' SampleC2', 'SampleC3', 'SampleC4'], + 'more_per_submitted_files_metadata': {}, + 'more_submitted_files_metadata': ['C1Sample ', ' C2Sample', 'C3Sample', 'C4Sample'] + } + } + }, + # NB. obviously this doesn't make sense for the number of analyses in this report, but demonstrates the possible + # outputs for this check. + "fasta_check": { + 'not_all_insdc.fa': { + 'report_path': '/path/to/not_all_insdc_check.yml', + 'all_insdc': False, + 'sequences': [ + {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True}, + {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': False} + ], + 'metadata_assembly_compatible': True, + 'possible_assemblies': {'GCA_1'}, + 'assembly_in_metadata': 'GCA_1', + 'associated_analyses': ['Analysis A'] + }, + 'metadata_asm_not_found.fa': { + 'report_path': '/path/to/metadata_asm_not_found.yml', + 'all_insdc': True, + 'sequences': [ + {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True}, + {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True} + ], + 'possible_assemblies': {'GCA_1'} + }, + 'metadata_asm_not_match.fa': { + 'report_path': '/path/to/metadata_asm_not_match.yml', + 'all_insdc': True, + 'sequences': [ + {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True}, + {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True} + ], + 'metadata_assembly_compatible': False, + 'possible_assemblies': {'GCA_1'}, + 'assembly_in_metadata': 'GCA_2', + 'associated_analyses': ['Analysis B'] + }, + 'metadata_asm_match.fa': { + 'report_path': '/path/to/metadata_asm_match.yml', + 'all_insdc': True, + 'sequences': [ + {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True}, + {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True} + ], + 'metadata_assembly_compatible': True, + 'possible_assemblies': {'GCA_1'}, + 'assembly_in_metadata': 'GCA_1', + 'associated_analyses': ['Analysis A'] + }, + 'metadata_error.fa': { + 'report_path': '/path/to/metadata_error.yml', + 'all_insdc': True, + 'sequences': [ + {'sequence_name': '1', 'sequence_md5': 'hsjvchdhdo3ate83jdfd76rp2', 'insdc': True}, + {'sequence_name': '2', 'sequence_md5': 'hjfdoijsfc47hfg0gh9qwjrve', 'insdc': True} + ], + 'metadata_assembly_compatible': True, + 'possible_assemblies': {'GCA_1'}, + 'assembly_in_metadata': 'GCA_1', + 'associated_analyses': ['Analysis C'], + 'connection_error': '500 Server Error: Internal Server Error for url: https://www.ebi.ac.uk/eva/webservices/contig-alias/v1/chromosomes/md5checksum/hjfdoijsfc47hfg0gh9qwjrve' + } + }, + 'metadata_check': { + 'spreadsheet_errors': [ + {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, + {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'}, + {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, + {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'} + ], + 'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt', + } +} + +validation_results_json = { "assembly_check": { "input_passed.vcf": { "report_path": "/path/to/assembly_passed/report", @@ -155,27 +307,15 @@ {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ], - 'json_report_path': '/path/to/metadata/report', - 'spreadsheet_errors': [ - {'sheet': 'Files', 'row': '', 'column': '', 'description': 'Sheet "Files" is missing'}, - {'sheet': 'Project', 'row': '', 'column': 'Project Title', 'description': 'In sheet "Project", column "Project Title" is not populated'}, - {'sheet': 'Project', 'row': '', 'column': 'Description', 'description': 'In sheet "Project", column "Description" is not populated'}, - {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'}, - {'sheet': 'Project', 'row': '', 'column': 'Center', 'description': 'In sheet "Project", column "Center" is not populated'}, - {'sheet': 'Analysis', 'row': 2, 'column': 'Analysis Title', 'description': 'In sheet "Analysis", row "2", column "Analysis Title" is not populated'}, - {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, - {'sheet': 'Analysis', 'row': 2, 'column': 'Experiment Type', 'description': 'In sheet "Analysis", row "2", column "Experiment Type" is not populated'}, - {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, - {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'} - ], - 'spreadsheet_report_path': '/path/to/metadata/metadata_spreadsheet_validation.txt', + 'json_report_path': '/path/to/json/metadata/report' } } class TestReport(TestCase): resource_dir = os.path.join(os.path.dirname(__file__), 'resources') - expected_report = os.path.join(resource_dir, 'validation_reports', 'expected_report.html') + expected_report_metadata_xlsx = os.path.join(resource_dir, 'validation_reports', 'expected_report_metadata_xlsx.html') + expected_report_metadata_json = os.path.join(resource_dir, 'validation_reports', 'expected_report_metadata_json.html') test_project_name = "My cool project" test_validation_date = datetime.datetime(2023, 8, 31, 12, 34, 56) test_submission_dir = "/test/submission/dir" @@ -184,13 +324,29 @@ class TestReport(TestCase): test_vcf_fasta_analysis_mapping.append({'vcf_file': 'input_pass.vcf', 'fasta_file': 'input_pass.fa', 'analysis': 'B'}) test_vcf_fasta_analysis_mapping.append({'vcf_file': 'input_test.vcf', 'fasta_file': 'input_test.fa', 'analysis': 'could not be linked'}) - def test_generate_html_report(self): - report = generate_html_report(validation_results, self.test_validation_date, self.test_submission_dir, + def test_generate_html_report_metadata_xlsx(self): + report = generate_html_report(validation_results_xlsx, self.test_validation_date, self.test_submission_dir, + self.test_vcf_fasta_analysis_mapping, self.test_project_name) + with open('report.html', 'w') as open_file: + open_file.write(report) + + with open(self.expected_report_metadata_xlsx) as open_html: + expected_report_text = open_html.read() + # Inject the version in the expected report + expected_report_text = expected_report_text.replace('cligeneratedversion', eva_sub_cli.__version__) + assert report == expected_report_text + + # Remove output file if assert passes + if os.path.exists('report.html'): + os.remove('report.html') + + def test_generate_html_report_metadata_json(self): + report = generate_html_report(validation_results_json, self.test_validation_date, self.test_submission_dir, self.test_vcf_fasta_analysis_mapping, self.test_project_name) with open('report.html', 'w') as open_file: open_file.write(report) - with open(self.expected_report) as open_html: + with open(self.expected_report_metadata_json) as open_html: expected_report_text = open_html.read() # Inject the version in the expected report expected_report_text = expected_report_text.replace('cligeneratedversion', eva_sub_cli.__version__) diff --git a/tests/test_validator.py b/tests/test_validator.py index f01b4e1..9a15f04 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -12,6 +12,7 @@ class TestValidator(TestCase): assembly_reports = os.path.join(resource_dir, 'assembly_reports') output_dir = os.path.join(resource_dir, 'validation_reports') mapping_file = os.path.join(output_dir, 'vcf_files_mapping.csv') + metadata_xlsx_file = os.path.join(output_dir, 'EVA_Submission_test.xlsx') def setUp(self) -> None: # create vcf mapping file @@ -20,7 +21,8 @@ def setUp(self) -> None: [os.path.join(self.vcf_files, 'input_passed.vcf')], [os.path.join(self.fasta_files, 'input_passed.fa')], [os.path.join(self.assembly_reports, 'input_passed.txt')]) - self.validator = Validator(self.mapping_file, self.output_dir) + self.validator = Validator(self.mapping_file, self.output_dir, metadata_xlsx=self.metadata_xlsx_file) + self.validator_json = Validator(self.mapping_file, self.output_dir) def tearDown(self) -> None: files_from_tests = [ @@ -32,7 +34,7 @@ def tearDown(self) -> None: if os.path.exists(f): os.remove(f) - def test__collect_validation_workflow_results(self): + def test__collect_validation_workflow_results_with_metadata_xlsx(self): expected_results = { 'vcf_check': { 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, 'critical_count': 0, 'critical_list': []} @@ -116,6 +118,73 @@ def test__collect_validation_workflow_results(self): assert self.validator.results == expected_results + def test__collect_validation_workflow_results_with_metadata_json(self): + expected_results = { + 'vcf_check': { + 'input_passed.vcf': {'valid': True, 'error_list': [], 'error_count': 0, 'warning_count': 0, + 'critical_count': 0, 'critical_list': []} + }, + 'assembly_check': { + 'input_passed.vcf': {'error_list': [], 'mismatch_list': [], 'nb_mismatch': 0, 'nb_error': 0, + 'match': 247, 'total': 247} + }, + 'sample_check': { + 'overall_differences': False, + 'results_per_analysis': { + 'AA': { + 'difference': False, + 'more_metadata_submitted_files': [], + 'more_per_submitted_files_metadata': {}, + 'more_submitted_files_metadata': [] + } + } + }, + 'fasta_check': { + 'input_passed.fa': {'all_insdc': False, 'sequences': [ + {'sequence_name': 1, 'insdc': True, 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524'}, + {'sequence_name': 2, 'insdc': False, 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d'} + ]}, + }, + 'metadata_check': { + 'json_errors': [ + {'property': '/files', 'description': "should have required property 'files'"}, + {'property': '/project/title', 'description': "should have required property 'title'"}, + {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, + {'property': '/project/holdDate', 'description': 'must match format "date"'}, + {'property': '/analysis/0/description', + 'description': "should have required property 'description'"}, + {'property': '/analysis/0/referenceGenome', + 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0/bioSampleAccession', + 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0/bioSampleObject', + 'description': "should have required property 'bioSampleObject'"}, + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, + {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, + {'property': '/sample/2/bioSampleObject/characteristics/taxId', + 'description': '1234 is not a valid taxonomy code'}, + {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, + {'property': '/sample/analysisAlias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'}, + ], + 'spreadsheet_errors': [ + {'sheet': 'Project', 'row': '', 'column': 'Tax ID', + 'description': 'Worksheet Project is missing required header Tax ID'} + ] + } + } + + self.validator_json._collect_validation_workflow_results() + # Drop report paths from comparison (test will fail if missing) + del self.validator_json.results['metadata_check']['json_report_path'] + del self.validator_json.results['sample_check']['report_path'] + for file in self.validator_json.results['vcf_check'].values(): + del file['report_path'] + for file in self.validator_json.results['assembly_check'].values(): + del file['report_path'] + + assert self.validator_json.results == expected_results + def test_create_report(self): self.validator._collect_validation_workflow_results() report_path = self.validator.create_reports()