diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9dca22e..901f7e7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,4 +33,4 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | - PYTHONPATH=. pytest tests + PYTHONPATH=. pytest tests -k "not integration" diff --git a/bin/check_fasta_insdc.py b/bin/check_fasta_insdc.py index fb44a4c..8b2c727 100644 --- a/bin/check_fasta_insdc.py +++ b/bin/check_fasta_insdc.py @@ -123,7 +123,7 @@ def assess_fasta(input_fasta, analyses, assembly_in_metadata): possible_assemblies = containing_assemblies else: possible_assemblies &= containing_assemblies - except HTTPError as e: + except (ConnectionError, HTTPError) as e: # Server errors from either ENA refget or EVA contig alias will halt the check prematurely. # Report the error but do not return from the method, so that incomplete results can be reported # (i.e. any sequences found to be INSDC and any compatible assemblies so far) diff --git a/bin/check_metadata_semantics.py b/bin/check_metadata_semantics.py new file mode 100644 index 0000000..07705ec --- /dev/null +++ b/bin/check_metadata_semantics.py @@ -0,0 +1,22 @@ +import argparse +import json + +from eva_sub_cli.semantic_metadata import SemanticMetadataChecker + + +def main(): + arg_parser = argparse.ArgumentParser(description='Perform semantic checks on the metadata') + arg_parser.add_argument('--metadata_json', required=True, dest='metadata_json', help='EVA metadata json file') + arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml', + help='Path to the location of the results') + args = arg_parser.parse_args() + + with open(args.metadata_json) as open_json: + metadata = json.load(open_json) + checker = SemanticMetadataChecker(metadata) + checker.check_all() + checker.write_result_yaml(args.output_yaml) + + +if __name__ == "__main__": + main() diff --git a/bin/xlsx2json.py b/bin/xlsx2json.py index 826cc8d..767adaa 100644 --- a/bin/xlsx2json.py +++ b/bin/xlsx2json.py @@ -198,7 +198,7 @@ def get_project_json_data(self): project_data = self.get_rows() if len(project_data) > 1: logger.warning(f"{PROJECT} worksheet expects a single row of info but more than one found in the file. " - f"Only the first row's data will be taken into consideration") + f"Only the first row's data will be taken into consideration") first_row = project_data[0] first_row.pop('row_num') @@ -213,43 +213,45 @@ def get_biosample_object(self, data): # BioSample expects any of organism or species field data[SPECIES] = data[scientific_name] - # For some of the fields, BioSample expects value in arrays - data = {k: [v] if k in ['title', 'description', 'taxId', 'scientificName', 'commonName', SPECIES] else v - for k, v in data.items()} + # BioSample name goes in its own attribute, not part of characteristics + biosample_name = data.pop(sample_name) + # For all characteristics, BioSample expects value in arrays of objects + data = {k: [{'text': self.serialize(v)}] for k, v in data.items()} biosample_object = { - "name": data[sample_name], + "name": biosample_name, "characteristics": data } return biosample_object + @staticmethod + def serialize(value): + """Create a text representation of the value provided""" + if isinstance(value, datetime.date): + return value.strftime('%Y-%m-%d') + return str(value) + def get_sample_data_with_split_analysis_alias(self, data, analysis_alias, sample_name_in_vcf): analysis_alias_list = data[analysis_alias].split(',') sample_in_vcf_val = data[sample_name_in_vcf] - sample_data = [] - for aa in analysis_alias_list: - sample_data.append({analysis_alias: aa, sample_name_in_vcf: sample_in_vcf_val}) - - return sample_data + return {analysis_alias: analysis_alias_list, sample_name_in_vcf: sample_in_vcf_val} def get_sample_json_data(self): json_key = self.xlsx_conf[WORKSHEETS_KEY_NAME][SAMPLE] sample_json = {json_key: []} for row in self.get_rows(): - row.pop('row_num') + row_num = row.pop('row_num') json_value = {self.translate_header(SAMPLE, k): v for k, v in row.items() if v is not None} bio_sample_acc = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SAMPLE_ACCESSION_KEY] analysis_alias = self.xlsx_conf[SAMPLE][REQUIRED_HEADERS_KEY_NAME][ANALYSIS_ALIAS_KEY] sample_name_in_vcf = self.xlsx_conf[SAMPLE][REQUIRED_HEADERS_KEY_NAME][SAMPLE_NAME_IN_VCF_KEY] - sample_data_with_split_aa = self.get_sample_data_with_split_analysis_alias( - json_value, analysis_alias, sample_name_in_vcf) + sample_data = self.get_sample_data_with_split_analysis_alias(json_value, analysis_alias, sample_name_in_vcf) if bio_sample_acc in json_value and json_value[bio_sample_acc]: - sample_data_with_biosample_acc = [dict(item, bioSampleAccession=json_value[bio_sample_acc]) for item in - sample_data_with_split_aa] - sample_json[json_key].extend(sample_data_with_biosample_acc) + sample_data.update(bioSampleAccession=json_value[bio_sample_acc]) + sample_json[json_key].append(sample_data) else: json_value.pop(analysis_alias) json_value.pop(sample_name_in_vcf) @@ -260,18 +262,17 @@ def get_sample_json_data(self): if sample_name not in json_value: self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have ' f'{SAMPLE_NAME_KEY} populated', - sheet=SAMPLE, column=SAMPLE_NAME_KEY) + sheet=SAMPLE, row=row_num, column=SAMPLE_NAME_KEY) return None if scientific_name not in json_value: self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have ' f'{SCIENTIFIC_NAME_KEY} populated', - sheet=SAMPLE, column=SCIENTIFIC_NAME_KEY) + sheet=SAMPLE, row=row_num, column=SCIENTIFIC_NAME_KEY) return None biosample_obj = self.get_biosample_object(json_value) - sample_data_with_biosample_obj = [dict(item, bioSampleObject=biosample_obj) for item in - sample_data_with_split_aa] - sample_json[json_key].extend(sample_data_with_biosample_obj) + sample_data.update(bioSampleObject=biosample_obj) + sample_json[json_key].append(sample_data) return sample_json diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json index e069fee..e4e3028 100644 --- a/eva_sub_cli/etc/eva_schema.json +++ b/eva_sub_cli/etc/eva_schema.json @@ -94,18 +94,18 @@ "items": true }, "parentProject": { - "type": "string" + "$ref": "#/definitions/projectAccession" }, "childProjects": { "type": "array", "items": { - "type": "string" + "$ref": "#/definitions/projectAccession" } }, "peerProjects": { "type": "array", "items": { - "type": "string" + "$ref": "#/definitions/projectAccession" } }, "links": { @@ -296,8 +296,12 @@ ], "properties": { "analysisAlias": { - "type": "string", - "description": "Alias of the analysis performed on this sample." + "type": "array", + "description": "Aliases of the analyses performed on this sample", + "minItems": 1, + "items": { + "type": "string" + } }, "sampleInVCF": { "type": "string", @@ -349,5 +353,12 @@ } } } + }, + "definitions": { + "projectAccession": { + "type": "string", + "description": "ENA project accession", + "pattern": "^PRJ(EB|NA)\\d+$" + } } } \ No newline at end of file diff --git a/eva_sub_cli/metadata_utils.py b/eva_sub_cli/metadata_utils.py index 77a8a7c..ce967ae 100644 --- a/eva_sub_cli/metadata_utils.py +++ b/eva_sub_cli/metadata_utils.py @@ -10,7 +10,8 @@ def get_samples_per_analysis(metadata): """Returns mapping of analysis alias to sample names, based on metadata.""" samples_per_analysis = defaultdict(list) for sample_info in metadata.get('sample', []): - samples_per_analysis[sample_info.get('analysisAlias')].append(sample_info.get('sampleInVCF')) + for analysis_alias in sample_info.get('analysisAlias', []): + samples_per_analysis[analysis_alias].append(sample_info.get('sampleInVCF')) return { analysis_alias: set(samples) for analysis_alias, samples in samples_per_analysis.items() diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index 2b6d5d1..a382150 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -29,7 +29,8 @@ params.executable = [ params.python_scripts = [ "samples_checker": "samples_checker.py", "fasta_checker": "check_fasta_insdc.py", - "xlsx2json": "xlsx2json.py" + "xlsx2json": "xlsx2json.py", + "semantic_checker": "check_metadata_semantics.py" ] // prefix to prepend to all provided path params.base_dir = "" @@ -255,3 +256,21 @@ process insdc_checker { $params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml """ } + + +process metadata_semantic_check { + publishDir output_dir, + overwrite: true, + mode: "copy" + + input: + path(metadata_json) + + output: + path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml + + script: + """ + $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml + """ +} diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py new file mode 100644 index 0000000..a11d370 --- /dev/null +++ b/eva_sub_cli/semantic_metadata.py @@ -0,0 +1,126 @@ +import yaml + +from retry import retry +from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena +from ebi_eva_common_pyutils.logger import AppLogger + + +PROJECT_KEY = 'project' +ANALYSIS_KEY = 'analysis' +SAMPLE_KEY = 'sample' +FILES_KEY = 'files' +PARENT_PROJECT_KEY = 'parentProject' +CHILD_PROJECTS_KEY = 'childProjects' +PEER_PROJECTS_KEY = 'peerProjects' +BIOSAMPLE_OBJECT_KEY = 'bioSampleObject' +CHARACTERISTICS_KEY = 'characteristics' +TAX_ID_KEY = 'taxId' +ANALYSIS_ALIAS_KEY = 'analysisAlias' + + +def cast_list(l, type_to_cast=str): + for e in l: + yield type_to_cast(e) + + +class SemanticMetadataChecker(AppLogger): + + def __init__(self, metadata): + self.metadata = metadata + self.errors = [] + # Caches whether taxonomy code is valid or not + self.taxonomy_valid = {} + + def write_result_yaml(self, output_path): + with open(output_path, 'w') as open_yaml: + yaml.safe_dump(data=self.errors, stream=open_yaml) + + def check_all(self): + self.check_all_project_accessions() + self.check_all_taxonomy_codes() + self.check_analysis_alias_coherence() + + def check_all_project_accessions(self): + """Check that ENA project accessions exist and are public.""" + project = self.metadata[PROJECT_KEY] + if PARENT_PROJECT_KEY in project: + self.check_project_accession(project[PARENT_PROJECT_KEY], f'/{PROJECT_KEY}/{PARENT_PROJECT_KEY}') + for idx, accession in enumerate(project.get(CHILD_PROJECTS_KEY, [])): + self.check_project_accession(accession, f'/{PROJECT_KEY}/{CHILD_PROJECTS_KEY}/{idx}') + for idx, accession in enumerate(project.get(PEER_PROJECTS_KEY, [])): + self.check_project_accession(accession, f'/{PROJECT_KEY}/{PEER_PROJECTS_KEY}/{idx}') + + def check_all_taxonomy_codes(self): + """Check that taxonomy IDs are valid according to ENA.""" + project = self.metadata[PROJECT_KEY] + self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}') + # Check sample taxonomies for novel samples + for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): + if BIOSAMPLE_OBJECT_KEY in sample: + self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'], + f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}') + + @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) + def check_project_accession(self, project_acc, json_path): + try: + download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_acc}') + except Exception: + self.add_error(json_path, f'{project_acc} does not exist or is private') + + @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) + def check_taxonomy_code(self, taxonomy_code, json_path): + taxonomy_code = int(taxonomy_code) + if taxonomy_code in self.taxonomy_valid: + if self.taxonomy_valid[taxonomy_code] is False: + self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') + else: + try: + download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}') + self.taxonomy_valid[taxonomy_code] = True + except Exception: + self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') + self.taxonomy_valid[taxonomy_code] = False + + def add_error(self, property, description): + """ + Add an error, conforming to the format of biovalidator errors. + + :param property: JSON property of the error. This will be converted to sheet/row/column in spreadsheet if needed. + :param description: description of the error. + """ + self.errors.append({'property': property, 'description': description}) + + def check_analysis_alias_coherence(self): + """Check that the same analysis aliases are used in analysis, sample, and files.""" + analysis_aliases = [analysis[ANALYSIS_ALIAS_KEY] for analysis in self.metadata[ANALYSIS_KEY]] + aliases_in_samples = [alias for sample in self.metadata[SAMPLE_KEY] for alias in sample[ANALYSIS_ALIAS_KEY]] + aliases_in_files = [file[ANALYSIS_ALIAS_KEY] for file in self.metadata[FILES_KEY]] + + self.same_set(analysis_aliases, aliases_in_samples, 'Analysis', 'Samples', + f'/{SAMPLE_KEY}/{ANALYSIS_ALIAS_KEY}') + self.same_set(analysis_aliases, aliases_in_files, 'Analysis', 'Files', + f'/{FILES_KEY}/{ANALYSIS_ALIAS_KEY}') + + def same_set(self, list1, list2, list1_desc, list2_desc, json_path): + """ + Compare contents of two lists and add error messages. + + :param list1: first list to compare + :param list2: second list to compare + :param list1_desc: text description of first list, used only in error message + :param list2_desc: text description of second list, used only in error message + :param json_path: property where the error message will appear + """ + if not set(list1) == set(list2): + list1_list2 = sorted(cast_list(set(list1).difference(list2))) + list2_list1 = sorted(cast_list(set(list2).difference(list1))) + if list1_list2: + self.add_error( + property=json_path, + description=f'{",".join(list1_list2)} present in {list1_desc} not in {list2_desc}' + ) + if list2_list1: + self.add_error( + property=json_path, + description=f'{",".join(list2_list1)} present in {list2_desc} not in {list1_desc}' + ) diff --git a/eva_sub_cli/submit.py b/eva_sub_cli/submit.py index c67bb93..8f71128 100644 --- a/eva_sub_cli/submit.py +++ b/eva_sub_cli/submit.py @@ -95,4 +95,3 @@ def submit(self): # Complete the submission self.info(f'Complete submission') self._complete_submission() - diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index fbea681..3eb4b1f 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -12,7 +12,7 @@ logger = logging_config.get_logger(__name__) container_image = 'ebivariation/eva-sub-cli' -container_tag = 'v0.0.1.dev8' +container_tag = 'v0.0.1.dev9' container_validation_dir = '/opt/vcf_validation' container_validation_output_dir = 'vcf_validation_output' diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index e35fd00..1e2a973 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -360,6 +360,7 @@ def _collect_metadata_results(self): self.results['metadata_check'] = {} self._load_spreadsheet_conversion_errors() self._parse_biovalidator_validation_results() + self._parse_semantic_metadata_results() self._convert_biovalidator_validation_to_spreadsheet() self._write_spreadsheet_validation_results() self._collect_md5sum_to_metadata() @@ -411,7 +412,11 @@ def clean_read(ifile): def _parse_metadata_property(self, property_str): if property_str.startswith('.'): - return property_str.strip('.'), None, None + return property_str.strip('./'), None, None + # First attempt to parse as BioSample object + sheet, row, col = self._parse_sample_metadata_property(property_str) + if sheet is not None and row is not None and col is not None: + return sheet, row, col match = re.match(r'/(\w+)(/(\d+))?([./](\w+))?', property_str) if match: return match.group(1), match.group(3), match.group(5) @@ -419,6 +424,23 @@ def _parse_metadata_property(self, property_str): logger.error(f'Cannot parse {property_str} in JSON metadata error') return None, None, None + def _parse_sample_metadata_property(self, property_str): + match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str) + if match: + return 'sample', match.group(1), match.group(2) + return None, None, None + + def _parse_semantic_metadata_results(self): + errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', + 'metadata_semantic_check.yml')) + if not errors_file: + return + with open(errors_file) as open_yaml: + # errors is a list of dicts matching format of biovalidator errors + errors = yaml.safe_load(open_yaml) + # biovalidator error parsing always places a list here, even if no errors + self.results['metadata_check']['json_errors'] += errors + def _convert_biovalidator_validation_to_spreadsheet(self): config_file = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml") with open(config_file) as open_file: @@ -434,13 +456,19 @@ def _convert_biovalidator_validation_to_spreadsheet(self): if row_json is None and attribute_json is None: new_description = f'Sheet "{sheet}" is missing' elif row_json is None: - new_description = f'In sheet "{sheet}", column "{column}" is not populated' + if 'have required' not in error['description']: + new_description = error['description'] + else: + new_description = f'In sheet "{sheet}", column "{column}" is not populated' elif attribute_json and column: - new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated' + if 'have required' not in error['description']: + new_description = error['description'] + else: + new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated' else: new_description = error["description"].replace(sheet_json, sheet) if column is None: - # We do not know this attribute. It's most likely about bioSampleObject + # We do not know this attribute. continue if 'schema' in new_description: # This is an error specific to json schema diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..183f1c9 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +markers = + integration: integration test diff --git a/requirements.txt b/requirements.txt index 0746216..5ee6871 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ -pyyaml +ebi_eva_common_pyutils==0.6.3 jinja2 +jsonschema minify_html==0.11.1 openpyxl +pyyaml requests -jsonschema -ebi_eva_common_pyutils==0.6.3 \ No newline at end of file +retry \ No newline at end of file diff --git a/tests/resources/EVA_Submission_test.json b/tests/resources/EVA_Submission_test.json index 2015d31..3be0ce3 100644 --- a/tests/resources/EVA_Submission_test.json +++ b/tests/resources/EVA_Submission_test.json @@ -3,7 +3,6 @@ { "lastName": "Smith", "firstName": "John", - "telephone": "+1234567890", "email": "john.smith@example.com", "laboratory": "Genomics Lab", "centre": "University of Example", @@ -12,7 +11,6 @@ { "lastName": "Doe", "firstName": "Jane", - "telephone": "+1234567890", "email": "jane.doe@example.com", "laboratory": "Bioinformatics Lab", "centre": "University of Example", @@ -21,7 +19,6 @@ ], "project": { "title": "Example Project", - "projectAlias": "EP", "description": "An example project for demonstration purposes", "centre": "University of Example", "taxId": 9606, @@ -32,119 +29,99 @@ "analysisTitle": "Variant Detection 1", "analysisAlias": "VD1", "description": "An example analysis for demonstration purposes", - "ProjectTitle": "Example Project", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27", - "referenceFasta": "GCA_000001405.27_fasta.fa", + "referenceFasta": "GCA_000001405.27_fasta.fa", "platform": "BGISEQ-500" }, { "analysisTitle": "Variant Detection 2", "analysisAlias": "VD2", "description": "An example analysis for demonstration purposes", - "ProjectTitle": "Example Project", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27", - "referenceFasta": "GCA_000001405.27_fasta.fa", + "referenceFasta": "GCA_000001405.27_fasta.fa", "platform": "BGISEQ-500" }, { "analysisTitle": "Variant Detection 3", "analysisAlias": "VD3", "description": "An example analysis for demonstration purposes", - "ProjectTitle": "Example Project", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27", - "referenceFasta": "GCA_000001405.27_fasta.fa", + "referenceFasta": "GCA_000001405.27_fasta.fa", "platform": "BGISEQ-500" } ], "sample": [ { - "analysisAlias": "VD1", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD2", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD3", + "analysisAlias": [ + "VD1", + "VD2", + "VD3" + ], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001" }, { - "analysisAlias": "VD1", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD2", + "analysisAlias": [ + "VD1", + "VD2", + "VD3" + ], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002" }, { - "analysisAlias": "VD3", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", + "analysisAlias": [ + "VD3" + ], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003" }, { - "analysisAlias": "VD4", + "analysisAlias": [ + "VD4", + "VD5" + ], "sampleInVCF": "sample4", "bioSampleObject": { "name": "Lm_17_S8", "characteristics": { - "bioSampleName": "Lm_17_S8", "title": [ - "Bastet normal sample" + { + "text": "Bastet normal sample" + } ], "description": [ - "Test Description" + { + "text": "Test Description" + } ], "taxId": [ - 9447 + { + "text": "9447" + } ], "scientificName": [ - "Lemur catta" + { + "text": "Lemur catta" + } ], - "sex": "Female", - "tissueType": "skin", - "species": [ - "Lemur catta" - ] - } - } - }, - { - "analysisAlias": "VD5", - "sampleInVCF": "sample4", - "bioSampleObject": { - "name": "Lm_17_S8", - "characteristics": { - "bioSampleName": "Lm_17_S8", - "title": [ - "Bastet normal sample" + "sex": [ + { + "text": "Female" + } ], - "description": [ - "Test Description" - ], - "taxId": [ - 9447 - ], - "scientificName": [ - "Lemur catta" + "tissueType": [ + { + "text": "skin" + } ], - "sex": "Female", - "tissueType": "skin", "species": [ - "Lemur catta" + { + "text": "Lemur catta" + } ] } } diff --git a/tests/resources/EVA_Submission_test_with_asm_report.json b/tests/resources/EVA_Submission_test_with_asm_report.json index 892a0c2..5c3ede3 100644 --- a/tests/resources/EVA_Submission_test_with_asm_report.json +++ b/tests/resources/EVA_Submission_test_with_asm_report.json @@ -64,90 +64,73 @@ ], "sample": [ { - "analysisAlias": "VD1", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD2", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD3", + "analysisAlias": [ + "VD1", + "VD2", + "VD3" + ], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001" }, { - "analysisAlias": "VD1", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD2", + "analysisAlias": [ + "VD1", + "VD2", + "VD3" + ], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002" }, { - "analysisAlias": "VD3", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", + "analysisAlias": [ + "VD3" + ], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003" }, { - "analysisAlias": "VD4", + "analysisAlias": [ + "VD4", + "VD5" + ], "sampleInVCF": "sample4", "bioSampleObject": { "name": "Lm_17_S8", "characteristics": { - "bioSampleName": "Lm_17_S8", "title": [ - "Bastet normal sample" + { + "text": "Bastet normal sample" + } ], "description": [ - "Test Description" + { + "text": "Test Description" + } ], "taxId": [ - 9447 + { + "text": "9447" + } ], "scientificName": [ - "Lemur catta" + { + "text": "Lemur catta" + } ], - "sex": "Female", - "tissueType": "skin", - "species": [ - "Lemur catta" - ] - } - } - }, - { - "analysisAlias": "VD5", - "sampleInVCF": "sample4", - "bioSampleObject": { - "name": "Lm_17_S8", - "characteristics": { - "bioSampleName": "Lm_17_S8", - "title": [ - "Bastet normal sample" + "sex": [ + { + "text": "Female" + } ], - "description": [ - "Test Description" - ], - "taxId": [ - 9447 - ], - "scientificName": [ - "Lemur catta" + "tissueType": [ + { + "text": "skin" + } ], - "sex": "Female", - "tissueType": "skin", "species": [ - "Lemur catta" + { + "text": "Lemur catta" + } ] } } diff --git a/tests/resources/sample_checker/metadata.json b/tests/resources/sample_checker/metadata.json index 8854cde..20a9d0b 100644 --- a/tests/resources/sample_checker/metadata.json +++ b/tests/resources/sample_checker/metadata.json @@ -23,37 +23,17 @@ }, "sample": [ { - "analysisAlias": "VD1", + "analysisAlias": ["VD1", "VD2", "VD3"], "sampleInVCF": "sample1", "BioSampleAccession": "SAME00001" }, { - "analysisAlias": "VD1", + "analysisAlias": ["VD1", "VD2", "VD3"], "sampleInVCF": "sample2", "BioSampleAccession": "SAME00002" }, { - "analysisAlias": "VD2", - "sampleInVCF": "sample1", - "BioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD2", - "sampleInVCF": "sample2", - "BioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", - "sampleInVCF": "sample1", - "BioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD3", - "sampleInVCF": "sample2", - "BioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", + "analysisAlias": ["VD3"], "sampleInVCF": "sample3", "BioSampleAccession": "SAME00003" } diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml new file mode 100644 index 0000000..c0c130c --- /dev/null +++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml @@ -0,0 +1,8 @@ +- description: PRJEBNA does not exist or is private + property: /project/childProjects/1 +- description: 1234 is not a valid taxonomy code + property: /sample/2/bioSampleObject/characteristics/taxId +- description: alias1 present in Analysis not in Samples + property: /sample/analysisAlias +- description: alias_1,alias_2 present in Samples not in Analysis + property: /sample/analysisAlias diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt b/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt index d3229f0..3b805ec 100644 --- a/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt +++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt @@ -8,21 +8,21 @@ 2023-06-29T10:03:45.008Z [info] Validation failed with errors: data should have required property 'files', data/project should have required property 'title', data/project should have required property 'description', data/project should have required property 'taxId', data/project should have required property 'centre', data/analysis/0 should have required property 'analysisTitle', data/analysis/0 should have required property 'description', data/analysis/0 should have required property 'experimentType', data/analysis/0 should have required property 'referenceGenome', data/sample/0 should have required property 'bioSampleAccession', data/sample/0 should have required property 'bioSampleObject', data/sample/0 should match exactly one schema in oneOf  Validation failed with following error(s):  - .files + /files should have required property 'files' -/project.title +/project/title should have required property 'title' /project/taxId must have required property 'taxId' /project/holdDate must match format "date" -/analysis/0.description +/analysis/0/description should have required property 'description' -/analysis/0.referenceGenome +/analysis/0/referenceGenome should have required property 'referenceGenome' -/sample/0.bioSampleAccession +/sample/0/bioSampleAccession should have required property 'bioSampleAccession' -/sample/0.bioSampleObject +/sample/0/bioSampleObject should have required property 'bioSampleObject' /sample/0 should match exactly one schema in oneOf diff --git a/tests/test_check_fasta_insdc.py b/tests/test_check_fasta_insdc.py index a1d34ff..116e6cc 100644 --- a/tests/test_check_fasta_insdc.py +++ b/tests/test_check_fasta_insdc.py @@ -2,6 +2,7 @@ from unittest import TestCase from unittest.mock import patch +import pytest import requests as requests from bin.check_fasta_insdc import assess_fasta, get_analyses_and_reference_genome_from_metadata @@ -10,14 +11,77 @@ class TestFastaChecker(TestCase): resource_dir = os.path.join(os.path.dirname(__file__), 'resources') + def test_get_analysis_and_reference_genome_from_metadata(self): + metadata_json = os.path.join(self.resource_dir, 'sample_checker', 'metadata.json') + vcf_file = os.path.join(self.resource_dir, 'sample_checker', 'example1.vcf.gz') + analyses, reference = get_analyses_and_reference_genome_from_metadata([vcf_file], metadata_json) + assert analyses == {'VD1'} + assert reference == 'GCA_000001405.27' + def test_assess_fasta_is_insdc(self): input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') - results = assess_fasta(input_fasta, ['analysis'], None) - assert results == { - 'all_insdc': True, - 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], - 'possible_assemblies': {'GCA_000146045.2'} - } + with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \ + patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: + m_get_refget.return_value = {'sequence_name': 'chr1'} + m_get_assemblies.return_value = {'GCA_000146045.2'} + results = assess_fasta(input_fasta, ['analysis'], None) + assert results == { + 'all_insdc': True, + 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], + 'possible_assemblies': {'GCA_000146045.2'} + } + with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \ + patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: + m_get_refget.return_value = None + m_get_assemblies.return_value = set() + results = assess_fasta(input_fasta, ['analysis'], None) + assert results == { + 'all_insdc': False, + 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': False}] + } + + def test_assess_fasta_matches_metadata(self): + input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') + with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \ + patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: + m_get_refget.return_value = {'sequence_name': 'I'} + m_get_assemblies.return_value = {'GCA_000146045.2'} + results = assess_fasta(input_fasta, ['analysis'], 'GCA_000146045.2') + assert results == { + 'all_insdc': True, + 'sequences': [ + {'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], + 'possible_assemblies': {'GCA_000146045.2'}, + 'metadata_assembly_compatible': True, + 'associated_analyses': ['analysis'], + 'assembly_in_metadata': 'GCA_000146045.2' + } + results = assess_fasta(input_fasta, ['analysis'], 'GCA_002915635.1') + assert results == { + 'all_insdc': True, + 'sequences': [ + {'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], + 'possible_assemblies': {'GCA_000146045.2'}, + 'metadata_assembly_compatible': False, + 'associated_analyses': ['analysis'], + 'assembly_in_metadata': 'GCA_002915635.1' + } + + def test_assess_fasta_http_error(self): + input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') + with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \ + patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: + m_get_refget.return_value = {'sequence_name': 'I'} + m_get_assemblies.side_effect = requests.HTTPError('500 Internal Server Error') + results = assess_fasta(input_fasta, ['analysis'], None) + assert results == { + 'all_insdc': True, + 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], + 'connection_error': '500 Internal Server Error' + } + + @pytest.mark.integration + def test_assess_fasta_is_insdc_integration(self): input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'input_passed.fa') results = assess_fasta(input_fasta, ['analysis'], None) assert results == { @@ -25,7 +89,8 @@ def test_assess_fasta_is_insdc(self): 'sequences': [{'sequence_name': 'chr1', 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d', 'insdc': False}] } - def test_assess_fasta_matches_metadata(self): + @pytest.mark.integration + def test_assess_fasta_matches_metadata_integration(self): input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') results = assess_fasta(input_fasta, ['analysis'], 'GCA_000146045.2') assert results == { @@ -37,31 +102,3 @@ def test_assess_fasta_matches_metadata(self): 'associated_analyses': ['analysis'], 'assembly_in_metadata': 'GCA_000146045.2' } - results = assess_fasta(input_fasta, ['analysis'], 'GCA_002915635.1') - assert results == { - 'all_insdc': True, - 'sequences': [ - {'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], - 'possible_assemblies': {'GCA_000146045.2'}, - 'metadata_assembly_compatible': False, - 'associated_analyses': ['analysis'], - 'assembly_in_metadata': 'GCA_002915635.1' - } - - def test_get_analysis_and_reference_genome_from_metadata(self): - metadata_json = os.path.join(self.resource_dir, 'sample_checker', 'metadata.json') - vcf_file = os.path.join(self.resource_dir, 'sample_checker', 'example1.vcf.gz') - analyses, reference = get_analyses_and_reference_genome_from_metadata([vcf_file], metadata_json) - assert analyses == {'VD1'} - assert reference == 'GCA_000001405.27' - - def test_assess_fasta_http_error(self): - input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') - with patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: - m_get_assemblies.side_effect = requests.HTTPError('500 Internal Server Error') - results = assess_fasta(input_fasta, ['analysis'], None) - assert results == { - 'all_insdc': True, - 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], - 'connection_error': '500 Internal Server Error' - } diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index 5d2b000..72aba00 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -33,9 +33,11 @@ def setUp(self): [os.path.join(self.assembly_reports, 'input_passed.txt')]) sub_metadata = { "submitterDetails": [], - "project": {}, + "project": { + "parentProject": "PRJ_INVALID" + }, "sample": [ - {"analysisAlias": "AA", "sampleInVCF": "HG00096", "BioSampleAccession": "SAME0000096"} + {"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"} ], "analysis": [ {"analysisAlias": "AA"} @@ -131,6 +133,11 @@ def test_validate(self): 'md5': '96a80c9368cc3c37095c86fbe6044fb2'} ] + # Check metadata errors + with open(os.path.join(self.validator.output_dir, 'other_validations', 'metadata_validation.txt')) as open_file: + metadata_val_lines = {l.strip() for l in open_file.readlines()} + assert 'must match pattern "^PRJ(EB|NA)\\d+$"' in metadata_val_lines + def test_validate_from_excel(self): self.validator_from_excel.validate() self.assertTrue(os.path.isfile(self.validator_from_excel._sample_check_yaml)) diff --git a/tests/test_main.py b/tests/test_main.py index b281388..2a19196 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -33,7 +33,6 @@ def setUp(self) -> None: os.makedirs(self.test_sub_dir) shutil.copy(os.path.join(self.resource_dir, 'EVA_Submission_test.json'), self.metadata_json) - def tearDown(self) -> None: shutil.rmtree(self.test_sub_dir) diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py new file mode 100644 index 0000000..ab61f7b --- /dev/null +++ b/tests/test_semantic_metadata.py @@ -0,0 +1,93 @@ +from unittest import TestCase +from unittest.mock import patch + +from eva_sub_cli.semantic_metadata import SemanticMetadataChecker + + +class TestSemanticMetadata(TestCase): + + def test_check_all_project_accessions(self): + metadata = { + "project": { + "parentProject": "PRJEB123", + "childProjects": ["PRJEB456", "PRJEBNA"] + }, + } + checker = SemanticMetadataChecker(metadata) + with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: + m_ena_download.side_effect = [True, True, Exception('problem downloading')] + checker.check_all_project_accessions() + self.assertEqual(checker.errors, [ + {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'} + ]) + + def test_check_all_taxonomy_codes(self): + metadata = { + "project": { + "taxId": 9606, + }, + "sample": [ + { + "bioSampleAccession": "SAME00003" + }, + { + "bioSampleObject": { + "characteristics": { + "taxId": [{"text": "9606"}] + } + } + }, + { + "bioSampleObject": { + "characteristics": { + "taxId": [{"text": "1234"}] + } + } + } + ] + } + checker = SemanticMetadataChecker(metadata) + with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: + # Mock should only be called once per taxonomy code + m_ena_download.side_effect = [True, Exception('problem downloading')] + checker.check_all_taxonomy_codes() + self.assertEqual(checker.errors, [ + { + 'property': '/sample/2/bioSampleObject/characteristics/taxId', + 'description': '1234 is not a valid taxonomy code' + } + ]) + + def test_check_analysis_alias_coherence(self): + metadata = { + "analysis": [ + {"analysisAlias": "alias1"}, + {"analysisAlias": "alias2"} + ], + "sample": [ + { + "bioSampleAccession": "SAME00003", + "analysisAlias": ["alias_1", "alias_2"] + }, + { + "bioSampleAccession": "SAME00004", + "analysisAlias": ["alias2"] + } + ], + "files": [ + { + "analysisAlias": "alias1", + "fileName": "example1.vcf.gz" + }, + { + "analysisAlias": "alias2", + "fileName": "example2.vcf.gz" + } + ] + } + checker = SemanticMetadataChecker(metadata) + checker.check_analysis_alias_coherence() + self.assertEqual(checker.errors, [ + {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, + {'property': '/sample/analysisAlias', 'description': 'alias_1,alias_2 present in Samples not in Analysis'} + ]) diff --git a/tests/test_validator.py b/tests/test_validator.py index 292ffa6..f01b4e1 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -59,15 +59,21 @@ def test__collect_validation_workflow_results(self): }, 'metadata_check': { 'json_errors': [ - {'property': '.files', 'description': "should have required property 'files'"}, - {'property': '/project.title', 'description': "should have required property 'title'"}, + {'property': '/files', 'description': "should have required property 'files'"}, + {'property': '/project/title', 'description': "should have required property 'title'"}, {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0.description', 'description': "should have required property 'description'"}, - {'property': '/analysis/0.referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + {'property': '/analysis/0/description', 'description': "should have required property 'description'"}, + {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, + {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, + {'property': '/sample/2/bioSampleObject/characteristics/taxId', + 'description': '1234 is not a valid taxonomy code'}, + {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, + {'property': '/sample/analysisAlias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'}, ], 'spreadsheet_errors': [ # NB. Wouldn't normally get conversion error + validation errors together, but it is supported. @@ -79,13 +85,21 @@ def test__collect_validation_workflow_results(self): {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'}, {'sheet': 'Project', 'row': '', 'column': 'Hold Date', - 'description': 'In sheet "Project", column "Hold Date" is not populated'}, + 'description': 'must match format "date"'}, {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Child Project(s)', + 'description': 'PRJEBNA does not exist or is private'}, + {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', + 'description': '1234 is not a valid taxonomy code'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias1 present in Analysis not in Samples'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'} ] } } @@ -121,33 +135,40 @@ def test_parse_biovalidator_validation_results(self): self.validator.results['metadata_check'] = {} self.validator._parse_biovalidator_validation_results() assert self.validator.results['metadata_check']['json_errors'] == [ - {'property': '.files', 'description': "should have required property 'files'"}, - {'property': '/project.title', 'description': "should have required property 'title'"}, + {'property': '/files', 'description': "should have required property 'files'"}, + {'property': '/project/title', 'description': "should have required property 'title'"}, {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0.description', 'description': "should have required property 'description'"}, - {'property': '/analysis/0.referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, + {'property': '/analysis/0/description', 'description': "should have required property 'description'"}, + {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ] def test_convert_biovalidator_validation_to_spreadsheet(self): self.validator.results['metadata_check'] = { 'json_errors': [ - {'property': '.files', 'description': "should have required property 'files'"}, - {'property': '/project.title', 'description': "should have required property 'title'"}, + {'property': '/files', 'description': "should have required property 'files'"}, + {'property': '/project/title', 'description': "should have required property 'title'"}, {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0.description', + {'property': '/analysis/0/description', 'description': "should have required property 'description'"}, - {'property': '/analysis/0.referenceGenome', + {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0.bioSampleAccession', + {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0.bioSampleObject', + {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, + # Semantic checks + {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, + {'property': '/sample/2/bioSampleObject/characteristics/taxId', + 'description': '1234 is not a valid taxonomy code'}, + {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, + {'property': '/sample/analysisAlias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'} ] } self.validator._convert_biovalidator_validation_to_spreadsheet() @@ -159,13 +180,20 @@ def test_convert_biovalidator_validation_to_spreadsheet(self): {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'}, {'sheet': 'Project', 'row': '', 'column': 'Hold Date', - 'description': 'In sheet "Project", column "Hold Date" is not populated'}, + 'description': 'must match format "date"'}, {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', - 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'} + 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Child Project(s)', + 'description': 'PRJEBNA does not exist or is private'}, + {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', 'description': '1234 is not a valid taxonomy code'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias1 present in Analysis not in Samples'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'} ] def test_parse_assembly_check_log(self): diff --git a/tests/test_xlsx2json.py b/tests/test_xlsx2json.py index f292d06..84a6c63 100644 --- a/tests/test_xlsx2json.py +++ b/tests/test_xlsx2json.py @@ -29,14 +29,20 @@ def test_conversion_2_json(self) -> None: xls_filename = os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx') self.parser = XlsxParser(xls_filename, self.conf_filename) output_json = os.path.join(self.resource_dir, 'EVA_Submission_test_output.json') + errors_yaml = os.path.join(self.resource_dir, 'EVA_Submission_test_errors.yml') self.parser.json(output_json) + self.parser.save_errors(errors_yaml) + + # confirm no errors + with open(errors_yaml) as open_file: + errors_data = yaml.safe_load(open_file) + assert errors_data == [] with open(output_json) as open_file: json_data = json.load(open_file) # assert json file is created with expected data assert sorted(json_data.keys()) == ['analysis', 'files', 'project', 'sample', 'submitterDetails'] - print(self.get_expected_json()) - self.assertTrue(self.get_expected_json() == json_data) + self.assertEqual(self.get_expected_json(), json_data) # assert json schema with open(self.eva_schema) as eva_schema_file: @@ -45,7 +51,7 @@ def test_conversion_2_json(self) -> None: biosample_json_schema = json.load(biosample_schema_file) # assert created json file sample field conforms to eva-biosamples schema - jsonschema.validate(json_data['sample'][7]['bioSampleObject'], biosample_json_schema) + jsonschema.validate(json_data['sample'][3]['bioSampleObject'], biosample_json_schema) # assert created json file conform to eva_schema resolver = jsonschema.RefResolver.from_schema(eva_json_schema) @@ -133,92 +139,48 @@ def get_expected_json(self): ], "sample": [ { - "analysisAlias": "VD1", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD2", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD3", + "analysisAlias": ["VD1", "VD2", "VD3"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001" }, { - "analysisAlias": "VD1", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD2", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", + "analysisAlias": ["VD1", "VD2", "VD3"], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002" }, { - "analysisAlias": "VD3", + "analysisAlias": ["VD3"], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003" }, { - "analysisAlias": "VD4", - "sampleInVCF": "sample4", - "bioSampleObject": { - "name": "Lm_17_S8", - "characteristics": { - "bioSampleName": "Lm_17_S8", - "title": [ - "Bastet normal sample" - ], - "description": [ - "Test Description" - ], - "taxId": [ - 9447 - ], - "scientificName": [ - "Lemur catta" - ], - "sex": "Female", - "tissueType": "skin", - "species": [ - "Lemur catta" - ] - } - } - }, - { - "analysisAlias": "VD5", + "analysisAlias": ["VD4", "VD5"], "sampleInVCF": "sample4", "bioSampleObject": { - "name": "Lm_17_S8", - "characteristics": { - "bioSampleName": "Lm_17_S8", - "title": [ - "Bastet normal sample" - ], - "description": [ - "Test Description" - ], - "taxId": [ - 9447 - ], - "scientificName": [ - "Lemur catta" - ], - "sex": "Female", - "tissueType": "skin", - "species": [ - "Lemur catta" - ] - } + "name": "Lm_17_S8", + "characteristics": { + "title": [ + {"text": "Bastet normal sample"} + ], + "description": [ + {"text": "Test Description"} + ], + "taxId": [ + {"text": "9447"} + ], + "scientificName": [ + {"text": "Lemur catta"} + ], + "sex": [ + {"text": "Female"} + ], + "tissueType": [ + {"text": "skin"} + ], + "species": [ + {"text": "Lemur catta"} + ] + } } } ],