From 80931a8594f297c480901402eead2e026a517e6e Mon Sep 17 00:00:00 2001 From: April Shen Date: Thu, 4 Jul 2024 15:42:18 +0100 Subject: [PATCH 01/11] semantic metadata checks WIP --- bin/check_metadata_semantics.py | 70 +++++++++++++++++++++++++++++ eva_sub_cli/etc/eva_schema.json | 13 ++++-- eva_sub_cli/nextflow/validation.nf | 21 ++++++++- eva_sub_cli/validators/validator.py | 1 + requirements.txt | 7 +-- 5 files changed, 105 insertions(+), 7 deletions(-) create mode 100644 bin/check_metadata_semantics.py diff --git a/bin/check_metadata_semantics.py b/bin/check_metadata_semantics.py new file mode 100644 index 0000000..4091038 --- /dev/null +++ b/bin/check_metadata_semantics.py @@ -0,0 +1,70 @@ +import argparse +import json +import re + +import requests +import yaml + +from retry import retry +from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena +from ebi_eva_common_pyutils.logger import logging_config + + +logger = logging_config.get_logger(__name__) + +PARENT_PROJECT_KEY = 'parentProject' +CHILD_PROJECTS_KEY = 'childProjects' +PEER_PROJECTS_KEY = 'peerProjects' + + +@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) +def check_existing_project_in_ena(project_accession): + """Check if a project accession exists and is public in ENA""" + try: + download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_accession}') + except requests.exceptions.HTTPError: + return False + return True + + +def check_project_accession(errors, project_acc, key_name, idx=None): + if not check_existing_project_in_ena(project_acc): + field_name = f'/project/{key_name}' + if idx is not None: + field_name += f'/{idx}' + errors[field_name] = f'{project_acc} does not exist or is private' + + +def write_result_yaml(output_yaml, results): + with open(output_yaml, 'w') as open_yaml: + yaml.safe_dump(data=results, stream=open_yaml) + + +def check_all_project_accessions(metadata): + """Check that ENA project accessions exist and are public""" + errors = {} + project = metadata['project'] + check_project_accession(errors, project[PARENT_PROJECT_KEY], PARENT_PROJECT_KEY) + for idx, accession in enumerate(project[CHILD_PROJECTS_KEY]): + check_project_accession(errors, accession, CHILD_PROJECTS_KEY, idx) + for idx, accession in enumerate(project[PEER_PROJECTS_KEY]): + check_project_accession(errors, accession, PEER_PROJECTS_KEY, idx) + return errors + + +def main(): + arg_parser = argparse.ArgumentParser(description='Perform semantic checks on the metadata') + arg_parser.add_argument('--metadata_json', required=True, dest='metadata_json', help='EVA metadata json file') + arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml', + help='Path to the location of the results') + args = arg_parser.parse_args() + logging_config.add_stdout_handler() + with open(args.metadata_json) as open_json: + metadata = json.load(open_json) + errors = check_all_project_accessions(metadata) + # TODO other errors + write_result_yaml(args.output_yaml, errors) + + +if __name__ == "__main__": + main() diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json index e069fee..78c2c22 100644 --- a/eva_sub_cli/etc/eva_schema.json +++ b/eva_sub_cli/etc/eva_schema.json @@ -94,18 +94,18 @@ "items": true }, "parentProject": { - "type": "string" + "$ref": "#/definitions/projectAccession" }, "childProjects": { "type": "array", "items": { - "type": "string" + "$ref": "#/definitions/projectAccession" } }, "peerProjects": { "type": "array", "items": { - "type": "string" + "$ref": "#/definitions/projectAccession" } }, "links": { @@ -349,5 +349,12 @@ } } } + }, + "definitions": { + "projectAccession": { + "type": "string", + "description": "ENA project accession", + "pattern": "^PRJ(EB|NA)\\d+$" + } } } \ No newline at end of file diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index 2b6d5d1..a382150 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -29,7 +29,8 @@ params.executable = [ params.python_scripts = [ "samples_checker": "samples_checker.py", "fasta_checker": "check_fasta_insdc.py", - "xlsx2json": "xlsx2json.py" + "xlsx2json": "xlsx2json.py", + "semantic_checker": "check_metadata_semantics.py" ] // prefix to prepend to all provided path params.base_dir = "" @@ -255,3 +256,21 @@ process insdc_checker { $params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml """ } + + +process metadata_semantic_check { + publishDir output_dir, + overwrite: true, + mode: "copy" + + input: + path(metadata_json) + + output: + path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml + + script: + """ + $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml + """ +} diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index e35fd00..c2fd996 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -363,6 +363,7 @@ def _collect_metadata_results(self): self._convert_biovalidator_validation_to_spreadsheet() self._write_spreadsheet_validation_results() self._collect_md5sum_to_metadata() + # TODO add semantic metadata checks def _load_spreadsheet_conversion_errors(self): errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', diff --git a/requirements.txt b/requirements.txt index 0746216..5ee6871 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ -pyyaml +ebi_eva_common_pyutils==0.6.3 jinja2 +jsonschema minify_html==0.11.1 openpyxl +pyyaml requests -jsonschema -ebi_eva_common_pyutils==0.6.3 \ No newline at end of file +retry \ No newline at end of file From 71b28bcef66642d495cf9808fb9fee09e0f29ed9 Mon Sep 17 00:00:00 2001 From: April Shen Date: Fri, 5 Jul 2024 11:54:16 +0100 Subject: [PATCH 02/11] WIP - refactoring --- bin/check_metadata_semantics.py | 58 ++----------- eva_sub_cli/semantic_metadata.py | 128 ++++++++++++++++++++++++++++ eva_sub_cli/validators/validator.py | 13 ++- 3 files changed, 145 insertions(+), 54 deletions(-) create mode 100644 eva_sub_cli/semantic_metadata.py diff --git a/bin/check_metadata_semantics.py b/bin/check_metadata_semantics.py index 4091038..07705ec 100644 --- a/bin/check_metadata_semantics.py +++ b/bin/check_metadata_semantics.py @@ -1,55 +1,7 @@ import argparse import json -import re -import requests -import yaml - -from retry import retry -from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena -from ebi_eva_common_pyutils.logger import logging_config - - -logger = logging_config.get_logger(__name__) - -PARENT_PROJECT_KEY = 'parentProject' -CHILD_PROJECTS_KEY = 'childProjects' -PEER_PROJECTS_KEY = 'peerProjects' - - -@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) -def check_existing_project_in_ena(project_accession): - """Check if a project accession exists and is public in ENA""" - try: - download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_accession}') - except requests.exceptions.HTTPError: - return False - return True - - -def check_project_accession(errors, project_acc, key_name, idx=None): - if not check_existing_project_in_ena(project_acc): - field_name = f'/project/{key_name}' - if idx is not None: - field_name += f'/{idx}' - errors[field_name] = f'{project_acc} does not exist or is private' - - -def write_result_yaml(output_yaml, results): - with open(output_yaml, 'w') as open_yaml: - yaml.safe_dump(data=results, stream=open_yaml) - - -def check_all_project_accessions(metadata): - """Check that ENA project accessions exist and are public""" - errors = {} - project = metadata['project'] - check_project_accession(errors, project[PARENT_PROJECT_KEY], PARENT_PROJECT_KEY) - for idx, accession in enumerate(project[CHILD_PROJECTS_KEY]): - check_project_accession(errors, accession, CHILD_PROJECTS_KEY, idx) - for idx, accession in enumerate(project[PEER_PROJECTS_KEY]): - check_project_accession(errors, accession, PEER_PROJECTS_KEY, idx) - return errors +from eva_sub_cli.semantic_metadata import SemanticMetadataChecker def main(): @@ -58,12 +10,12 @@ def main(): arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml', help='Path to the location of the results') args = arg_parser.parse_args() - logging_config.add_stdout_handler() + with open(args.metadata_json) as open_json: metadata = json.load(open_json) - errors = check_all_project_accessions(metadata) - # TODO other errors - write_result_yaml(args.output_yaml, errors) + checker = SemanticMetadataChecker(metadata) + checker.check_all() + checker.write_result_yaml(args.output_yaml) if __name__ == "__main__": diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py new file mode 100644 index 0000000..a224b74 --- /dev/null +++ b/eva_sub_cli/semantic_metadata.py @@ -0,0 +1,128 @@ +import yaml + +from retry import retry +from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena +from ebi_eva_common_pyutils.logger import AppLogger + + +PROJECT_KEY = 'project' +PARENT_PROJECT_KEY = 'parentProject' +CHILD_PROJECTS_KEY = 'childProjects' +PEER_PROJECTS_KEY = 'peerProjects' +SAMPLE_KEY = 'sample' +BIOSAMPLE_OBJECT_KEY = 'bioSampleObject' +CHARACTERISTICS_KEY = 'characteristics' +TAX_ID_KEY = 'taxId' + + +def cast_list(l, type_to_cast=str): + for e in l: + yield type_to_cast(e) + + +class SemanticMetadataChecker(AppLogger): + + def __init__(self, metadata): + self.metadata = metadata + self.errors = [] + + def write_result_yaml(self, output_path): + with open(output_path, 'w') as open_yaml: + yaml.safe_dump(data=self.errors, stream=open_yaml) + + def check_all(self): + self.check_all_project_accessions() + self.check_all_taxonomy_codes() + + def check_all_project_accessions(self): + """Check that ENA project accessions exist and are public""" + project = self.metadata[PROJECT_KEY] + self.check_project_accession(project[PARENT_PROJECT_KEY], f'/{PROJECT_KEY}/{PARENT_PROJECT_KEY}') + for idx, accession in enumerate(project[CHILD_PROJECTS_KEY]): + self.check_project_accession(accession, f'/{PROJECT_KEY}/{CHILD_PROJECTS_KEY}/{idx}') + for idx, accession in enumerate(project[PEER_PROJECTS_KEY]): + self.check_project_accession(accession, f'/{PROJECT_KEY}/{PEER_PROJECTS_KEY}/{idx}') + + def check_all_taxonomy_codes(self): + """Check that taxonomy IDs are valid according to ENA""" + project = self.metadata[PROJECT_KEY] + self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}') + # Check sample taxonomies for novel samples + for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): + if BIOSAMPLE_OBJECT_KEY in sample: + self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY], + f'/{SAMPLE_KEY}/{idx}.{CHARACTERISTICS_KEY}/{TAX_ID_KEY}') + + @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) + def check_project_accession(self, project_acc, json_path): + try: + download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_acc}') + except Exception: + self.add_error(json_path, f'{project_acc} does not exist or is private') + + @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) + def check_taxonomy_code(self, taxonomy_code, json_path): + try: + download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}') + except Exception: + self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') + + def add_error(self, property, description): + # Ensure that errors match the format of biovalidator errors + self.errors.append({'property': property, 'description': description}) + + def check_analysis_alias_coherence(self): + # TODO modify these two methods + """Check that the same analysis aliases are used in analysis, sample, and files""" + analysis_aliases = [analysis_row['Analysis Alias'] for analysis_row in self.metadata['Analysis']] + self.same_set( + analysis_aliases, + [analysis_alias.strip() for sample_row in self.metadata['Sample'] for analysis_alias in + sample_row['Analysis Alias'].split(',')], + 'Analysis Alias', 'Samples' + ) + self.same_set(analysis_aliases, [file_row['Analysis Alias'] for file_row in self.metadata['Files']], + 'Analysis Alias', 'Files') + + def same_set(self, list1, list2, list1_desc, list2_desc): + if not set(list1) == set(list2): + list1_list2 = sorted(cast_list(set(list1).difference(list2))) + list2_list1 = sorted(cast_list(set(list2).difference(list1))) + errors = [] + if list1_list2: + errors.append('%s present in %s not in %s' % (','.join(list1_list2), list1_desc, list2_desc)) + if list2_list1: + errors.append('%s present in %s not in %s' % (','.join(list2_list1), list2_desc, list1_desc)) + self.error_list.append('Check %s vs %s: %s' % (list1_desc, list2_desc, ' -- '.join(errors))) + # + # def _validate_existing_biosample(sample_data, row_num, accession): + # """This function only check if the existing sample has the expected fields present""" + # found_collection_date = False + # for key in ['collection_date', 'collection date']: + # if key in sample_data['characteristics'] and \ + # # TODO date just needs to exist, format should be checked in json schema (?) + # self._check_date(sample_data['characteristics'][key][0]['text']): + # found_collection_date = True + # if not found_collection_date: + # self.error_list.append( + # f'In row {row_num}, existing sample accession {accession} does not have a valid collection date') + # found_geo_loc = False + # for key in ['geographic location (country and/or sea)']: + # if key in sample_data['characteristics'] and sample_data['characteristics'][key][0]['text']: + # found_geo_loc = True + # if not found_geo_loc: + # self.error_list.append( + # f'In row {row_num}, existing sample accession {accession} does not have a valid geographic location') + # + # def check_all_sample_accessions(metadata): + # """Check that BioSample accessions exist and are public""" + # for row in self.metadata['Sample']: + # if row.get('Sample Accession'): + # sample_accession = row.get('Sample Accession').strip() + # try: + # sample_data = self.communicator.follows_link('samples', join_url=sample_accession) + # _validate_existing_biosample(sample_data, row.get('row_num'), sample_accession) + # except ValueError: + # self.error_list.append( + # f'In Sample, row {row.get("row_num")} BioSamples accession {sample_accession} ' + # f'does not exist or is private') diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index c2fd996..004215b 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -360,10 +360,10 @@ def _collect_metadata_results(self): self.results['metadata_check'] = {} self._load_spreadsheet_conversion_errors() self._parse_biovalidator_validation_results() + self._parse_semantic_metadata_results() self._convert_biovalidator_validation_to_spreadsheet() self._write_spreadsheet_validation_results() self._collect_md5sum_to_metadata() - # TODO add semantic metadata checks def _load_spreadsheet_conversion_errors(self): errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', @@ -420,6 +420,17 @@ def _parse_metadata_property(self, property_str): logger.error(f'Cannot parse {property_str} in JSON metadata error') return None, None, None + def _parse_semantic_metadata_results(self): + errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', + 'metadata_semantic_check.yml')) + if not errors_file: + return + with open(errors_file) as open_yaml: + # errors is a list of dicts matching format of biovalidator errors + errors = yaml.safe_load(open_yaml) + # biovalidator error parsing always places a list here, even if no errors + self.results['metadata_check']['json_errors'] += errors + def _convert_biovalidator_validation_to_spreadsheet(self): config_file = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml") with open(config_file) as open_file: From bf35bb48e7633cffcb10a157968039d28633a4e1 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 8 Jul 2024 14:45:36 +0100 Subject: [PATCH 03/11] add analysis alias coherence, tests --- eva_sub_cli/etc/eva_schema.json | 8 ++- eva_sub_cli/semantic_metadata.py | 103 ++++++++++++++----------------- eva_sub_cli/submit.py | 1 - tests/test_semantic_metadata.py | 93 ++++++++++++++++++++++++++++ 4 files changed, 146 insertions(+), 59 deletions(-) create mode 100644 tests/test_semantic_metadata.py diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json index 78c2c22..e4e3028 100644 --- a/eva_sub_cli/etc/eva_schema.json +++ b/eva_sub_cli/etc/eva_schema.json @@ -296,8 +296,12 @@ ], "properties": { "analysisAlias": { - "type": "string", - "description": "Alias of the analysis performed on this sample." + "type": "array", + "description": "Aliases of the analyses performed on this sample", + "minItems": 1, + "items": { + "type": "string" + } }, "sampleInVCF": { "type": "string", diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index a224b74..3f7de66 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -5,14 +5,18 @@ from ebi_eva_common_pyutils.logger import AppLogger +# TODO check for a nicer way to do this PROJECT_KEY = 'project' +ANALYSIS_KEY = 'analysis' +SAMPLE_KEY = 'sample' +FILES_KEY = 'files' PARENT_PROJECT_KEY = 'parentProject' CHILD_PROJECTS_KEY = 'childProjects' PEER_PROJECTS_KEY = 'peerProjects' -SAMPLE_KEY = 'sample' BIOSAMPLE_OBJECT_KEY = 'bioSampleObject' CHARACTERISTICS_KEY = 'characteristics' TAX_ID_KEY = 'taxId' +ANALYSIS_ALIAS_KEY = 'analysisAlias' def cast_list(l, type_to_cast=str): @@ -33,25 +37,27 @@ def write_result_yaml(self, output_path): def check_all(self): self.check_all_project_accessions() self.check_all_taxonomy_codes() + self.check_analysis_alias_coherence() def check_all_project_accessions(self): - """Check that ENA project accessions exist and are public""" + """Check that ENA project accessions exist and are public.""" project = self.metadata[PROJECT_KEY] - self.check_project_accession(project[PARENT_PROJECT_KEY], f'/{PROJECT_KEY}/{PARENT_PROJECT_KEY}') - for idx, accession in enumerate(project[CHILD_PROJECTS_KEY]): + if PARENT_PROJECT_KEY in project: + self.check_project_accession(project[PARENT_PROJECT_KEY], f'/{PROJECT_KEY}/{PARENT_PROJECT_KEY}') + for idx, accession in enumerate(project.get(CHILD_PROJECTS_KEY, [])): self.check_project_accession(accession, f'/{PROJECT_KEY}/{CHILD_PROJECTS_KEY}/{idx}') - for idx, accession in enumerate(project[PEER_PROJECTS_KEY]): + for idx, accession in enumerate(project.get(PEER_PROJECTS_KEY, [])): self.check_project_accession(accession, f'/{PROJECT_KEY}/{PEER_PROJECTS_KEY}/{idx}') def check_all_taxonomy_codes(self): - """Check that taxonomy IDs are valid according to ENA""" + """Check that taxonomy IDs are valid according to ENA.""" project = self.metadata[PROJECT_KEY] self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}') # Check sample taxonomies for novel samples for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): if BIOSAMPLE_OBJECT_KEY in sample: self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY], - f'/{SAMPLE_KEY}/{idx}.{CHARACTERISTICS_KEY}/{TAX_ID_KEY}') + f'/{SAMPLE_KEY}/{idx}.{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}') @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) def check_project_accession(self, project_acc, json_path): @@ -68,61 +74,46 @@ def check_taxonomy_code(self, taxonomy_code, json_path): self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') def add_error(self, property, description): + """ + Add an error, conforming to the format of biovalidator errors. + + :param property: JSON property of the error. This will be converted to sheet/row/column in spreadsheet if needed. + :param description: description of the error. + """ # Ensure that errors match the format of biovalidator errors self.errors.append({'property': property, 'description': description}) def check_analysis_alias_coherence(self): - # TODO modify these two methods - """Check that the same analysis aliases are used in analysis, sample, and files""" - analysis_aliases = [analysis_row['Analysis Alias'] for analysis_row in self.metadata['Analysis']] - self.same_set( - analysis_aliases, - [analysis_alias.strip() for sample_row in self.metadata['Sample'] for analysis_alias in - sample_row['Analysis Alias'].split(',')], - 'Analysis Alias', 'Samples' - ) - self.same_set(analysis_aliases, [file_row['Analysis Alias'] for file_row in self.metadata['Files']], - 'Analysis Alias', 'Files') - - def same_set(self, list1, list2, list1_desc, list2_desc): + """Check that the same analysis aliases are used in analysis, sample, and files.""" + analysis_aliases = [analysis[ANALYSIS_ALIAS_KEY] for analysis in self.metadata[ANALYSIS_KEY]] + aliases_in_samples = [alias for sample in self.metadata[SAMPLE_KEY] for alias in sample[ANALYSIS_ALIAS_KEY]] + aliases_in_files = [file[ANALYSIS_ALIAS_KEY] for file in self.metadata[FILES_KEY]] + + self.same_set(analysis_aliases, aliases_in_samples, 'Analysis', 'Samples', + f'/{SAMPLE_KEY}/{ANALYSIS_ALIAS_KEY}') + self.same_set(analysis_aliases, aliases_in_files, 'Analysis', 'Files', + f'/{FILES_KEY}/{ANALYSIS_ALIAS_KEY}') + + def same_set(self, list1, list2, list1_desc, list2_desc, json_path): + """ + Compare contents of two lists and add error messages. + + :param list1: first list to compare + :param list2: second list to compare + :param list1_desc: test description of first list, used only in error message + :param list2_desc: text description of second list, used only in error message + :param json_path: property where the error message will appear + """ if not set(list1) == set(list2): list1_list2 = sorted(cast_list(set(list1).difference(list2))) list2_list1 = sorted(cast_list(set(list2).difference(list1))) - errors = [] if list1_list2: - errors.append('%s present in %s not in %s' % (','.join(list1_list2), list1_desc, list2_desc)) + self.add_error( + property=json_path, + description=f'{",".join(list1_list2)} present in {list1_desc} not in {list2_desc}' + ) if list2_list1: - errors.append('%s present in %s not in %s' % (','.join(list2_list1), list2_desc, list1_desc)) - self.error_list.append('Check %s vs %s: %s' % (list1_desc, list2_desc, ' -- '.join(errors))) - # - # def _validate_existing_biosample(sample_data, row_num, accession): - # """This function only check if the existing sample has the expected fields present""" - # found_collection_date = False - # for key in ['collection_date', 'collection date']: - # if key in sample_data['characteristics'] and \ - # # TODO date just needs to exist, format should be checked in json schema (?) - # self._check_date(sample_data['characteristics'][key][0]['text']): - # found_collection_date = True - # if not found_collection_date: - # self.error_list.append( - # f'In row {row_num}, existing sample accession {accession} does not have a valid collection date') - # found_geo_loc = False - # for key in ['geographic location (country and/or sea)']: - # if key in sample_data['characteristics'] and sample_data['characteristics'][key][0]['text']: - # found_geo_loc = True - # if not found_geo_loc: - # self.error_list.append( - # f'In row {row_num}, existing sample accession {accession} does not have a valid geographic location') - # - # def check_all_sample_accessions(metadata): - # """Check that BioSample accessions exist and are public""" - # for row in self.metadata['Sample']: - # if row.get('Sample Accession'): - # sample_accession = row.get('Sample Accession').strip() - # try: - # sample_data = self.communicator.follows_link('samples', join_url=sample_accession) - # _validate_existing_biosample(sample_data, row.get('row_num'), sample_accession) - # except ValueError: - # self.error_list.append( - # f'In Sample, row {row.get("row_num")} BioSamples accession {sample_accession} ' - # f'does not exist or is private') + self.add_error( + property=json_path, + description=f'{",".join(list2_list1)} present in {list2_desc} not in {list1_desc}' + ) diff --git a/eva_sub_cli/submit.py b/eva_sub_cli/submit.py index c67bb93..8f71128 100644 --- a/eva_sub_cli/submit.py +++ b/eva_sub_cli/submit.py @@ -95,4 +95,3 @@ def submit(self): # Complete the submission self.info(f'Complete submission') self._complete_submission() - diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py new file mode 100644 index 0000000..c329bc5 --- /dev/null +++ b/tests/test_semantic_metadata.py @@ -0,0 +1,93 @@ +from unittest import TestCase +from unittest.mock import patch + +from eva_sub_cli.semantic_metadata import SemanticMetadataChecker + + +class TestSemanticMetadata(TestCase): + + def test_check_all_project_accessions(self): + metadata = { + "project": { + "parentProject": "PRJEB123", + "childProjects": ["PRJEB456", "PRJEBNA"] + }, + } + checker = SemanticMetadataChecker(metadata) + with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: + m_ena_download.side_effect = [True, True, Exception('problem downloading')] + checker.check_all_project_accessions() + self.assertEqual(checker.errors, [ + {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'} + ]) + + def test_check_all_taxonomy_codes(self): + metadata = { + "project": { + "taxId": 9606, + }, + "sample": [ + { + "bioSampleAccession": "SAME00003" + }, + { + "bioSampleObject": { + "characteristics": { + # TODO these might need to be lists of strings + "taxId": 9447 + } + } + }, + { + "bioSampleObject": { + "characteristics": { + "taxId": 1234 + } + } + } + ] + } + checker = SemanticMetadataChecker(metadata) + with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: + m_ena_download.side_effect = [True, True, Exception('problem downloading')] + checker.check_all_taxonomy_codes() + self.assertEqual(checker.errors, [ + { + 'property': '/sample/2.bioSampleObject/characteristics/taxId', + 'description': '1234 is not a valid taxonomy code' + } + ]) + + def test_check_analysis_alias_coherence(self): + metadata = { + "analysis": [ + {"analysisAlias": "alias1"}, + {"analysisAlias": "alias2"} + ], + "sample": [ + { + "bioSampleAccession": "SAME00003", + "analysisAlias": ["alias_1", "alias_2"] + }, + { + "bioSampleAccession": "SAME00004", + "analysisAlias": ["alias2"] + } + ], + "files": [ + { + "analysisAlias": "alias1", + "fileName": "example1.vcf.gz" + }, + { + "analysisAlias": "alias2", + "fileName": "example2.vcf.gz" + } + ] + } + checker = SemanticMetadataChecker(metadata) + checker.check_analysis_alias_coherence() + self.assertEqual(checker.errors, [ + {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, + {'property': '/sample/analysisAlias', 'description': 'alias_1,alias_2 present in Samples not in Analysis'} + ]) From 10bebd365b3973321830b3fb4822f11a69ca5485 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 8 Jul 2024 17:09:18 +0100 Subject: [PATCH 04/11] update tests --- eva_sub_cli/semantic_metadata.py | 2 +- eva_sub_cli/validators/validator.py | 24 +++++- .../metadata_semantic_check.yml | 8 ++ .../other_validations/metadata_validation.txt | 12 +-- tests/test_docker_validator.py | 2 +- tests/test_main.py | 1 - tests/test_semantic_metadata.py | 2 +- tests/test_validator.py | 74 +++++++++++++------ 8 files changed, 88 insertions(+), 37 deletions(-) create mode 100644 tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index 3f7de66..7716750 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -57,7 +57,7 @@ def check_all_taxonomy_codes(self): for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): if BIOSAMPLE_OBJECT_KEY in sample: self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY], - f'/{SAMPLE_KEY}/{idx}.{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}') + f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}') @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) def check_project_accession(self, project_acc, json_path): diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py index 004215b..1e2a973 100755 --- a/eva_sub_cli/validators/validator.py +++ b/eva_sub_cli/validators/validator.py @@ -412,7 +412,11 @@ def clean_read(ifile): def _parse_metadata_property(self, property_str): if property_str.startswith('.'): - return property_str.strip('.'), None, None + return property_str.strip('./'), None, None + # First attempt to parse as BioSample object + sheet, row, col = self._parse_sample_metadata_property(property_str) + if sheet is not None and row is not None and col is not None: + return sheet, row, col match = re.match(r'/(\w+)(/(\d+))?([./](\w+))?', property_str) if match: return match.group(1), match.group(3), match.group(5) @@ -420,6 +424,12 @@ def _parse_metadata_property(self, property_str): logger.error(f'Cannot parse {property_str} in JSON metadata error') return None, None, None + def _parse_sample_metadata_property(self, property_str): + match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str) + if match: + return 'sample', match.group(1), match.group(2) + return None, None, None + def _parse_semantic_metadata_results(self): errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations', 'metadata_semantic_check.yml')) @@ -446,13 +456,19 @@ def _convert_biovalidator_validation_to_spreadsheet(self): if row_json is None and attribute_json is None: new_description = f'Sheet "{sheet}" is missing' elif row_json is None: - new_description = f'In sheet "{sheet}", column "{column}" is not populated' + if 'have required' not in error['description']: + new_description = error['description'] + else: + new_description = f'In sheet "{sheet}", column "{column}" is not populated' elif attribute_json and column: - new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated' + if 'have required' not in error['description']: + new_description = error['description'] + else: + new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated' else: new_description = error["description"].replace(sheet_json, sheet) if column is None: - # We do not know this attribute. It's most likely about bioSampleObject + # We do not know this attribute. continue if 'schema' in new_description: # This is an error specific to json schema diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml new file mode 100644 index 0000000..c0c130c --- /dev/null +++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml @@ -0,0 +1,8 @@ +- description: PRJEBNA does not exist or is private + property: /project/childProjects/1 +- description: 1234 is not a valid taxonomy code + property: /sample/2/bioSampleObject/characteristics/taxId +- description: alias1 present in Analysis not in Samples + property: /sample/analysisAlias +- description: alias_1,alias_2 present in Samples not in Analysis + property: /sample/analysisAlias diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt b/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt index d3229f0..3b805ec 100644 --- a/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt +++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt @@ -8,21 +8,21 @@ 2023-06-29T10:03:45.008Z [info] Validation failed with errors: data should have required property 'files', data/project should have required property 'title', data/project should have required property 'description', data/project should have required property 'taxId', data/project should have required property 'centre', data/analysis/0 should have required property 'analysisTitle', data/analysis/0 should have required property 'description', data/analysis/0 should have required property 'experimentType', data/analysis/0 should have required property 'referenceGenome', data/sample/0 should have required property 'bioSampleAccession', data/sample/0 should have required property 'bioSampleObject', data/sample/0 should match exactly one schema in oneOf  Validation failed with following error(s):  - .files + /files should have required property 'files' -/project.title +/project/title should have required property 'title' /project/taxId must have required property 'taxId' /project/holdDate must match format "date" -/analysis/0.description +/analysis/0/description should have required property 'description' -/analysis/0.referenceGenome +/analysis/0/referenceGenome should have required property 'referenceGenome' -/sample/0.bioSampleAccession +/sample/0/bioSampleAccession should have required property 'bioSampleAccession' -/sample/0.bioSampleObject +/sample/0/bioSampleObject should have required property 'bioSampleObject' /sample/0 should match exactly one schema in oneOf diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index 5d2b000..d736235 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -35,7 +35,7 @@ def setUp(self): "submitterDetails": [], "project": {}, "sample": [ - {"analysisAlias": "AA", "sampleInVCF": "HG00096", "BioSampleAccession": "SAME0000096"} + {"analysisAlias": "AA", "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"} ], "analysis": [ {"analysisAlias": "AA"} diff --git a/tests/test_main.py b/tests/test_main.py index b281388..2a19196 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -33,7 +33,6 @@ def setUp(self) -> None: os.makedirs(self.test_sub_dir) shutil.copy(os.path.join(self.resource_dir, 'EVA_Submission_test.json'), self.metadata_json) - def tearDown(self) -> None: shutil.rmtree(self.test_sub_dir) diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py index c329bc5..fc28216 100644 --- a/tests/test_semantic_metadata.py +++ b/tests/test_semantic_metadata.py @@ -53,7 +53,7 @@ def test_check_all_taxonomy_codes(self): checker.check_all_taxonomy_codes() self.assertEqual(checker.errors, [ { - 'property': '/sample/2.bioSampleObject/characteristics/taxId', + 'property': '/sample/2/bioSampleObject/characteristics/taxId', 'description': '1234 is not a valid taxonomy code' } ]) diff --git a/tests/test_validator.py b/tests/test_validator.py index 292ffa6..f01b4e1 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -59,15 +59,21 @@ def test__collect_validation_workflow_results(self): }, 'metadata_check': { 'json_errors': [ - {'property': '.files', 'description': "should have required property 'files'"}, - {'property': '/project.title', 'description': "should have required property 'title'"}, + {'property': '/files', 'description': "should have required property 'files'"}, + {'property': '/project/title', 'description': "should have required property 'title'"}, {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0.description', 'description': "should have required property 'description'"}, - {'property': '/analysis/0.referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + {'property': '/analysis/0/description', 'description': "should have required property 'description'"}, + {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, + {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, + {'property': '/sample/2/bioSampleObject/characteristics/taxId', + 'description': '1234 is not a valid taxonomy code'}, + {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, + {'property': '/sample/analysisAlias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'}, ], 'spreadsheet_errors': [ # NB. Wouldn't normally get conversion error + validation errors together, but it is supported. @@ -79,13 +85,21 @@ def test__collect_validation_workflow_results(self): {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'}, {'sheet': 'Project', 'row': '', 'column': 'Hold Date', - 'description': 'In sheet "Project", column "Hold Date" is not populated'}, + 'description': 'must match format "date"'}, {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Child Project(s)', + 'description': 'PRJEBNA does not exist or is private'}, + {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', + 'description': '1234 is not a valid taxonomy code'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias1 present in Analysis not in Samples'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'} ] } } @@ -121,33 +135,40 @@ def test_parse_biovalidator_validation_results(self): self.validator.results['metadata_check'] = {} self.validator._parse_biovalidator_validation_results() assert self.validator.results['metadata_check']['json_errors'] == [ - {'property': '.files', 'description': "should have required property 'files'"}, - {'property': '/project.title', 'description': "should have required property 'title'"}, + {'property': '/files', 'description': "should have required property 'files'"}, + {'property': '/project/title', 'description': "should have required property 'title'"}, {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0.description', 'description': "should have required property 'description'"}, - {'property': '/analysis/0.referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, + {'property': '/analysis/0/description', 'description': "should have required property 'description'"}, + {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"}, + {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, + {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} ] def test_convert_biovalidator_validation_to_spreadsheet(self): self.validator.results['metadata_check'] = { 'json_errors': [ - {'property': '.files', 'description': "should have required property 'files'"}, - {'property': '/project.title', 'description': "should have required property 'title'"}, + {'property': '/files', 'description': "should have required property 'files'"}, + {'property': '/project/title', 'description': "should have required property 'title'"}, {'property': '/project/taxId', 'description': "must have required property 'taxId'"}, {'property': '/project/holdDate', 'description': 'must match format "date"'}, - {'property': '/analysis/0.description', + {'property': '/analysis/0/description', 'description': "should have required property 'description'"}, - {'property': '/analysis/0.referenceGenome', + {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"}, - {'property': '/sample/0.bioSampleAccession', + {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"}, - {'property': '/sample/0.bioSampleObject', + {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"}, - {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'} + {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}, + # Semantic checks + {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}, + {'property': '/sample/2/bioSampleObject/characteristics/taxId', + 'description': '1234 is not a valid taxonomy code'}, + {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, + {'property': '/sample/analysisAlias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'} ] } self.validator._convert_biovalidator_validation_to_spreadsheet() @@ -159,13 +180,20 @@ def test_convert_biovalidator_validation_to_spreadsheet(self): {'sheet': 'Project', 'row': '', 'column': 'Tax ID', 'description': 'In sheet "Project", column "Tax ID" is not populated'}, {'sheet': 'Project', 'row': '', 'column': 'Hold Date', - 'description': 'In sheet "Project", column "Hold Date" is not populated'}, + 'description': 'must match format "date"'}, {'sheet': 'Analysis', 'row': 2, 'column': 'Description', 'description': 'In sheet "Analysis", row "2", column "Description" is not populated'}, {'sheet': 'Analysis', 'row': 2, 'column': 'Reference', 'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'}, {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession', - 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'} + 'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}, + {'sheet': 'Project', 'row': '', 'column': 'Child Project(s)', + 'description': 'PRJEBNA does not exist or is private'}, + {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', 'description': '1234 is not a valid taxonomy code'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias1 present in Analysis not in Samples'}, + {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias', + 'description': 'alias_1,alias_2 present in Samples not in Analysis'} ] def test_parse_assembly_check_log(self): From 2930ffaa528764a8be9d87fa5519a2a0f2744164 Mon Sep 17 00:00:00 2001 From: April Shen Date: Tue, 9 Jul 2024 13:17:47 +0100 Subject: [PATCH 05/11] fix biosamples characteristic conversion --- bin/xlsx2json.py | 37 +++---- eva_sub_cli/semantic_metadata.py | 6 +- eva_sub_cli/validators/docker_validator.py | 2 +- tests/resources/EVA_Submission_test.json | 109 ++++++++------------- tests/test_semantic_metadata.py | 5 +- tests/test_xlsx2json.py | 105 ++++++-------------- 6 files changed, 97 insertions(+), 167 deletions(-) diff --git a/bin/xlsx2json.py b/bin/xlsx2json.py index 826cc8d..250d10a 100644 --- a/bin/xlsx2json.py +++ b/bin/xlsx2json.py @@ -198,7 +198,7 @@ def get_project_json_data(self): project_data = self.get_rows() if len(project_data) > 1: logger.warning(f"{PROJECT} worksheet expects a single row of info but more than one found in the file. " - f"Only the first row's data will be taken into consideration") + f"Only the first row's data will be taken into consideration") first_row = project_data[0] first_row.pop('row_num') @@ -213,25 +213,29 @@ def get_biosample_object(self, data): # BioSample expects any of organism or species field data[SPECIES] = data[scientific_name] - # For some of the fields, BioSample expects value in arrays - data = {k: [v] if k in ['title', 'description', 'taxId', 'scientificName', 'commonName', SPECIES] else v - for k, v in data.items()} + # BioSample name goes in its own attribute, not part of characteristics + biosample_name = data.pop(sample_name) + # For all characteristics, BioSample expects value in arrays of objects + data = {k: [{'text': self.serialize(v)}] for k, v in data.items()} biosample_object = { - "name": data[sample_name], + "name": biosample_name, "characteristics": data } return biosample_object + @staticmethod + def serialize(value): + """Create a text representation of the value provided""" + if isinstance(value, datetime.date): + return value.strftime('%Y-%m-%d') + return str(value) + def get_sample_data_with_split_analysis_alias(self, data, analysis_alias, sample_name_in_vcf): analysis_alias_list = data[analysis_alias].split(',') sample_in_vcf_val = data[sample_name_in_vcf] - sample_data = [] - for aa in analysis_alias_list: - sample_data.append({analysis_alias: aa, sample_name_in_vcf: sample_in_vcf_val}) - - return sample_data + return {analysis_alias: analysis_alias_list, sample_name_in_vcf: sample_in_vcf_val} def get_sample_json_data(self): json_key = self.xlsx_conf[WORKSHEETS_KEY_NAME][SAMPLE] @@ -243,13 +247,11 @@ def get_sample_json_data(self): analysis_alias = self.xlsx_conf[SAMPLE][REQUIRED_HEADERS_KEY_NAME][ANALYSIS_ALIAS_KEY] sample_name_in_vcf = self.xlsx_conf[SAMPLE][REQUIRED_HEADERS_KEY_NAME][SAMPLE_NAME_IN_VCF_KEY] - sample_data_with_split_aa = self.get_sample_data_with_split_analysis_alias( - json_value, analysis_alias, sample_name_in_vcf) + sample_data = self.get_sample_data_with_split_analysis_alias(json_value, analysis_alias, sample_name_in_vcf) if bio_sample_acc in json_value and json_value[bio_sample_acc]: - sample_data_with_biosample_acc = [dict(item, bioSampleAccession=json_value[bio_sample_acc]) for item in - sample_data_with_split_aa] - sample_json[json_key].extend(sample_data_with_biosample_acc) + sample_data.update(bioSampleAccession=json_value[bio_sample_acc]) + sample_json[json_key].append(sample_data) else: json_value.pop(analysis_alias) json_value.pop(sample_name_in_vcf) @@ -269,9 +271,8 @@ def get_sample_json_data(self): return None biosample_obj = self.get_biosample_object(json_value) - sample_data_with_biosample_obj = [dict(item, bioSampleObject=biosample_obj) for item in - sample_data_with_split_aa] - sample_json[json_key].extend(sample_data_with_biosample_obj) + sample_data.update(bioSampleObject=biosample_obj) + sample_json[json_key].append(sample_data) return sample_json diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index 7716750..69db577 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -5,7 +5,6 @@ from ebi_eva_common_pyutils.logger import AppLogger -# TODO check for a nicer way to do this PROJECT_KEY = 'project' ANALYSIS_KEY = 'analysis' SAMPLE_KEY = 'sample' @@ -56,7 +55,7 @@ def check_all_taxonomy_codes(self): # Check sample taxonomies for novel samples for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): if BIOSAMPLE_OBJECT_KEY in sample: - self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY], + self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'], f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}') @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) @@ -80,7 +79,6 @@ def add_error(self, property, description): :param property: JSON property of the error. This will be converted to sheet/row/column in spreadsheet if needed. :param description: description of the error. """ - # Ensure that errors match the format of biovalidator errors self.errors.append({'property': property, 'description': description}) def check_analysis_alias_coherence(self): @@ -100,7 +98,7 @@ def same_set(self, list1, list2, list1_desc, list2_desc, json_path): :param list1: first list to compare :param list2: second list to compare - :param list1_desc: test description of first list, used only in error message + :param list1_desc: text description of first list, used only in error message :param list2_desc: text description of second list, used only in error message :param json_path: property where the error message will appear """ diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py index fbea681..3eb4b1f 100644 --- a/eva_sub_cli/validators/docker_validator.py +++ b/eva_sub_cli/validators/docker_validator.py @@ -12,7 +12,7 @@ logger = logging_config.get_logger(__name__) container_image = 'ebivariation/eva-sub-cli' -container_tag = 'v0.0.1.dev8' +container_tag = 'v0.0.1.dev9' container_validation_dir = '/opt/vcf_validation' container_validation_output_dir = 'vcf_validation_output' diff --git a/tests/resources/EVA_Submission_test.json b/tests/resources/EVA_Submission_test.json index 2015d31..3be0ce3 100644 --- a/tests/resources/EVA_Submission_test.json +++ b/tests/resources/EVA_Submission_test.json @@ -3,7 +3,6 @@ { "lastName": "Smith", "firstName": "John", - "telephone": "+1234567890", "email": "john.smith@example.com", "laboratory": "Genomics Lab", "centre": "University of Example", @@ -12,7 +11,6 @@ { "lastName": "Doe", "firstName": "Jane", - "telephone": "+1234567890", "email": "jane.doe@example.com", "laboratory": "Bioinformatics Lab", "centre": "University of Example", @@ -21,7 +19,6 @@ ], "project": { "title": "Example Project", - "projectAlias": "EP", "description": "An example project for demonstration purposes", "centre": "University of Example", "taxId": 9606, @@ -32,119 +29,99 @@ "analysisTitle": "Variant Detection 1", "analysisAlias": "VD1", "description": "An example analysis for demonstration purposes", - "ProjectTitle": "Example Project", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27", - "referenceFasta": "GCA_000001405.27_fasta.fa", + "referenceFasta": "GCA_000001405.27_fasta.fa", "platform": "BGISEQ-500" }, { "analysisTitle": "Variant Detection 2", "analysisAlias": "VD2", "description": "An example analysis for demonstration purposes", - "ProjectTitle": "Example Project", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27", - "referenceFasta": "GCA_000001405.27_fasta.fa", + "referenceFasta": "GCA_000001405.27_fasta.fa", "platform": "BGISEQ-500" }, { "analysisTitle": "Variant Detection 3", "analysisAlias": "VD3", "description": "An example analysis for demonstration purposes", - "ProjectTitle": "Example Project", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27", - "referenceFasta": "GCA_000001405.27_fasta.fa", + "referenceFasta": "GCA_000001405.27_fasta.fa", "platform": "BGISEQ-500" } ], "sample": [ { - "analysisAlias": "VD1", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD2", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD3", + "analysisAlias": [ + "VD1", + "VD2", + "VD3" + ], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001" }, { - "analysisAlias": "VD1", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD2", + "analysisAlias": [ + "VD1", + "VD2", + "VD3" + ], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002" }, { - "analysisAlias": "VD3", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", + "analysisAlias": [ + "VD3" + ], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003" }, { - "analysisAlias": "VD4", + "analysisAlias": [ + "VD4", + "VD5" + ], "sampleInVCF": "sample4", "bioSampleObject": { "name": "Lm_17_S8", "characteristics": { - "bioSampleName": "Lm_17_S8", "title": [ - "Bastet normal sample" + { + "text": "Bastet normal sample" + } ], "description": [ - "Test Description" + { + "text": "Test Description" + } ], "taxId": [ - 9447 + { + "text": "9447" + } ], "scientificName": [ - "Lemur catta" + { + "text": "Lemur catta" + } ], - "sex": "Female", - "tissueType": "skin", - "species": [ - "Lemur catta" - ] - } - } - }, - { - "analysisAlias": "VD5", - "sampleInVCF": "sample4", - "bioSampleObject": { - "name": "Lm_17_S8", - "characteristics": { - "bioSampleName": "Lm_17_S8", - "title": [ - "Bastet normal sample" + "sex": [ + { + "text": "Female" + } ], - "description": [ - "Test Description" - ], - "taxId": [ - 9447 - ], - "scientificName": [ - "Lemur catta" + "tissueType": [ + { + "text": "skin" + } ], - "sex": "Female", - "tissueType": "skin", "species": [ - "Lemur catta" + { + "text": "Lemur catta" + } ] } } diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py index fc28216..c6ab042 100644 --- a/tests/test_semantic_metadata.py +++ b/tests/test_semantic_metadata.py @@ -33,15 +33,14 @@ def test_check_all_taxonomy_codes(self): { "bioSampleObject": { "characteristics": { - # TODO these might need to be lists of strings - "taxId": 9447 + "taxId": [{"text": "9447"}] } } }, { "bioSampleObject": { "characteristics": { - "taxId": 1234 + "taxId": [{"text": "1234"}] } } } diff --git a/tests/test_xlsx2json.py b/tests/test_xlsx2json.py index f292d06..22dd80f 100644 --- a/tests/test_xlsx2json.py +++ b/tests/test_xlsx2json.py @@ -35,8 +35,7 @@ def test_conversion_2_json(self) -> None: json_data = json.load(open_file) # assert json file is created with expected data assert sorted(json_data.keys()) == ['analysis', 'files', 'project', 'sample', 'submitterDetails'] - print(self.get_expected_json()) - self.assertTrue(self.get_expected_json() == json_data) + self.assertEqual(self.get_expected_json(), json_data) # assert json schema with open(self.eva_schema) as eva_schema_file: @@ -45,7 +44,7 @@ def test_conversion_2_json(self) -> None: biosample_json_schema = json.load(biosample_schema_file) # assert created json file sample field conforms to eva-biosamples schema - jsonschema.validate(json_data['sample'][7]['bioSampleObject'], biosample_json_schema) + jsonschema.validate(json_data['sample'][3]['bioSampleObject'], biosample_json_schema) # assert created json file conform to eva_schema resolver = jsonschema.RefResolver.from_schema(eva_json_schema) @@ -133,92 +132,48 @@ def get_expected_json(self): ], "sample": [ { - "analysisAlias": "VD1", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD2", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD3", + "analysisAlias": ["VD1", "VD2", "VD3"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001" }, { - "analysisAlias": "VD1", + "analysisAlias": ["VD1", "VD2", "VD3"], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002" }, { - "analysisAlias": "VD2", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", + "analysisAlias": ["VD3"], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003" }, { - "analysisAlias": "VD4", - "sampleInVCF": "sample4", - "bioSampleObject": { - "name": "Lm_17_S8", - "characteristics": { - "bioSampleName": "Lm_17_S8", - "title": [ - "Bastet normal sample" - ], - "description": [ - "Test Description" - ], - "taxId": [ - 9447 - ], - "scientificName": [ - "Lemur catta" - ], - "sex": "Female", - "tissueType": "skin", - "species": [ - "Lemur catta" - ] - } - } - }, - { - "analysisAlias": "VD5", + "analysisAlias": ["VD4", "VD5"], "sampleInVCF": "sample4", "bioSampleObject": { - "name": "Lm_17_S8", - "characteristics": { - "bioSampleName": "Lm_17_S8", - "title": [ - "Bastet normal sample" - ], - "description": [ - "Test Description" - ], - "taxId": [ - 9447 - ], - "scientificName": [ - "Lemur catta" - ], - "sex": "Female", - "tissueType": "skin", - "species": [ - "Lemur catta" - ] - } + "name": "Lm_17_S8", + "characteristics": { + "title": [ + {"text": "Bastet normal sample"} + ], + "description": [ + {"text": "Test Description"} + ], + "taxId": [ + {"text": "9447"} + ], + "scientificName": [ + {"text": "Lemur catta"} + ], + "sex": [ + {"text": "Female"} + ], + "tissueType": [ + {"text": "skin"} + ], + "species": [ + {"text": "Lemur catta"} + ] + } } } ], From 9a044ed0d27f3cb0f442d71ebffc1b9141424a39 Mon Sep 17 00:00:00 2001 From: April Shen Date: Tue, 9 Jul 2024 15:20:44 +0100 Subject: [PATCH 06/11] fix more tests --- bin/xlsx2json.py | 6 +- eva_sub_cli/metadata_utils.py | 3 +- .../EVA_Submission_test_with_asm_report.json | 97 ++++++++----------- tests/resources/sample_checker/metadata.json | 26 +---- tests/test_xlsx2json.py | 7 ++ 5 files changed, 55 insertions(+), 84 deletions(-) diff --git a/bin/xlsx2json.py b/bin/xlsx2json.py index 250d10a..767adaa 100644 --- a/bin/xlsx2json.py +++ b/bin/xlsx2json.py @@ -241,7 +241,7 @@ def get_sample_json_data(self): json_key = self.xlsx_conf[WORKSHEETS_KEY_NAME][SAMPLE] sample_json = {json_key: []} for row in self.get_rows(): - row.pop('row_num') + row_num = row.pop('row_num') json_value = {self.translate_header(SAMPLE, k): v for k, v in row.items() if v is not None} bio_sample_acc = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SAMPLE_ACCESSION_KEY] @@ -262,12 +262,12 @@ def get_sample_json_data(self): if sample_name not in json_value: self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have ' f'{SAMPLE_NAME_KEY} populated', - sheet=SAMPLE, column=SAMPLE_NAME_KEY) + sheet=SAMPLE, row=row_num, column=SAMPLE_NAME_KEY) return None if scientific_name not in json_value: self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have ' f'{SCIENTIFIC_NAME_KEY} populated', - sheet=SAMPLE, column=SCIENTIFIC_NAME_KEY) + sheet=SAMPLE, row=row_num, column=SCIENTIFIC_NAME_KEY) return None biosample_obj = self.get_biosample_object(json_value) diff --git a/eva_sub_cli/metadata_utils.py b/eva_sub_cli/metadata_utils.py index 77a8a7c..ce967ae 100644 --- a/eva_sub_cli/metadata_utils.py +++ b/eva_sub_cli/metadata_utils.py @@ -10,7 +10,8 @@ def get_samples_per_analysis(metadata): """Returns mapping of analysis alias to sample names, based on metadata.""" samples_per_analysis = defaultdict(list) for sample_info in metadata.get('sample', []): - samples_per_analysis[sample_info.get('analysisAlias')].append(sample_info.get('sampleInVCF')) + for analysis_alias in sample_info.get('analysisAlias', []): + samples_per_analysis[analysis_alias].append(sample_info.get('sampleInVCF')) return { analysis_alias: set(samples) for analysis_alias, samples in samples_per_analysis.items() diff --git a/tests/resources/EVA_Submission_test_with_asm_report.json b/tests/resources/EVA_Submission_test_with_asm_report.json index 892a0c2..5c3ede3 100644 --- a/tests/resources/EVA_Submission_test_with_asm_report.json +++ b/tests/resources/EVA_Submission_test_with_asm_report.json @@ -64,90 +64,73 @@ ], "sample": [ { - "analysisAlias": "VD1", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD2", - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD3", + "analysisAlias": [ + "VD1", + "VD2", + "VD3" + ], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001" }, { - "analysisAlias": "VD1", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD2", + "analysisAlias": [ + "VD1", + "VD2", + "VD3" + ], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002" }, { - "analysisAlias": "VD3", - "sampleInVCF": "sample2", - "bioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", + "analysisAlias": [ + "VD3" + ], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003" }, { - "analysisAlias": "VD4", + "analysisAlias": [ + "VD4", + "VD5" + ], "sampleInVCF": "sample4", "bioSampleObject": { "name": "Lm_17_S8", "characteristics": { - "bioSampleName": "Lm_17_S8", "title": [ - "Bastet normal sample" + { + "text": "Bastet normal sample" + } ], "description": [ - "Test Description" + { + "text": "Test Description" + } ], "taxId": [ - 9447 + { + "text": "9447" + } ], "scientificName": [ - "Lemur catta" + { + "text": "Lemur catta" + } ], - "sex": "Female", - "tissueType": "skin", - "species": [ - "Lemur catta" - ] - } - } - }, - { - "analysisAlias": "VD5", - "sampleInVCF": "sample4", - "bioSampleObject": { - "name": "Lm_17_S8", - "characteristics": { - "bioSampleName": "Lm_17_S8", - "title": [ - "Bastet normal sample" + "sex": [ + { + "text": "Female" + } ], - "description": [ - "Test Description" - ], - "taxId": [ - 9447 - ], - "scientificName": [ - "Lemur catta" + "tissueType": [ + { + "text": "skin" + } ], - "sex": "Female", - "tissueType": "skin", "species": [ - "Lemur catta" + { + "text": "Lemur catta" + } ] } } diff --git a/tests/resources/sample_checker/metadata.json b/tests/resources/sample_checker/metadata.json index 8854cde..20a9d0b 100644 --- a/tests/resources/sample_checker/metadata.json +++ b/tests/resources/sample_checker/metadata.json @@ -23,37 +23,17 @@ }, "sample": [ { - "analysisAlias": "VD1", + "analysisAlias": ["VD1", "VD2", "VD3"], "sampleInVCF": "sample1", "BioSampleAccession": "SAME00001" }, { - "analysisAlias": "VD1", + "analysisAlias": ["VD1", "VD2", "VD3"], "sampleInVCF": "sample2", "BioSampleAccession": "SAME00002" }, { - "analysisAlias": "VD2", - "sampleInVCF": "sample1", - "BioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD2", - "sampleInVCF": "sample2", - "BioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", - "sampleInVCF": "sample1", - "BioSampleAccession": "SAME00001" - }, - { - "analysisAlias": "VD3", - "sampleInVCF": "sample2", - "BioSampleAccession": "SAME00002" - }, - { - "analysisAlias": "VD3", + "analysisAlias": ["VD3"], "sampleInVCF": "sample3", "BioSampleAccession": "SAME00003" } diff --git a/tests/test_xlsx2json.py b/tests/test_xlsx2json.py index 22dd80f..84a6c63 100644 --- a/tests/test_xlsx2json.py +++ b/tests/test_xlsx2json.py @@ -29,7 +29,14 @@ def test_conversion_2_json(self) -> None: xls_filename = os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx') self.parser = XlsxParser(xls_filename, self.conf_filename) output_json = os.path.join(self.resource_dir, 'EVA_Submission_test_output.json') + errors_yaml = os.path.join(self.resource_dir, 'EVA_Submission_test_errors.yml') self.parser.json(output_json) + self.parser.save_errors(errors_yaml) + + # confirm no errors + with open(errors_yaml) as open_file: + errors_data = yaml.safe_load(open_file) + assert errors_data == [] with open(output_json) as open_file: json_data = json.load(open_file) From 58af1877556492b0962d0051960ccdc78a28b8e9 Mon Sep 17 00:00:00 2001 From: April Shen Date: Tue, 9 Jul 2024 15:40:46 +0100 Subject: [PATCH 07/11] fix test --- tests/test_docker_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index d736235..5486804 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -35,7 +35,7 @@ def setUp(self): "submitterDetails": [], "project": {}, "sample": [ - {"analysisAlias": "AA", "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"} + {"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"} ], "analysis": [ {"analysisAlias": "AA"} From 57b933068b71f5653a018aef610acb84fdef7734 Mon Sep 17 00:00:00 2001 From: April Shen Date: Tue, 9 Jul 2024 16:11:03 +0100 Subject: [PATCH 08/11] test for ENA project accession pattern --- eva_sub_cli/nextflow/nextflow.config | 3 +++ tests/test_docker_validator.py | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 eva_sub_cli/nextflow/nextflow.config diff --git a/eva_sub_cli/nextflow/nextflow.config b/eva_sub_cli/nextflow/nextflow.config new file mode 100644 index 0000000..c2e6d9e --- /dev/null +++ b/eva_sub_cli/nextflow/nextflow.config @@ -0,0 +1,3 @@ +process { + errorStrategy = 'finish' +} \ No newline at end of file diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index 5486804..28fc2c3 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -33,7 +33,9 @@ def setUp(self): [os.path.join(self.assembly_reports, 'input_passed.txt')]) sub_metadata = { "submitterDetails": [], - "project": {}, + "project": { + "parentProject": "PRJ_INVALID" + }, "sample": [ {"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"} ], @@ -131,6 +133,11 @@ def test_validate(self): 'md5': '96a80c9368cc3c37095c86fbe6044fb2'} ] + # Check metadata errors + with open(os.path.join(self.validator.output_dir, 'other_validations', 'metadata_validation.txt')) as open_file: + metadata_val_lines = {l.strip() for l in open_file.readlines()} + assert 'must match pattern "^PRJ(EB|NA)\d+$"' in metadata_val_lines + def test_validate_from_excel(self): self.validator_from_excel.validate() self.assertTrue(os.path.isfile(self.validator_from_excel._sample_check_yaml)) From f17ae6b471f39343191841280bb6553df82c02b4 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 10 Jul 2024 08:54:39 +0100 Subject: [PATCH 09/11] catch ConnectionError in insdc check --- bin/check_fasta_insdc.py | 2 +- tests/test_docker_validator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/check_fasta_insdc.py b/bin/check_fasta_insdc.py index fb44a4c..8b2c727 100644 --- a/bin/check_fasta_insdc.py +++ b/bin/check_fasta_insdc.py @@ -123,7 +123,7 @@ def assess_fasta(input_fasta, analyses, assembly_in_metadata): possible_assemblies = containing_assemblies else: possible_assemblies &= containing_assemblies - except HTTPError as e: + except (ConnectionError, HTTPError) as e: # Server errors from either ENA refget or EVA contig alias will halt the check prematurely. # Report the error but do not return from the method, so that incomplete results can be reported # (i.e. any sequences found to be INSDC and any compatible assemblies so far) diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index 28fc2c3..72aba00 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -136,7 +136,7 @@ def test_validate(self): # Check metadata errors with open(os.path.join(self.validator.output_dir, 'other_validations', 'metadata_validation.txt')) as open_file: metadata_val_lines = {l.strip() for l in open_file.readlines()} - assert 'must match pattern "^PRJ(EB|NA)\d+$"' in metadata_val_lines + assert 'must match pattern "^PRJ(EB|NA)\\d+$"' in metadata_val_lines def test_validate_from_excel(self): self.validator_from_excel.validate() From 0a1f7539466a57746cdd713014c6cd185301b3db Mon Sep 17 00:00:00 2001 From: April Shen Date: Fri, 12 Jul 2024 09:12:50 +0100 Subject: [PATCH 10/11] cache taxonomy codes --- eva_sub_cli/nextflow/nextflow.config | 3 --- eva_sub_cli/semantic_metadata.py | 17 +++++++++++++---- tests/test_semantic_metadata.py | 5 +++-- 3 files changed, 16 insertions(+), 9 deletions(-) delete mode 100644 eva_sub_cli/nextflow/nextflow.config diff --git a/eva_sub_cli/nextflow/nextflow.config b/eva_sub_cli/nextflow/nextflow.config deleted file mode 100644 index c2e6d9e..0000000 --- a/eva_sub_cli/nextflow/nextflow.config +++ /dev/null @@ -1,3 +0,0 @@ -process { - errorStrategy = 'finish' -} \ No newline at end of file diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index 69db577..a11d370 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -28,6 +28,8 @@ class SemanticMetadataChecker(AppLogger): def __init__(self, metadata): self.metadata = metadata self.errors = [] + # Caches whether taxonomy code is valid or not + self.taxonomy_valid = {} def write_result_yaml(self, output_path): with open(output_path, 'w') as open_yaml: @@ -67,10 +69,17 @@ def check_project_accession(self, project_acc, json_path): @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3)) def check_taxonomy_code(self, taxonomy_code, json_path): - try: - download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}') - except Exception: - self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') + taxonomy_code = int(taxonomy_code) + if taxonomy_code in self.taxonomy_valid: + if self.taxonomy_valid[taxonomy_code] is False: + self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') + else: + try: + download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}') + self.taxonomy_valid[taxonomy_code] = True + except Exception: + self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code') + self.taxonomy_valid[taxonomy_code] = False def add_error(self, property, description): """ diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py index c6ab042..ab61f7b 100644 --- a/tests/test_semantic_metadata.py +++ b/tests/test_semantic_metadata.py @@ -33,7 +33,7 @@ def test_check_all_taxonomy_codes(self): { "bioSampleObject": { "characteristics": { - "taxId": [{"text": "9447"}] + "taxId": [{"text": "9606"}] } } }, @@ -48,7 +48,8 @@ def test_check_all_taxonomy_codes(self): } checker = SemanticMetadataChecker(metadata) with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: - m_ena_download.side_effect = [True, True, Exception('problem downloading')] + # Mock should only be called once per taxonomy code + m_ena_download.side_effect = [True, Exception('problem downloading')] checker.check_all_taxonomy_codes() self.assertEqual(checker.errors, [ { From d14b80f9be93b184ad7c2b3139c09265ad286830 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 15 Jul 2024 10:16:08 +0100 Subject: [PATCH 11/11] mark integration tests and add insdc check unit tests --- .github/workflows/tests.yml | 2 +- pytest.ini | 3 + tests/test_check_fasta_insdc.py | 107 +++++++++++++++++++++----------- 3 files changed, 76 insertions(+), 36 deletions(-) create mode 100644 pytest.ini diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9dca22e..901f7e7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,4 +33,4 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | - PYTHONPATH=. pytest tests + PYTHONPATH=. pytest tests -k "not integration" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..183f1c9 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +markers = + integration: integration test diff --git a/tests/test_check_fasta_insdc.py b/tests/test_check_fasta_insdc.py index a1d34ff..116e6cc 100644 --- a/tests/test_check_fasta_insdc.py +++ b/tests/test_check_fasta_insdc.py @@ -2,6 +2,7 @@ from unittest import TestCase from unittest.mock import patch +import pytest import requests as requests from bin.check_fasta_insdc import assess_fasta, get_analyses_and_reference_genome_from_metadata @@ -10,14 +11,77 @@ class TestFastaChecker(TestCase): resource_dir = os.path.join(os.path.dirname(__file__), 'resources') + def test_get_analysis_and_reference_genome_from_metadata(self): + metadata_json = os.path.join(self.resource_dir, 'sample_checker', 'metadata.json') + vcf_file = os.path.join(self.resource_dir, 'sample_checker', 'example1.vcf.gz') + analyses, reference = get_analyses_and_reference_genome_from_metadata([vcf_file], metadata_json) + assert analyses == {'VD1'} + assert reference == 'GCA_000001405.27' + def test_assess_fasta_is_insdc(self): input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') - results = assess_fasta(input_fasta, ['analysis'], None) - assert results == { - 'all_insdc': True, - 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], - 'possible_assemblies': {'GCA_000146045.2'} - } + with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \ + patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: + m_get_refget.return_value = {'sequence_name': 'chr1'} + m_get_assemblies.return_value = {'GCA_000146045.2'} + results = assess_fasta(input_fasta, ['analysis'], None) + assert results == { + 'all_insdc': True, + 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], + 'possible_assemblies': {'GCA_000146045.2'} + } + with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \ + patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: + m_get_refget.return_value = None + m_get_assemblies.return_value = set() + results = assess_fasta(input_fasta, ['analysis'], None) + assert results == { + 'all_insdc': False, + 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': False}] + } + + def test_assess_fasta_matches_metadata(self): + input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') + with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \ + patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: + m_get_refget.return_value = {'sequence_name': 'I'} + m_get_assemblies.return_value = {'GCA_000146045.2'} + results = assess_fasta(input_fasta, ['analysis'], 'GCA_000146045.2') + assert results == { + 'all_insdc': True, + 'sequences': [ + {'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], + 'possible_assemblies': {'GCA_000146045.2'}, + 'metadata_assembly_compatible': True, + 'associated_analyses': ['analysis'], + 'assembly_in_metadata': 'GCA_000146045.2' + } + results = assess_fasta(input_fasta, ['analysis'], 'GCA_002915635.1') + assert results == { + 'all_insdc': True, + 'sequences': [ + {'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], + 'possible_assemblies': {'GCA_000146045.2'}, + 'metadata_assembly_compatible': False, + 'associated_analyses': ['analysis'], + 'assembly_in_metadata': 'GCA_002915635.1' + } + + def test_assess_fasta_http_error(self): + input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') + with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \ + patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: + m_get_refget.return_value = {'sequence_name': 'I'} + m_get_assemblies.side_effect = requests.HTTPError('500 Internal Server Error') + results = assess_fasta(input_fasta, ['analysis'], None) + assert results == { + 'all_insdc': True, + 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], + 'connection_error': '500 Internal Server Error' + } + + @pytest.mark.integration + def test_assess_fasta_is_insdc_integration(self): input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'input_passed.fa') results = assess_fasta(input_fasta, ['analysis'], None) assert results == { @@ -25,7 +89,8 @@ def test_assess_fasta_is_insdc(self): 'sequences': [{'sequence_name': 'chr1', 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d', 'insdc': False}] } - def test_assess_fasta_matches_metadata(self): + @pytest.mark.integration + def test_assess_fasta_matches_metadata_integration(self): input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') results = assess_fasta(input_fasta, ['analysis'], 'GCA_000146045.2') assert results == { @@ -37,31 +102,3 @@ def test_assess_fasta_matches_metadata(self): 'associated_analyses': ['analysis'], 'assembly_in_metadata': 'GCA_000146045.2' } - results = assess_fasta(input_fasta, ['analysis'], 'GCA_002915635.1') - assert results == { - 'all_insdc': True, - 'sequences': [ - {'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], - 'possible_assemblies': {'GCA_000146045.2'}, - 'metadata_assembly_compatible': False, - 'associated_analyses': ['analysis'], - 'assembly_in_metadata': 'GCA_002915635.1' - } - - def test_get_analysis_and_reference_genome_from_metadata(self): - metadata_json = os.path.join(self.resource_dir, 'sample_checker', 'metadata.json') - vcf_file = os.path.join(self.resource_dir, 'sample_checker', 'example1.vcf.gz') - analyses, reference = get_analyses_and_reference_genome_from_metadata([vcf_file], metadata_json) - assert analyses == {'VD1'} - assert reference == 'GCA_000001405.27' - - def test_assess_fasta_http_error(self): - input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa') - with patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies: - m_get_assemblies.side_effect = requests.HTTPError('500 Internal Server Error') - results = assess_fasta(input_fasta, ['analysis'], None) - assert results == { - 'all_insdc': True, - 'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}], - 'connection_error': '500 Internal Server Error' - }