From 80931a8594f297c480901402eead2e026a517e6e Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Thu, 4 Jul 2024 15:42:18 +0100
Subject: [PATCH 01/11] semantic metadata checks WIP

---
 bin/check_metadata_semantics.py     | 70 +++++++++++++++++++++++++++++
 eva_sub_cli/etc/eva_schema.json     | 13 ++++--
 eva_sub_cli/nextflow/validation.nf  | 21 ++++++++-
 eva_sub_cli/validators/validator.py |  1 +
 requirements.txt                    |  7 +--
 5 files changed, 105 insertions(+), 7 deletions(-)
 create mode 100644 bin/check_metadata_semantics.py

diff --git a/bin/check_metadata_semantics.py b/bin/check_metadata_semantics.py
new file mode 100644
index 0000000..4091038
--- /dev/null
+++ b/bin/check_metadata_semantics.py
@@ -0,0 +1,70 @@
+import argparse
+import json
+import re
+
+import requests
+import yaml
+
+from retry import retry
+from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
+from ebi_eva_common_pyutils.logger import logging_config
+
+
+logger = logging_config.get_logger(__name__)
+
+PARENT_PROJECT_KEY = 'parentProject'
+CHILD_PROJECTS_KEY = 'childProjects'
+PEER_PROJECTS_KEY = 'peerProjects'
+
+
+@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
+def check_existing_project_in_ena(project_accession):
+    """Check if a project accession exists and is public in ENA"""
+    try:
+        download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_accession}')
+    except requests.exceptions.HTTPError:
+        return False
+    return True
+
+
+def check_project_accession(errors, project_acc, key_name, idx=None):
+    if not check_existing_project_in_ena(project_acc):
+        field_name = f'/project/{key_name}'
+        if idx is not None:
+            field_name += f'/{idx}'
+        errors[field_name] = f'{project_acc} does not exist or is private'
+
+
+def write_result_yaml(output_yaml, results):
+    with open(output_yaml, 'w') as open_yaml:
+        yaml.safe_dump(data=results, stream=open_yaml)
+
+
+def check_all_project_accessions(metadata):
+    """Check that ENA project accessions exist and are public"""
+    errors = {}
+    project = metadata['project']
+    check_project_accession(errors, project[PARENT_PROJECT_KEY], PARENT_PROJECT_KEY)
+    for idx, accession in enumerate(project[CHILD_PROJECTS_KEY]):
+        check_project_accession(errors, accession, CHILD_PROJECTS_KEY, idx)
+    for idx, accession in enumerate(project[PEER_PROJECTS_KEY]):
+        check_project_accession(errors, accession, PEER_PROJECTS_KEY, idx)
+    return errors
+
+
+def main():
+    arg_parser = argparse.ArgumentParser(description='Perform semantic checks on the metadata')
+    arg_parser.add_argument('--metadata_json', required=True, dest='metadata_json', help='EVA metadata json file')
+    arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
+                            help='Path to the location of the results')
+    args = arg_parser.parse_args()
+    logging_config.add_stdout_handler()
+    with open(args.metadata_json) as open_json:
+        metadata = json.load(open_json)
+        errors = check_all_project_accessions(metadata)
+    # TODO other errors
+    write_result_yaml(args.output_yaml, errors)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json
index e069fee..78c2c22 100644
--- a/eva_sub_cli/etc/eva_schema.json
+++ b/eva_sub_cli/etc/eva_schema.json
@@ -94,18 +94,18 @@
           "items": true
         },
         "parentProject": {
-          "type": "string"
+          "$ref": "#/definitions/projectAccession"
         },
         "childProjects": {
           "type": "array",
           "items": {
-            "type": "string"
+            "$ref": "#/definitions/projectAccession"
           }
         },
         "peerProjects": {
           "type": "array",
           "items": {
-            "type": "string"
+            "$ref": "#/definitions/projectAccession"
           }
         },
         "links": {
@@ -349,5 +349,12 @@
         }
       }
     }
+  },
+  "definitions": {
+    "projectAccession": {
+      "type": "string",
+      "description": "ENA project accession",
+      "pattern": "^PRJ(EB|NA)\\d+$"
+    }
   }
 }
\ No newline at end of file
diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
index 2b6d5d1..a382150 100644
--- a/eva_sub_cli/nextflow/validation.nf
+++ b/eva_sub_cli/nextflow/validation.nf
@@ -29,7 +29,8 @@ params.executable = [
 params.python_scripts = [
     "samples_checker": "samples_checker.py",
     "fasta_checker": "check_fasta_insdc.py",
-    "xlsx2json": "xlsx2json.py"
+    "xlsx2json": "xlsx2json.py",
+    "semantic_checker": "check_metadata_semantics.py"
 ]
 // prefix to prepend to all provided path
 params.base_dir = ""
@@ -255,3 +256,21 @@ process insdc_checker {
     $params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml
     """
 }
+
+
+process metadata_semantic_check {
+    publishDir output_dir,
+            overwrite: true,
+            mode: "copy"
+
+    input:
+    path(metadata_json)
+
+    output:
+    path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml
+
+    script:
+    """
+    $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml
+    """
+}
diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
index e35fd00..c2fd996 100755
--- a/eva_sub_cli/validators/validator.py
+++ b/eva_sub_cli/validators/validator.py
@@ -363,6 +363,7 @@ def _collect_metadata_results(self):
         self._convert_biovalidator_validation_to_spreadsheet()
         self._write_spreadsheet_validation_results()
         self._collect_md5sum_to_metadata()
+        # TODO add semantic metadata checks
 
     def _load_spreadsheet_conversion_errors(self):
         errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations',
diff --git a/requirements.txt b/requirements.txt
index 0746216..5ee6871 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,8 @@
-pyyaml
+ebi_eva_common_pyutils==0.6.3
 jinja2
+jsonschema
 minify_html==0.11.1
 openpyxl
+pyyaml
 requests
-jsonschema
-ebi_eva_common_pyutils==0.6.3
\ No newline at end of file
+retry
\ No newline at end of file

From 71b28bcef66642d495cf9808fb9fee09e0f29ed9 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Fri, 5 Jul 2024 11:54:16 +0100
Subject: [PATCH 02/11] WIP - refactoring

---
 bin/check_metadata_semantics.py     |  58 ++-----------
 eva_sub_cli/semantic_metadata.py    | 128 ++++++++++++++++++++++++++++
 eva_sub_cli/validators/validator.py |  13 ++-
 3 files changed, 145 insertions(+), 54 deletions(-)
 create mode 100644 eva_sub_cli/semantic_metadata.py

diff --git a/bin/check_metadata_semantics.py b/bin/check_metadata_semantics.py
index 4091038..07705ec 100644
--- a/bin/check_metadata_semantics.py
+++ b/bin/check_metadata_semantics.py
@@ -1,55 +1,7 @@
 import argparse
 import json
-import re
 
-import requests
-import yaml
-
-from retry import retry
-from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
-from ebi_eva_common_pyutils.logger import logging_config
-
-
-logger = logging_config.get_logger(__name__)
-
-PARENT_PROJECT_KEY = 'parentProject'
-CHILD_PROJECTS_KEY = 'childProjects'
-PEER_PROJECTS_KEY = 'peerProjects'
-
-
-@retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
-def check_existing_project_in_ena(project_accession):
-    """Check if a project accession exists and is public in ENA"""
-    try:
-        download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_accession}')
-    except requests.exceptions.HTTPError:
-        return False
-    return True
-
-
-def check_project_accession(errors, project_acc, key_name, idx=None):
-    if not check_existing_project_in_ena(project_acc):
-        field_name = f'/project/{key_name}'
-        if idx is not None:
-            field_name += f'/{idx}'
-        errors[field_name] = f'{project_acc} does not exist or is private'
-
-
-def write_result_yaml(output_yaml, results):
-    with open(output_yaml, 'w') as open_yaml:
-        yaml.safe_dump(data=results, stream=open_yaml)
-
-
-def check_all_project_accessions(metadata):
-    """Check that ENA project accessions exist and are public"""
-    errors = {}
-    project = metadata['project']
-    check_project_accession(errors, project[PARENT_PROJECT_KEY], PARENT_PROJECT_KEY)
-    for idx, accession in enumerate(project[CHILD_PROJECTS_KEY]):
-        check_project_accession(errors, accession, CHILD_PROJECTS_KEY, idx)
-    for idx, accession in enumerate(project[PEER_PROJECTS_KEY]):
-        check_project_accession(errors, accession, PEER_PROJECTS_KEY, idx)
-    return errors
+from eva_sub_cli.semantic_metadata import SemanticMetadataChecker
 
 
 def main():
@@ -58,12 +10,12 @@ def main():
     arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
                             help='Path to the location of the results')
     args = arg_parser.parse_args()
-    logging_config.add_stdout_handler()
+
     with open(args.metadata_json) as open_json:
         metadata = json.load(open_json)
-        errors = check_all_project_accessions(metadata)
-    # TODO other errors
-    write_result_yaml(args.output_yaml, errors)
+        checker = SemanticMetadataChecker(metadata)
+        checker.check_all()
+        checker.write_result_yaml(args.output_yaml)
 
 
 if __name__ == "__main__":
diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
new file mode 100644
index 0000000..a224b74
--- /dev/null
+++ b/eva_sub_cli/semantic_metadata.py
@@ -0,0 +1,128 @@
+import yaml
+
+from retry import retry
+from ebi_eva_common_pyutils.ena_utils import download_xml_from_ena
+from ebi_eva_common_pyutils.logger import AppLogger
+
+
+PROJECT_KEY = 'project'
+PARENT_PROJECT_KEY = 'parentProject'
+CHILD_PROJECTS_KEY = 'childProjects'
+PEER_PROJECTS_KEY = 'peerProjects'
+SAMPLE_KEY = 'sample'
+BIOSAMPLE_OBJECT_KEY = 'bioSampleObject'
+CHARACTERISTICS_KEY = 'characteristics'
+TAX_ID_KEY = 'taxId'
+
+
+def cast_list(l, type_to_cast=str):
+    for e in l:
+        yield type_to_cast(e)
+
+
+class SemanticMetadataChecker(AppLogger):
+
+    def __init__(self, metadata):
+        self.metadata = metadata
+        self.errors = []
+
+    def write_result_yaml(self, output_path):
+        with open(output_path, 'w') as open_yaml:
+            yaml.safe_dump(data=self.errors, stream=open_yaml)
+
+    def check_all(self):
+        self.check_all_project_accessions()
+        self.check_all_taxonomy_codes()
+
+    def check_all_project_accessions(self):
+        """Check that ENA project accessions exist and are public"""
+        project = self.metadata[PROJECT_KEY]
+        self.check_project_accession(project[PARENT_PROJECT_KEY], f'/{PROJECT_KEY}/{PARENT_PROJECT_KEY}')
+        for idx, accession in enumerate(project[CHILD_PROJECTS_KEY]):
+            self.check_project_accession(accession, f'/{PROJECT_KEY}/{CHILD_PROJECTS_KEY}/{idx}')
+        for idx, accession in enumerate(project[PEER_PROJECTS_KEY]):
+            self.check_project_accession(accession, f'/{PROJECT_KEY}/{PEER_PROJECTS_KEY}/{idx}')
+
+    def check_all_taxonomy_codes(self):
+        """Check that taxonomy IDs are valid according to ENA"""
+        project = self.metadata[PROJECT_KEY]
+        self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
+        # Check sample taxonomies for novel samples
+        for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
+            if BIOSAMPLE_OBJECT_KEY in sample:
+                self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY],
+                                         f'/{SAMPLE_KEY}/{idx}.{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
+
+    @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
+    def check_project_accession(self, project_acc, json_path):
+        try:
+            download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{project_acc}')
+        except Exception:
+            self.add_error(json_path, f'{project_acc} does not exist or is private')
+
+    @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
+    def check_taxonomy_code(self, taxonomy_code, json_path):
+        try:
+            download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}')
+        except Exception:
+            self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
+
+    def add_error(self, property, description):
+        # Ensure that errors match the format of biovalidator errors
+        self.errors.append({'property': property, 'description': description})
+
+    def check_analysis_alias_coherence(self):
+        # TODO modify these two methods
+        """Check that the same analysis aliases are used in analysis, sample, and files"""
+        analysis_aliases = [analysis_row['Analysis Alias'] for analysis_row in self.metadata['Analysis']]
+        self.same_set(
+            analysis_aliases,
+            [analysis_alias.strip() for sample_row in self.metadata['Sample'] for analysis_alias in
+             sample_row['Analysis Alias'].split(',')],
+            'Analysis Alias', 'Samples'
+        )
+        self.same_set(analysis_aliases, [file_row['Analysis Alias'] for file_row in self.metadata['Files']],
+                      'Analysis Alias', 'Files')
+
+    def same_set(self, list1, list2, list1_desc, list2_desc):
+        if not set(list1) == set(list2):
+            list1_list2 = sorted(cast_list(set(list1).difference(list2)))
+            list2_list1 = sorted(cast_list(set(list2).difference(list1)))
+            errors = []
+            if list1_list2:
+                errors.append('%s present in %s not in %s' % (','.join(list1_list2), list1_desc, list2_desc))
+            if list2_list1:
+                errors.append('%s present in %s not in %s' % (','.join(list2_list1), list2_desc, list1_desc))
+            self.error_list.append('Check %s vs %s: %s' % (list1_desc, list2_desc, ' -- '.join(errors)))
+    #
+    # def _validate_existing_biosample(sample_data, row_num, accession):
+    #     """This function only check if the existing sample has the expected fields present"""
+    #     found_collection_date = False
+    #     for key in ['collection_date', 'collection date']:
+    #         if key in sample_data['characteristics'] and \
+    #                 # TODO date just needs to exist, format should be checked in json schema (?)
+    #             self._check_date(sample_data['characteristics'][key][0]['text']):
+    #         found_collection_date = True
+    #     if not found_collection_date:
+    #         self.error_list.append(
+    #             f'In row {row_num}, existing sample accession {accession} does not have a valid collection date')
+    #     found_geo_loc = False
+    #     for key in ['geographic location (country and/or sea)']:
+    #         if key in sample_data['characteristics'] and sample_data['characteristics'][key][0]['text']:
+    #             found_geo_loc = True
+    #     if not found_geo_loc:
+    #         self.error_list.append(
+    #             f'In row {row_num}, existing sample accession {accession} does not have a valid geographic location')
+    #
+    # def check_all_sample_accessions(metadata):
+    #     """Check that BioSample accessions exist and are public"""
+    #     for row in self.metadata['Sample']:
+    #         if row.get('Sample Accession'):
+    #             sample_accession = row.get('Sample Accession').strip()
+    #             try:
+    #                 sample_data = self.communicator.follows_link('samples', join_url=sample_accession)
+    #                 _validate_existing_biosample(sample_data, row.get('row_num'), sample_accession)
+    #             except ValueError:
+    #                 self.error_list.append(
+    #                     f'In Sample, row {row.get("row_num")} BioSamples accession {sample_accession} '
+    #                     f'does not exist or is private')
diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
index c2fd996..004215b 100755
--- a/eva_sub_cli/validators/validator.py
+++ b/eva_sub_cli/validators/validator.py
@@ -360,10 +360,10 @@ def _collect_metadata_results(self):
         self.results['metadata_check'] = {}
         self._load_spreadsheet_conversion_errors()
         self._parse_biovalidator_validation_results()
+        self._parse_semantic_metadata_results()
         self._convert_biovalidator_validation_to_spreadsheet()
         self._write_spreadsheet_validation_results()
         self._collect_md5sum_to_metadata()
-        # TODO add semantic metadata checks
 
     def _load_spreadsheet_conversion_errors(self):
         errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations',
@@ -420,6 +420,17 @@ def _parse_metadata_property(self, property_str):
             logger.error(f'Cannot parse {property_str} in JSON metadata error')
             return None, None, None
 
+    def _parse_semantic_metadata_results(self):
+        errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations',
+                                                            'metadata_semantic_check.yml'))
+        if not errors_file:
+            return
+        with open(errors_file) as open_yaml:
+            # errors is a list of dicts matching format of biovalidator errors
+            errors = yaml.safe_load(open_yaml)
+            # biovalidator error parsing always places a list here, even if no errors
+            self.results['metadata_check']['json_errors'] += errors
+
     def _convert_biovalidator_validation_to_spreadsheet(self):
         config_file = os.path.join(ETC_DIR, "spreadsheet2json_conf.yaml")
         with open(config_file) as open_file:

From bf35bb48e7633cffcb10a157968039d28633a4e1 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 8 Jul 2024 14:45:36 +0100
Subject: [PATCH 03/11] add analysis alias coherence, tests

---
 eva_sub_cli/etc/eva_schema.json  |   8 ++-
 eva_sub_cli/semantic_metadata.py | 103 ++++++++++++++-----------------
 eva_sub_cli/submit.py            |   1 -
 tests/test_semantic_metadata.py  |  93 ++++++++++++++++++++++++++++
 4 files changed, 146 insertions(+), 59 deletions(-)
 create mode 100644 tests/test_semantic_metadata.py

diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json
index 78c2c22..e4e3028 100644
--- a/eva_sub_cli/etc/eva_schema.json
+++ b/eva_sub_cli/etc/eva_schema.json
@@ -296,8 +296,12 @@
         ],
         "properties": {
           "analysisAlias": {
-            "type": "string",
-            "description": "Alias of the analysis performed on this sample."
+            "type": "array",
+            "description": "Aliases of the analyses performed on this sample",
+            "minItems": 1,
+            "items": {
+              "type": "string"
+            }
           },
           "sampleInVCF": {
             "type": "string",
diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
index a224b74..3f7de66 100644
--- a/eva_sub_cli/semantic_metadata.py
+++ b/eva_sub_cli/semantic_metadata.py
@@ -5,14 +5,18 @@
 from ebi_eva_common_pyutils.logger import AppLogger
 
 
+# TODO check for a nicer way to do this
 PROJECT_KEY = 'project'
+ANALYSIS_KEY = 'analysis'
+SAMPLE_KEY = 'sample'
+FILES_KEY = 'files'
 PARENT_PROJECT_KEY = 'parentProject'
 CHILD_PROJECTS_KEY = 'childProjects'
 PEER_PROJECTS_KEY = 'peerProjects'
-SAMPLE_KEY = 'sample'
 BIOSAMPLE_OBJECT_KEY = 'bioSampleObject'
 CHARACTERISTICS_KEY = 'characteristics'
 TAX_ID_KEY = 'taxId'
+ANALYSIS_ALIAS_KEY = 'analysisAlias'
 
 
 def cast_list(l, type_to_cast=str):
@@ -33,25 +37,27 @@ def write_result_yaml(self, output_path):
     def check_all(self):
         self.check_all_project_accessions()
         self.check_all_taxonomy_codes()
+        self.check_analysis_alias_coherence()
 
     def check_all_project_accessions(self):
-        """Check that ENA project accessions exist and are public"""
+        """Check that ENA project accessions exist and are public."""
         project = self.metadata[PROJECT_KEY]
-        self.check_project_accession(project[PARENT_PROJECT_KEY], f'/{PROJECT_KEY}/{PARENT_PROJECT_KEY}')
-        for idx, accession in enumerate(project[CHILD_PROJECTS_KEY]):
+        if PARENT_PROJECT_KEY in project:
+            self.check_project_accession(project[PARENT_PROJECT_KEY], f'/{PROJECT_KEY}/{PARENT_PROJECT_KEY}')
+        for idx, accession in enumerate(project.get(CHILD_PROJECTS_KEY, [])):
             self.check_project_accession(accession, f'/{PROJECT_KEY}/{CHILD_PROJECTS_KEY}/{idx}')
-        for idx, accession in enumerate(project[PEER_PROJECTS_KEY]):
+        for idx, accession in enumerate(project.get(PEER_PROJECTS_KEY, [])):
             self.check_project_accession(accession, f'/{PROJECT_KEY}/{PEER_PROJECTS_KEY}/{idx}')
 
     def check_all_taxonomy_codes(self):
-        """Check that taxonomy IDs are valid according to ENA"""
+        """Check that taxonomy IDs are valid according to ENA."""
         project = self.metadata[PROJECT_KEY]
         self.check_taxonomy_code(project[TAX_ID_KEY], f'/{PROJECT_KEY}/{TAX_ID_KEY}')
         # Check sample taxonomies for novel samples
         for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
             if BIOSAMPLE_OBJECT_KEY in sample:
                 self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY],
-                                         f'/{SAMPLE_KEY}/{idx}.{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
+                                         f'/{SAMPLE_KEY}/{idx}.{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
 
     @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
     def check_project_accession(self, project_acc, json_path):
@@ -68,61 +74,46 @@ def check_taxonomy_code(self, taxonomy_code, json_path):
             self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
 
     def add_error(self, property, description):
+        """
+        Add an error, conforming to the format of biovalidator errors.
+
+        :param property: JSON property of the error. This will be converted to sheet/row/column in spreadsheet if needed.
+        :param description: description of the error.
+        """
         # Ensure that errors match the format of biovalidator errors
         self.errors.append({'property': property, 'description': description})
 
     def check_analysis_alias_coherence(self):
-        # TODO modify these two methods
-        """Check that the same analysis aliases are used in analysis, sample, and files"""
-        analysis_aliases = [analysis_row['Analysis Alias'] for analysis_row in self.metadata['Analysis']]
-        self.same_set(
-            analysis_aliases,
-            [analysis_alias.strip() for sample_row in self.metadata['Sample'] for analysis_alias in
-             sample_row['Analysis Alias'].split(',')],
-            'Analysis Alias', 'Samples'
-        )
-        self.same_set(analysis_aliases, [file_row['Analysis Alias'] for file_row in self.metadata['Files']],
-                      'Analysis Alias', 'Files')
-
-    def same_set(self, list1, list2, list1_desc, list2_desc):
+        """Check that the same analysis aliases are used in analysis, sample, and files."""
+        analysis_aliases = [analysis[ANALYSIS_ALIAS_KEY] for analysis in self.metadata[ANALYSIS_KEY]]
+        aliases_in_samples = [alias for sample in self.metadata[SAMPLE_KEY] for alias in sample[ANALYSIS_ALIAS_KEY]]
+        aliases_in_files = [file[ANALYSIS_ALIAS_KEY] for file in self.metadata[FILES_KEY]]
+
+        self.same_set(analysis_aliases, aliases_in_samples, 'Analysis', 'Samples',
+                      f'/{SAMPLE_KEY}/{ANALYSIS_ALIAS_KEY}')
+        self.same_set(analysis_aliases, aliases_in_files, 'Analysis', 'Files',
+                      f'/{FILES_KEY}/{ANALYSIS_ALIAS_KEY}')
+
+    def same_set(self, list1, list2, list1_desc, list2_desc, json_path):
+        """
+        Compare contents of two lists and add error messages.
+
+        :param list1: first list to compare
+        :param list2: second list to compare
+        :param list1_desc: test description of first list, used only in error message
+        :param list2_desc: text description of second list, used only in error message
+        :param json_path: property where the error message will appear
+        """
         if not set(list1) == set(list2):
             list1_list2 = sorted(cast_list(set(list1).difference(list2)))
             list2_list1 = sorted(cast_list(set(list2).difference(list1)))
-            errors = []
             if list1_list2:
-                errors.append('%s present in %s not in %s' % (','.join(list1_list2), list1_desc, list2_desc))
+                self.add_error(
+                    property=json_path,
+                    description=f'{",".join(list1_list2)} present in {list1_desc} not in {list2_desc}'
+                )
             if list2_list1:
-                errors.append('%s present in %s not in %s' % (','.join(list2_list1), list2_desc, list1_desc))
-            self.error_list.append('Check %s vs %s: %s' % (list1_desc, list2_desc, ' -- '.join(errors)))
-    #
-    # def _validate_existing_biosample(sample_data, row_num, accession):
-    #     """This function only check if the existing sample has the expected fields present"""
-    #     found_collection_date = False
-    #     for key in ['collection_date', 'collection date']:
-    #         if key in sample_data['characteristics'] and \
-    #                 # TODO date just needs to exist, format should be checked in json schema (?)
-    #             self._check_date(sample_data['characteristics'][key][0]['text']):
-    #         found_collection_date = True
-    #     if not found_collection_date:
-    #         self.error_list.append(
-    #             f'In row {row_num}, existing sample accession {accession} does not have a valid collection date')
-    #     found_geo_loc = False
-    #     for key in ['geographic location (country and/or sea)']:
-    #         if key in sample_data['characteristics'] and sample_data['characteristics'][key][0]['text']:
-    #             found_geo_loc = True
-    #     if not found_geo_loc:
-    #         self.error_list.append(
-    #             f'In row {row_num}, existing sample accession {accession} does not have a valid geographic location')
-    #
-    # def check_all_sample_accessions(metadata):
-    #     """Check that BioSample accessions exist and are public"""
-    #     for row in self.metadata['Sample']:
-    #         if row.get('Sample Accession'):
-    #             sample_accession = row.get('Sample Accession').strip()
-    #             try:
-    #                 sample_data = self.communicator.follows_link('samples', join_url=sample_accession)
-    #                 _validate_existing_biosample(sample_data, row.get('row_num'), sample_accession)
-    #             except ValueError:
-    #                 self.error_list.append(
-    #                     f'In Sample, row {row.get("row_num")} BioSamples accession {sample_accession} '
-    #                     f'does not exist or is private')
+                self.add_error(
+                    property=json_path,
+                    description=f'{",".join(list2_list1)} present in {list2_desc} not in {list1_desc}'
+                )
diff --git a/eva_sub_cli/submit.py b/eva_sub_cli/submit.py
index c67bb93..8f71128 100644
--- a/eva_sub_cli/submit.py
+++ b/eva_sub_cli/submit.py
@@ -95,4 +95,3 @@ def submit(self):
         # Complete the submission
         self.info(f'Complete submission')
         self._complete_submission()
-
diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py
new file mode 100644
index 0000000..c329bc5
--- /dev/null
+++ b/tests/test_semantic_metadata.py
@@ -0,0 +1,93 @@
+from unittest import TestCase
+from unittest.mock import patch
+
+from eva_sub_cli.semantic_metadata import SemanticMetadataChecker
+
+
+class TestSemanticMetadata(TestCase):
+
+    def test_check_all_project_accessions(self):
+        metadata = {
+            "project": {
+                "parentProject": "PRJEB123",
+                "childProjects": ["PRJEB456", "PRJEBNA"]
+            },
+        }
+        checker = SemanticMetadataChecker(metadata)
+        with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download:
+            m_ena_download.side_effect = [True, True, Exception('problem downloading')]
+            checker.check_all_project_accessions()
+            self.assertEqual(checker.errors, [
+                {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'}
+            ])
+
+    def test_check_all_taxonomy_codes(self):
+        metadata = {
+            "project": {
+                "taxId": 9606,
+            },
+            "sample": [
+                {
+                    "bioSampleAccession": "SAME00003"
+                },
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            # TODO these might need to be lists of strings
+                            "taxId": 9447
+                        }
+                    }
+                },
+                {
+                    "bioSampleObject": {
+                        "characteristics": {
+                            "taxId": 1234
+                        }
+                    }
+                }
+            ]
+        }
+        checker = SemanticMetadataChecker(metadata)
+        with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download:
+            m_ena_download.side_effect = [True, True, Exception('problem downloading')]
+            checker.check_all_taxonomy_codes()
+            self.assertEqual(checker.errors, [
+                {
+                    'property': '/sample/2.bioSampleObject/characteristics/taxId',
+                    'description': '1234 is not a valid taxonomy code'
+                }
+            ])
+
+    def test_check_analysis_alias_coherence(self):
+        metadata = {
+            "analysis": [
+                {"analysisAlias": "alias1"},
+                {"analysisAlias": "alias2"}
+            ],
+            "sample": [
+                {
+                    "bioSampleAccession": "SAME00003",
+                    "analysisAlias": ["alias_1", "alias_2"]
+                },
+                {
+                    "bioSampleAccession": "SAME00004",
+                    "analysisAlias": ["alias2"]
+                }
+            ],
+            "files": [
+                {
+                    "analysisAlias": "alias1",
+                    "fileName": "example1.vcf.gz"
+                },
+                {
+                    "analysisAlias": "alias2",
+                    "fileName": "example2.vcf.gz"
+                }
+            ]
+        }
+        checker = SemanticMetadataChecker(metadata)
+        checker.check_analysis_alias_coherence()
+        self.assertEqual(checker.errors, [
+            {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'},
+            {'property': '/sample/analysisAlias', 'description': 'alias_1,alias_2 present in Samples not in Analysis'}
+        ])

From 10bebd365b3973321830b3fb4822f11a69ca5485 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 8 Jul 2024 17:09:18 +0100
Subject: [PATCH 04/11] update tests

---
 eva_sub_cli/semantic_metadata.py              |  2 +-
 eva_sub_cli/validators/validator.py           | 24 +++++-
 .../metadata_semantic_check.yml               |  8 ++
 .../other_validations/metadata_validation.txt | 12 +--
 tests/test_docker_validator.py                |  2 +-
 tests/test_main.py                            |  1 -
 tests/test_semantic_metadata.py               |  2 +-
 tests/test_validator.py                       | 74 +++++++++++++------
 8 files changed, 88 insertions(+), 37 deletions(-)
 create mode 100644 tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml

diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
index 3f7de66..7716750 100644
--- a/eva_sub_cli/semantic_metadata.py
+++ b/eva_sub_cli/semantic_metadata.py
@@ -57,7 +57,7 @@ def check_all_taxonomy_codes(self):
         for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
             if BIOSAMPLE_OBJECT_KEY in sample:
                 self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY],
-                                         f'/{SAMPLE_KEY}/{idx}.{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
+                                         f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
 
     @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
     def check_project_accession(self, project_acc, json_path):
diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
index 004215b..1e2a973 100755
--- a/eva_sub_cli/validators/validator.py
+++ b/eva_sub_cli/validators/validator.py
@@ -412,7 +412,11 @@ def clean_read(ifile):
 
     def _parse_metadata_property(self, property_str):
         if property_str.startswith('.'):
-            return property_str.strip('.'), None, None
+            return property_str.strip('./'), None, None
+        # First attempt to parse as BioSample object
+        sheet, row, col = self._parse_sample_metadata_property(property_str)
+        if sheet is not None and row is not None and col is not None:
+            return sheet, row, col
         match = re.match(r'/(\w+)(/(\d+))?([./](\w+))?', property_str)
         if match:
             return match.group(1), match.group(3), match.group(5)
@@ -420,6 +424,12 @@ def _parse_metadata_property(self, property_str):
             logger.error(f'Cannot parse {property_str} in JSON metadata error')
             return None, None, None
 
+    def _parse_sample_metadata_property(self, property_str):
+        match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str)
+        if match:
+            return 'sample', match.group(1), match.group(2)
+        return None, None, None
+
     def _parse_semantic_metadata_results(self):
         errors_file = resolve_single_file_path(os.path.join(self.output_dir, 'other_validations',
                                                             'metadata_semantic_check.yml'))
@@ -446,13 +456,19 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
             if row_json is None and attribute_json is None:
                 new_description = f'Sheet "{sheet}" is missing'
             elif row_json is None:
-                new_description = f'In sheet "{sheet}", column "{column}" is not populated'
+                if 'have required' not in error['description']:
+                    new_description = error['description']
+                else:
+                    new_description = f'In sheet "{sheet}", column "{column}" is not populated'
             elif attribute_json and column:
-                new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated'
+                if 'have required' not in error['description']:
+                    new_description = error['description']
+                else:
+                    new_description = f'In sheet "{sheet}", row "{row}", column "{column}" is not populated'
             else:
                 new_description = error["description"].replace(sheet_json, sheet)
             if column is None:
-                # We do not know this attribute. It's most likely about bioSampleObject
+                # We do not know this attribute.
                 continue
             if 'schema' in new_description:
                 # This is an error specific to json schema
diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml
new file mode 100644
index 0000000..c0c130c
--- /dev/null
+++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_semantic_check.yml
@@ -0,0 +1,8 @@
+- description: PRJEBNA does not exist or is private
+  property: /project/childProjects/1
+- description: 1234 is not a valid taxonomy code
+  property: /sample/2/bioSampleObject/characteristics/taxId
+- description: alias1 present in Analysis not in Samples
+  property: /sample/analysisAlias
+- description: alias_1,alias_2 present in Samples not in Analysis
+  property: /sample/analysisAlias
diff --git a/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt b/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt
index d3229f0..3b805ec 100644
--- a/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt
+++ b/tests/resources/validation_reports/validation_output/other_validations/metadata_validation.txt
@@ -8,21 +8,21 @@
 2023-06-29T10:03:45.008Z [info] Validation failed with errors: data should have required property 'files', data/project should have required property 'title', data/project should have required property 'description', data/project should have required property 'taxId', data/project should have required property 'centre', data/analysis/0 should have required property 'analysisTitle', data/analysis/0 should have required property 'description', data/analysis/0 should have required property 'experimentType', data/analysis/0 should have required property 'referenceGenome', data/sample/0 should have required property 'bioSampleAccession', data/sample/0 should have required property 'bioSampleObject', data/sample/0 should match exactly one schema in oneOf
 [1m[31m[40m Validation failed with following error(s):
  [0m
-[1m[31m[40m .files
+[1m[31m[40m /files
 	should have required property 'files'
-/project.title
+/project/title
 	should have required property 'title'
 /project/taxId
         must have required property 'taxId'
 /project/holdDate
         must match format "date"
-/analysis/0.description
+/analysis/0/description
 	should have required property 'description'
-/analysis/0.referenceGenome
+/analysis/0/referenceGenome
 	should have required property 'referenceGenome'
-/sample/0.bioSampleAccession
+/sample/0/bioSampleAccession
 	should have required property 'bioSampleAccession'
-/sample/0.bioSampleObject
+/sample/0/bioSampleObject
 	should have required property 'bioSampleObject'
 /sample/0
 	should match exactly one schema in oneOf
diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py
index 5d2b000..d736235 100644
--- a/tests/test_docker_validator.py
+++ b/tests/test_docker_validator.py
@@ -35,7 +35,7 @@ def setUp(self):
             "submitterDetails": [],
             "project": {},
             "sample": [
-                {"analysisAlias":  "AA", "sampleInVCF":  "HG00096", "BioSampleAccession": "SAME0000096"}
+                {"analysisAlias": "AA", "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"}
             ],
             "analysis": [
                 {"analysisAlias": "AA"}
diff --git a/tests/test_main.py b/tests/test_main.py
index b281388..2a19196 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -33,7 +33,6 @@ def setUp(self) -> None:
         os.makedirs(self.test_sub_dir)
         shutil.copy(os.path.join(self.resource_dir, 'EVA_Submission_test.json'), self.metadata_json)
 
-
     def tearDown(self) -> None:
         shutil.rmtree(self.test_sub_dir)
 
diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py
index c329bc5..fc28216 100644
--- a/tests/test_semantic_metadata.py
+++ b/tests/test_semantic_metadata.py
@@ -53,7 +53,7 @@ def test_check_all_taxonomy_codes(self):
             checker.check_all_taxonomy_codes()
             self.assertEqual(checker.errors, [
                 {
-                    'property': '/sample/2.bioSampleObject/characteristics/taxId',
+                    'property': '/sample/2/bioSampleObject/characteristics/taxId',
                     'description': '1234 is not a valid taxonomy code'
                 }
             ])
diff --git a/tests/test_validator.py b/tests/test_validator.py
index 292ffa6..f01b4e1 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -59,15 +59,21 @@ def test__collect_validation_workflow_results(self):
             },
             'metadata_check': {
                 'json_errors': [
-                    {'property': '.files', 'description': "should have required property 'files'"},
-                    {'property': '/project.title', 'description': "should have required property 'title'"},
+                    {'property': '/files', 'description': "should have required property 'files'"},
+                    {'property': '/project/title', 'description': "should have required property 'title'"},
                     {'property': '/project/taxId', 'description': "must have required property 'taxId'"},
                     {'property': '/project/holdDate', 'description': 'must match format "date"'},
-                    {'property': '/analysis/0.description', 'description': "should have required property 'description'"},
-                    {'property': '/analysis/0.referenceGenome', 'description': "should have required property 'referenceGenome'"},
-                    {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"},
-                    {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
-                    {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
+                    {'property': '/analysis/0/description', 'description': "should have required property 'description'"},
+                    {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"},
+                    {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"},
+                    {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
+                    {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'},
+                    {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'},
+                    {'property': '/sample/2/bioSampleObject/characteristics/taxId',
+                     'description': '1234 is not a valid taxonomy code'},
+                    {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'},
+                    {'property': '/sample/analysisAlias',
+                     'description': 'alias_1,alias_2 present in Samples not in Analysis'},
                 ],
                 'spreadsheet_errors': [
                     # NB. Wouldn't normally get conversion error + validation errors together, but it is supported.
@@ -79,13 +85,21 @@ def test__collect_validation_workflow_results(self):
                     {'sheet': 'Project', 'row': '', 'column': 'Tax ID',
                      'description': 'In sheet "Project", column "Tax ID" is not populated'},
                     {'sheet': 'Project', 'row': '', 'column': 'Hold Date',
-                     'description': 'In sheet "Project", column "Hold Date" is not populated'},
+                     'description': 'must match format "date"'},
                     {'sheet': 'Analysis', 'row': 2, 'column': 'Description',
                      'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
                     {'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
                      'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
                     {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
                      'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
+                    {'sheet': 'Project', 'row': '', 'column': 'Child Project(s)',
+                     'description': 'PRJEBNA does not exist or is private'},
+                    {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id',
+                     'description': '1234 is not a valid taxonomy code'},
+                    {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
+                     'description': 'alias1 present in Analysis not in Samples'},
+                    {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
+                     'description': 'alias_1,alias_2 present in Samples not in Analysis'}
                 ]
             }
         }
@@ -121,33 +135,40 @@ def test_parse_biovalidator_validation_results(self):
         self.validator.results['metadata_check'] = {}
         self.validator._parse_biovalidator_validation_results()
         assert self.validator.results['metadata_check']['json_errors'] == [
-            {'property': '.files', 'description': "should have required property 'files'"},
-            {'property': '/project.title', 'description': "should have required property 'title'"},
+            {'property': '/files', 'description': "should have required property 'files'"},
+            {'property': '/project/title', 'description': "should have required property 'title'"},
             {'property': '/project/taxId', 'description': "must have required property 'taxId'"},
             {'property': '/project/holdDate', 'description': 'must match format "date"'},
-            {'property': '/analysis/0.description', 'description': "should have required property 'description'"},
-            {'property': '/analysis/0.referenceGenome', 'description': "should have required property 'referenceGenome'"},
-            {'property': '/sample/0.bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"},
-            {'property': '/sample/0.bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
+            {'property': '/analysis/0/description', 'description': "should have required property 'description'"},
+            {'property': '/analysis/0/referenceGenome', 'description': "should have required property 'referenceGenome'"},
+            {'property': '/sample/0/bioSampleAccession', 'description': "should have required property 'bioSampleAccession'"},
+            {'property': '/sample/0/bioSampleObject', 'description': "should have required property 'bioSampleObject'"},
             {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
         ]
 
     def test_convert_biovalidator_validation_to_spreadsheet(self):
         self.validator.results['metadata_check'] = {
             'json_errors': [
-                {'property': '.files', 'description': "should have required property 'files'"},
-                {'property': '/project.title', 'description': "should have required property 'title'"},
+                {'property': '/files', 'description': "should have required property 'files'"},
+                {'property': '/project/title', 'description': "should have required property 'title'"},
                 {'property': '/project/taxId', 'description': "must have required property 'taxId'"},
                 {'property': '/project/holdDate', 'description': 'must match format "date"'},
-                {'property': '/analysis/0.description',
+                {'property': '/analysis/0/description',
                  'description': "should have required property 'description'"},
-                {'property': '/analysis/0.referenceGenome',
+                {'property': '/analysis/0/referenceGenome',
                  'description': "should have required property 'referenceGenome'"},
-                {'property': '/sample/0.bioSampleAccession',
+                {'property': '/sample/0/bioSampleAccession',
                  'description': "should have required property 'bioSampleAccession'"},
-                {'property': '/sample/0.bioSampleObject',
+                {'property': '/sample/0/bioSampleObject',
                  'description': "should have required property 'bioSampleObject'"},
-                {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'}
+                {'property': '/sample/0', 'description': 'should match exactly one schema in oneOf'},
+                # Semantic checks
+                {'property': '/project/childProjects/1', 'description': 'PRJEBNA does not exist or is private'},
+                {'property': '/sample/2/bioSampleObject/characteristics/taxId',
+                 'description': '1234 is not a valid taxonomy code'},
+                {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'},
+                {'property': '/sample/analysisAlias',
+                 'description': 'alias_1,alias_2 present in Samples not in Analysis'}
             ]
         }
         self.validator._convert_biovalidator_validation_to_spreadsheet()
@@ -159,13 +180,20 @@ def test_convert_biovalidator_validation_to_spreadsheet(self):
             {'sheet': 'Project', 'row': '', 'column': 'Tax ID',
              'description': 'In sheet "Project", column "Tax ID" is not populated'},
             {'sheet': 'Project', 'row': '', 'column': 'Hold Date',
-             'description': 'In sheet "Project", column "Hold Date" is not populated'},
+             'description': 'must match format "date"'},
             {'sheet': 'Analysis', 'row': 2, 'column': 'Description',
              'description': 'In sheet "Analysis", row "2", column "Description" is not populated'},
             {'sheet': 'Analysis', 'row': 2, 'column': 'Reference',
              'description': 'In sheet "Analysis", row "2", column "Reference" is not populated'},
             {'sheet': 'Sample', 'row': 3, 'column': 'Sample Accession',
-             'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'}
+             'description': 'In sheet "Sample", row "3", column "Sample Accession" is not populated'},
+            {'sheet': 'Project', 'row': '', 'column': 'Child Project(s)',
+             'description': 'PRJEBNA does not exist or is private'},
+            {'sheet': 'Sample', 'row': 5, 'column': 'Tax Id', 'description': '1234 is not a valid taxonomy code'},
+            {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
+             'description': 'alias1 present in Analysis not in Samples'},
+            {'sheet': 'Sample', 'row': '', 'column': 'Analysis Alias',
+             'description': 'alias_1,alias_2 present in Samples not in Analysis'}
         ]
 
     def test_parse_assembly_check_log(self):

From 2930ffaa528764a8be9d87fa5519a2a0f2744164 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Tue, 9 Jul 2024 13:17:47 +0100
Subject: [PATCH 05/11] fix biosamples characteristic conversion

---
 bin/xlsx2json.py                           |  37 +++----
 eva_sub_cli/semantic_metadata.py           |   6 +-
 eva_sub_cli/validators/docker_validator.py |   2 +-
 tests/resources/EVA_Submission_test.json   | 109 ++++++++-------------
 tests/test_semantic_metadata.py            |   5 +-
 tests/test_xlsx2json.py                    | 105 ++++++--------------
 6 files changed, 97 insertions(+), 167 deletions(-)

diff --git a/bin/xlsx2json.py b/bin/xlsx2json.py
index 826cc8d..250d10a 100644
--- a/bin/xlsx2json.py
+++ b/bin/xlsx2json.py
@@ -198,7 +198,7 @@ def get_project_json_data(self):
         project_data = self.get_rows()
         if len(project_data) > 1:
             logger.warning(f"{PROJECT} worksheet expects a single row of info but more than one found in the file. "
-                            f"Only the first row's data will be taken into consideration")
+                           f"Only the first row's data will be taken into consideration")
 
         first_row = project_data[0]
         first_row.pop('row_num')
@@ -213,25 +213,29 @@ def get_biosample_object(self, data):
 
         # BioSample expects any of organism or species field
         data[SPECIES] = data[scientific_name]
-        # For some of the fields, BioSample expects value in arrays
-        data = {k: [v] if k in ['title', 'description', 'taxId', 'scientificName', 'commonName', SPECIES] else v
-                for k, v in data.items()}
+        # BioSample name goes in its own attribute, not part of characteristics
+        biosample_name = data.pop(sample_name)
+        # For all characteristics, BioSample expects value in arrays of objects
+        data = {k: [{'text': self.serialize(v)}] for k, v in data.items()}
 
         biosample_object = {
-            "name": data[sample_name],
+            "name": biosample_name,
             "characteristics": data
         }
 
         return biosample_object
 
+    @staticmethod
+    def serialize(value):
+        """Create a text representation of the value provided"""
+        if isinstance(value, datetime.date):
+            return value.strftime('%Y-%m-%d')
+        return str(value)
+
     def get_sample_data_with_split_analysis_alias(self, data, analysis_alias, sample_name_in_vcf):
         analysis_alias_list = data[analysis_alias].split(',')
         sample_in_vcf_val = data[sample_name_in_vcf]
-        sample_data = []
-        for aa in analysis_alias_list:
-            sample_data.append({analysis_alias: aa, sample_name_in_vcf: sample_in_vcf_val})
-
-        return sample_data
+        return {analysis_alias: analysis_alias_list, sample_name_in_vcf: sample_in_vcf_val}
 
     def get_sample_json_data(self):
         json_key = self.xlsx_conf[WORKSHEETS_KEY_NAME][SAMPLE]
@@ -243,13 +247,11 @@ def get_sample_json_data(self):
 
             analysis_alias = self.xlsx_conf[SAMPLE][REQUIRED_HEADERS_KEY_NAME][ANALYSIS_ALIAS_KEY]
             sample_name_in_vcf = self.xlsx_conf[SAMPLE][REQUIRED_HEADERS_KEY_NAME][SAMPLE_NAME_IN_VCF_KEY]
-            sample_data_with_split_aa = self.get_sample_data_with_split_analysis_alias(
-                json_value, analysis_alias, sample_name_in_vcf)
+            sample_data = self.get_sample_data_with_split_analysis_alias(json_value, analysis_alias, sample_name_in_vcf)
 
             if bio_sample_acc in json_value and json_value[bio_sample_acc]:
-                sample_data_with_biosample_acc = [dict(item, bioSampleAccession=json_value[bio_sample_acc]) for item in
-                                                  sample_data_with_split_aa]
-                sample_json[json_key].extend(sample_data_with_biosample_acc)
+                sample_data.update(bioSampleAccession=json_value[bio_sample_acc])
+                sample_json[json_key].append(sample_data)
             else:
                 json_value.pop(analysis_alias)
                 json_value.pop(sample_name_in_vcf)
@@ -269,9 +271,8 @@ def get_sample_json_data(self):
                     return None
 
                 biosample_obj = self.get_biosample_object(json_value)
-                sample_data_with_biosample_obj = [dict(item, bioSampleObject=biosample_obj) for item in
-                                                  sample_data_with_split_aa]
-                sample_json[json_key].extend(sample_data_with_biosample_obj)
+                sample_data.update(bioSampleObject=biosample_obj)
+                sample_json[json_key].append(sample_data)
 
         return sample_json
 
diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
index 7716750..69db577 100644
--- a/eva_sub_cli/semantic_metadata.py
+++ b/eva_sub_cli/semantic_metadata.py
@@ -5,7 +5,6 @@
 from ebi_eva_common_pyutils.logger import AppLogger
 
 
-# TODO check for a nicer way to do this
 PROJECT_KEY = 'project'
 ANALYSIS_KEY = 'analysis'
 SAMPLE_KEY = 'sample'
@@ -56,7 +55,7 @@ def check_all_taxonomy_codes(self):
         # Check sample taxonomies for novel samples
         for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
             if BIOSAMPLE_OBJECT_KEY in sample:
-                self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY],
+                self.check_taxonomy_code(sample[BIOSAMPLE_OBJECT_KEY][CHARACTERISTICS_KEY][TAX_ID_KEY][0]['text'],
                                          f'/{SAMPLE_KEY}/{idx}/{BIOSAMPLE_OBJECT_KEY}/{CHARACTERISTICS_KEY}/{TAX_ID_KEY}')
 
     @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
@@ -80,7 +79,6 @@ def add_error(self, property, description):
         :param property: JSON property of the error. This will be converted to sheet/row/column in spreadsheet if needed.
         :param description: description of the error.
         """
-        # Ensure that errors match the format of biovalidator errors
         self.errors.append({'property': property, 'description': description})
 
     def check_analysis_alias_coherence(self):
@@ -100,7 +98,7 @@ def same_set(self, list1, list2, list1_desc, list2_desc, json_path):
 
         :param list1: first list to compare
         :param list2: second list to compare
-        :param list1_desc: test description of first list, used only in error message
+        :param list1_desc: text description of first list, used only in error message
         :param list2_desc: text description of second list, used only in error message
         :param json_path: property where the error message will appear
         """
diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
index fbea681..3eb4b1f 100644
--- a/eva_sub_cli/validators/docker_validator.py
+++ b/eva_sub_cli/validators/docker_validator.py
@@ -12,7 +12,7 @@
 logger = logging_config.get_logger(__name__)
 
 container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev8'
+container_tag = 'v0.0.1.dev9'
 container_validation_dir = '/opt/vcf_validation'
 container_validation_output_dir = 'vcf_validation_output'
 
diff --git a/tests/resources/EVA_Submission_test.json b/tests/resources/EVA_Submission_test.json
index 2015d31..3be0ce3 100644
--- a/tests/resources/EVA_Submission_test.json
+++ b/tests/resources/EVA_Submission_test.json
@@ -3,7 +3,6 @@
     {
       "lastName": "Smith",
       "firstName": "John",
-      "telephone": "+1234567890",
       "email": "john.smith@example.com",
       "laboratory": "Genomics Lab",
       "centre": "University of Example",
@@ -12,7 +11,6 @@
     {
       "lastName": "Doe",
       "firstName": "Jane",
-      "telephone": "+1234567890",
       "email": "jane.doe@example.com",
       "laboratory": "Bioinformatics Lab",
       "centre": "University of Example",
@@ -21,7 +19,6 @@
   ],
   "project": {
     "title": "Example Project",
-    "projectAlias": "EP",
     "description": "An example project for demonstration purposes",
     "centre": "University of Example",
     "taxId": 9606,
@@ -32,119 +29,99 @@
       "analysisTitle": "Variant Detection 1",
       "analysisAlias": "VD1",
       "description": "An example analysis for demonstration purposes",
-      "ProjectTitle": "Example Project",
       "experimentType": "Whole genome sequencing",
       "referenceGenome": "GCA_000001405.27",
-	  "referenceFasta": "GCA_000001405.27_fasta.fa",
+      "referenceFasta": "GCA_000001405.27_fasta.fa",
       "platform": "BGISEQ-500"
     },
     {
       "analysisTitle": "Variant Detection 2",
       "analysisAlias": "VD2",
       "description": "An example analysis for demonstration purposes",
-      "ProjectTitle": "Example Project",
       "experimentType": "Whole genome sequencing",
       "referenceGenome": "GCA_000001405.27",
-	  "referenceFasta": "GCA_000001405.27_fasta.fa",
+      "referenceFasta": "GCA_000001405.27_fasta.fa",
       "platform": "BGISEQ-500"
     },
     {
       "analysisTitle": "Variant Detection 3",
       "analysisAlias": "VD3",
       "description": "An example analysis for demonstration purposes",
-      "ProjectTitle": "Example Project",
       "experimentType": "Whole genome sequencing",
       "referenceGenome": "GCA_000001405.27",
-	  "referenceFasta": "GCA_000001405.27_fasta.fa",
+      "referenceFasta": "GCA_000001405.27_fasta.fa",
       "platform": "BGISEQ-500"
     }
   ],
   "sample": [
     {
-      "analysisAlias": "VD1",
-      "sampleInVCF": "sample1",
-      "bioSampleAccession": "SAME00001"
-    },
-    {
-      "analysisAlias": "VD2",
-      "sampleInVCF": "sample1",
-      "bioSampleAccession": "SAME00001"
-    },
-    {
-      "analysisAlias": "VD3",
+      "analysisAlias": [
+        "VD1",
+        "VD2",
+        "VD3"
+      ],
       "sampleInVCF": "sample1",
       "bioSampleAccession": "SAME00001"
     },
     {
-      "analysisAlias": "VD1",
-      "sampleInVCF": "sample2",
-      "bioSampleAccession": "SAME00002"
-    },
-    {
-      "analysisAlias": "VD2",
+      "analysisAlias": [
+        "VD1",
+        "VD2",
+        "VD3"
+      ],
       "sampleInVCF": "sample2",
       "bioSampleAccession": "SAME00002"
     },
     {
-      "analysisAlias": "VD3",
-      "sampleInVCF": "sample2",
-      "bioSampleAccession": "SAME00002"
-    },
-    {
-      "analysisAlias": "VD3",
+      "analysisAlias": [
+        "VD3"
+      ],
       "sampleInVCF": "sample3",
       "bioSampleAccession": "SAME00003"
     },
     {
-      "analysisAlias": "VD4",
+      "analysisAlias": [
+        "VD4",
+        "VD5"
+      ],
       "sampleInVCF": "sample4",
       "bioSampleObject": {
         "name": "Lm_17_S8",
         "characteristics": {
-          "bioSampleName": "Lm_17_S8",
           "title": [
-            "Bastet normal sample"
+            {
+              "text": "Bastet normal sample"
+            }
           ],
           "description": [
-            "Test Description"
+            {
+              "text": "Test Description"
+            }
           ],
           "taxId": [
-            9447
+            {
+              "text": "9447"
+            }
           ],
           "scientificName": [
-            "Lemur catta"
+            {
+              "text": "Lemur catta"
+            }
           ],
-          "sex": "Female",
-          "tissueType": "skin",
-          "species": [
-            "Lemur catta"
-          ]
-        }
-      }
-    },
-    {
-      "analysisAlias": "VD5",
-      "sampleInVCF": "sample4",
-      "bioSampleObject": {
-        "name": "Lm_17_S8",
-        "characteristics": {
-          "bioSampleName": "Lm_17_S8",
-          "title": [
-            "Bastet normal sample"
+          "sex": [
+            {
+              "text": "Female"
+            }
           ],
-          "description": [
-            "Test Description"
-          ],
-          "taxId": [
-            9447
-          ],
-          "scientificName": [
-            "Lemur catta"
+          "tissueType": [
+            {
+              "text": "skin"
+            }
           ],
-          "sex": "Female",
-          "tissueType": "skin",
           "species": [
-            "Lemur catta"
+            {
+              "text": "Lemur catta"
+            }
           ]
         }
       }
diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py
index fc28216..c6ab042 100644
--- a/tests/test_semantic_metadata.py
+++ b/tests/test_semantic_metadata.py
@@ -33,15 +33,14 @@ def test_check_all_taxonomy_codes(self):
                 {
                     "bioSampleObject": {
                         "characteristics": {
-                            # TODO these might need to be lists of strings
-                            "taxId": 9447
+                            "taxId": [{"text": "9447"}]
                         }
                     }
                 },
                 {
                     "bioSampleObject": {
                         "characteristics": {
-                            "taxId": 1234
+                            "taxId": [{"text": "1234"}]
                         }
                     }
                 }
diff --git a/tests/test_xlsx2json.py b/tests/test_xlsx2json.py
index f292d06..22dd80f 100644
--- a/tests/test_xlsx2json.py
+++ b/tests/test_xlsx2json.py
@@ -35,8 +35,7 @@ def test_conversion_2_json(self) -> None:
             json_data = json.load(open_file)
             # assert json file is created with expected data
             assert sorted(json_data.keys()) == ['analysis', 'files', 'project', 'sample', 'submitterDetails']
-            print(self.get_expected_json())
-            self.assertTrue(self.get_expected_json() == json_data)
+            self.assertEqual(self.get_expected_json(), json_data)
 
         # assert json schema
         with open(self.eva_schema) as eva_schema_file:
@@ -45,7 +44,7 @@ def test_conversion_2_json(self) -> None:
             biosample_json_schema = json.load(biosample_schema_file)
 
         # assert created json file sample field conforms to eva-biosamples schema
-        jsonschema.validate(json_data['sample'][7]['bioSampleObject'], biosample_json_schema)
+        jsonschema.validate(json_data['sample'][3]['bioSampleObject'], biosample_json_schema)
 
         # assert created json file conform to eva_schema
         resolver = jsonschema.RefResolver.from_schema(eva_json_schema)
@@ -133,92 +132,48 @@ def get_expected_json(self):
             ],
             "sample": [
                 {
-                    "analysisAlias": "VD1",
-                    "sampleInVCF": "sample1",
-                    "bioSampleAccession": "SAME00001"
-                },
-                {
-                    "analysisAlias": "VD2",
-                    "sampleInVCF": "sample1",
-                    "bioSampleAccession": "SAME00001"
-                },
-                {
-                    "analysisAlias": "VD3",
+                    "analysisAlias": ["VD1", "VD2", "VD3"],
                     "sampleInVCF": "sample1",
                     "bioSampleAccession": "SAME00001"
                 },
                 {
-                    "analysisAlias": "VD1",
+                    "analysisAlias": ["VD1", "VD2", "VD3"],
                     "sampleInVCF": "sample2",
                     "bioSampleAccession": "SAME00002"
                 },
                 {
-                    "analysisAlias": "VD2",
-                    "sampleInVCF": "sample2",
-                    "bioSampleAccession": "SAME00002"
-                },
-                {
-                    "analysisAlias": "VD3",
-                    "sampleInVCF": "sample2",
-                    "bioSampleAccession": "SAME00002"
-                },
-                {
-                    "analysisAlias": "VD3",
+                    "analysisAlias": ["VD3"],
                     "sampleInVCF": "sample3",
                     "bioSampleAccession": "SAME00003"
                 },
                 {
-                    "analysisAlias": "VD4",
-                    "sampleInVCF": "sample4",
-                    "bioSampleObject": {
-                      "name": "Lm_17_S8",
-                      "characteristics": {
-                        "bioSampleName": "Lm_17_S8",
-                        "title": [
-                          "Bastet normal sample"
-                        ],
-                        "description": [
-                          "Test Description"
-                        ],
-                        "taxId": [
-                          9447
-                        ],
-                        "scientificName": [
-                          "Lemur catta"
-                        ],
-                        "sex": "Female",
-                        "tissueType": "skin",
-                        "species": [
-                          "Lemur catta"
-                        ]
-                      }
-                    }
-                 },
-                {
-                    "analysisAlias": "VD5",
+                    "analysisAlias": ["VD4", "VD5"],
                     "sampleInVCF": "sample4",
                     "bioSampleObject": {
-                      "name": "Lm_17_S8",
-                      "characteristics": {
-                        "bioSampleName": "Lm_17_S8",
-                        "title": [
-                          "Bastet normal sample"
-                        ],
-                        "description": [
-                          "Test Description"
-                        ],
-                        "taxId": [
-                          9447
-                        ],
-                        "scientificName": [
-                          "Lemur catta"
-                        ],
-                        "sex": "Female",
-                        "tissueType": "skin",
-                        "species": [
-                          "Lemur catta"
-                        ]
-                      }
+                        "name": "Lm_17_S8",
+                        "characteristics": {
+                            "title": [
+                                {"text": "Bastet normal sample"}
+                            ],
+                            "description": [
+                                {"text": "Test Description"}
+                            ],
+                            "taxId": [
+                                {"text": "9447"}
+                            ],
+                            "scientificName": [
+                                {"text": "Lemur catta"}
+                            ],
+                            "sex": [
+                                {"text": "Female"}
+                            ],
+                            "tissueType": [
+                                {"text": "skin"}
+                            ],
+                            "species": [
+                                {"text": "Lemur catta"}
+                            ]
+                        }
                     }
                 }
             ],

From 9a044ed0d27f3cb0f442d71ebffc1b9141424a39 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Tue, 9 Jul 2024 15:20:44 +0100
Subject: [PATCH 06/11] fix more tests

---
 bin/xlsx2json.py                              |  6 +-
 eva_sub_cli/metadata_utils.py                 |  3 +-
 .../EVA_Submission_test_with_asm_report.json  | 97 ++++++++-----------
 tests/resources/sample_checker/metadata.json  | 26 +----
 tests/test_xlsx2json.py                       |  7 ++
 5 files changed, 55 insertions(+), 84 deletions(-)

diff --git a/bin/xlsx2json.py b/bin/xlsx2json.py
index 250d10a..767adaa 100644
--- a/bin/xlsx2json.py
+++ b/bin/xlsx2json.py
@@ -241,7 +241,7 @@ def get_sample_json_data(self):
         json_key = self.xlsx_conf[WORKSHEETS_KEY_NAME][SAMPLE]
         sample_json = {json_key: []}
         for row in self.get_rows():
-            row.pop('row_num')
+            row_num = row.pop('row_num')
             json_value = {self.translate_header(SAMPLE, k): v for k, v in row.items() if v is not None}
             bio_sample_acc = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SAMPLE_ACCESSION_KEY]
 
@@ -262,12 +262,12 @@ def get_sample_json_data(self):
                 if sample_name not in json_value:
                     self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
                                    f'{SAMPLE_NAME_KEY} populated',
-                                   sheet=SAMPLE, column=SAMPLE_NAME_KEY)
+                                   sheet=SAMPLE, row=row_num, column=SAMPLE_NAME_KEY)
                     return None
                 if scientific_name not in json_value:
                     self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
                                    f'{SCIENTIFIC_NAME_KEY} populated',
-                                   sheet=SAMPLE, column=SCIENTIFIC_NAME_KEY)
+                                   sheet=SAMPLE, row=row_num, column=SCIENTIFIC_NAME_KEY)
                     return None
 
                 biosample_obj = self.get_biosample_object(json_value)
diff --git a/eva_sub_cli/metadata_utils.py b/eva_sub_cli/metadata_utils.py
index 77a8a7c..ce967ae 100644
--- a/eva_sub_cli/metadata_utils.py
+++ b/eva_sub_cli/metadata_utils.py
@@ -10,7 +10,8 @@ def get_samples_per_analysis(metadata):
     """Returns mapping of analysis alias to sample names, based on metadata."""
     samples_per_analysis = defaultdict(list)
     for sample_info in metadata.get('sample', []):
-        samples_per_analysis[sample_info.get('analysisAlias')].append(sample_info.get('sampleInVCF'))
+        for analysis_alias in sample_info.get('analysisAlias', []):
+            samples_per_analysis[analysis_alias].append(sample_info.get('sampleInVCF'))
     return {
         analysis_alias: set(samples)
         for analysis_alias, samples in samples_per_analysis.items()
diff --git a/tests/resources/EVA_Submission_test_with_asm_report.json b/tests/resources/EVA_Submission_test_with_asm_report.json
index 892a0c2..5c3ede3 100644
--- a/tests/resources/EVA_Submission_test_with_asm_report.json
+++ b/tests/resources/EVA_Submission_test_with_asm_report.json
@@ -64,90 +64,73 @@
   ],
   "sample": [
     {
-      "analysisAlias": "VD1",
-      "sampleInVCF": "sample1",
-      "bioSampleAccession": "SAME00001"
-    },
-    {
-      "analysisAlias": "VD2",
-      "sampleInVCF": "sample1",
-      "bioSampleAccession": "SAME00001"
-    },
-    {
-      "analysisAlias": "VD3",
+      "analysisAlias": [
+        "VD1",
+        "VD2",
+        "VD3"
+      ],
       "sampleInVCF": "sample1",
       "bioSampleAccession": "SAME00001"
     },
     {
-      "analysisAlias": "VD1",
-      "sampleInVCF": "sample2",
-      "bioSampleAccession": "SAME00002"
-    },
-    {
-      "analysisAlias": "VD2",
+      "analysisAlias": [
+        "VD1",
+        "VD2",
+        "VD3"
+      ],
       "sampleInVCF": "sample2",
       "bioSampleAccession": "SAME00002"
     },
     {
-      "analysisAlias": "VD3",
-      "sampleInVCF": "sample2",
-      "bioSampleAccession": "SAME00002"
-    },
-    {
-      "analysisAlias": "VD3",
+      "analysisAlias": [
+        "VD3"
+      ],
       "sampleInVCF": "sample3",
       "bioSampleAccession": "SAME00003"
     },
     {
-      "analysisAlias": "VD4",
+      "analysisAlias": [
+        "VD4",
+        "VD5"
+      ],
       "sampleInVCF": "sample4",
       "bioSampleObject": {
         "name": "Lm_17_S8",
         "characteristics": {
-          "bioSampleName": "Lm_17_S8",
           "title": [
-            "Bastet normal sample"
+            {
+              "text": "Bastet normal sample"
+            }
           ],
           "description": [
-            "Test Description"
+            {
+              "text": "Test Description"
+            }
           ],
           "taxId": [
-            9447
+            {
+              "text": "9447"
+            }
           ],
           "scientificName": [
-            "Lemur catta"
+            {
+              "text": "Lemur catta"
+            }
           ],
-          "sex": "Female",
-          "tissueType": "skin",
-          "species": [
-            "Lemur catta"
-          ]
-        }
-      }
-    },
-    {
-      "analysisAlias": "VD5",
-      "sampleInVCF": "sample4",
-      "bioSampleObject": {
-        "name": "Lm_17_S8",
-        "characteristics": {
-          "bioSampleName": "Lm_17_S8",
-          "title": [
-            "Bastet normal sample"
+          "sex": [
+            {
+              "text": "Female"
+            }
           ],
-          "description": [
-            "Test Description"
-          ],
-          "taxId": [
-            9447
-          ],
-          "scientificName": [
-            "Lemur catta"
+          "tissueType": [
+            {
+              "text": "skin"
+            }
           ],
-          "sex": "Female",
-          "tissueType": "skin",
           "species": [
-            "Lemur catta"
+            {
+              "text": "Lemur catta"
+            }
           ]
         }
       }
diff --git a/tests/resources/sample_checker/metadata.json b/tests/resources/sample_checker/metadata.json
index 8854cde..20a9d0b 100644
--- a/tests/resources/sample_checker/metadata.json
+++ b/tests/resources/sample_checker/metadata.json
@@ -23,37 +23,17 @@
   },
   "sample": [
     {
-      "analysisAlias":  "VD1",
+      "analysisAlias":  ["VD1", "VD2", "VD3"],
       "sampleInVCF":  "sample1",
       "BioSampleAccession": "SAME00001"
     },
     {
-      "analysisAlias":  "VD1",
+      "analysisAlias":  ["VD1", "VD2", "VD3"],
       "sampleInVCF":  "sample2",
       "BioSampleAccession": "SAME00002"
     },
     {
-      "analysisAlias":  "VD2",
-      "sampleInVCF":  "sample1",
-      "BioSampleAccession": "SAME00001"
-    },
-    {
-      "analysisAlias":  "VD2",
-      "sampleInVCF":  "sample2",
-      "BioSampleAccession": "SAME00002"
-    },
-    {
-      "analysisAlias":  "VD3",
-      "sampleInVCF":  "sample1",
-      "BioSampleAccession": "SAME00001"
-    },
-    {
-      "analysisAlias":  "VD3",
-      "sampleInVCF":  "sample2",
-      "BioSampleAccession": "SAME00002"
-    },
-    {
-      "analysisAlias":  "VD3",
+      "analysisAlias":  ["VD3"],
       "sampleInVCF":  "sample3",
       "BioSampleAccession": "SAME00003"
     }
diff --git a/tests/test_xlsx2json.py b/tests/test_xlsx2json.py
index 22dd80f..84a6c63 100644
--- a/tests/test_xlsx2json.py
+++ b/tests/test_xlsx2json.py
@@ -29,7 +29,14 @@ def test_conversion_2_json(self) -> None:
         xls_filename = os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx')
         self.parser = XlsxParser(xls_filename, self.conf_filename)
         output_json = os.path.join(self.resource_dir, 'EVA_Submission_test_output.json')
+        errors_yaml = os.path.join(self.resource_dir, 'EVA_Submission_test_errors.yml')
         self.parser.json(output_json)
+        self.parser.save_errors(errors_yaml)
+
+        # confirm no errors
+        with open(errors_yaml) as open_file:
+            errors_data = yaml.safe_load(open_file)
+            assert errors_data == []
 
         with open(output_json) as open_file:
             json_data = json.load(open_file)

From 58af1877556492b0962d0051960ccdc78a28b8e9 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Tue, 9 Jul 2024 15:40:46 +0100
Subject: [PATCH 07/11] fix test

---
 tests/test_docker_validator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py
index d736235..5486804 100644
--- a/tests/test_docker_validator.py
+++ b/tests/test_docker_validator.py
@@ -35,7 +35,7 @@ def setUp(self):
             "submitterDetails": [],
             "project": {},
             "sample": [
-                {"analysisAlias": "AA", "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"}
+                {"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"}
             ],
             "analysis": [
                 {"analysisAlias": "AA"}

From 57b933068b71f5653a018aef610acb84fdef7734 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Tue, 9 Jul 2024 16:11:03 +0100
Subject: [PATCH 08/11] test for ENA project accession pattern

---
 eva_sub_cli/nextflow/nextflow.config | 3 +++
 tests/test_docker_validator.py       | 9 ++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 eva_sub_cli/nextflow/nextflow.config

diff --git a/eva_sub_cli/nextflow/nextflow.config b/eva_sub_cli/nextflow/nextflow.config
new file mode 100644
index 0000000..c2e6d9e
--- /dev/null
+++ b/eva_sub_cli/nextflow/nextflow.config
@@ -0,0 +1,3 @@
+process {
+    errorStrategy = 'finish'
+}
\ No newline at end of file
diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py
index 5486804..28fc2c3 100644
--- a/tests/test_docker_validator.py
+++ b/tests/test_docker_validator.py
@@ -33,7 +33,9 @@ def setUp(self):
                             [os.path.join(self.assembly_reports, 'input_passed.txt')])
         sub_metadata = {
             "submitterDetails": [],
-            "project": {},
+            "project": {
+                "parentProject": "PRJ_INVALID"
+            },
             "sample": [
                 {"analysisAlias": ["AA"], "sampleInVCF": "HG00096", "bioSampleAccession": "SAME0000096"}
             ],
@@ -131,6 +133,11 @@ def test_validate(self):
                  'md5': '96a80c9368cc3c37095c86fbe6044fb2'}
             ]
 
+        # Check metadata errors
+        with open(os.path.join(self.validator.output_dir, 'other_validations', 'metadata_validation.txt')) as open_file:
+            metadata_val_lines = {l.strip() for l in open_file.readlines()}
+            assert 'must match pattern "^PRJ(EB|NA)\d+$"' in metadata_val_lines
+
     def test_validate_from_excel(self):
         self.validator_from_excel.validate()
         self.assertTrue(os.path.isfile(self.validator_from_excel._sample_check_yaml))

From f17ae6b471f39343191841280bb6553df82c02b4 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Wed, 10 Jul 2024 08:54:39 +0100
Subject: [PATCH 09/11] catch ConnectionError in insdc check

---
 bin/check_fasta_insdc.py       | 2 +-
 tests/test_docker_validator.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/check_fasta_insdc.py b/bin/check_fasta_insdc.py
index fb44a4c..8b2c727 100644
--- a/bin/check_fasta_insdc.py
+++ b/bin/check_fasta_insdc.py
@@ -123,7 +123,7 @@ def assess_fasta(input_fasta, analyses, assembly_in_metadata):
                 possible_assemblies = containing_assemblies
             else:
                 possible_assemblies &= containing_assemblies
-    except HTTPError as e:
+    except (ConnectionError, HTTPError) as e:
         # Server errors from either ENA refget or EVA contig alias will halt the check prematurely.
         # Report the error but do not return from the method, so that incomplete results can be reported
         # (i.e. any sequences found to be INSDC and any compatible assemblies so far)
diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py
index 28fc2c3..72aba00 100644
--- a/tests/test_docker_validator.py
+++ b/tests/test_docker_validator.py
@@ -136,7 +136,7 @@ def test_validate(self):
         # Check metadata errors
         with open(os.path.join(self.validator.output_dir, 'other_validations', 'metadata_validation.txt')) as open_file:
             metadata_val_lines = {l.strip() for l in open_file.readlines()}
-            assert 'must match pattern "^PRJ(EB|NA)\d+$"' in metadata_val_lines
+            assert 'must match pattern "^PRJ(EB|NA)\\d+$"' in metadata_val_lines
 
     def test_validate_from_excel(self):
         self.validator_from_excel.validate()

From 0a1f7539466a57746cdd713014c6cd185301b3db Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Fri, 12 Jul 2024 09:12:50 +0100
Subject: [PATCH 10/11] cache taxonomy codes

---
 eva_sub_cli/nextflow/nextflow.config |  3 ---
 eva_sub_cli/semantic_metadata.py     | 17 +++++++++++++----
 tests/test_semantic_metadata.py      |  5 +++--
 3 files changed, 16 insertions(+), 9 deletions(-)
 delete mode 100644 eva_sub_cli/nextflow/nextflow.config

diff --git a/eva_sub_cli/nextflow/nextflow.config b/eva_sub_cli/nextflow/nextflow.config
deleted file mode 100644
index c2e6d9e..0000000
--- a/eva_sub_cli/nextflow/nextflow.config
+++ /dev/null
@@ -1,3 +0,0 @@
-process {
-    errorStrategy = 'finish'
-}
\ No newline at end of file
diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
index 69db577..a11d370 100644
--- a/eva_sub_cli/semantic_metadata.py
+++ b/eva_sub_cli/semantic_metadata.py
@@ -28,6 +28,8 @@ class SemanticMetadataChecker(AppLogger):
     def __init__(self, metadata):
         self.metadata = metadata
         self.errors = []
+        # Caches whether taxonomy code is valid or not
+        self.taxonomy_valid = {}
 
     def write_result_yaml(self, output_path):
         with open(output_path, 'w') as open_yaml:
@@ -67,10 +69,17 @@ def check_project_accession(self, project_acc, json_path):
 
     @retry(tries=4, delay=2, backoff=1.2, jitter=(1, 3))
     def check_taxonomy_code(self, taxonomy_code, json_path):
-        try:
-            download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}')
-        except Exception:
-            self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
+        taxonomy_code = int(taxonomy_code)
+        if taxonomy_code in self.taxonomy_valid:
+            if self.taxonomy_valid[taxonomy_code] is False:
+                self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
+        else:
+            try:
+                download_xml_from_ena(f'https://www.ebi.ac.uk/ena/browser/api/xml/{taxonomy_code}')
+                self.taxonomy_valid[taxonomy_code] = True
+            except Exception:
+                self.add_error(json_path, f'{taxonomy_code} is not a valid taxonomy code')
+                self.taxonomy_valid[taxonomy_code] = False
 
     def add_error(self, property, description):
         """
diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py
index c6ab042..ab61f7b 100644
--- a/tests/test_semantic_metadata.py
+++ b/tests/test_semantic_metadata.py
@@ -33,7 +33,7 @@ def test_check_all_taxonomy_codes(self):
                 {
                     "bioSampleObject": {
                         "characteristics": {
-                            "taxId": [{"text": "9447"}]
+                            "taxId": [{"text": "9606"}]
                         }
                     }
                 },
@@ -48,7 +48,8 @@ def test_check_all_taxonomy_codes(self):
         }
         checker = SemanticMetadataChecker(metadata)
         with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download:
-            m_ena_download.side_effect = [True, True, Exception('problem downloading')]
+            # Mock should only be called once per taxonomy code
+            m_ena_download.side_effect = [True, Exception('problem downloading')]
             checker.check_all_taxonomy_codes()
             self.assertEqual(checker.errors, [
                 {

From d14b80f9be93b184ad7c2b3139c09265ad286830 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 15 Jul 2024 10:16:08 +0100
Subject: [PATCH 11/11] mark integration tests and add insdc check unit tests

---
 .github/workflows/tests.yml     |   2 +-
 pytest.ini                      |   3 +
 tests/test_check_fasta_insdc.py | 107 +++++++++++++++++++++-----------
 3 files changed, 76 insertions(+), 36 deletions(-)
 create mode 100644 pytest.ini

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9dca22e..901f7e7 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -33,4 +33,4 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        PYTHONPATH=. pytest tests
+        PYTHONPATH=. pytest tests -k "not integration"
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..183f1c9
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    integration: integration test
diff --git a/tests/test_check_fasta_insdc.py b/tests/test_check_fasta_insdc.py
index a1d34ff..116e6cc 100644
--- a/tests/test_check_fasta_insdc.py
+++ b/tests/test_check_fasta_insdc.py
@@ -2,6 +2,7 @@
 from unittest import TestCase
 from unittest.mock import patch
 
+import pytest
 import requests as requests
 
 from bin.check_fasta_insdc import assess_fasta, get_analyses_and_reference_genome_from_metadata
@@ -10,14 +11,77 @@
 class TestFastaChecker(TestCase):
     resource_dir = os.path.join(os.path.dirname(__file__), 'resources')
 
+    def test_get_analysis_and_reference_genome_from_metadata(self):
+        metadata_json = os.path.join(self.resource_dir, 'sample_checker', 'metadata.json')
+        vcf_file = os.path.join(self.resource_dir, 'sample_checker', 'example1.vcf.gz')
+        analyses, reference = get_analyses_and_reference_genome_from_metadata([vcf_file], metadata_json)
+        assert analyses == {'VD1'}
+        assert reference == 'GCA_000001405.27'
+
     def test_assess_fasta_is_insdc(self):
         input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa')
-        results = assess_fasta(input_fasta, ['analysis'], None)
-        assert results == {
-            'all_insdc': True,
-            'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}],
-            'possible_assemblies': {'GCA_000146045.2'}
-        }
+        with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \
+                patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies:
+            m_get_refget.return_value = {'sequence_name': 'chr1'}
+            m_get_assemblies.return_value = {'GCA_000146045.2'}
+            results = assess_fasta(input_fasta, ['analysis'], None)
+            assert results == {
+                'all_insdc': True,
+                'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}],
+                'possible_assemblies': {'GCA_000146045.2'}
+            }
+        with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \
+                patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies:
+            m_get_refget.return_value = None
+            m_get_assemblies.return_value = set()
+            results = assess_fasta(input_fasta, ['analysis'], None)
+            assert results == {
+                'all_insdc': False,
+                'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': False}]
+            }
+
+    def test_assess_fasta_matches_metadata(self):
+        input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa')
+        with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \
+                patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies:
+            m_get_refget.return_value = {'sequence_name': 'I'}
+            m_get_assemblies.return_value = {'GCA_000146045.2'}
+            results = assess_fasta(input_fasta, ['analysis'], 'GCA_000146045.2')
+            assert results == {
+                'all_insdc': True,
+                'sequences': [
+                    {'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}],
+                'possible_assemblies': {'GCA_000146045.2'},
+                'metadata_assembly_compatible': True,
+                'associated_analyses': ['analysis'],
+                'assembly_in_metadata': 'GCA_000146045.2'
+            }
+            results = assess_fasta(input_fasta, ['analysis'], 'GCA_002915635.1')
+            assert results == {
+                'all_insdc': True,
+                'sequences': [
+                    {'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}],
+                'possible_assemblies': {'GCA_000146045.2'},
+                'metadata_assembly_compatible': False,
+                'associated_analyses': ['analysis'],
+                'assembly_in_metadata': 'GCA_002915635.1'
+            }
+
+    def test_assess_fasta_http_error(self):
+        input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa')
+        with patch('bin.check_fasta_insdc.get_refget_metadata', autospec=True) as m_get_refget, \
+                patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies:
+            m_get_refget.return_value = {'sequence_name': 'I'}
+            m_get_assemblies.side_effect = requests.HTTPError('500 Internal Server Error')
+            results = assess_fasta(input_fasta, ['analysis'], None)
+            assert results == {
+                'all_insdc': True,
+                'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}],
+                'connection_error': '500 Internal Server Error'
+            }
+
+    @pytest.mark.integration
+    def test_assess_fasta_is_insdc_integration(self):
         input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'input_passed.fa')
         results = assess_fasta(input_fasta, ['analysis'], None)
         assert results == {
@@ -25,7 +89,8 @@ def test_assess_fasta_is_insdc(self):
             'sequences': [{'sequence_name': 'chr1', 'sequence_md5': 'd2b3f22704d944f92a6bc45b6603ea2d', 'insdc': False}]
         }
 
-    def test_assess_fasta_matches_metadata(self):
+    @pytest.mark.integration
+    def test_assess_fasta_matches_metadata_integration(self):
         input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa')
         results = assess_fasta(input_fasta, ['analysis'], 'GCA_000146045.2')
         assert results == {
@@ -37,31 +102,3 @@ def test_assess_fasta_matches_metadata(self):
             'associated_analyses': ['analysis'],
             'assembly_in_metadata': 'GCA_000146045.2'
         }
-        results = assess_fasta(input_fasta, ['analysis'], 'GCA_002915635.1')
-        assert results == {
-            'all_insdc': True,
-            'sequences': [
-                {'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}],
-            'possible_assemblies': {'GCA_000146045.2'},
-            'metadata_assembly_compatible': False,
-            'associated_analyses': ['analysis'],
-            'assembly_in_metadata': 'GCA_002915635.1'
-        }
-
-    def test_get_analysis_and_reference_genome_from_metadata(self):
-        metadata_json = os.path.join(self.resource_dir, 'sample_checker', 'metadata.json')
-        vcf_file = os.path.join(self.resource_dir, 'sample_checker', 'example1.vcf.gz')
-        analyses, reference = get_analyses_and_reference_genome_from_metadata([vcf_file], metadata_json)
-        assert analyses == {'VD1'}
-        assert reference == 'GCA_000001405.27'
-
-    def test_assess_fasta_http_error(self):
-        input_fasta = os.path.join(self.resource_dir, 'fasta_files', 'Saccharomyces_cerevisiae_I.fa')
-        with patch('bin.check_fasta_insdc._get_containing_assemblies_paged', autospec=True) as m_get_assemblies:
-            m_get_assemblies.side_effect = requests.HTTPError('500 Internal Server Error')
-            results = assess_fasta(input_fasta, ['analysis'], None)
-            assert results == {
-                'all_insdc': True,
-                'sequences': [{'sequence_name': 'I', 'sequence_md5': '6681ac2f62509cfc220d78751b8dc524', 'insdc': True}],
-                'connection_error': '500 Internal Server Error'
-            }