EBIvariation · apriltuesday · Sep 16, 2024 · Sep 6, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/eva_sub_cli/executables/xlsx2json.py b/eva_sub_cli/executables/xlsx2json.py
@@ -52,14 +52,16 @@ def __init__(self, xlsx_filename, conf_filename):
         try:
             self.workbook = load_workbook(xlsx_filename, read_only=True)
         except Exception as e:
-            self.add_error(f'Error loading {xlsx_filename}: {e}')
+            self.add_error(f'Error loading {xlsx_filename}: {repr(e)}')
+            self.file_loaded = False
             return
-        self.worksheets = None
+        self.worksheets = []
         self._active_worksheet = None
         self.row_offset = {}
         self.headers = {}
-        self.valid = None
+        self.file_loaded = True
         self.errors = []
+        self.valid_worksheets()
 
     @property
     def active_worksheet(self):
@@ -77,14 +79,8 @@ def active_worksheet(self, worksheet):
 
     def valid_worksheets(self):
         """
-        Get the list of the names of worksheets which have all the configured required headers
-        :return: list of valid worksheet names in the Excel file
-        :rtype: list
+        Get the list of the names of worksheets which have the expected title and header row.
         """
-        if self.worksheets is not None:
-            return self.worksheets
-
-        self.worksheets = []
         sheet_titles = self.workbook.sheetnames
 
         for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]:
@@ -97,31 +93,10 @@ def valid_worksheets(self):
             header_row = self.xlsx_conf[title].get(HEADERS_KEY_ROW, 1)
             if worksheet.max_row < header_row + 1:
                 continue
-            # Check required headers are present
+            # Store headers and worksheet title
             self.headers[title] = [cell.value if cell.value is None else cell.value.strip()
                                    for cell in worksheet[header_row]]
-            required_headers = self.xlsx_conf[title].get(REQUIRED_HEADERS_KEY_NAME, [])
-            if set(required_headers) <= set(self.headers[title]):  # issubset
-                self.worksheets.append(title)
-            else:
-                missing_headers = set(required_headers) - set(self.headers[title])
-                for header in missing_headers:
-                    self.add_error(f'Worksheet {title} is missing required header {header}',
-                                   sheet=title, column=header)
-
-        return self.worksheets
-
-    def is_valid(self):
-        """
-        Check that is all the worksheets contain required headers
-        :return: True if all the worksheets contain required headers. False otherwise
-        :rtype: bool
-        """
-        if self.valid is None:
-            self.valid = True
-            self.valid_worksheets()
-
-        return self.valid
+            self.worksheets.append(title)
 
     @staticmethod
     def cast_value(value, type_name):
@@ -219,16 +194,17 @@ def get_biosample_object(self, data):
         scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY]
 
         # BioSample expects any of organism or species field
-        data[SPECIES] = data[scientific_name]
+        if scientific_name in data:
+            data[SPECIES] = data[scientific_name]
         # BioSample name goes in its own attribute, not part of characteristics
-        biosample_name = data.pop(sample_name)
-        # For all characteristics, BioSample expects value in arrays of objects
-        data = {k: [{'text': self.serialize(v)}] for k, v in data.items()}
+        biosample_name = data.pop(sample_name, None)
 
+        # For all characteristics, BioSample expects value in arrays of objects
         biosample_object = {
-            "name": biosample_name,
-            "characteristics": data
+            'characteristics': {k: [{'text': self.serialize(v)}] for k, v in data.items()}
         }
+        if biosample_name is not None:
+            biosample_object['name'] = biosample_name
 
         return biosample_object
 
@@ -263,30 +239,15 @@ def get_sample_json_data(self):
                 json_value.pop(analysis_alias)
                 json_value.pop(sample_name_in_vcf)
 
-                # Check for headers that are required only in this case
-                sample_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SAMPLE_NAME_KEY]
-                scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY]
-                if sample_name not in json_value:
-                    self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
-                                   f'{SAMPLE_NAME_KEY} populated',
-                                   sheet=SAMPLE, row=row_num, column=SAMPLE_NAME_KEY)
-                    return None
-                if scientific_name not in json_value:
-                    self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
-                                   f'{SCIENTIFIC_NAME_KEY} populated',
-                                   sheet=SAMPLE, row=row_num, column=SCIENTIFIC_NAME_KEY)
-                    return None
-
                 biosample_obj = self.get_biosample_object(json_value)
                 sample_data.update(bioSampleObject=biosample_obj)
                 sample_json[json_key].append(sample_data)
 
         return sample_json
 
     def json(self, output_json_file):
-        # First check that all sheets present have the required headers;
-        # also guards against the case where conversion fails in init
-        if not self.is_valid():
+        # If the file could not be loaded at all, return without generating JSON.
+        if not self.file_loaded:
             return
         json_data = {}
         for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]:
@@ -295,8 +256,6 @@ def json(self, output_json_file):
                 json_data.update(self.get_project_json_data())
             elif title == SAMPLE:
                 sample_data = self.get_sample_json_data()
-                if sample_data is None:  # missing conditionally required headers
-                    return
                 json_data.update(sample_data)
             else:
                 json_data[self.xlsx_conf[WORKSHEETS_KEY_NAME][title]] = []
@@ -324,7 +283,6 @@ def add_error(self, message, sheet='', row='', column=''):
         """Adds a conversion error using the same structure as other validation errors,
         and marks the spreadsheet as invalid."""
         self.errors.append({'sheet': sheet, 'row': row, 'column': column, 'description': message})
-        self.valid = False
 
     def save_errors(self, errors_yaml_file):
         with open(errors_yaml_file, 'w') as open_file:

diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
@@ -1,4 +1,3 @@
-import argparse
 import csv
 import os
 import re
@@ -12,7 +11,7 @@
 logger = logging_config.get_logger(__name__)
 
 container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev16'
+container_tag = 'v0.0.1.dev17'
 container_validation_dir = '/opt/vcf_validation'
 container_validation_output_dir = 'vcf_validation_output'
 

diff --git a/eva_sub_cli/validators/validation_results_parsers.py b/eva_sub_cli/validators/validation_results_parsers.py
@@ -134,6 +134,10 @@ def clean_read(ifile):
                 if line.startswith('Validation failed with following error(s):'):
                     collect = True
             else:
+                while line and not line.startswith('/'):
+                    # Sometimes there are multiple (possibly redundant) errors listed under a single property,
+                    # we only report the first
+                    line = clean_read(open_file)
                 line2 = clean_read(open_file)
                 if line is None or line2 is None:
                     break  # EOF
@@ -164,6 +168,9 @@ def convert_metadata_attribute(sheet, json_attribute, xls2json_conf):
     attributes_dict = {}
     attributes_dict.update(xls2json_conf[sheet].get('required', {}))
     attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
+    attributes_dict['Scientific Name'] = 'species'
+    attributes_dict['BioSample Name'] = 'name'
+
     for attribute in attributes_dict:
         if attributes_dict[attribute] == json_attribute:
             return attribute
@@ -185,7 +192,12 @@ def parse_metadata_property(property_str):
 
 
 def parse_sample_metadata_property(property_str):
+    # Check characteristics
     match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str)
     if match:
         return 'sample', match.group(1), match.group(2)
+    # Check name
+    match = re.match(r'/sample/(\d+)/bioSampleObject/name', property_str)
+    if match:
+        return 'sample', match.group(1), 'name'
     return None, None, None
diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 import csv
 import datetime
-import glob
 import json
 import logging
 import os
@@ -345,7 +344,7 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
             sheet = convert_metadata_sheet(sheet_json, xls2json_conf)
             row = convert_metadata_row(sheet, row_json, xls2json_conf)
             column = convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
-            if row_json is None and attribute_json is None:
+            if row_json is None and attribute_json is None and sheet is not None:
                 new_description = f'Sheet "{sheet}" is missing'
             elif row_json is None:
                 if 'have required' not in error['description']:

diff --git a/tests/resources/EVA_Submission_test_fails.xlsx b/tests/resources/EVA_Submission_test_fails.xlsx
diff --git a/...ces/validation_reports/validation_output/other_validations/metadata_conversion_errors.yml b/...ces/validation_reports/validation_output/other_validations/metadata_conversion_errors.yml
@@ -1,4 +1,4 @@
-- column: Tax ID
-  description: Worksheet Project is missing required header Tax ID
+- column: ''
+  description: 'Error loading problem.xlsx: Exception()'
   row: ''
-  sheet: Project
+  sheet: ''
diff --git a/.../resources/validation_reports/validation_output/other_validations/metadata_validation.txt b/.../resources/validation_reports/validation_output/other_validations/metadata_validation.txt
@@ -26,4 +26,19 @@
 	should have required property 'bioSampleObject'
 /sample/0
 	should match exactly one schema in oneOf
+/sample/3/bioSampleObject/name
+        must have required property 'name'
+        must have required property 'name'
+        must have required property 'name'
+/sample/3/bioSampleObject/characteristics/organism
+        must have required property 'organism'
+        must have required property 'organism'
+/sample/3/bioSampleObject/characteristics/Organism
+        must have required property 'Organism'
+/sample/3/bioSampleObject/characteristics/species
+        must have required property 'species'
+/sample/3/bioSampleObject/characteristics/Species
+        must have required property 'Species'
+/sample/3/bioSampleObject/characteristics
+        must match a schema in anyOf
  [0m