Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3564 - Simplify metadata conversion and validation #55

Merged
merged 5 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 17 additions & 59 deletions eva_sub_cli/executables/xlsx2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,16 @@ def __init__(self, xlsx_filename, conf_filename):
try:
self.workbook = load_workbook(xlsx_filename, read_only=True)
except Exception as e:
self.add_error(f'Error loading {xlsx_filename}: {e}')
self.add_error(f'Error loading {xlsx_filename}: {repr(e)}')
self.file_loaded = False
return
self.worksheets = None
self.worksheets = []
self._active_worksheet = None
self.row_offset = {}
self.headers = {}
self.valid = None
self.file_loaded = True
self.errors = []
self.valid_worksheets()

@property
def active_worksheet(self):
Expand All @@ -77,14 +79,8 @@ def active_worksheet(self, worksheet):

def valid_worksheets(self):
"""
Get the list of the names of worksheets which have all the configured required headers
:return: list of valid worksheet names in the Excel file
:rtype: list
Get the list of the names of worksheets which have the expected title and header row.
"""
if self.worksheets is not None:
return self.worksheets

self.worksheets = []
sheet_titles = self.workbook.sheetnames

for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]:
Expand All @@ -97,31 +93,10 @@ def valid_worksheets(self):
header_row = self.xlsx_conf[title].get(HEADERS_KEY_ROW, 1)
if worksheet.max_row < header_row + 1:
continue
# Check required headers are present
# Store headers and worksheet title
self.headers[title] = [cell.value if cell.value is None else cell.value.strip()
for cell in worksheet[header_row]]
required_headers = self.xlsx_conf[title].get(REQUIRED_HEADERS_KEY_NAME, [])
if set(required_headers) <= set(self.headers[title]): # issubset
self.worksheets.append(title)
else:
missing_headers = set(required_headers) - set(self.headers[title])
for header in missing_headers:
self.add_error(f'Worksheet {title} is missing required header {header}',
sheet=title, column=header)

return self.worksheets

def is_valid(self):
"""
Check that is all the worksheets contain required headers
:return: True if all the worksheets contain required headers. False otherwise
:rtype: bool
"""
if self.valid is None:
self.valid = True
self.valid_worksheets()

return self.valid
self.worksheets.append(title)

@staticmethod
def cast_value(value, type_name):
Expand Down Expand Up @@ -219,16 +194,17 @@ def get_biosample_object(self, data):
scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY]

# BioSample expects any of organism or species field
data[SPECIES] = data[scientific_name]
if scientific_name in data:
data[SPECIES] = data[scientific_name]
Comment on lines +197 to +198
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth having a mechanism that convert a excel spreadsheet into more than one BioSample field ?
We could then provide this in the spreadsheet2json_conf.yaml as

Sample:
  header_row: 3
  optional:
    Scientific Name: [scientificName, species]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could do, it would only be used for this case though as far as I know - the name field goes outside of characteristics so has to be handled differently. I probably won't add it to this PR but we should keep it in mind.

# BioSample name goes in its own attribute, not part of characteristics
biosample_name = data.pop(sample_name)
# For all characteristics, BioSample expects value in arrays of objects
data = {k: [{'text': self.serialize(v)}] for k, v in data.items()}
biosample_name = data.pop(sample_name, None)

# For all characteristics, BioSample expects value in arrays of objects
biosample_object = {
"name": biosample_name,
"characteristics": data
'characteristics': {k: [{'text': self.serialize(v)}] for k, v in data.items()}
}
if biosample_name is not None:
biosample_object['name'] = biosample_name

return biosample_object

Expand Down Expand Up @@ -263,30 +239,15 @@ def get_sample_json_data(self):
json_value.pop(analysis_alias)
json_value.pop(sample_name_in_vcf)

# Check for headers that are required only in this case
sample_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SAMPLE_NAME_KEY]
scientific_name = self.xlsx_conf[SAMPLE][OPTIONAL_HEADERS_KEY_NAME][SCIENTIFIC_NAME_KEY]
if sample_name not in json_value:
self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
f'{SAMPLE_NAME_KEY} populated',
sheet=SAMPLE, row=row_num, column=SAMPLE_NAME_KEY)
return None
if scientific_name not in json_value:
self.add_error(f'If BioSample Accession is not provided, the {SAMPLE} worksheet should have '
f'{SCIENTIFIC_NAME_KEY} populated',
sheet=SAMPLE, row=row_num, column=SCIENTIFIC_NAME_KEY)
return None

biosample_obj = self.get_biosample_object(json_value)
sample_data.update(bioSampleObject=biosample_obj)
sample_json[json_key].append(sample_data)

return sample_json

def json(self, output_json_file):
# First check that all sheets present have the required headers;
# also guards against the case where conversion fails in init
if not self.is_valid():
# If the file could not be loaded at all, return without generating JSON.
if not self.file_loaded:
return
json_data = {}
for title in self.xlsx_conf[WORKSHEETS_KEY_NAME]:
Expand All @@ -295,8 +256,6 @@ def json(self, output_json_file):
json_data.update(self.get_project_json_data())
elif title == SAMPLE:
sample_data = self.get_sample_json_data()
if sample_data is None: # missing conditionally required headers
return
json_data.update(sample_data)
else:
json_data[self.xlsx_conf[WORKSHEETS_KEY_NAME][title]] = []
Expand Down Expand Up @@ -324,7 +283,6 @@ def add_error(self, message, sheet='', row='', column=''):
"""Adds a conversion error using the same structure as other validation errors,
and marks the spreadsheet as invalid."""
self.errors.append({'sheet': sheet, 'row': row, 'column': column, 'description': message})
self.valid = False

def save_errors(self, errors_yaml_file):
with open(errors_yaml_file, 'w') as open_file:
Expand Down
3 changes: 1 addition & 2 deletions eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import argparse
import csv
import os
import re
Expand All @@ -12,7 +11,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev16'
container_tag = 'v0.0.1.dev17'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down
12 changes: 12 additions & 0 deletions eva_sub_cli/validators/validation_results_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ def clean_read(ifile):
if line.startswith('Validation failed with following error(s):'):
collect = True
else:
while line and not line.startswith('/'):
# Sometimes there are multiple (possibly redundant) errors listed under a single property,
# we only report the first
Comment on lines +138 to +139
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably should report that to biovalidator.js

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I'll make an issue for them.

line = clean_read(open_file)
line2 = clean_read(open_file)
if line is None or line2 is None:
break # EOF
Expand Down Expand Up @@ -164,6 +168,9 @@ def convert_metadata_attribute(sheet, json_attribute, xls2json_conf):
attributes_dict = {}
attributes_dict.update(xls2json_conf[sheet].get('required', {}))
attributes_dict.update(xls2json_conf[sheet].get('optional', {}))
attributes_dict['Scientific Name'] = 'species'
attributes_dict['BioSample Name'] = 'name'

for attribute in attributes_dict:
if attributes_dict[attribute] == json_attribute:
return attribute
Expand All @@ -185,7 +192,12 @@ def parse_metadata_property(property_str):


def parse_sample_metadata_property(property_str):
# Check characteristics
match = re.match(r'/sample/(\d+)/bioSampleObject/characteristics/(\w+)', property_str)
if match:
return 'sample', match.group(1), match.group(2)
# Check name
match = re.match(r'/sample/(\d+)/bioSampleObject/name', property_str)
if match:
return 'sample', match.group(1), 'name'
return None, None, None
3 changes: 1 addition & 2 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python
import csv
import datetime
import glob
import json
import logging
import os
Expand Down Expand Up @@ -345,7 +344,7 @@ def _convert_biovalidator_validation_to_spreadsheet(self):
sheet = convert_metadata_sheet(sheet_json, xls2json_conf)
row = convert_metadata_row(sheet, row_json, xls2json_conf)
column = convert_metadata_attribute(sheet, attribute_json, xls2json_conf)
if row_json is None and attribute_json is None:
if row_json is None and attribute_json is None and sheet is not None:
new_description = f'Sheet "{sheet}" is missing'
elif row_json is None:
if 'have required' not in error['description']:
Expand Down
Binary file modified tests/resources/EVA_Submission_test_fails.xlsx
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
- column: Tax ID
description: Worksheet Project is missing required header Tax ID
- column: ''
description: 'Error loading problem.xlsx: Exception()'
row: ''
sheet: Project
sheet: ''
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,19 @@
should have required property 'bioSampleObject'
/sample/0
should match exactly one schema in oneOf
/sample/3/bioSampleObject/name
must have required property 'name'
must have required property 'name'
must have required property 'name'
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure what's going on here, but this is the output I get from biovalidator...

/sample/3/bioSampleObject/characteristics/organism
must have required property 'organism'
must have required property 'organism'
/sample/3/bioSampleObject/characteristics/Organism
must have required property 'Organism'
/sample/3/bioSampleObject/characteristics/species
must have required property 'species'
/sample/3/bioSampleObject/characteristics/Species
must have required property 'Species'
/sample/3/bioSampleObject/characteristics
must match a schema in anyOf

Loading
Loading