From 32749377bf34c1b54c1a9ddbfae84705c97d446b Mon Sep 17 00:00:00 2001 From: Damion Dooley Date: Wed, 4 Sep 2024 23:38:44 -0700 Subject: [PATCH] Create dh-validate.py This is the new validation script for tsv/csv/xlsx/xls files. --- script/dh-validate.py | 523 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 523 insertions(+) create mode 100755 script/dh-validate.py diff --git a/script/dh-validate.py b/script/dh-validate.py new file mode 100755 index 00000000..9598fb44 --- /dev/null +++ b/script/dh-validate.py @@ -0,0 +1,523 @@ +#!/opt/anaconda3/bin/python + +# dh-validate.py +# +# A script to validate DataHarmonizer data files according to a given schema +# and optionally Class, in the case of data files that don't identify which +# class their fields are from. Passes data file to linkml-validate. +# +# Navigate to a particular /web/templates/[template folder]/schema.yaml +# +# Options: +# -s, --schema FILE Schema file to validate data against +# -C, --target-class TEXT +# -S, --index-slot TEXT top level slot. Required for CSV dumping/loading +# -V, --version Show the version and exit. +# +# > cd web/templates/[template folder]/ +# > python ../../../script/dh-validate.py +# +# Allowed data file types: '.tsv','.csv','.xls','.xlsx','.json','.yml','.yaml' +# +# dh-validate.py --schema schema.yaml test_good.csv +# dh-validate.py --schema schema.yaml --target-class "CanCOGeN Covid-19" test_good.csv +# +# To prepare tsv or csv files for above validation, first line of a +# DataHarmonizer-generated data file with its section headers must be removed, +# and if 2nd line has spaces in its column/slot names, these must be replaced +# by underscores. Sed can be used to do this: +# +# > sed '1d;2 s/ /_/g' exampleInput/validTestData_2-1-2.tsv > test_good.tsv +# +# Author: Damion Dooley +# + +from collections import OrderedDict +from decimal import Decimal +import csv +from openpyxl import load_workbook # For .xlsx +import xlrd # for .xls // pip install xlrd +import yaml +import json +import pathlib +import os +from sys import exit +import argparse +from linkml_runtime.utils.schemaview import SchemaView +from linkml_runtime.dumpers.yaml_dumper import YAMLDumper +import subprocess + +# ISSUE TRYING TO IMPORT linkml, getting "error: no such option: --schema" when --schema provided to argparse; and "Input LinkML schema file not given" when not given that parameter! +""" +try: + from linkml.validator import validate +except Exception as inst: + print(type(inst)) # the exception type + print(inst.args) # arguments stored in .args + print(inst) # __str__ allows args to be printed directly, + # but may be overridden in exception subclasses +""" + +def init_parser(): + + parser = argparse.ArgumentParser( + prog='dh-validate', + description='A wrapper around linkml-validate for validating data files against a LinkML schema.', + epilog='For more information, see https://github.com/cidgoh/DataHarmonizer/') + + # FUTURE: In case of json, schema will be provided in data file url, and + # potentially can be looked up dynamically. + parser.add_argument("-s", "--schema", + dest="schema_path", + metavar="FILE", + default= os.getcwd() +"/schema.yaml", + help='A LinkML yaml schema to test given data file against. Default is "schema.yaml"'); + + parser.add_argument("-C", "--target-class", + dest="target_class", + metavar="TEXT", + required=False, + help="A schema class to test given data file against."); + + parser.add_argument("-S", "--index-slot", + dest="index_slot", + metavar="TEXT", + required=False, + help="A slot within target class which acts as an identifier or unique key. This is required for CSV dumping/loading. It is optional when this slot can be identified automatically as the only identifier or unique_key in the slot."); + + parser.add_argument('data_sources', + metavar='DATA_SOURCES', + #nargs='+', + help="One or more data files to validate using given schema."); + + """ parser.add_argument("-V", "--version", + dest="version", + default=False, + help="Return version # and then exit.", + ) + """ + + return parser.parse_args(); + + +# For a given schema, return the class that possibly fits data to be +# validated. Also return the "index_slot" or likely identifier or +# primary key slot, useful in converting tabular tsv / csv data. +# +# Currently tries to match inputted target_class if any. +# Alternately returns class if there is only one in schema. +# Future: could sniff data to see what matches best in case where +# there are multiple classes. + +def getTargetClass(SCHEMA, target_class, slot_key = None): + templates = {}; + for name, class_obj in SCHEMA.all_classes().items(): + # Generate schema's list of possible template classes: + if name == 'dh_interface': # Archaic class + continue; + templates[name] = class_obj; + + if target_class and not target_class in templates: + exit("The given validation target class [" + target_class + "] was not found in schema!"); + + # So far many DH schemas only have one class so only possibility is to validate against that: + if not target_class: + if len(templates) == 1: + target_class = next(iter(templates.keys())); + else: + # FUTURE: determine if one of the multiple classes is a good match for given data. + exit("No validation class was provided, and no default could be found!"); + + # Determined target_class at this point. + found = False; + template = SCHEMA.get_class(target_class); + for name in SCHEMA.class_slots(target_class): + slot = template['attributes'][name]; + if 'identifier' in slot and slot['identifier'] == True: + if slot_key: + if slot_key != name: + continue; # Continue search + else: + found = True; + break; + # Pick first eligable slot key if none provided on command line + else: + found = True; + slot_key = name; + + break; + + if not found: + exit("Target class [" + target_class + "] does not have an identifier or key slot!"); + + print ("index:", slot_key); + return (target_class, slot_key); + + +# A dictionary of all the sections that may appear on first line of a DH data file +def getSlotGroupTitleDict(SCHEMA, target_class): + + slot_group_titles = {}; + + template = SCHEMA.get_class(target_class); + for name in SCHEMA.class_slots(target_class): + slot = template['attributes'][name]; + #print("SLOT", slot); + if slot['slot_group'] and slot['slot_group'] > '': + slot_group_titles[slot['slot_group']] = True; + + return slot_group_titles; + + +def getSlotTitleToNameDict(SCHEMA, target_class): + slot_title_name_map = {}; + + template = SCHEMA.get_class(target_class); + for name in SCHEMA.class_slots(target_class): + slot = template['attributes'][name]; + slot_title_name_map[slot['title']] = name; + + return slot_title_name_map; + + +# Used to compose columns of new output file +# uses class.slots array to determine order, which should be same as +# order in slot_usage and its rank. FUTURE: ensure sort by slot_usage rank, +# if any. +def getSlotNameToTitleDict(SCHEMA, target_class): + slot_name_title_map = OrderedDict(); + + template = SCHEMA.get_class(target_class); + for name in SCHEMA.class_slots(target_class): + slot = template['attributes'][name]; + slot_name_title_map[name] = slot['title'] or None; + + return slot_name_title_map; + + +# LinkML validation doesn't care about order of fields, so we can add +# missing schema headers in appropriate columns (according to ordering) +# with empty values. This enables a "new" tabular data file to be saved. +# subsequent data columns have to be mapped over however. + +# Returns converted headers along with a report of mismatched headers which +# arise when old schema applied to newer data file or visa versa. +# In contrast to JSON, schema-version-appropriate tabular data should mention +# All columns/slots in SCHEMA +def getNormalizedHeaders(SCHEMA, target_class, row, slot_header_map, slot_title_map): + + report = OrderedDict(); + header_count = 0; + + # Data_row_map will always have a place for every place in row. + data_row_map = []; + ignored = OrderedDict(); + + for field in row: + if field in slot_header_map: # row already mentions slot name + data_row_map.append(field); + header_count +=1; + continue; + + if field in slot_title_map: # row mentions slot title, so translate. + data_row_map.append(slot_title_map[field]); + report[field] = 'Mapped "' + field + '" to ' + slot_title_map[field]; + header_count +=1; + continue; + + # Possibly old naming that doesn't match title but would match name, try: + # try_name = Lower(regexreplace(regexreplace(field,"[ /]","_"),"[-()]","")). + # ... + + else: + data_row_map.append(''); + ignored[field] = 'Ignored "' + field + '", not in schema.'; + + # TO DO: Determine how to handle rows with fewer or more unmatched columns. + if header_count < len(row): + exit ("Data file is missing fields: " + str(report)); + # FUTURE: allow shorter (empty tail) rows? NO. + # take out unused columns? + + report |= ignored; # Add ignored fields to end of report. + + return (tuple(data_row_map), report, header_count); + + +# A normalized data file has tsv/csv/xls/xlsx files converted to JSON format. +# (This does not handle .yaml, .yml, .json-ld since those are validated ok +# with data elements saved by LinkML slot name rather than title.) +# +# FUTURE: handle situation where for every section there is only one slot name/title +def getNormalizedDataFile(SCHEMA, target_class, data_source, temp_base): + + if not os.path.isfile(data_source): + exit("ERROR: Data file not found: " + data_source); + + # slot_group_titles = getSlotGroupTitleDict(SCHEMA, target_class); + slot_title_map = getSlotTitleToNameDict(SCHEMA, target_class); + slot_header_map = getSlotNameToTitleDict(SCHEMA, target_class); + file_path_obj = pathlib.Path(data_source); + reader = None; + + if file_path_obj.suffix in ['.xls','.xlsx']: + file_mode = "rb" + else: + file_mode = "r" + + with open(data_source, file_mode) as data_handle: + + match file_path_obj.suffix: + + case '.yaml' | '.yml': + exit("ERROR: getNormalizedDataFile() does not process yaml/yml files."); + + # For CSV and TSV, if reader.fieldnames has empty labels, it means its + # not a header line, i.e. skipping "section" line or other stuff. + # (Don't even need to use "next(data_handle)") + # Here we find reader.fieldnames, AND move reader TO FIRST ROW OF DATA. + case '.csv' | '.tsv': + while True: + reader = csv.DictReader(data_handle, dialect = ('excel' if file_path_obj.suffix == ".csv" else 'excel-tab')); + # Empty fieldname indicates superfluous initial row. + if not '' in reader.fieldnames: + # Found row of full table cells. Additional test on cell text = slot name/title? + (header_row, report, header_count) = getNormalizedHeaders(SCHEMA, target_class, reader.fieldnames, slot_header_map, slot_title_map); + break; + + # Excell types can hold multiple tabs, each of which needs to be validated separately. + # Excel returns a tuple, one value position for each column. + case '.xlsx': + + workbook = load_workbook(data_source); + + if target_class in workbook.sheetnames: # e.g. ['Sheet1'], also note "sheet.title" + sheet = workbook.worksheets[workbook.sheetnames.index(target_class)]; + else: + if len(workbook.sheetnames) > 1: + exit("ERROR: getNormalizedDataFile() cannot find " + target_class + " tab in excel spreadsheet tabs: " + str(workbook.sheetnames)); + else: + sheet = workbook.worksheets[0]; # pick the one and only tab/sheet. + + reader = sheet.iter_rows(values_only=True); + + for row in reader: + # Skip superfluous header liness: + if not '' in row: + # Found row with value in each cell, so likely header. + (header_row, report, header_count) = getNormalizedHeaders(SCHEMA, target_class, row, slot_header_map, slot_title_map); + break; + + # Aligning .xls with same output as .xlsx + case '.xls': + + workbook = xlrd.open_workbook(data_source); + sheets = workbook.nsheets; + sheetnames = workbook.sheet_names(); + if target_class in sheetnames: + sheet = workbook.sheet_by_index(sheetnames.index(target_class)); + else: + if sheets > 1: + exit("ERROR: getNormalizedDataFile() cannot find " + target_class + " tab in excel spreadsheet tabs: " + str(sheetnames)); + else: + sheet = workbook.sheet_by_index(0); + + # Returns array of values for given row + reader = iter(tuple([sheet.cell_value(rx, cx) for cx in range(sheet.ncols)]) for rx in range(sheet.nrows)); + + for row in reader: + # Skip superfluous header liness: + if not '' in row: + (header_row, report, header_count) = getNormalizedHeaders(SCHEMA, target_class, row, slot_header_map, slot_title_map); + break; + + case _: + exit("ERROR: Data file doesn't have compatible type ('tsv/csv/xls/xlsx/json/yml/yaml) :" + data_source); + + # for item in report: + # print (report[item]); + + # Write both normalized TSV and JSON files: + template = SCHEMA.get_class(target_class); + yaml = writeTmpFiles (SCHEMA, template, header_row, reader, temp_base); + + return yaml; + + +def writeTmpFiles (SCHEMA, template, header_row, reader, temp_base): + # First row of DH tabular data may be slot_groups + # 2nd DH row likely has column/field/slot names which need adjustment + # (newline = '' prevents extra blank line) + data = []; + + with open(temp_base + '.tsv', 'w', newline='') as tsv_file: + + writer = csv.DictWriter(tsv_file, fieldnames = header_row, dialect='excel-tab'); + writer.writeheader(); + + # We are in the data rows now, which can be written to new file: + for row in reader: + + if type(row) is tuple: # xls/xlsx: for value in row: + row_data = {k: v for k, v in zip(header_row, row)} + + else: # tsv/csv Dict: e.g. row = {'first name': 'foo', ...}, wher keys have to be converted + row_data = {k: row[v] for k, v in zip(header_row, row)} + #print("ROW:",row_data) + + # Yaml only gets fields that have values, and as well transformation of some data types. + data.append(getLinkMLTransform(SCHEMA, template, row_data)); + writer.writerow(row_data); + + YAMLDumper().dump(data, temp_base + '.yaml'); + return data; + + +# Could be made more efficient by running vertical loop on columns of a given +# data type. +def getLinkMLTransform(SCHEMA, template, row_data): + + data = {}; + for key, val in row_data.items(): + if val: # Only return dict keys that have values. + slot = template['attributes'][key]; + output_val = val; + ranges = []; + + if slot['range']: + ranges = [slot['range']]; + else: + for range_type in ['any_of','exactly_one_of','none_of','all_of']: + if slot[range_type]: + # e.g. 'any_of': [AnonymousSlotExpression({'range': 'decimal'}), + # AnonymousSlotExpression({'range': 'NullValueMenu'})] + ranges = [binding.range for binding in slot[range_type]] + break; + + # ISSUE: If a slot is integer, decimal or date, but value is saved as + # a string in yaml file, linkml-validate throws error. Must adjust saved + # datatype + for slot_range in ranges: + match slot_range: + case 'integer': + if isInteger(val): + output_val = int(val); + case 'decimal'|'float': + if isDecimal(val): # Note .isdecimal() does NOT test for decimals. + if '.' in val: + output_val = float(val); + else: + output_val = int(val); + #case 'date': + case _: # Nothing to do, but error situation since all slots have ranges + #print (SCHEMA.get_enums(slot_range)) + pass + + if slot['multivalued'] == True: + output_val = [output_val]; + data[key] = output_val; + + return data; + +def isDecimal(x): + try: + float(x); + return True + except ValueError: + return False + +def isInteger(x): + try: + int(x) + except ValueError: + return False + else: + return True + +############################################################################### + +warnings = []; + +args = init_parser(); + +if not os.path.isfile(args.schema_path): + exit("LinkML schema file not found: " + args.schema_path) + +with open(args.schema_path, "r") as schema_handle: + + # Using SchemaView() to generate inferred slot attributes + # like in schema.json generated by tabular_to_schema.py + # Converts schema as javascript object into LinkML schemaView object; + schema_obj = yaml.safe_load(schema_handle); + + if not "classes" in schema_obj or not "slots" in schema_obj: + exit("Given schema is missing classes or slots."); + + SCHEMA = SchemaView(yaml.dump(schema_obj, sort_keys=False)); + # Brings in any "imports:". This also includes built-in linkml:types + SCHEMA.merge_imports(); + + # Loop through each class and replace it with its induced version which + # includes attributes dictionary containing inferred slot definitions. + for name, class_obj in SCHEMA.all_classes().items(): + # Note classDef["@type"]: "ClassDefinition" is only in json output + if SCHEMA.class_slots(name): + new_obj = SCHEMA.induced_class(name); + SCHEMA.add_class(new_obj); + + (target_class, slot_key) = getTargetClass(SCHEMA, args.target_class); + # HAD TO Normalize target class : "CanCoGEN Covid-19" => "CanCOGeNCovid19" + + # Cycle through each data_source file to validate + for data_source in args.data_sources.split(): + + print ("VALIDATING: ", data_source); + + file_path_obj = pathlib.Path(data_source); + if file_path_obj.suffix in ['.json','.json-ld','.yaml','.yml']: + + # FUTURE: Handle slot name / title variations here too. + temp_file = data_source; + + else: + # Deal with section headers and column headers as titles + # Writes a temporary file with all fields renamed + temp_file = file_path_obj.stem + ".tmp"; + getNormalizedDataFile(SCHEMA, target_class, data_source, temp_file); + + subprocess.run(["linkml-validate", "-s", args.schema_path, "-C", target_class, temp_file + '.yaml']); # input='foobar'.encode('utf-8') + + print ("File scan complete."); + +if len(warnings): + print ("\nWARNING: \n", "\n ".join(warnings)); + + +# SNIPETS: +# +#stderr=None; +#try: +# e.g. > linkml-convert -s schema.yaml -C CanCOGeNCovid19 --index-slot specimen_collector_sample_id -o validTestData_2-1-2.tmp.tsv.json validTestData_2-1-2.tmp.tsv +# ISSUE IS range="ANY_OF" slots may have content but if REQUIRED=True, ARE THROWING ERROR. +#run_state = subprocess.check_output(["linkml-convert", "-s", args.schema_path, "-C", target_class, "--index-slot", slot_key, "-o", temp_file + '.json', temp_file]) #, stderr=subprocess.STDOUT +""" +except BaseException as inst: + + print(type(inst)) # the exception type + print(inst.args) # arguments stored in .args + print(inst) # __str__ allows args to be printed directly, + # but may be overridden in exception subclasses + +finally: +""" + +""" +# A challenge trying to get linkml-validate working via python module. + +report = validate(data_handle, args.schema_path, "Person") # , "Person" +if not report.results: + print('The instance is valid!') +else: + for result in report.results: + print(result.message) +""" \ No newline at end of file