From 32749377bf34c1b54c1a9ddbfae84705c97d446b Mon Sep 17 00:00:00 2001
From: Damion Dooley <damion_dooley@sfu.ca>
Date: Wed, 4 Sep 2024 23:38:44 -0700
Subject: [PATCH] Create dh-validate.py

This is the new validation script for tsv/csv/xlsx/xls files.
---
 script/dh-validate.py | 523 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 523 insertions(+)
 create mode 100755 script/dh-validate.py

diff --git a/script/dh-validate.py b/script/dh-validate.py
new file mode 100755
index 00000000..9598fb44
--- /dev/null
+++ b/script/dh-validate.py
@@ -0,0 +1,523 @@
+#!/opt/anaconda3/bin/python
+
+# dh-validate.py
+#  
+# A script to validate DataHarmonizer data files according to a given schema
+# and optionally Class, in the case of data files that don't identify which 
+# class their fields are from.  Passes data file to linkml-validate.
+#
+# Navigate to a particular /web/templates/[template folder]/schema.yaml 
+# 
+# Options:
+#  -s, --schema FILE               Schema file to validate data against
+#  -C, --target-class TEXT 
+#  -S, --index-slot TEXT    top level slot. Required for CSV dumping/loading
+#  -V, --version                   Show the version and exit.
+#
+# > cd web/templates/[template folder]/
+# > python ../../../script/dh-validate.py 
+#
+# Allowed data file types: '.tsv','.csv','.xls','.xlsx','.json','.yml','.yaml'
+#
+# dh-validate.py --schema schema.yaml test_good.csv
+# dh-validate.py --schema schema.yaml --target-class "CanCOGeN Covid-19" test_good.csv
+#
+# To prepare tsv or csv files for above validation, first line of a
+# DataHarmonizer-generated data file with its section headers must be removed,
+# and if 2nd line has spaces in its column/slot names, these must be replaced
+# by underscores.  Sed can be used to do this:
+#
+# > sed '1d;2 s/ /_/g' exampleInput/validTestData_2-1-2.tsv > test_good.tsv
+#
+# Author: Damion Dooley
+# 
+
+from collections import OrderedDict
+from decimal import Decimal
+import csv
+from openpyxl import load_workbook # For .xlsx
+import xlrd # for .xls // pip install xlrd
+import yaml
+import json
+import pathlib
+import os
+from sys import exit
+import argparse
+from linkml_runtime.utils.schemaview import SchemaView
+from linkml_runtime.dumpers.yaml_dumper import YAMLDumper
+import subprocess
+
+# ISSUE TRYING TO IMPORT linkml, getting "error: no such option: --schema" when --schema provided to argparse; and "Input LinkML schema file not given" when not given that parameter!
+"""
+try:
+	from linkml.validator import validate
+except Exception as inst:
+	print(type(inst))    # the exception type
+	print(inst.args)     # arguments stored in .args
+	print(inst)          # __str__ allows args to be printed directly,
+	                     # but may be overridden in exception subclasses
+"""
+
+def init_parser():
+
+	parser = argparse.ArgumentParser(
+	  prog='dh-validate',
+	  description='A wrapper around linkml-validate for validating data files against a LinkML schema.',
+	  epilog='For more information, see https://github.com/cidgoh/DataHarmonizer/')
+
+	# FUTURE: In case of json, schema will be provided in data file url, and 
+	# potentially can be looked up dynamically.
+	parser.add_argument("-s", "--schema", 
+		dest="schema_path", 
+		metavar="FILE",
+		default= os.getcwd() +"/schema.yaml",
+	  help='A LinkML yaml schema to test given data file against.  Default is "schema.yaml"');
+
+	parser.add_argument("-C", "--target-class", 
+		dest="target_class", 
+		metavar="TEXT",
+		required=False, 
+	  help="A schema class to test given data file against.");
+
+	parser.add_argument("-S", "--index-slot", 
+		dest="index_slot", 
+		metavar="TEXT",
+		required=False, 
+	  help="A slot within target class which acts as an identifier or unique key. This is required for CSV dumping/loading.  It is optional when this slot can be identified automatically as the only identifier or unique_key in the slot.");
+
+	parser.add_argument('data_sources', 
+		metavar='DATA_SOURCES', 
+		#nargs='+',
+	  help="One or more data files to validate using given schema.");
+
+	""" parser.add_argument("-V", "--version",
+		dest="version",
+		default=False,
+		help="Return version # and then exit.",
+	)
+	"""
+
+	return parser.parse_args();
+
+
+# For a given schema, return the class that possibly fits data to be 
+# validated. Also return the "index_slot" or likely identifier or 
+# primary key slot, useful in converting tabular tsv / csv data.
+#
+# Currently tries to match inputted target_class if any.
+# Alternately returns class if there is only one in schema.
+# Future: could sniff data to see what matches best in case where 
+# there are multiple classes.
+
+def getTargetClass(SCHEMA, target_class, slot_key = None):
+	templates = {};
+	for name, class_obj in SCHEMA.all_classes().items():
+	# Generate schema's list of possible template classes:
+		if name == 'dh_interface': # Archaic class
+			continue;
+		templates[name] = class_obj;
+
+	if target_class and not target_class in templates:
+		exit("The given validation target class [" + target_class + "] was not found in schema!");
+
+	# So far many DH schemas only have one class so only possibility is to validate against that:
+	if not target_class:
+		if len(templates) == 1:
+			target_class = next(iter(templates.keys())); 
+		else:
+			# FUTURE: determine if one of the multiple classes is a good match for given data.
+			exit("No validation class was provided, and no default could be found!");
+
+	# Determined target_class at this point.
+	found = False;
+	template = SCHEMA.get_class(target_class);
+	for name in SCHEMA.class_slots(target_class):
+		slot = template['attributes'][name];
+		if 'identifier' in slot and slot['identifier'] == True:
+			if slot_key:
+				if slot_key != name:
+					continue; # Continue search
+				else:
+					found = True;
+					break;
+			# Pick first eligable slot key if none provided on command line
+			else:
+				found = True;
+				slot_key = name;
+
+			break;
+
+	if not found: 
+		exit("Target class [" + target_class + "] does not have an identifier or key slot!");
+
+	print ("index:", slot_key);
+	return (target_class, slot_key);
+
+
+# A dictionary of all the sections that may appear on first line of a DH data file
+def getSlotGroupTitleDict(SCHEMA, target_class):
+	
+	slot_group_titles = {};
+
+	template = SCHEMA.get_class(target_class);
+	for name in SCHEMA.class_slots(target_class):
+		slot = template['attributes'][name];
+		#print("SLOT", slot);
+		if slot['slot_group'] and slot['slot_group'] > '':
+			slot_group_titles[slot['slot_group']] = True;
+
+	return slot_group_titles;
+
+
+def getSlotTitleToNameDict(SCHEMA, target_class):
+	slot_title_name_map = {};
+
+	template = SCHEMA.get_class(target_class);
+	for name in SCHEMA.class_slots(target_class):
+		slot = template['attributes'][name];
+		slot_title_name_map[slot['title']] = name;
+
+	return slot_title_name_map;
+
+
+# Used to compose columns of new output file
+# uses class.slots array to determine order, which should be same as 
+# order in slot_usage and its rank.  FUTURE: ensure sort by slot_usage rank,
+# if any.
+def getSlotNameToTitleDict(SCHEMA, target_class):
+	slot_name_title_map = OrderedDict();
+
+	template = SCHEMA.get_class(target_class);
+	for name in SCHEMA.class_slots(target_class):
+		slot = template['attributes'][name];
+		slot_name_title_map[name] = slot['title'] or None;
+
+	return slot_name_title_map;
+
+
+# LinkML validation doesn't care about order of fields, so we can add 
+# missing schema headers in appropriate columns (according to ordering)
+# with empty values. This enables a "new" tabular data file to be saved.
+# subsequent data columns have to be mapped over however.
+
+# Returns converted headers along with a report of mismatched headers which
+# arise when old schema applied to newer data file or visa versa.
+# In contrast to JSON, schema-version-appropriate tabular data should mention
+# All columns/slots in SCHEMA
+def getNormalizedHeaders(SCHEMA, target_class, row, slot_header_map, slot_title_map):
+
+	report = OrderedDict();
+	header_count = 0;
+
+	# Data_row_map will always have a place for every place in row.
+	data_row_map = [];
+	ignored = OrderedDict();
+
+	for field in row:
+		if field in slot_header_map: # row already mentions slot name
+			data_row_map.append(field);
+			header_count +=1;
+			continue;
+
+		if field in slot_title_map: # row mentions slot title, so translate.
+			data_row_map.append(slot_title_map[field]);
+			report[field] = 'Mapped "' + field + '" to ' + slot_title_map[field];
+			header_count +=1;	
+			continue;
+
+		# Possibly old naming that doesn't match title but would match name, try: 
+		# try_name = Lower(regexreplace(regexreplace(field,"[ /]","_"),"[-()]","")).
+			# ...
+
+		else:
+			data_row_map.append('');
+			ignored[field] = 'Ignored "' + field + '", not in schema.';
+
+	# TO DO: Determine how to handle rows with fewer or more unmatched columns.
+	if header_count < len(row):
+		exit ("Data file is missing fields: " + str(report));
+		# FUTURE: allow shorter (empty tail) rows?  NO. 
+		# take out unused columns?
+
+	report |= ignored; # Add ignored fields to end of report.
+
+	return (tuple(data_row_map), report, header_count);
+
+
+# A normalized data file has tsv/csv/xls/xlsx files converted to JSON format.
+# (This does not handle .yaml, .yml, .json-ld since those are validated ok
+# with data elements saved by LinkML slot name rather than title.)
+#
+# FUTURE: handle situation where for every section there is only one slot name/title
+def getNormalizedDataFile(SCHEMA, target_class, data_source, temp_base):
+
+	if not os.path.isfile(data_source):
+		exit("ERROR: Data file not found: " + data_source);
+
+	# slot_group_titles = getSlotGroupTitleDict(SCHEMA, target_class);
+	slot_title_map = getSlotTitleToNameDict(SCHEMA, target_class);
+	slot_header_map = getSlotNameToTitleDict(SCHEMA, target_class);
+	file_path_obj = pathlib.Path(data_source);
+	reader = None;
+
+	if file_path_obj.suffix in ['.xls','.xlsx']:
+		file_mode = "rb"
+	else:
+		file_mode = "r"
+
+	with open(data_source, file_mode) as data_handle:
+		
+		match file_path_obj.suffix:
+
+			case '.yaml' | '.yml':
+				exit("ERROR: getNormalizedDataFile() does not process yaml/yml files.");  
+
+			# For CSV and TSV, if reader.fieldnames has empty labels, it means its
+			# not a header line, i.e. skipping "section" line or other stuff.
+			# (Don't even need to use "next(data_handle)")
+			# Here we find reader.fieldnames, AND move reader TO FIRST ROW OF DATA.
+			case '.csv' | '.tsv':
+				while True:
+					reader = csv.DictReader(data_handle, dialect = ('excel' if file_path_obj.suffix == ".csv" else 'excel-tab'));
+					# Empty fieldname indicates superfluous initial row.
+					if not '' in reader.fieldnames: 
+						# Found row of full table cells.  Additional test on cell text = slot name/title?
+						(header_row, report, header_count) = getNormalizedHeaders(SCHEMA, target_class, reader.fieldnames, slot_header_map, slot_title_map);
+						break;
+
+			# Excell types can hold multiple tabs, each of which needs to be validated separately.
+			# Excel returns a tuple, one value position for each column.
+			case '.xlsx':
+
+				workbook = load_workbook(data_source);
+
+				if target_class in workbook.sheetnames: # e.g. ['Sheet1'], also note "sheet.title"
+					sheet = workbook.worksheets[workbook.sheetnames.index(target_class)]; 
+				else:
+					if len(workbook.sheetnames) > 1:
+						exit("ERROR: getNormalizedDataFile() cannot find " + target_class + " tab in excel spreadsheet tabs: " + str(workbook.sheetnames)); 
+					else:
+						sheet = workbook.worksheets[0]; # pick the one and only tab/sheet.
+
+				reader = sheet.iter_rows(values_only=True);
+
+				for row in reader:
+					# Skip superfluous header liness:
+					if not '' in row:
+						# Found row with value in each cell, so likely header.
+						(header_row, report, header_count) = getNormalizedHeaders(SCHEMA, target_class, row, slot_header_map, slot_title_map);
+						break;
+
+			# Aligning .xls with same output as .xlsx
+			case '.xls': 
+
+				workbook = xlrd.open_workbook(data_source);
+				sheets = workbook.nsheets;
+				sheetnames = workbook.sheet_names();
+				if target_class in sheetnames:
+					sheet = workbook.sheet_by_index(sheetnames.index(target_class));
+				else:
+					if sheets > 1:
+						exit("ERROR: getNormalizedDataFile() cannot find " + target_class + " tab in excel spreadsheet tabs: " + str(sheetnames)); 
+					else:
+						sheet = workbook.sheet_by_index(0);
+
+				# Returns array of values for given row
+				reader = iter(tuple([sheet.cell_value(rx, cx) for cx in range(sheet.ncols)]) for rx in range(sheet.nrows)); 
+   
+				for row in reader:
+					# Skip superfluous header liness:
+					if not '' in row:	
+						(header_row, report, header_count) = getNormalizedHeaders(SCHEMA, target_class, row, slot_header_map, slot_title_map);
+						break;
+
+			case _:
+				exit("ERROR: Data file doesn't have compatible type ('tsv/csv/xls/xlsx/json/yml/yaml) :" + data_source);
+
+		#	for item in report:
+		#		print (report[item]);
+
+		# Write both normalized TSV and JSON files:
+		template = SCHEMA.get_class(target_class);
+		yaml = writeTmpFiles (SCHEMA, template, header_row, reader, temp_base);
+
+	return yaml;
+
+
+def writeTmpFiles (SCHEMA, template, header_row, reader, temp_base):
+  # First row of DH tabular data may be slot_groups
+  # 2nd DH row likely has column/field/slot names which need adjustment
+  # (newline = '' prevents extra blank line)
+	data = [];
+	
+	with open(temp_base + '.tsv', 'w', newline='') as tsv_file:
+
+		writer = csv.DictWriter(tsv_file, fieldnames = header_row, dialect='excel-tab');
+		writer.writeheader();
+
+		# We are in the data rows now, which can be written to new file:
+		for row in reader:	
+
+			if type(row) is tuple: # xls/xlsx: for value in row:
+				row_data = {k: v for k, v in zip(header_row, row)}
+			
+			else: # tsv/csv Dict: e.g. row = {'first name': 'foo', ...}, wher keys have to be converted
+				row_data = {k: row[v] for k, v in zip(header_row, row)}
+				#print("ROW:",row_data)
+					
+				# Yaml only gets fields that have values, and as well transformation of some data types.
+				data.append(getLinkMLTransform(SCHEMA, template, row_data));
+				writer.writerow(row_data);
+
+		YAMLDumper().dump(data, temp_base + '.yaml');	
+		return data;
+
+
+# Could be made more efficient by running vertical loop on columns of a given 
+# data type.
+def getLinkMLTransform(SCHEMA, template, row_data):
+		
+	data = {};
+	for key, val in row_data.items():
+		if val: # Only return dict keys that have values.
+			slot = template['attributes'][key];
+			output_val = val;
+			ranges = [];
+
+			if slot['range']:
+				ranges = [slot['range']];
+			else:
+				for range_type in ['any_of','exactly_one_of','none_of','all_of']:
+					if slot[range_type]:
+						# e.g. 'any_of': [AnonymousSlotExpression({'range': 'decimal'}),
+  					#  AnonymousSlotExpression({'range': 'NullValueMenu'})]
+						ranges = [binding.range for binding in slot[range_type]]
+						break;
+
+			# ISSUE: If a slot is integer, decimal or date, but value is saved as
+			# a string in yaml file, linkml-validate throws error. Must adjust saved
+			# datatype
+			for slot_range in ranges:
+				match slot_range:
+					case 'integer': 
+						if isInteger(val): 
+							output_val = int(val);
+					case 'decimal'|'float':
+						if isDecimal(val): # Note .isdecimal() does NOT test for decimals.
+							if '.' in val:
+								output_val = float(val); 
+							else:
+								output_val = int(val);
+					#case 'date': 
+					case _: # Nothing to do, but error situation since all slots have ranges
+						#print (SCHEMA.get_enums(slot_range))
+						pass
+
+			if slot['multivalued'] == True:
+				output_val = [output_val];
+			data[key] = output_val;
+
+	return data;
+
+def isDecimal(x):
+    try:
+        float(x);
+        return True
+    except ValueError:
+        return False
+
+def isInteger(x):
+    try: 
+        int(x)
+    except ValueError:
+        return False
+    else:
+        return True
+
+###############################################################################
+
+warnings = [];
+
+args = init_parser();
+
+if not os.path.isfile(args.schema_path):
+  exit("LinkML schema file not found: " + args.schema_path)
+
+with open(args.schema_path, "r") as schema_handle:
+
+	# Using SchemaView() to generate inferred slot attributes
+	# like in schema.json generated by tabular_to_schema.py
+	# Converts schema as javascript object into LinkML schemaView object;
+	schema_obj = yaml.safe_load(schema_handle);
+
+	if not "classes" in schema_obj or not "slots" in schema_obj:
+		exit("Given schema is missing classes or slots.");
+
+	SCHEMA = SchemaView(yaml.dump(schema_obj, sort_keys=False)); 
+	# Brings in any "imports:". This also includes built-in linkml:types
+	SCHEMA.merge_imports();
+
+	# Loop through each class and replace it with its induced version which
+	# includes attributes dictionary containing inferred slot definitions.
+	for name, class_obj in SCHEMA.all_classes().items():
+		# Note classDef["@type"]: "ClassDefinition" is only in json output
+		if SCHEMA.class_slots(name):
+			new_obj = SCHEMA.induced_class(name);
+			SCHEMA.add_class(new_obj);
+		
+	(target_class, slot_key) = getTargetClass(SCHEMA, args.target_class);
+	# HAD TO Normalize target class : "CanCoGEN Covid-19" => "CanCOGeNCovid19"
+
+	# Cycle through each data_source file to validate
+	for data_source in args.data_sources.split():
+
+		print ("VALIDATING: ", data_source);
+
+		file_path_obj = pathlib.Path(data_source);
+		if file_path_obj.suffix in ['.json','.json-ld','.yaml','.yml']:
+
+			# FUTURE: Handle slot name / title variations here too.
+			temp_file = data_source;
+
+		else:
+			# Deal with section headers and column headers as titles
+			# Writes a temporary file with all fields renamed
+			temp_file = file_path_obj.stem + ".tmp";
+			getNormalizedDataFile(SCHEMA, target_class, data_source, temp_file);
+
+		subprocess.run(["linkml-validate", "-s", args.schema_path, "-C", target_class, temp_file + '.yaml']);			# input='foobar'.encode('utf-8')
+		
+		print ("File scan complete.");
+
+if len(warnings):
+	print ("\nWARNING: \n", "\n ".join(warnings));
+
+
+# SNIPETS:
+#
+#stderr=None;
+#try:
+# e.g. > linkml-convert -s schema.yaml -C CanCOGeNCovid19 --index-slot specimen_collector_sample_id -o validTestData_2-1-2.tmp.tsv.json validTestData_2-1-2.tmp.tsv
+# ISSUE IS range="ANY_OF" slots may have content but if REQUIRED=True, ARE THROWING ERROR.
+#run_state = subprocess.check_output(["linkml-convert", "-s", args.schema_path, "-C", target_class, "--index-slot", slot_key, "-o", temp_file + '.json', temp_file]) #, stderr=subprocess.STDOUT
+"""
+except BaseException as inst:
+
+	print(type(inst))		# the exception type
+	print(inst.args)		# arguments stored in .args
+	print(inst)					# __str__ allows args to be printed directly,
+												# but may be overridden in exception subclasses
+
+finally:
+"""
+
+"""
+# A challenge trying to get linkml-validate working via python module.
+
+report = validate(data_handle, args.schema_path, "Person") # , "Person"
+if not report.results:
+  print('The instance is valid!')
+else:
+  for result in report.results:
+      print(result.message)
+"""
\ No newline at end of file