diff --git a/script/dh-validate.py b/script/dh-validate.py index 65c4ea7b..c732d04b 100644 --- a/script/dh-validate.py +++ b/script/dh-validate.py @@ -419,16 +419,35 @@ def getLinkMLTransform(SCHEMA, template, row_data): if slot['multivalued'] == True: output_val = [x.strip() for x in re.split(DELIMITERS, output_val)]; - # If key isn't in snake_case, then convert it to that, since - # linkml-validate insists on that: + # ISSUE: () preserved rather than eliminated # ISSUE: diagnostic pcr Ct value 1 not transformed to diagnostic_pcr_ct_value_1 - # ISSUE: geo_loc_name_(state_province_territory) not transformed to geo_loc_name_(state_province_territory) - # ISSUE: NML submitted specimen type not transformed to nml_submitted_specimen_type - # ISSUE: specimen collector sample ID not transformed to specimen_collector_sample_id - - # This relabling of key helps - key = re.sub("[-]","",re.sub("[ /]","_", key)); #LinkML doesn't convert to lowercase + # ISSUE: + + + # For validation, LinkML will transform both schema and slot labels into + # what it considers are standardized names, so we have to anticipate what + # new slot label will be via search and replace. Convert keys to + # **snake_case** since linkml-validate insists on that. However: + # - Forward slashes and parentheses are preserved though this is + # nonstandard, so: + # "geo_loc name (state/province/territory)" + # is changed to + # "geo_loc_name_(state/province/territory)" + # - Case is preserved though that is non-standard. So + # "specimen collector sample ID" + # is changed to + # "specimen_collector_sample_ID" + # + # - Validating caps CamelCase Enums is hard, e.g. if an Enum is named + # "geo_loc_name (state/province/territory) menu" + # LinkML will automatically rename this to + # "GeoLocName(state/province/territory)Menu" + # However, it doesn't update the name in slot range expressions! + # Hence these must be renamed in source schema. + + The GeoLocName(state/province/territory)Menu + key = re.sub("[-]","",re.sub("[ ]","_", key)); # Accepts ()/ in field name. data[key] = output_val; return data; diff --git a/script/tabular_to_schema.py b/script/tabular_to_schema.py index 40ef0b18..88235743 100644 --- a/script/tabular_to_schema.py +++ b/script/tabular_to_schema.py @@ -149,19 +149,37 @@ def set_range(slot, slot_range, slot_range_2): def set_min_max(slot, slot_minimum_value, slot_maximum_value): if slot_minimum_value > '': - if slot_minimum_value.isnumeric(): + if isInteger(slot_minimum_value): slot['minimum_value'] = int(slot_minimum_value); + elif isDecimal(slot_minimum_value): + slot['minimum_value'] = float(slot_minimum_value); else: slot['todos'] = ['>=' + slot_minimum_value]; if slot_maximum_value > '': - if slot_maximum_value.isnumeric(): + if isInteger(slot_maximum_value): slot['maximum_value'] = int(slot_maximum_value); + elif isDecimal(slot_maximum_value): + slot['maximum_value'] = float(slot_maximum_value); else: if slot['todos']: slot['todos'].append('<=' + slot_maximum_value); else: slot['todos'] = ['<=' + slot_maximum_value]; +def isDecimal(x): + try: + float(x); + return True + except ValueError: + return False + +def isInteger(x): + try: + int(x) + except ValueError: + return False + else: + return True def set_classes(schema_slot_path, schema, locale_schemas, export_format, warnings): @@ -356,9 +374,8 @@ def set_enums(enum_path, schema, locale_schemas, export_format, warnings): reader = csv.DictReader(tsvfile, dialect='excel-tab'); enumerations = schema['enums']; - - name = ''; # running title for chunks of enumeration rows - menu_path = []; + name = ''; # running name for chunks of enumeration rows + choice_path = []; enum = {}; for row in reader: @@ -368,21 +385,18 @@ def set_enums(enum_path, schema, locale_schemas, export_format, warnings): row[field] = row[field].strip(); # Each enumeration begins with a row that provides the name of the enum. - if row.get('title','') > '': + # subsequent rows may not have a name. + if row.get('name','') > '' or row.get('title','') > '': # Process default language title - + name = row.get('name'); title = row.get('title'); - name = row.get('name',''); - if name == '': name = title; - print ("name:", name) - - description = row.get('description',''); + if not name: # For enumerations that don't have separate name field + name = title; if not (name in enumerations): enum = { 'name': name, 'title': title, - 'description': description, 'permissible_values': {} }; enumerations[name] = enum; @@ -393,36 +407,29 @@ def set_enums(enum_path, schema, locale_schemas, export_format, warnings): for lcode in locale_schemas.keys(): locale_schema = locale_schemas[lcode]; locale_schema['enums'][name] = { - 'name': name, # default (usu. english) name acts as key + 'name': name, # Acts as key 'permissible_values': {} }; # Provide translation title if available for this menu. locale_title = row.get('title_' + lcode, ''); if locale_title > '': - locale_schema['enums'][name]['title'] = row.get('title_' + lcode, title); - - # Provide translation description if available for this menu. - locale_description = row.get('description_' + lcode, ''); - if locale_description > '': - locale_schema['enums'][name]['description'] = locale_description; - + locale_schema['enums'][name]['title'] = locale_title; - # If there is a title (or name) of an emum at play - if name and name > '': + if name > '': # Text is label of a particular menu choice # Loop scans through columns until it gets a value for depth in range(1,6): menu_x = 'menu_' + str(depth); - choice_value = row.get(menu_x); + choice_text = row.get(menu_x); # Here there is a menu item to process - if choice_value > '': - del menu_path[depth-1:] # Menu path always points to parent + if choice_text > '': + del choice_path[depth-1:] # Menu path always points to parent description = row.get('description',''); meaning = row.get('meaning',''); - choice = {'text' : choice_value} + choice = {'text' : choice_text} if description > '': choice['description'] = description; if meaning > '': choice['meaning'] = meaning; @@ -430,21 +437,22 @@ def set_enums(enum_path, schema, locale_schemas, export_format, warnings): set_mappings(choice, row, export_format); # IMPLEMENTS FLAT LIST WITH IS_A HIERARCHY - if len(menu_path) > 0: - choice['is_a'] = menu_path[-1]; # Last item in path + if len(choice_path) > 0: + choice['is_a'] = choice_path[-1]; # Last item in path - enum['permissible_values'][choice_value] = choice - menu_path.append(choice_value) + enum['permissible_values'][choice_text] = choice; + choice_path.append(choice_text); for lcode in locale_schemas.keys(): translation = row.get(menu_x + '_' + lcode, ''); - if translation > '': + if translation > '' and translation != choice['text']: - local_choice = copy.deepcopy(choice); - del local_choice['text']; # in language variant files this isn't needed. - local_choice['title'] = translation; + local_choice = {'title': translation} + description = row.get(description + '_' + lcode, ''); + if description: + local_choice['description': description]; - locale_schemas[lcode]['enums'][name]['permissible_values'][choice_value] = local_choice; + locale_schemas[lcode]['enums'][name]['permissible_values'][choice_text] = local_choice; break;