From c99a2f9af98c249d06422176fccbbf460e1fbe81 Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Wed, 20 Sep 2023 18:27:28 +1000 Subject: [PATCH 1/4] chg: refactor translation detection - overall goal is to avoid using only the first few lines of survey or choices sheet data to decide whether to use multi-language behaviour (itext) or not. - xls2json.py: re-use dealias func for finding headers as well as data - translation_checks.py: use headers data to find translations - survey.py: - declare _translations key and store and translations info in it - tidy setup_translations to data prep vs. _translations update steps - add translation type key to help differentiate question vs choice - annotate survey elements with info about translations, dynamic choices, and media, so that this doesn't need to be re-checked later by individual elements. - decide whether to emit itext labels for choices on a per-itemset basis, rather than switching into itext mode for all choices. This avoids needing to mangle untranslated itemsets into translated when other itemsets in the survey are translated (otherwise they would get no itext, not even a hypen "-"). - add lru_cache upper limit so that it doesn't grow forever - update add_to_nested_dict to allow updating existing dict - question.py: - declare translations, dynamic choices, and media keys set by survey and use them to determine relevant behaviour instead of re-checking - utils.py: remove multi-language parameter from has_dynamic_label because it's clearer to do the same logic external to the function. - add tests for multi-language detection + with media --- pyxform/builder.py | 12 +- pyxform/constants.py | 2 + pyxform/instance_info.py | 15 -- pyxform/question.py | 31 ++- pyxform/survey.py | 194 +++++++++++------- pyxform/utils.py | 27 +-- .../validators/pyxform/translations_checks.py | 43 ++-- pyxform/xform2json.py | 9 +- pyxform/xls2json.py | 72 +++++-- tests/test_translations.py | 146 ++++++++++++- tests/test_xform2json.py | 7 +- tests/xpath_helpers/choices.py | 2 + tests/xpath_helpers/questions.py | 43 +++- 13 files changed, 427 insertions(+), 176 deletions(-) delete mode 100644 pyxform/instance_info.py diff --git a/pyxform/builder.py b/pyxform/builder.py index 4092028c..d7edad2c 100644 --- a/pyxform/builder.py +++ b/pyxform/builder.py @@ -83,12 +83,13 @@ def __init__(self, **kwargs): # I don't know why we would need an explicit none option for # select alls self._add_none_option = False + self._sections: Optional[Dict[str, Dict]] = None self.set_sections(kwargs.get("sections", {})) # dictionary of setvalue target and value tuple indexed by triggering element self.setvalues_by_triggering_ref = {} # For tracking survey-level choices while recursing through the survey. - self.choices: Dict[str, Any] = {} + self._choices: Dict[str, Any] = {} def set_sections(self, sections): """ @@ -112,13 +113,13 @@ def create_survey_element_from_dict( if d[const.TYPE] in self.SECTION_CLASSES: if d[const.TYPE] == const.SURVEY: - self.choices = copy.deepcopy(d.get(const.CHOICES, {})) + self._choices = copy.deepcopy(d.get(const.CHOICES, {})) section = self._create_section_from_dict(d) if d[const.TYPE] == const.SURVEY: section.setvalues_by_triggering_ref = self.setvalues_by_triggering_ref - section.choices = self.choices + section.choices = self._choices return section elif d[const.TYPE] == const.LOOP: @@ -263,7 +264,7 @@ def _create_section_from_dict(self, d): survey_element = self.create_survey_element_from_dict(copy.deepcopy(child)) if child[const.TYPE].endswith(const.SELECT_OR_OTHER_SUFFIX): select_question = survey_element[0] - itemset_choices = self.choices.get(select_question[const.ITEMSET], None) + itemset_choices = self._choices.get(select_question[const.ITEMSET], None) if ( itemset_choices is not None and isinstance(itemset_choices, list) @@ -307,7 +308,8 @@ def _create_loop_from_dict(self, d): # TODO: Verify that nothing breaks if this returns a list return result.children - def _name_and_label_substitutions(self, question_template, column_headers): + @staticmethod + def _name_and_label_substitutions(question_template, column_headers): # if the label in column_headers has multiple languages setup a # dictionary by language to do substitutions. info_by_lang = {} diff --git a/pyxform/constants.py b/pyxform/constants.py index 73c0ed65..a1f469ce 100644 --- a/pyxform/constants.py +++ b/pyxform/constants.py @@ -64,6 +64,8 @@ SELECT_ALL_THAT_APPLY = "select all that apply" SELECT_OR_OTHER_SUFFIX = " or specify other" RANK = "rank" +QUESTION = "question" +CHOICE = "choice" CHOICES = "choices" # XLS Specific constants diff --git a/pyxform/instance_info.py b/pyxform/instance_info.py deleted file mode 100644 index 049d8974..00000000 --- a/pyxform/instance_info.py +++ /dev/null @@ -1,15 +0,0 @@ -# -*- coding: utf-8 -*- -""" -InstanceInfo class module. -""" - - -class InstanceInfo: - """Standardise Instance details relevant during XML generation.""" - - def __init__(self, type, context, name, src, instance): - self.type = type - self.context = context - self.name = name - self.src = src - self.instance = instance diff --git a/pyxform/question.py b/pyxform/question.py index 1b53a0e2..4421e630 100644 --- a/pyxform/question.py +++ b/pyxform/question.py @@ -14,15 +14,19 @@ from pyxform.errors import PyXFormError from pyxform.question_type_dictionary import QUESTION_TYPE_DICT from pyxform.survey_element import SurveyElement -from pyxform.utils import ( - PYXFORM_REFERENCE_REGEX, - default_is_dynamic, - has_dynamic_label, - node, -) +from pyxform.utils import PYXFORM_REFERENCE_REGEX, default_is_dynamic, node class Question(SurveyElement): + FIELDS = SurveyElement.FIELDS.copy() + FIELDS.update( + { + "_itemset_multi_language": bool, + "_itemset_has_media": bool, + "_itemset_dyn_label": bool, + } + ) + def validate(self): SurveyElement.validate(self) @@ -202,12 +206,6 @@ def build_xml(self): for element in self.xml_label_and_hint(): result.appendChild(element) - choices = survey.get("choices") - multi_language = False - if choices is not None and len(choices) > 0: - first_choices = next(iter(choices.values())) - multi_language = isinstance(first_choices[0].get("label"), dict) - # itemset are only supposed to be strings, # check to prevent the rare dicts that show up if self["itemset"] and isinstance(self["itemset"], str): @@ -227,16 +225,13 @@ def build_xml(self): else EXTERNAL_CHOICES_ITEMSET_REF_LABEL, ) - has_media = False - has_dyn_label = False + multi_language = self.get("_itemset_multi_language", False) + has_media = self.get("_itemset_has_media", False) + has_dyn_label = self.get("_itemset_dyn_label", False) is_previous_question = bool( PYXFORM_REFERENCE_REGEX.search(self.get("itemset")) ) - if choices.get(itemset): - has_media = bool(choices[itemset][0].get("media")) - has_dyn_label = has_dynamic_label(choices[itemset], multi_language) - if file_extension in EXTERNAL_INSTANCE_EXTENSIONS: itemset = itemset else: diff --git a/pyxform/survey.py b/pyxform/survey.py index 242fbceb..9551413d 100644 --- a/pyxform/survey.py +++ b/pyxform/survey.py @@ -10,14 +10,13 @@ from collections import defaultdict from datetime import datetime from functools import lru_cache -from typing import Iterator, List, Optional, Tuple +from typing import Generator, Iterator, List, Optional, Tuple from pyxform import aliases, constants from pyxform.constants import EXTERNAL_INSTANCE_EXTENSIONS from pyxform.errors import PyXFormError, ValidationError from pyxform.external_instance import ExternalInstance from pyxform.instance import SurveyInstance -from pyxform.instance_info import InstanceInfo from pyxform.parsing import instance_expression from pyxform.question import Option, Question from pyxform.section import Section @@ -47,6 +46,26 @@ RE_XML_TEXT = re.compile(r"(>)\n\s*(\s[^<>\s].*?)\n\s*(\s bool: + if ( + label is not None + and isinstance(label, str) + and re.search(BRACKETED_TAG_REGEX, label) is not None + ): + return True + else: + return False + + +def recursive_dict(): + return defaultdict(recursive_dict) + + class Survey(Section): """ Survey class - represents the full XForm XML. @@ -174,7 +209,7 @@ class Survey(Section): constants.COMPACT_DELIMITER: str, "file_name": str, "default_language": str, - "_translations": dict, + "_translations": recursive_dict, "submission_url": str, "auto_send": str, "auto_delete": str, @@ -267,27 +302,30 @@ def xml(self): "h:html", node("h:head", node("h:title", self.title), self.xml_model()), node("h:body", *self.xml_control(), **body_kwargs), - **nsmap + **nsmap, ) def get_setvalues_for_question_name(self, question_name): return self.setvalues_by_triggering_ref.get("${%s}" % question_name) - @staticmethod - def _generate_static_instances(list_name, choice_list) -> InstanceInfo: + def _generate_static_instances(self, list_name, choice_list) -> InstanceInfo: """ - Generates elements for static data - (e.g. choices for select type questions) - - Note that per commit message 0578242 and in xls2json.py R539, an - instance is only output for select items defined in the choices sheet - when the item has a choice_filter, and it is that way for backwards - compatibility. + Generate elements for static data (e.g. choices for selects) """ instance_element_list = [] - multi_language = isinstance(choice_list[0].get("label"), dict) has_media = bool(choice_list[0].get("media")) - has_dyn_label = has_dynamic_label(choice_list, multi_language) + has_dyn_label = has_dynamic_label(choice_list) + multi_language = False + if isinstance(self._translations, dict): + choices = tuple( + k + for items in self._translations.values() + for k, v in items.items() + if v.get(constants.TYPE, "") == constants.CHOICE + and k.split("-")[0] == list_name + ) + if 0 < len(choices): + multi_language = True for idx, choice in enumerate(choice_list): choice_element_list = [] @@ -302,7 +340,7 @@ def _generate_static_instances(list_name, choice_list) -> InstanceInfo: if ( not multi_language and not has_media - and not has_dynamic_label(choice_list, multi_language) + and not has_dyn_label and isinstance(value, str) and name == "label" ): @@ -391,9 +429,9 @@ def get_instance_info(element, file_id): uri = "jr://file-csv/{}.csv".format(file_id) return InstanceInfo( - type=u"pulldata", + type="pulldata", context="[type: {t}, name: {n}]".format( - t=element[u"parent"][u"type"], n=element[u"parent"][u"name"] + t=element["parent"]["type"], n=element["parent"]["name"] ), name=file_id, src=uri, @@ -626,7 +664,11 @@ def xml_instance(self, **kwargs): def _add_to_nested_dict(self, dicty, path, value): if len(path) == 1: - dicty[path[0]] = value + key = path[0] + if key in dicty and isinstance(dicty[key], dict) and isinstance(value, dict): + dicty[key].update(value) + else: + dicty[key] = value return if path[0] not in dicty: dicty[path[0]] = {} @@ -638,32 +680,69 @@ def _setup_translations(self): setup media and itext functions """ - def _setup_choice_translations(name, choice_value, itext_id): - for media_type_or_language, value in choice_value.items(): # noqa + def _setup_choice_translations( + name, choice_value, itext_id + ) -> Generator[Tuple[List[str], str], None, None]: + for media_or_lang, value in choice_value.items(): # noqa if isinstance(value, dict): for language, val in value.items(): - self._add_to_nested_dict( - self._translations, - [language, itext_id, media_type_or_language], - val, - ) + yield ([language, itext_id, media_or_lang], val) else: - if name == "media": - self._add_to_nested_dict( - self._translations, - [self.default_language, itext_id, media_type_or_language], - value, - ) + if name == constants.MEDIA: + yield ([self.default_language, itext_id, media_or_lang], value) else: - self._add_to_nested_dict( - self._translations, - [media_type_or_language, itext_id, "long"], - value, + yield ([media_or_lang, itext_id, "long"], value) + + itemsets_multi_language = set() + itemsets_has_media = set() + itemsets_has_dyn_label = set() + + def get_choices(): + for list_name, choice_list in self.choices.items(): + multi_language = False + has_media = False + dyn_label = False + choices = [] + for idx, choice in enumerate(choice_list): + for col_name, choice_value in choice.items(): + lang_choice = None + if col_name == constants.MEDIA: + has_media = True + if isinstance(choice_value, dict): + lang_choice = choice_value + multi_language = True + elif col_name == constants.LABEL: + lang_choice = {self.default_language: choice_value} + if is_label_dynamic(choice_value): + dyn_label = True + if lang_choice is not None: + # e.g. (label, {"default": "Yes"}, "consent", 0) + choices.append((col_name, lang_choice, list_name, idx)) + if multi_language or has_media or dyn_label: + if multi_language: + itemsets_multi_language.add(list_name) + if has_media: + itemsets_has_media.add(list_name) + if dyn_label: + itemsets_has_dyn_label.add(list_name) + for c in choices: + yield from _setup_choice_translations( + c[0], c[1], f"{c[2]}-{c[3]}" ) - self._translations = defaultdict(dict) # pylint: disable=W0201 + for path, value in get_choices(): + last_path = path.pop() + leaf_value = {last_path: value, constants.TYPE: constants.CHOICE} + self._add_to_nested_dict(self._translations, path, leaf_value) + select_types = set(aliases.select.keys()) for element in self.iter_descendants(): + itemset = element.get("itemset") + if itemset is not None: + element._itemset_multi_language = itemset in itemsets_multi_language + element._itemset_has_media = itemset in itemsets_has_media + element._itemset_dyn_label = itemset in itemsets_has_dyn_label + # Skip creation of translations for choices in selects. The creation of these # translations is done futher below in this function. parent = element.get("parent") @@ -685,37 +764,15 @@ def _setup_choice_translations(name, choice_value, itext_id): form: { "text": d["text"], "output_context": d["output_context"], - } + }, + constants.TYPE: constants.QUESTION, } ) - # This code sets up translations for choices in selects. - for list_name, choice_list in self.choices.items(): - multi_language = isinstance(choice_list[0].get("label"), dict) - has_media = bool(choice_list[0].get("media")) - if ( - not multi_language - and not has_media - and not has_dynamic_label(choice_list, multi_language) - ): - continue - for idx, choice in enumerate(choice_list): - for name, choice_value in choice.items(): - itext_id = "-".join([list_name, str(idx)]) - if isinstance(choice_value, dict): - _setup_choice_translations(name, choice_value, itext_id) - elif name == "label": - self._add_to_nested_dict( - self._translations, - [self.default_language, itext_id, "long"], - choice_value, - ) - def _add_empty_translations(self): """ - Adds translations so that every itext element has the same elements \ - accross every language. - When translations are not provided "-" will be used. + Adds translations so that every itext element has the same elements across every + language. When translations are not provided "-" will be used. This disables any of the default_language fallback functionality. """ paths = {} @@ -767,7 +824,6 @@ def _set_up_media_translations(media_dict, translation_key): # Create the required dictionaries in _translations, # then add media as a leaf value: - if language not in self._translations: self._translations[language] = {} @@ -783,9 +839,6 @@ def _set_up_media_translations(media_dict, translation_key): translations_trans_key[media_type] = media - if not self._translations: - self._translations = defaultdict(dict) # pylint: disable=W0201 - for survey_element in self.iter_descendants(): # Skip set up of media for choices in selects. Translations for their media # content should have been set up in _setup_translations, with one copy of @@ -818,6 +871,9 @@ def itext(self): raise Exception() for media_type, media_value in content.items(): + # Ignore key indicating Question or Choice translation type. + if media_type == constants.TYPE: + continue if isinstance(media_value, dict): value, output_inserted = self.insert_output_values( media_value["text"], context=media_value["output_context"] diff --git a/pyxform/utils.py b/pyxform/utils.py index 7786cb1c..2b7ddb48 100644 --- a/pyxform/utils.py +++ b/pyxform/utils.py @@ -10,7 +10,7 @@ import re from collections import namedtuple from json.decoder import JSONDecodeError -from typing import List, Tuple +from typing import Dict, List, Tuple from xml.dom.minidom import Element, Text, parseString import openpyxl @@ -277,17 +277,20 @@ def default_is_dynamic(element_default, element_type=None): return False -# If the first or second choice label includes a reference, we must use itext. -# Check the first two choices in case first is something like "Other". -def has_dynamic_label(choice_list, multi_language): - if not multi_language: - for i in range(0, min(2, len(choice_list))): - if ( - choice_list[i].get("label") is not None - and re.search(BRACKETED_TAG_REGEX, choice_list[i].get("label")) - is not None - ): - return True +def has_dynamic_label(choice_list: "List[Dict[str, str]]") -> bool: + """ + If the first or second choice label includes a reference, we must use itext. + + Check the first two choices in case first is something like "Other". + """ + for c in choice_list[:2]: + choice_label = c.get("label") + if ( + choice_label is not None + and isinstance(choice_label, str) + and re.search(BRACKETED_TAG_REGEX, choice_label) is not None + ): + return True return False diff --git a/pyxform/validators/pyxform/translations_checks.py b/pyxform/validators/pyxform/translations_checks.py index 722017ba..01121d76 100644 --- a/pyxform/validators/pyxform/translations_checks.py +++ b/pyxform/validators/pyxform/translations_checks.py @@ -6,9 +6,9 @@ from pyxform.errors import PyXFormError if TYPE_CHECKING: - from typing import Dict, List, Optional, Sequence, Set, Union + from typing import Dict, List, Optional, Sequence, Set, Tuple - SheetData = List[Dict[str, Union[str, Dict]]] + SheetData = Tuple[Tuple[str, ...]] Warnings = List[str] @@ -93,31 +93,20 @@ def __init__( def _find_translations( self, sheet_data: "SheetData", translatable_columns: "Dict[str, str]" ): - def process_cell(typ, cell): - if cell is not None: - if typ in translatable_columns.keys(): - name = translatable_columns[typ] - if isinstance(cell, str): - self.seen[const.DEFAULT_LANGUAGE_VALUE].append(name) - self.columns_seen.add(name) - elif isinstance(cell, dict): - for lng in cell: - self.seen[lng].append(name) - self.columns_seen.add(name) - - if 0 < len(sheet_data): - # e.g. ("name", "q1"), ("label", {"en": "Hello", "fr": "Bonjour"}) - for column_type, cell_content in sheet_data[0].items(): - if column_type == const.MEDIA and isinstance(cell_content, dict): - # e.g. ("audio", {"eng": "my.mp3"}) - for media_type, media_cell in cell_content.items(): - process_cell(typ=media_type, cell=media_cell) - if column_type == const.BIND: - # e.g. ("jr:constraintMsg", "Try again") - for bind_type, bind_cell in cell_content.items(): - process_cell(typ=bind_type, cell=bind_cell) - else: - process_cell(typ=column_type, cell=cell_content) + def process_header(head): + if head[0] in translatable_columns.keys(): + name = translatable_columns[head[0]] + if len(head) == 1: + self.seen[const.DEFAULT_LANGUAGE_VALUE].append(name) + elif len(head) == 2: + self.seen[head[1]].append(name) + self.columns_seen.add(name) + + for header in sheet_data: + if 1 < len(header) and header[0] in (const.MEDIA, const.BIND): + process_header(head=header[1:]) + else: + process_header(head=header) def seen_default_only(self) -> bool: return 0 == len(self.seen) or ( diff --git a/pyxform/xform2json.py b/pyxform/xform2json.py index c8497733..4cf0ebcc 100644 --- a/pyxform/xform2json.py +++ b/pyxform/xform2json.py @@ -668,7 +668,8 @@ def _get_choices(self) -> Dict[str, Any]: choices[instance["id"]] = items return choices - def _get_name_from_ref(self, ref): + @staticmethod + def _get_name_from_ref(ref): """given /xlsform_spec_test/launch, return the string after the last occurance of the character '/' """ @@ -678,10 +679,12 @@ def _get_name_from_ref(self, ref): else: return ref[pos + 1 :].strip() - def _expand_child(self, obj_list): + @staticmethod + def _expand_child(obj_list): return obj_list - def _shorten_xpaths_in_string(self, text): + @staticmethod + def _shorten_xpaths_in_string(text): def get_last_item(xpath_str): last_item = xpath_str.split("/") return last_item[len(last_item) - 1].strip() diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py index e1fe6cae..f720a264 100644 --- a/pyxform/xls2json.py +++ b/pyxform/xls2json.py @@ -8,7 +8,7 @@ import re import sys from collections import Counter -from typing import IO, Any, Dict, List, Optional +from typing import IO, Any, Dict, List, Optional, Tuple from pyxform import aliases, constants from pyxform.constants import ( @@ -97,13 +97,25 @@ def replace_smart_quotes_in_dict(_d): _d[key] = value +class DealiasAndGroupHeadersResult: + __slots__ = ("headers", "data") + + def __init__(self, headers: Tuple[Tuple[str, ...], ...], data: List[Dict]): + """ + :param headers: Distinct headers seen in the sheet, parsed / split if applicable. + :param data: Sheet data rows, in grouped dict format. + """ + self.headers: Tuple[Tuple[str, ...], ...] = headers + self.data: List[Dict] = data + + def dealias_and_group_headers( - dict_array: "List[Dict]", - header_aliases: "Dict", + dict_array: List[Dict], + header_aliases: Dict[str, str], use_double_colons: bool, default_language: str = constants.DEFAULT_LANGUAGE_VALUE, ignore_case: bool = False, -): +) -> DealiasAndGroupHeadersResult: """ For each row in the worksheet, group all keys that contain a double colon. So @@ -117,10 +129,10 @@ def dealias_and_group_headers( """ group_delimiter = "::" out_dict_array = list() + seen_headers = {} for row in dict_array: out_row = dict() for header, val in row.items(): - if ignore_case: header = header.lower() @@ -150,9 +162,12 @@ def dealias_and_group_headers( new_key = tokens[0] new_value = list_to_nested_dict(tokens[1:] + [val]) out_row = merge_dicts(out_row, {new_key: new_value}, default_language) + seen_headers[tuple(tokens)] = None out_dict_array.append(out_row) - return out_dict_array + return DealiasAndGroupHeadersResult( + headers=tuple(seen_headers.keys()), data=out_dict_array + ) def dealias_types(dict_array): @@ -474,9 +489,11 @@ def workbook_to_json( settings_sheet_headers = [] settings_sheet = dealias_and_group_headers( - settings_sheet_headers, aliases.settings_header, use_double_colons + dict_array=settings_sheet_headers, + header_aliases=aliases.settings_header, + use_double_colons=use_double_colons, ) - settings = settings_sheet[0] if len(settings_sheet) > 0 else {} + settings = settings_sheet.data[0] if len(settings_sheet.data) > 0 else {} replace_smart_quotes_in_dict(settings) default_language = settings.get(constants.DEFAULT_LANGUAGE_KEY, default_language) @@ -514,10 +531,13 @@ def workbook_to_json( replace_smart_quotes_in_dict(choice_item) external_choices_sheet = dealias_and_group_headers( - external_choices_sheet, aliases.list_header, use_double_colons, default_language + dict_array=external_choices_sheet, + header_aliases=aliases.list_header, + use_double_colons=use_double_colons, + default_language=default_language, ) external_choices = group_dictionaries_by_key( - external_choices_sheet, constants.LIST_NAME + list_of_dicts=external_choices_sheet.data, key=constants.LIST_NAME ) # ########## Choices sheet ########## @@ -525,9 +545,14 @@ def workbook_to_json( for choice_item in choices_sheet: replace_smart_quotes_in_dict(choice_item) choices_sheet = dealias_and_group_headers( - choices_sheet, aliases.list_header, use_double_colons, default_language + dict_array=choices_sheet, + header_aliases=aliases.list_header, + use_double_colons=use_double_colons, + default_language=default_language, + ) + combined_lists = group_dictionaries_by_key( + list_of_dicts=choices_sheet.data, key=constants.LIST_NAME ) - combined_lists = group_dictionaries_by_key(choices_sheet, constants.LIST_NAME) # To combine the warning into one message, the check for missing choices translation # columns is run with Survey sheet below. @@ -614,23 +639,30 @@ def workbook_to_json( if clean_text_values_enabled: survey_sheet = clean_text_values(survey_sheet) survey_sheet = dealias_and_group_headers( - survey_sheet, aliases.survey_header, use_double_colons, default_language + dict_array=survey_sheet, + header_aliases=aliases.survey_header, + use_double_colons=use_double_colons, + default_language=default_language, ) - survey_sheet = dealias_types(survey_sheet) + survey_sheet.data = dealias_types(dict_array=survey_sheet.data) # Check for missing translations. The choices sheet is checked here so that the # warning can be combined into one message. sheet_translations = SheetTranslations( - survey_sheet=survey_sheet, - choices_sheet=choices_sheet, + survey_sheet=survey_sheet.headers, + choices_sheet=choices_sheet.headers, ) sheet_translations.missing_check(warnings=warnings) # No spell check for OSM sheet (infrequently used, many spurious matches). osm_sheet = dealias_and_group_headers( - workbook_dict.get(constants.OSM, []), aliases.list_header, True + dict_array=workbook_dict.get(constants.OSM, []), + header_aliases=aliases.list_header, + use_double_colons=True, + ) + osm_tags = group_dictionaries_by_key( + list_of_dicts=osm_sheet.data, key=constants.LIST_NAME ) - osm_tags = group_dictionaries_by_key(osm_sheet, constants.LIST_NAME) # ################################# # Parse the survey sheet while generating a survey in our json format: @@ -673,7 +705,7 @@ def workbook_to_json( # row by row, validate questions, throwing errors and adding warnings # where needed. - for row in survey_sheet: + for row in survey_sheet.data: row_number += 1 if stack[-1] is not None: prev_control_type = stack[-1]["control_type"] @@ -1650,7 +1682,7 @@ def _setup_question_types_dictionary(self): header_aliases={}, use_double_colons=use_double_colons, default_language=constants.DEFAULT_LANGUAGE_VALUE, - ) + ).data self._dict = organize_by_values(self._dict, "name") diff --git a/tests/test_translations.py b/tests/test_translations.py index 8116a0bb..b6a85ba0 100644 --- a/tests/test_translations.py +++ b/tests/test_translations.py @@ -416,6 +416,146 @@ def run(name): ): run(name=f"questions={count}, without check (seconds):") + def test_translation_detection__survey_and_choices_columns_present(self): + """Should identify that the survey is multi-language when first row(s) empty.""" + md = """ + | survey | | | | | + | | type | name | label | label::en | + | | select_one c0 | f | f | | + | | select_one c1 | q1 | Question 1 | Question A | + | choices | | | | | | + | | list name | name | label | label::en | label::fr | + | | c0 | n | l | | | + | | c1 | na | la | | | + | | c1 | nb | lb | lb-e | lb-f | + """ + self.assertPyxformXform( + md=md, + xml__xpath_match=[ + xpq.body_select1_itemset("f"), + xpq.body_label_inline("select1", "f", "f"), + xpq.body_select1_itemset("q1"), + xpq.body_label_itext("select1", "q1"), + xpq.model_itext_label("q1", DEFAULT_LANG, "Question 1"), + xpq.model_itext_label("q1", "en", "Question A"), + xpq.model_itext_label("q1", "fr", "-"), + xpc.model_instance_choices_label("c0", (("n", "l"),)), + xpc.model_instance_choices_itext("c1", ("na", "nb")), + xpc.model_itext_choice_text_label_by_pos( + DEFAULT_LANG, "c1", ("la", "lb") + ), + xpc.model_itext_choice_text_label_by_pos("en", "c1", ("-", "lb-e")), + xpc.model_itext_choice_text_label_by_pos("fr", "c1", ("-", "lb-f")), + ], + ) + + def test_translation_detection__survey_columns_not_present(self): + """Should identify that the survey is multi-language when only choices translated.""" + md = """ + | survey | | | | + | | type | name | label | + | | select_one c0 | f | f | + | | select_one c1 | q1 | Question 1 | + | choices | | | | | | + | | list name | name | label | label::en | label::fr | + | | c0 | n | l | | | + | | c1 | na | la | | | + | | c1 | nb | lb | lb-e | lb-f | + """ + self.assertPyxformXform( + md=md, + xml__xpath_match=[ + xpq.body_select1_itemset("f"), + xpq.body_label_inline("select1", "f", "f"), + xpq.body_select1_itemset("q1"), + xpq.body_label_inline("select1", "q1", "Question 1"), + xpc.model_instance_choices_label("c0", (("n", "l"),)), + xpc.model_instance_choices_itext("c1", ("na", "nb")), + xpc.model_itext_choice_text_label_by_pos( + DEFAULT_LANG, "c1", ("la", "lb") + ), + xpc.model_itext_choice_text_label_by_pos("en", "c1", ("-", "lb-e")), + xpc.model_itext_choice_text_label_by_pos("fr", "c1", ("-", "lb-f")), + ], + ) + + def test_translation_detection__only_survey_columns_present(self): + """Should identify that the survey is multi-language when first row(s) empty.""" + md = """ + | survey | | | | | + | | type | name | label | label::en | + | | select_one c0 | f | f | | + | | select_one c1 | q1 | Question 1 | Question A | + | choices | | | | + | | list name | name | label | + | | c0 | n | l | + | | c1 | na | la | + | | c1 | nb | lb | + """ + self.assertPyxformXform( + md=md, + xml__xpath_match=[ + xpq.body_select1_itemset("f"), + xpq.body_label_inline("select1", "f", "f"), + xpq.body_select1_itemset("q1"), + xpq.body_label_itext("select1", "q1"), + xpq.model_itext_label("q1", DEFAULT_LANG, "Question 1"), + xpq.model_itext_label("q1", "en", "Question A"), + xpc.model_instance_choices_label("c0", (("n", "l"),)), + xpc.model_instance_choices_label("c1", (("na", "la"), ("nb", "lb"))), + ], + ) + + def test_translation_detection__survey_columns_present_with_media(self): + """Should identify that the survey is multi-language when first row(s) empty.""" + md = """ + | survey | | | | | | + | | type | name | label | label::en | image::en | + | | select_one c0 | f | f | | | + | | select_one c1 | q1 | Question 1 | Question A | c1.png | + | choices | | | | | | + | | list name | name | label | label::en | label::fr | audio::de | + | | c0 | n | l | | | | + | | c1 | na | la | | | | + | | c1 | nb | lb | lb-e | | c1_nb.mp3 | + | | c1 | nc | lc | lc-e | lc-f | c1_nc.mp3 | + """ + self.assertPyxformXform( + md=md, + debug=True, + xml__xpath_match=[ + xpq.body_select1_itemset("f"), + xpq.body_label_inline("select1", "f", "f"), + xpq.body_select1_itemset("q1"), + xpq.body_label_itext("select1", "q1"), + xpq.model_itext_label("q1", DEFAULT_LANG, "Question 1"), + xpq.model_itext_label( + "q1", + "en", + "Question A", + ), + xpq.model_itext_form("q1", "en", "image", "c1.png"), + xpc.model_instance_choices_label("c0", (("n", "l"),)), + xpc.model_itext_choice_text_label_by_pos( + DEFAULT_LANG, "c1", ("la", "lb", "lc") + ), + xpc.model_itext_choice_text_label_by_pos( + "en", "c1", ("-", "lb-e", "lc-e") + ), + xpc.model_itext_choice_text_label_by_pos("fr", "c1", ("-", "-", "lc-f")), + xpc.model_itext_choice_text_label_by_pos("de", "c1", ("-", "-", "-")), + xpc.model_itext_choice_media_by_pos( + "de", + "c1", + ( + ((None, None),), + (("audio", "c1_nb.mp3"),), + (("audio", "c1_nc.mp3"),), + ), + ), + ], + ) + class TestTranslationsSurvey(PyxformTestCase): """Translations behaviour of columns in the Survey sheet.""" @@ -1090,9 +1230,9 @@ def test_missing_translation__one_lang_simple__warn__default(self): | | type | name | label | | | select_one c1 | q1 | Question 1 | | choices | | | | - | | list name | name | label | label::eng | media::audio | - | | c1 | na | la-d | la-e | la-d.mp3 | - | | c1 | nb | lb-d | lb-e | lb-d.mp3 | + | | list name | name | label::eng | media::audio | + | | c1 | na | la-e | la-d.mp3 | + | | c1 | nb | lb-e | lb-d.mp3 | """ cols = {CHOICES: {"default": ("label",), "eng": ("media::audio",)}} warning = format_missing_translations_msg(_in=cols) diff --git a/tests/test_xform2json.py b/tests/test_xform2json.py index ba9a56ce..7d283fb0 100644 --- a/tests/test_xform2json.py +++ b/tests/test_xform2json.py @@ -35,9 +35,10 @@ def test_convert_toJSON_multi_language(self): kwargs={"id_string": "id", "name": "multi-language", "title": "some-title"}, autoname=False, ) - expected_xml = survey.to_xml() + expected = survey.to_xml() generated_json = survey.to_json() - survey = create_survey_element_from_dict(json.loads(generated_json)) + survey_from_builder = create_survey_element_from_dict(json.loads(generated_json)) + observed = survey_from_builder.to_xml() - self.assertEqual(expected_xml, survey.to_xml()) + self.assertEqual(expected, observed) diff --git a/tests/xpath_helpers/choices.py b/tests/xpath_helpers/choices.py index dddcdb49..d546221b 100644 --- a/tests/xpath_helpers/choices.py +++ b/tests/xpath_helpers/choices.py @@ -106,6 +106,7 @@ def model_itext_choice_media_by_pos( """ for idx, choice in enumerate(choices) for form, media in choice + if form is not None ) ) return f""" @@ -134,6 +135,7 @@ def model_no_itext_choice_media_by_pos( """ for idx, choice in enumerate(choices) for form, media in choice + if form is not None ) ) return f""" diff --git a/tests/xpath_helpers/questions.py b/tests/xpath_helpers/questions.py index c6af3b65..bf865ab3 100644 --- a/tests/xpath_helpers/questions.py +++ b/tests/xpath_helpers/questions.py @@ -1,6 +1,6 @@ class XPathHelper: """ - XPath expressions for choices assertions. + XPath expressions for questions assertions. """ @staticmethod @@ -20,6 +20,47 @@ def model_instance_bind(q_name: str, _type: str): ] """ + @staticmethod + def model_itext_label(q_name: str, lang: str, q_label: str): + """Model itext contains the question label.""" + return f""" + /h:html/h:head/x:model/x:itext/x:translation[@lang='{lang}'] + /x:text[@id='/test_name/{q_name}:label'] + /x:value[not(@form) and text()='{q_label}'] + """ + + @staticmethod + def model_itext_form(q_name: str, lang: str, form: str, fname: str): + """Model itext contains an alternate form itext for the label or hint.""" + prefix = { + "audio": ("label", "jr://audio/"), + "image": ("label", "jr://images/"), + "big-image": ("label", "jr://images/"), + "video": ("label", "jr://video/"), + "guidance": ("hint", ""), + } + return f""" + /h:html/h:head/x:model/x:itext/x:translation[@lang='{lang}'] + /x:text[@id='/test_name/{q_name}:{prefix[form][0]}'] + /x:value[@form='{form}' and text()='{prefix[form][1]}{fname}'] + """ + + @staticmethod + def body_label_inline(q_type: str, q_name: str, q_label: str): + """Body element contains the question label.""" + return f""" + /h:html/h:body/x:{q_type}[@ref='/test_name/{q_name}'] + /x:label[not(@ref) and text()='{q_label}'] + """ + + @staticmethod + def body_label_itext(q_type: str, q_name: str): + """Body element references itext for the question label.""" + return f""" + /h:html/h:body/x:{q_type}[@ref='/test_name/{q_name}'] + /x:label[@ref="jr:itext('/test_name/{q_name}:label')" and not(text())] + """ + @staticmethod def body_select1_itemset(q_name: str): """Body has a select1 with an itemset, and no inline items.""" From f353d54f5a070c457cf3305d6993f7ca54c1a1b5 Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Tue, 26 Sep 2023 12:37:51 +1000 Subject: [PATCH 2/4] dev: fix/elaborate on type annotations --- pyxform/entities/entities_parsing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyxform/entities/entities_parsing.py b/pyxform/entities/entities_parsing.py index f4c4a620..30a60f0d 100644 --- a/pyxform/entities/entities_parsing.py +++ b/pyxform/entities/entities_parsing.py @@ -1,4 +1,4 @@ -from typing import Dict, List +from typing import Any, Dict, List from pyxform import constants from pyxform.errors import PyXFormError @@ -6,8 +6,8 @@ def get_entity_declaration( - entities_sheet: Dict, workbook_dict: Dict, warnings: List -) -> Dict: + entities_sheet: List[Dict], workbook_dict: Dict[str, List[Dict]], warnings: List[str] +) -> Dict[str, Any]: if len(entities_sheet) == 0: similar = find_sheet_misspellings( key=constants.ENTITIES, keys=workbook_dict.keys() @@ -81,7 +81,7 @@ def get_validated_dataset_name(entity): def validate_entity_saveto( - row: Dict, row_number: int, entity_declaration: Dict, in_repeat: bool + row: Dict, row_number: int, entity_declaration: Dict[str, Any], in_repeat: bool ): save_to = row.get("bind", {}).get("entities:saveto", "") if not save_to: From 26026a6cd1b12b93f1f895e6036a9c21679bbc1a Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Tue, 26 Sep 2023 12:45:17 +1000 Subject: [PATCH 3/4] chg: use more performant data type for finding element lineage - For insert/pop on the left side: O(1) for deque vs. O(n) for list. - Avoids creating lots of new lists with [] + []. - Did not investigate whether the "flat" filter is still necessary. --- pyxform/survey_element.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pyxform/survey_element.py b/pyxform/survey_element.py index 2831a82d..937b26ac 100644 --- a/pyxform/survey_element.py +++ b/pyxform/survey_element.py @@ -4,6 +4,7 @@ """ import json import re +from collections import deque from functools import lru_cache from typing import TYPE_CHECKING @@ -188,16 +189,14 @@ def get_lineage(self): """ Return a the list [root, ..., self._parent, self] """ - result = [self] + result = deque((self,)) current_element = self while current_element.parent: current_element = current_element.parent - result = [current_element] + result + result.appendleft(current_element) # For some reason the root element has a True flat property... - output = [result[0]] - for item in result[1:]: - if not item.get("flat"): - output.append(item) + output = [result.popleft()] + output.extend([i for i in result if not i.get("flat")]) return output def get_root(self): From 05ad654093d5b9f3c8c687374746e0b6adedd41d Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Tue, 26 Sep 2023 12:48:21 +1000 Subject: [PATCH 4/4] fix: return type change for dealias_and_group_headers for entities sheet - needed after rebase on latest master --- pyxform/xls2json.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py index f720a264..2fca218b 100644 --- a/pyxform/xls2json.py +++ b/pyxform/xls2json.py @@ -626,9 +626,13 @@ def workbook_to_json( # ########## Entities sheet ########### entities_sheet = workbook_dict.get(constants.ENTITIES, []) entities_sheet = dealias_and_group_headers( - entities_sheet, aliases.entities_header, False + dict_array=entities_sheet, + header_aliases=aliases.entities_header, + use_double_colons=False, + ) + entity_declaration = get_entity_declaration( + entities_sheet=entities_sheet.data, workbook_dict=workbook_dict, warnings=warnings ) - entity_declaration = get_entity_declaration(entities_sheet, workbook_dict, warnings) # ########## Survey sheet ########### survey_sheet = workbook_dict[constants.SURVEY]