Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

647: include all choice lists in output #736

Merged
merged 3 commits into from
Nov 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ dependencies = [
dev = [
"formencode==2.1.0", # Compare XML
"lxml==5.3.0", # XPath test expressions
"psutil==6.0.0", # Process info for performance tests
"ruff==0.6.4", # Format and lint
"psutil==6.1.0", # Process info for performance tests
"ruff==0.7.1", # Format and lint
]

[project.urls]
Expand Down
7 changes: 4 additions & 3 deletions pyxform/entities/entities_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from pyxform import constants as const
from pyxform.aliases import yes_no
from pyxform.errors import PyXFormError
from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag
from pyxform.parsing.expression import is_xml_tag
from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings

EC = const.EntityColumns

Expand Down Expand Up @@ -73,7 +74,7 @@ def get_validated_dataset_name(entity):
f"Invalid entity list name: '{dataset}'. Names may not include periods."
)

if not is_valid_xml_tag(dataset):
if not is_xml_tag(dataset):
if isinstance(dataset, bytes):
dataset = dataset.decode("utf-8")

Expand Down Expand Up @@ -118,7 +119,7 @@ def validate_entity_saveto(
f"{error_start} the entity property name '{save_to}' starts with reserved prefix {const.ENTITIES_RESERVED_PREFIX}."
)

if not is_valid_xml_tag(save_to):
if not is_xml_tag(save_to):
if isinstance(save_to, bytes):
save_to = save_to.decode("utf-8")

Expand Down
18 changes: 16 additions & 2 deletions pyxform/parsing/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def get_expression_lexer() -> re.Scanner:
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
"OPS_MATH": r"[\*\+\-]|mod|div",
"OPS_MATH": r"[\*\+\-]| mod | div ",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r"and|or",
"OPS_BOOL": r" and | or ",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
Expand Down Expand Up @@ -107,3 +107,17 @@ def is_single_token_expression(expression: str, token_types: Iterable[str]) -> b
return True
else:
return False


def is_pyxform_reference(value: str) -> bool:
"""
Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
"""
return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))


def is_xml_tag(value: str) -> bool:
"""
Does the input string contain only a valid XML tag / element name?
"""
return is_single_token_expression(expression=value, token_types=("NAME",))
4 changes: 2 additions & 2 deletions pyxform/survey_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pyxform import aliases as alias
from pyxform import constants as const
from pyxform.errors import PyXFormError
from pyxform.parsing.expression import is_xml_tag
from pyxform.question_type_dictionary import QUESTION_TYPE_DICT
from pyxform.utils import (
BRACKETED_TAG_REGEX,
Expand All @@ -19,7 +20,6 @@
node,
)
from pyxform.xls2json import print_pyobj_to_json
from pyxform.xlsparseutils import is_valid_xml_tag

if TYPE_CHECKING:
from pyxform.utils import DetachableElement
Expand Down Expand Up @@ -140,7 +140,7 @@ def add_children(self, children):
SUPPORTED_MEDIA = ("image", "big-image", "audio", "video")

def validate(self):
if not is_valid_xml_tag(self.name):
if not is_xml_tag(self.name):
invalid_char = re.search(INVALID_XFORM_TAG_REGEXP, self.name)
raise PyXFormError(
f"The name '{self.name}' contains an invalid character '{invalid_char.group(0)}'. Names {const.XML_IDENTIFIER_ERROR_MESSAGE}"
Expand Down
59 changes: 59 additions & 0 deletions pyxform/validators/pyxform/choices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from pyxform import constants
from pyxform.errors import PyXFormError

INVALID_NAME = (
"[row : {row}] On the 'choices' sheet, the 'name' value is invalid. "
"Choices must have a name. "
"Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
)
INVALID_LABEL = (
"[row : {row}] On the 'choices' sheet, the 'label' value is invalid. "
"Choices should have a label. "
Copy link
Contributor

@lognaturel lognaturel Nov 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is optional/should because a choice list might now just be used for reference and not for a select?

Copy link
Contributor Author

@lindsay-stevens lindsay-stevens Nov 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason for this check being a "should" / warning is to match existing behaviour - historically choice labels aren't required, but that might change per #534

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ohhh, thank you for introducing me to my past self. :D

"Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
)
INVALID_HEADER = (
"[row : 1] On the 'choices' sheet, the '{column}' value is invalid. "
"Column headers must not be empty and must not contain spaces. "
"Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
)
INVALID_DUPLICATE = (
"[row : {row}] On the 'choices' sheet, the 'name' value is invalid. "
"Choice names must be unique for each choice list. "
"If this is intentional, use the setting 'allow_choice_duplicates'. "
"Learn more: https://xlsform.org/#choice-names."
)


def validate_headers(
headers: tuple[tuple[str, ...], ...], warnings: list[str]
) -> list[str]:
def check():
for header in headers:
header = header[0]
if header != constants.LIST_NAME_S and (" " in header or header == ""):
warnings.append(INVALID_HEADER.format(column=header))
yield header

return list(check())


def validate_choices(
options: list[dict], warnings: list[str], allow_duplicates: bool = False
) -> None:
seen_options = set()
duplicate_errors = []
for option in options:
if "name" not in option:
raise PyXFormError(INVALID_NAME.format(row=option["__row"]))
elif "label" not in option:
warnings.append(INVALID_LABEL.format(row=option["__row"]))

if not allow_duplicates:
name = option["name"]
if name in seen_options:
duplicate_errors.append(INVALID_DUPLICATE.format(row=option["__row"]))
else:
seen_options.add(name)

if 0 < len(duplicate_errors):
raise PyXFormError("\n".join(duplicate_errors))
6 changes: 2 additions & 4 deletions pyxform/validators/pyxform/question_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

from pyxform.errors import PyXFormError
from pyxform.parsing.expression import is_single_token_expression
from pyxform.parsing.expression import is_pyxform_reference
from pyxform.utils import PYXFORM_REFERENCE_REGEX

BACKGROUND_GEOPOINT_CALCULATION = "[row : {r}] For 'background-geopoint' questions, the 'calculation' column must be empty."
Expand All @@ -25,9 +25,7 @@ def validate_background_geopoint_calculation(row: dict, row_num: int) -> bool:

def validate_background_geopoint_trigger(row: dict, row_num: int) -> bool:
"""A background-geopoint must have a trigger."""
if not row.get("trigger", False) or not is_single_token_expression(
expression=row["trigger"], token_types=["PYXFORM_REF"]
):
if not row.get("trigger", False) or not is_pyxform_reference(value=row["trigger"]):
raise PyXFormError(TRIGGER_INVALID.format(r=row_num, t=row["type"]))
return True

Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
import re
from collections.abc import KeysView

from pyxform import constants
from pyxform.utils import levenshtein_distance

# http://www.w3.org/TR/REC-xml/
TAG_START_CHAR = r"[a-zA-Z:_]"
TAG_CHAR = r"[a-zA-Z:_0-9\-.]"
XFORM_TAG_REGEXP = re.compile(rf"^{TAG_START_CHAR}{TAG_CHAR}*$")


def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None":
"""
Expand Down Expand Up @@ -36,10 +30,3 @@ def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None":
return msg
else:
return None


def is_valid_xml_tag(tag):
"""
Use a regex to see if there are any invalid characters (i.e. spaces).
"""
return re.search(XFORM_TAG_REGEXP, tag)
119 changes: 36 additions & 83 deletions pyxform/xls2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import os
import re
import sys
from collections import Counter
from typing import IO, Any

from pyxform import aliases, constants
Expand All @@ -21,15 +20,16 @@
validate_entity_saveto,
)
from pyxform.errors import PyXFormError
from pyxform.parsing.expression import is_single_token_expression
from pyxform.parsing.expression import is_pyxform_reference, is_xml_tag
from pyxform.utils import PYXFORM_REFERENCE_REGEX, coalesce, default_is_dynamic
from pyxform.validators.pyxform import choices as vc
from pyxform.validators.pyxform import parameters_generic, select_from_file
from pyxform.validators.pyxform import question_types as qt
from pyxform.validators.pyxform.android_package_name import validate_android_package_name
from pyxform.validators.pyxform.pyxform_reference import validate_pyxform_reference_syntax
from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings
from pyxform.validators.pyxform.translations_checks import SheetTranslations
from pyxform.xls2json_backends import csv_to_dict, xls_to_dict, xlsx_to_dict
from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag

SMART_QUOTES = {"\u2018": "'", "\u2019": "'", "\u201c": '"', "\u201d": '"'}
RE_SMART_QUOTES = re.compile(r"|".join(re.escape(old) for old in SMART_QUOTES))
Expand Down Expand Up @@ -175,7 +175,12 @@ def dealias_types(dict_array):
return dict_array


def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool = False):
def clean_text_values(
sheet_name: str,
data: list[dict],
strip_whitespace: bool = False,
add_row_number: bool = False,
) -> list[dict]:
"""
Go though the dict array and strips all text values.
Also replaces multiple spaces with single spaces.
Expand All @@ -192,6 +197,8 @@ def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool
validate_pyxform_reference_syntax(
value=value, sheet_name=sheet_name, row_number=row_number, key=key
)
if add_row_number:
row["__row"] = row_number
return data


Expand Down Expand Up @@ -513,82 +520,41 @@ def workbook_to_json(

# ########## Choices sheet ##########
choices_sheet = workbook_dict.get(constants.CHOICES, [])
choices_sheet = clean_text_values(sheet_name=constants.CHOICES, data=choices_sheet)
choices_sheet = clean_text_values(
sheet_name=constants.CHOICES,
data=choices_sheet,
add_row_number=True,
)
choices_sheet = dealias_and_group_headers(
dict_array=choices_sheet,
header_aliases=aliases.list_header,
use_double_colons=use_double_colons,
default_language=default_language,
)
combined_lists = group_dictionaries_by_key(
choices = group_dictionaries_by_key(
list_of_dicts=choices_sheet.data, key=constants.LIST_NAME_S
)
# To combine the warning into one message, the check for missing choices translation
# columns is run with Survey sheet below.

choices = combined_lists
# Make sure all the options have the required properties:
warnedabout = set()
for list_name, options in choices.items():
# Warn and remove invalid headers in case the form uses headers for notes.
invalid_headers = vc.validate_headers(choices_sheet.headers, warnings)
allow_duplicates = aliases.yes_no.get(
settings.get("allow_choice_duplicates", False), False
)
for options in choices.values():
vc.validate_choices(
options=options,
warnings=warnings,
allow_duplicates=allow_duplicates,
)
for option in options:
if "name" not in option:
info = "[list_name : " + list_name + "]"
raise PyXFormError(
"On the choices sheet there is a option with no name. " + info
)
if "label" not in option:
info = "[list_name : " + list_name + "]"
warnings.append(
"On the choices sheet there is a option with no label. " + info
)
# chrislrobert's fix for a cryptic error message:
# see: https://code.google.com/p/opendatakit/issues/detail?id=833&start=200
option_keys = list(option.keys())
for headername in option_keys:
# Using warnings and removing the bad columns
# instead of throwing errors because some forms
# use choices column headers for notes.
if " " in headername:
if headername not in warnedabout:
warnedabout.add(headername)
warnings.append(
"On the choices sheet there is "
+ 'a column ("'
+ headername
+ '") with an illegal header. '
+ "Headers cannot include spaces."
)
del option[headername]
elif headername == "":
warnings.append(
"On the choices sheet there is a value"
+ " in a column with no header."
)
del option[headername]
list_name_choices = [option.get("name") for option in options]
if len(list_name_choices) != len(set(list_name_choices)):
duplicate_setting = settings.get("allow_choice_duplicates")
for v in Counter(list_name_choices).values():
if v > 1:
if not duplicate_setting or duplicate_setting.capitalize() != "Yes":
choice_duplicates = [
item
for item, count in Counter(list_name_choices).items()
if count > 1
]

if choice_duplicates:
raise PyXFormError(
"The name column for the '{}' choice list contains these duplicates: {}. Duplicate names "
"will be impossible to identify in analysis unless a previous value in a cascading "
"select differentiates them. If this is intentional, you can set the "
"allow_choice_duplicates setting to 'yes'. Learn more: https://xlsform.org/#choice-names.".format(
list_name,
", ".join(
[f"'{dupe}'" for dupe in choice_duplicates]
),
)
)
for invalid_header in invalid_headers:
option.pop(invalid_header, None)
del option["__row"]

if 0 < len(choices):
json_dict[constants.CHOICES] = choices

# ########## Entities sheet ###########
entities_sheet = workbook_dict.get(constants.ENTITIES, [])
Expand Down Expand Up @@ -944,7 +910,7 @@ def workbook_to_json(
ROW_FORMAT_STRING % row_number + " Question or group with no name."
)
question_name = str(row[constants.NAME])
if not is_valid_xml_tag(question_name):
if not is_xml_tag(question_name):
if isinstance(question_name, bytes):
question_name = question_name.decode("utf-8")

Expand Down Expand Up @@ -1021,10 +987,7 @@ def workbook_to_json(
repeat_count_expression = new_json_dict.get("control", {}).get("jr:count")
if repeat_count_expression:
# Simple expressions don't require a new node, they can reference directly.
simple_expression = is_single_token_expression(
expression=repeat_count_expression, token_types=["PYXFORM_REF"]
)
if not simple_expression:
if not is_pyxform_reference(value=repeat_count_expression):
generated_node_name = new_json_dict["name"] + "_count"
parent_children_array.append(
{
Expand Down Expand Up @@ -1242,16 +1205,6 @@ def workbook_to_json(
choice_filter=row.get(constants.CHOICE_FILTER),
file_extension=file_extension,
)
# Add the choice to the survey choices if it's being used in an itemset.
if (
constants.ITEMSET in new_json_dict
and new_json_dict[constants.ITEMSET] == list_name
and list_name in choices
):
# Initialise choices output if none added already.
if constants.CHOICES not in json_dict:
json_dict[constants.CHOICES] = {}
json_dict[constants.CHOICES][list_name] = choices[list_name]

# Code to deal with table_list appearance flags
# (for groups of selects)
Expand Down
Empty file added tests/parsing/__init__.py
Empty file.
Loading