Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

724: raise an error if a pyxform reference is malformed #734

Merged
merged 2 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 97 additions & 2 deletions pyxform/parsing/expression.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,108 @@
import re
from collections.abc import Iterable
from functools import lru_cache
from typing import NamedTuple

from pyxform.utils import parse_expression

def get_expression_lexer() -> re.Scanner:
"""
Get a expression lexer (scanner) for parsing.
"""
# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
# and https://www.w3.org/TR/REC-xml-names/#NT-NCName
namestartchar = (
r"([A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|"
+ r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
+ r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
+ r"|[\U00010000-\U000EFFFF])"
)
# additional characters allowed in NCNames after the first character
namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
ncname_regex = (
r"(" + namestartchar + r")(" + namestartchar + r"|" + namechar_extra + r")*"
)
ncname_regex = ncname_regex + r"(:" + ncname_regex + r")?"

date_regex = r"-?\d{4}-\d{2}-\d{2}"
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
lexer_rules = {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
"OPS_MATH": r"[\*\+\-]|mod|div",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r"and|or",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
"BRACKET": r"\[\]\{\}",
"PARENT_REF": r"\.\.",
"SELF_REF": r"\.",
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
"URI_SCHEME": ncname_regex + r"://",
"NAME": ncname_regex, # Must be after rules containing ncname_regex.
"PYXFORM_REF_START": r"\$\{",
"PYXFORM_REF_END": r"\}",
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}

def get_tokenizer(name):
def tokenizer(scan, value):
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
# re.Scanner is undocumented but has been around since at least 2003
# https://mail.python.org/pipermail/python-dev/2003-April/035075.html
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"a masterpiece of work to get inspiration from, but not as a tool to give out to anybody"
-- https://mail.python.org/pipermail/python-dev/2003-April/035070.html

Feels like annoying fence-sitting! https://bugs.python.org/issue5337

return re.Scanner(lexicon)


# Scanner takes a few 100ms to compile so use this shared instance.
class ExpLexerToken(NamedTuple):
name: str
value: str
start: int
end: int


_EXPRESSION_LEXER = get_expression_lexer()


@lru_cache(maxsize=1024)
def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
"""
Parse an expression.

Use this function instead of _EXPRESSION_LEXER to take advantage of caching.

:param text: The expression.
:return: The parsed tokens, and any remaining unparsed text.
"""
tokens, remainder = _EXPRESSION_LEXER.scan(text)
return tokens, remainder


def is_single_token_expression(expression: str, token_types: Iterable[str]) -> bool:
"""
Does the expression contain single token of one of the provided token types?
"""
tokens, _ = parse_expression(text=expression.strip())
tokens, _ = parse_expression(expression.strip())
if 1 == len(tokens) and tokens[0].name in token_types:
return True
else:
Expand Down
9 changes: 4 additions & 5 deletions pyxform/parsing/instance_expression.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from typing import TYPE_CHECKING

from pyxform.utils import BRACKETED_TAG_REGEX, EXPRESSION_LEXER, ExpLexerToken, node
from pyxform.parsing.expression import ExpLexerToken, parse_expression
from pyxform.utils import BRACKETED_TAG_REGEX, node

if TYPE_CHECKING:
from pyxform.survey import Survey
Expand Down Expand Up @@ -37,7 +37,7 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
path_enter = False
pred_enter = False
last_token = None
tokens, _ = EXPRESSION_LEXER.scan(xml_text)
tokens, _ = parse_expression(xml_text)
boundaries = []

for t in tokens:
Expand Down Expand Up @@ -111,8 +111,7 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
old_str = xml_text[start:end]
# Pass the new string through the pyxform reference replacer.
# noinspection PyProtectedMember
new_str = re.sub(
BRACKETED_TAG_REGEX,
new_str = BRACKETED_TAG_REGEX.sub(
lambda m: survey._var_repl_function(m, context),
old_str,
)
Expand Down
96 changes: 3 additions & 93 deletions pyxform/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,18 @@
import re
from io import StringIO
from json.decoder import JSONDecodeError
from typing import Any, NamedTuple
from typing import Any
from xml.dom import Node
from xml.dom.minidom import Element, Text, _write_data

from defusedxml.minidom import parseString

from pyxform import constants as const
from pyxform.errors import PyXFormError
from pyxform.parsing.expression import parse_expression

SEP = "_"

INVALID_XFORM_TAG_REGEXP = r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*"

INVALID_XFORM_TAG_REGEXP = re.compile(r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*")
LAST_SAVED_INSTANCE_NAME = "__last-saved"
BRACKETED_TAG_REGEX = re.compile(r"\${(last-saved#)?(.*?)}")
LAST_SAVED_REGEX = re.compile(r"\${last-saved#(.*?)}")
Expand Down Expand Up @@ -334,94 +333,5 @@ def levenshtein_distance(a: str, b: str) -> int:
return v0[n]


def get_expression_lexer() -> re.Scanner:
"""
Get a expression lexer (scanner) for parsing.
"""
# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
# and https://www.w3.org/TR/REC-xml-names/#NT-NCName
namestartchar = (
r"([A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|"
+ r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
+ r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
+ r"|[\U00010000-\U000EFFFF])"
)
# additional characters allowed in NCNames after the first character
namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
ncname_regex = (
r"(" + namestartchar + r")(" + namestartchar + r"|" + namechar_extra + r")*"
)
ncname_regex = ncname_regex + r"(:" + ncname_regex + r")?"

date_regex = r"-?\d{4}-\d{2}-\d{2}"
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
lexer_rules = {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
"OPS_MATH": r"[\*\+\-]|mod|div",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r"and|or",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
"BRACKET": r"\[\]\{\}",
"PARENT_REF": r"\.\.",
"SELF_REF": r"\.",
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
"URI_SCHEME": ncname_regex + r"://",
"NAME": ncname_regex, # Must be after rules containing ncname_regex.
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}

def get_tokenizer(name):
def tokenizer(scan, value):
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
# re.Scanner is undocumented but has been around since at least 2003
# https://mail.python.org/pipermail/python-dev/2003-April/035075.html
return re.Scanner(lexicon)


# Scanner takes a few 100ms to compile so use this shared instance.
class ExpLexerToken(NamedTuple):
name: str
value: str
start: int
end: int


EXPRESSION_LEXER = get_expression_lexer()


def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
"""
Parse a "default" expression, well enough to identify dynamic defaults vs. not.

:param text: The expression.
:return: The parsed tokens, and any remaining unparsed text.
"""
tokens, remainder = EXPRESSION_LEXER.scan(text)
return tokens, remainder


def coalesce(*args):
return next((a for a in args if a is not None), None)
8 changes: 5 additions & 3 deletions pyxform/validators/error_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import re

ERROR_MESSAGE_REGEX = re.compile(r"(/[a-z0-9\-_]+(?:/[a-z0-9\-_]+)+)", flags=re.I)


class ErrorCleaner:
"""Cleans up raw error messages from XForm validators for end users."""
Expand All @@ -22,9 +24,9 @@ def _replace_xpath_with_tokens(match):

@staticmethod
def _cleanup_errors(error_message):
pattern = r"(/[a-z0-9\-_]+(?:/[a-z0-9\-_]+)+)"
error_message = re.sub(
pattern, ErrorCleaner._replace_xpath_with_tokens, error_message, flags=re.I
error_message = ERROR_MESSAGE_REGEX.sub(
ErrorCleaner._replace_xpath_with_tokens,
error_message,
)
lines = str(error_message).strip().splitlines()
no_dupes = [
Expand Down
53 changes: 53 additions & 0 deletions pyxform/validators/pyxform/pyxform_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from pyxform import constants as co
from pyxform.errors import PyXFormError
from pyxform.parsing.expression import parse_expression

PYXFORM_REFERENCE_INVALID = (
"[row : {row_number}] On the '{sheet}' sheet, the '{column}' value is invalid. "
"Reference expressions must only include question names, and end with '}}'."
)


def validate_pyxform_reference_syntax(
value: str, sheet_name: str, row_number: int, key: str
) -> None:
# Skip columns in potentially large sheets where references are not allowed.
if sheet_name == co.SURVEY:
if key in (co.TYPE, co.NAME):
return
elif sheet_name == co.CHOICES:
if key in (co.LIST_NAME_S, co.LIST_NAME_U, co.NAME):
return
elif sheet_name == co.ENTITIES:
if key == (co.LIST_NAME_S, co.LIST_NAME_U):
return

tokens, _ = parse_expression(value)
start_token = None

for t in tokens:
# The start of an expression.
if t is not None and t.name == "PYXFORM_REF_START" and start_token is None:
start_token = t
# Tokens that are part of an expression.
elif start_token is not None:
if t.name == "NAME":
continue
elif t.name == "PYXFORM_REF_END":
start_token = None
elif t.name in ("PYXFORM_REF_START", "PYXFORM_REF"):
msg = PYXFORM_REFERENCE_INVALID.format(
sheet=sheet_name, row_number=row_number, column=key
)
raise PyXFormError(msg)
else:
msg = PYXFORM_REFERENCE_INVALID.format(
sheet=sheet_name, row_number=row_number, column=key
)
raise PyXFormError(msg)

if start_token is not None:
msg = PYXFORM_REFERENCE_INVALID.format(
sheet=sheet_name, row_number=row_number, column=key
)
raise PyXFormError(msg)
4 changes: 1 addition & 3 deletions pyxform/validators/pyxform/question_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
Validations for question types.
"""

import re

from pyxform.errors import PyXFormError
from pyxform.parsing.expression import is_single_token_expression
from pyxform.utils import PYXFORM_REFERENCE_REGEX
Expand Down Expand Up @@ -37,7 +35,7 @@ def validate_background_geopoint_trigger(row: dict, row_num: int) -> bool:
def validate_references(referrers: list[tuple[dict, int]], questions: set[str]) -> bool:
"""Triggers must refer to a question that exists."""
for row, row_num in referrers:
matches = re.match(PYXFORM_REFERENCE_REGEX, row["trigger"])
matches = PYXFORM_REFERENCE_REGEX.match(row["trigger"])
if matches is not None:
trigger = matches.groups()[0]
if trigger not in questions:
Expand Down
2 changes: 1 addition & 1 deletion pyxform/validators/pyxform/translations_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
if TYPE_CHECKING:
from collections.abc import Sequence

SheetData = tuple[tuple[str, ...]]
SheetData = tuple[tuple[str, ...], ...]
Warnings = list[str]


Expand Down
Loading