Skip to content

Commit

Permalink
Merge pull request #734 from lindsay-stevens/pyxform-724
Browse files Browse the repository at this point in the history
724: raise an error if a pyxform reference is malformed
  • Loading branch information
lognaturel authored Oct 29, 2024
2 parents a724215 + d9a1b5a commit 61bb3c3
Show file tree
Hide file tree
Showing 12 changed files with 304 additions and 165 deletions.
99 changes: 97 additions & 2 deletions pyxform/parsing/expression.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,108 @@
import re
from collections.abc import Iterable
from functools import lru_cache
from typing import NamedTuple

from pyxform.utils import parse_expression

def get_expression_lexer() -> re.Scanner:
"""
Get a expression lexer (scanner) for parsing.
"""
# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
# and https://www.w3.org/TR/REC-xml-names/#NT-NCName
namestartchar = (
r"([A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|"
+ r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
+ r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
+ r"|[\U00010000-\U000EFFFF])"
)
# additional characters allowed in NCNames after the first character
namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
ncname_regex = (
r"(" + namestartchar + r")(" + namestartchar + r"|" + namechar_extra + r")*"
)
ncname_regex = ncname_regex + r"(:" + ncname_regex + r")?"

date_regex = r"-?\d{4}-\d{2}-\d{2}"
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
lexer_rules = {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
"OPS_MATH": r"[\*\+\-]|mod|div",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r"and|or",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
"BRACKET": r"\[\]\{\}",
"PARENT_REF": r"\.\.",
"SELF_REF": r"\.",
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
"URI_SCHEME": ncname_regex + r"://",
"NAME": ncname_regex, # Must be after rules containing ncname_regex.
"PYXFORM_REF_START": r"\$\{",
"PYXFORM_REF_END": r"\}",
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}

def get_tokenizer(name):
def tokenizer(scan, value):
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
# re.Scanner is undocumented but has been around since at least 2003
# https://mail.python.org/pipermail/python-dev/2003-April/035075.html
return re.Scanner(lexicon)


# Scanner takes a few 100ms to compile so use this shared instance.
class ExpLexerToken(NamedTuple):
name: str
value: str
start: int
end: int


_EXPRESSION_LEXER = get_expression_lexer()


@lru_cache(maxsize=1024)
def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
"""
Parse an expression.
Use this function instead of _EXPRESSION_LEXER to take advantage of caching.
:param text: The expression.
:return: The parsed tokens, and any remaining unparsed text.
"""
tokens, remainder = _EXPRESSION_LEXER.scan(text)
return tokens, remainder


def is_single_token_expression(expression: str, token_types: Iterable[str]) -> bool:
"""
Does the expression contain single token of one of the provided token types?
"""
tokens, _ = parse_expression(text=expression.strip())
tokens, _ = parse_expression(expression.strip())
if 1 == len(tokens) and tokens[0].name in token_types:
return True
else:
Expand Down
9 changes: 4 additions & 5 deletions pyxform/parsing/instance_expression.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from typing import TYPE_CHECKING

from pyxform.utils import BRACKETED_TAG_REGEX, EXPRESSION_LEXER, ExpLexerToken, node
from pyxform.parsing.expression import ExpLexerToken, parse_expression
from pyxform.utils import BRACKETED_TAG_REGEX, node

if TYPE_CHECKING:
from pyxform.survey import Survey
Expand Down Expand Up @@ -37,7 +37,7 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
path_enter = False
pred_enter = False
last_token = None
tokens, _ = EXPRESSION_LEXER.scan(xml_text)
tokens, _ = parse_expression(xml_text)
boundaries = []

for t in tokens:
Expand Down Expand Up @@ -111,8 +111,7 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
old_str = xml_text[start:end]
# Pass the new string through the pyxform reference replacer.
# noinspection PyProtectedMember
new_str = re.sub(
BRACKETED_TAG_REGEX,
new_str = BRACKETED_TAG_REGEX.sub(
lambda m: survey._var_repl_function(m, context),
old_str,
)
Expand Down
96 changes: 3 additions & 93 deletions pyxform/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,18 @@
import re
from io import StringIO
from json.decoder import JSONDecodeError
from typing import Any, NamedTuple
from typing import Any
from xml.dom import Node
from xml.dom.minidom import Element, Text, _write_data

from defusedxml.minidom import parseString

from pyxform import constants as const
from pyxform.errors import PyXFormError
from pyxform.parsing.expression import parse_expression

SEP = "_"

INVALID_XFORM_TAG_REGEXP = r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*"

INVALID_XFORM_TAG_REGEXP = re.compile(r"[^a-zA-Z:_][^a-zA-Z:_0-9\-.]*")
LAST_SAVED_INSTANCE_NAME = "__last-saved"
BRACKETED_TAG_REGEX = re.compile(r"\${(last-saved#)?(.*?)}")
LAST_SAVED_REGEX = re.compile(r"\${last-saved#(.*?)}")
Expand Down Expand Up @@ -334,94 +333,5 @@ def levenshtein_distance(a: str, b: str) -> int:
return v0[n]


def get_expression_lexer() -> re.Scanner:
"""
Get a expression lexer (scanner) for parsing.
"""
# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
# and https://www.w3.org/TR/REC-xml-names/#NT-NCName
namestartchar = (
r"([A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|"
+ r"[\u0370-\u037d]|[\u037f-\u1fff]|[\u200c-\u200d]|[\u2070-\u218f]|"
+ r"[\u2c00-\u2fef]|[\u3001-\uD7FF]|[\uF900-\uFDCF]|[\uFDF0-\uFFFD]"
+ r"|[\U00010000-\U000EFFFF])"
)
# additional characters allowed in NCNames after the first character
namechar_extra = r"[-.0-9\xb7\u0300-\u036f\u203f-\u2040]"
ncname_regex = (
r"(" + namestartchar + r")(" + namestartchar + r"|" + namechar_extra + r")*"
)
ncname_regex = ncname_regex + r"(:" + ncname_regex + r")?"

date_regex = r"-?\d{4}-\d{2}-\d{2}"
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
lexer_rules = {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
"OPS_MATH": r"[\*\+\-]|mod|div",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r"and|or",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
"BRACKET": r"\[\]\{\}",
"PARENT_REF": r"\.\.",
"SELF_REF": r"\.",
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
"URI_SCHEME": ncname_regex + r"://",
"NAME": ncname_regex, # Must be after rules containing ncname_regex.
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}

def get_tokenizer(name):
def tokenizer(scan, value):
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
# re.Scanner is undocumented but has been around since at least 2003
# https://mail.python.org/pipermail/python-dev/2003-April/035075.html
return re.Scanner(lexicon)


# Scanner takes a few 100ms to compile so use this shared instance.
class ExpLexerToken(NamedTuple):
name: str
value: str
start: int
end: int


EXPRESSION_LEXER = get_expression_lexer()


def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
"""
Parse a "default" expression, well enough to identify dynamic defaults vs. not.
:param text: The expression.
:return: The parsed tokens, and any remaining unparsed text.
"""
tokens, remainder = EXPRESSION_LEXER.scan(text)
return tokens, remainder


def coalesce(*args):
return next((a for a in args if a is not None), None)
8 changes: 5 additions & 3 deletions pyxform/validators/error_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import re

ERROR_MESSAGE_REGEX = re.compile(r"(/[a-z0-9\-_]+(?:/[a-z0-9\-_]+)+)", flags=re.I)


class ErrorCleaner:
"""Cleans up raw error messages from XForm validators for end users."""
Expand All @@ -22,9 +24,9 @@ def _replace_xpath_with_tokens(match):

@staticmethod
def _cleanup_errors(error_message):
pattern = r"(/[a-z0-9\-_]+(?:/[a-z0-9\-_]+)+)"
error_message = re.sub(
pattern, ErrorCleaner._replace_xpath_with_tokens, error_message, flags=re.I
error_message = ERROR_MESSAGE_REGEX.sub(
ErrorCleaner._replace_xpath_with_tokens,
error_message,
)
lines = str(error_message).strip().splitlines()
no_dupes = [
Expand Down
53 changes: 53 additions & 0 deletions pyxform/validators/pyxform/pyxform_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from pyxform import constants as co
from pyxform.errors import PyXFormError
from pyxform.parsing.expression import parse_expression

PYXFORM_REFERENCE_INVALID = (
"[row : {row_number}] On the '{sheet}' sheet, the '{column}' value is invalid. "
"Reference expressions must only include question names, and end with '}}'."
)


def validate_pyxform_reference_syntax(
value: str, sheet_name: str, row_number: int, key: str
) -> None:
# Skip columns in potentially large sheets where references are not allowed.
if sheet_name == co.SURVEY:
if key in (co.TYPE, co.NAME):
return
elif sheet_name == co.CHOICES:
if key in (co.LIST_NAME_S, co.LIST_NAME_U, co.NAME):
return
elif sheet_name == co.ENTITIES:
if key == (co.LIST_NAME_S, co.LIST_NAME_U):
return

tokens, _ = parse_expression(value)
start_token = None

for t in tokens:
# The start of an expression.
if t is not None and t.name == "PYXFORM_REF_START" and start_token is None:
start_token = t
# Tokens that are part of an expression.
elif start_token is not None:
if t.name == "NAME":
continue
elif t.name == "PYXFORM_REF_END":
start_token = None
elif t.name in ("PYXFORM_REF_START", "PYXFORM_REF"):
msg = PYXFORM_REFERENCE_INVALID.format(
sheet=sheet_name, row_number=row_number, column=key
)
raise PyXFormError(msg)
else:
msg = PYXFORM_REFERENCE_INVALID.format(
sheet=sheet_name, row_number=row_number, column=key
)
raise PyXFormError(msg)

if start_token is not None:
msg = PYXFORM_REFERENCE_INVALID.format(
sheet=sheet_name, row_number=row_number, column=key
)
raise PyXFormError(msg)
4 changes: 1 addition & 3 deletions pyxform/validators/pyxform/question_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
Validations for question types.
"""

import re

from pyxform.errors import PyXFormError
from pyxform.parsing.expression import is_single_token_expression
from pyxform.utils import PYXFORM_REFERENCE_REGEX
Expand Down Expand Up @@ -37,7 +35,7 @@ def validate_background_geopoint_trigger(row: dict, row_num: int) -> bool:
def validate_references(referrers: list[tuple[dict, int]], questions: set[str]) -> bool:
"""Triggers must refer to a question that exists."""
for row, row_num in referrers:
matches = re.match(PYXFORM_REFERENCE_REGEX, row["trigger"])
matches = PYXFORM_REFERENCE_REGEX.match(row["trigger"])
if matches is not None:
trigger = matches.groups()[0]
if trigger not in questions:
Expand Down
2 changes: 1 addition & 1 deletion pyxform/validators/pyxform/translations_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
if TYPE_CHECKING:
from collections.abc import Sequence

SheetData = tuple[tuple[str, ...]]
SheetData = tuple[tuple[str, ...], ...]
Warnings = list[str]


Expand Down
Loading

0 comments on commit 61bb3c3

Please sign in to comment.