Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use ConjectureData.for_choices #4219

Merged
merged 7 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
RELEASE_TYPE: patch

This patch improves shrinking involving long strings or byte sequences whose value is not relevant to the failure.
Internal code refactoring for the typed choice sequence (:issue:`3921`). May have some neutral effect on shrinking.
115 changes: 59 additions & 56 deletions hypothesis-python/src/hypothesis/internal/conjecture/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,16 +1071,19 @@ def ir_value_permitted(value, ir_type, kwargs):
raise NotImplementedError(f"unhandled type {type(value)} of ir value {value}")


def ir_size(ir: Iterable[IRType]) -> int:
def ir_size(ir: Iterable[Union[IRNode, NodeTemplate, IRType]]) -> int:
from hypothesis.database import ir_to_bytes

return len(ir_to_bytes(ir))


def ir_size_nodes(nodes: Iterable[Union[IRNode, NodeTemplate]]) -> int:
size = 0
for node in nodes:
size += node.size if isinstance(node, NodeTemplate) else ir_size([node.value])
for v in ir:
if isinstance(v, IRNode):
size += len(ir_to_bytes([v.value]))
elif isinstance(v, NodeTemplate):
size += v.size
else:
# IRType
size += len(ir_to_bytes([v]))

return size


Expand Down Expand Up @@ -1972,9 +1975,9 @@ def for_buffer(
)

@classmethod
def for_ir_tree(
def for_choices(
cls,
ir_tree_prefix: Sequence[Union[IRNode, NodeTemplate]],
choices: Sequence[Union[NodeTemplate, IRType]],
*,
observer: Optional[DataObserver] = None,
provider: Union[type, PrimitiveProvider] = HypothesisProvider,
Expand All @@ -1985,12 +1988,10 @@ def for_ir_tree(

return cls(
max_length=BUFFER_SIZE,
max_length_ir=(
ir_size_nodes(ir_tree_prefix) if max_length is None else max_length
),
max_length_ir=(ir_size(choices) if max_length is None else max_length),
prefix=b"",
random=random,
ir_tree_prefix=ir_tree_prefix,
ir_prefix=choices,
observer=observer,
provider=provider,
)
Expand All @@ -2003,7 +2004,7 @@ def __init__(
random: Optional[Random],
observer: Optional[DataObserver] = None,
provider: Union[type, PrimitiveProvider] = HypothesisProvider,
ir_tree_prefix: Optional[Sequence[Union[IRNode, NodeTemplate]]] = None,
ir_prefix: Optional[Sequence[Union[NodeTemplate, IRType]]] = None,
max_length_ir: Optional[int] = None,
) -> None:
from hypothesis.internal.conjecture.engine import BUFFER_SIZE_IR
Expand All @@ -2020,7 +2021,7 @@ def __init__(
self.__prefix = bytes(prefix)
self.__random = random

if ir_tree_prefix is None:
if ir_prefix is None:
assert random is not None or max_length <= len(prefix)

self.blocks = Blocks(self)
Expand Down Expand Up @@ -2081,7 +2082,7 @@ def __init__(

self.extra_information = ExtraInformation()

self.ir_prefix = ir_tree_prefix
self.ir_prefix = ir_prefix
self.ir_nodes: tuple[IRNode, ...] = ()
self.misaligned_at: Optional[MisalignedAt] = None
self.start_example(TOP_LABEL)
Expand Down Expand Up @@ -2125,25 +2126,25 @@ def choices(self) -> tuple[IRType, ...]:
def _draw(self, ir_type, kwargs, *, observe, forced, fake_forced):
# this is somewhat redundant with the length > max_length check at the
# end of the function, but avoids trying to use a null self.random when
# drawing past the node of a ConjectureData.for_ir_tree data.
# drawing past the node of a ConjectureData.for_choices data.
if self.length_ir == self.max_length_ir:
debug_report(f"overrun because hit {self.max_length_ir=}")
self.mark_overrun()

if self.ir_prefix is not None and observe:
if self.index_ir < len(self.ir_prefix):
node_value = self._pop_ir_tree_node(ir_type, kwargs, forced=forced)
choice = self._pop_choice(ir_type, kwargs, forced=forced)
else:
try:
(node_value, _buf) = ir_to_buffer(
(choice, _buf) = ir_to_buffer(
ir_type, kwargs, forced=forced, random=self.__random
)
except StopTest:
debug_report("overrun because ir_to_buffer overran")
self.mark_overrun()

if forced is None:
forced = node_value
forced = choice
fake_forced = True

value = getattr(self.provider, f"draw_{ir_type}")(
Expand Down Expand Up @@ -2346,17 +2347,16 @@ def _pooled_kwargs(self, ir_type, kwargs):
POOLED_KWARGS_CACHE[key] = kwargs
return kwargs

def _pop_ir_tree_node(
def _pop_choice(
self, ir_type: IRTypeName, kwargs: IRKWargsType, *, forced: Optional[IRType]
) -> IRType:
from hypothesis.internal.conjecture.engine import BUFFER_SIZE

assert self.ir_prefix is not None
# checked in _draw
assert self.index_ir < len(self.ir_prefix)

node = self.ir_prefix[self.index_ir]
if isinstance(node, NodeTemplate):
value = self.ir_prefix[self.index_ir]
if isinstance(value, NodeTemplate):
node = value
assert node.size >= 0
# node templates have to be at the end for now, since it's not immediately
# apparent how to handle overruning a node template while generating a single
Expand All @@ -2373,50 +2373,58 @@ def _pop_ir_tree_node(
node.size -= ir_size([value])
if node.size < 0:
self.mark_overrun()
return value

value = node.value
return value # type: ignore # mypy misses eliminating the NodeTemplate type
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be inclined to assert isinstance(...) rather than ignoring, since this is unlikely to be a noticable perf hit.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

assert isinstance(ChoiceT) is impossible and assert not isinstance(NodeTemplate) feels bad - I went for a slightly more roundabout type checking solution.


node_ir_type = {
str: "string",
float: "float",
int: "integer",
bool: "boolean",
bytes: "bytes",
}[type(value)]
# If we're trying to:
# * draw a different ir type at the same location
# * draw the same ir type with a different kwargs
#
# then we call this a misalignment, because the choice sequence has
# slipped from what we expected at some point. An easy misalignment is
tybug marked this conversation as resolved.
Show resolved Hide resolved
#
# st.one_of(st.integers(0, 100), st.integers(101, 200))
# one_of(integers(0, 100), integers(101, 200))
#
# where the choice sequence [0, 100] has kwargs {min_value: 0, max_value: 100}
# at position 2, but [0, 101] has kwargs {min_value: 101, max_value: 200} at
# position 2.
# at index 1, but [0, 101] has kwargs {min_value: 101, max_value: 200} at
# index 1.
#
# When we see a misalignment, we can't offer up the stored node value as-is.
# We need to make it appropriate for the requested kwargs and ir type.
# Right now we do that by using bytes as the intermediary to convert between
# ir types/kwargs. In the future we'll probably use the index into a custom
# ordering for an (ir_type, kwargs) pair.
if node.ir_type != ir_type or not ir_value_permitted(
node.value, node.ir_type, kwargs
):
# When the choice sequence becomes misaligned, we generate a new value of the
# type and kwargs the strategy expects.
if node_ir_type != ir_type or not ir_value_permitted(value, ir_type, kwargs):
# only track first misalignment for now.
if self.misaligned_at is None:
self.misaligned_at = (self.index_ir, ir_type, kwargs, forced)
try:
(_value, buffer) = ir_to_buffer(
node.ir_type, node.kwargs, forced=node.value
)
value = buffer_to_ir(
ir_type, kwargs, buffer=buffer + bytes(BUFFER_SIZE - len(buffer))
)
except StopTest:
# must have been an overrun.
# Fill in any misalignments with index 0 choices. An alternative to
# this is using the index of the misaligned choice instead
# of index 0, which may be useful for maintaining
# "similarly-complex choices" in the shrinker. This requires
# attaching an index to every choice in ConjectureData.for_choices,
# which we don't always have (e.g. when reading from db).
#
# If we really wanted this in the future we could make this complexity
# optional, use it if present, and default to index 0 otherwise.
# This complicates our internal api and so I'd like to avoid it
# if possible.
#
# maybe we should fall back to to an arbitrary small value here
# instead? eg
# buffer_to_ir(ir_type, kwargs, buffer=bytes(BUFFER_SIZE))
# Additionally, I don't think slips which require
# slipping to high-complexity values are common. Though arguably
# we may want to expand a bit beyond *just* the simplest choice.
# (we could for example consider sampling choices from index 0-10).
value = choice_from_index(0, ir_type, kwargs)
except ChoiceTooLarge:
# should really never happen with a 0-index choice, but let's be safe.
self.mark_overrun()

self.index_ir += 1
return value
return value # type: ignore # mypy misses eliminating the NodeTemplate type

def as_result(self) -> Union[ConjectureResult, _Overrun]:
"""Convert the result of running this test into
Expand Down Expand Up @@ -2730,8 +2738,3 @@ def ir_to_buffer(ir_type, kwargs, *, forced=None, random=None):
)
value = getattr(cd.provider, f"draw_{ir_type}")(**kwargs, forced=forced)
return (value, cd.buffer)


def buffer_to_ir(ir_type, kwargs, *, buffer):
cd = ConjectureData.for_buffer(buffer)
return getattr(cd.provider, f"draw_{ir_type}")(**kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -765,17 +765,17 @@ def append_node(node):
or any(not v.is_exhausted for v in branch.children.values())
)

def rewrite(self, nodes):
def rewrite(self, choices):
"""Use previously seen ConjectureData objects to return a tuple of
the rewritten choice sequence and the status we would get from running
that with the test function. If the status cannot be predicted
from the existing values it will be None."""
data = ConjectureData.for_ir_tree(nodes)
data = ConjectureData.for_choices(choices)
try:
self.simulate_test_function(data)
return (data.ir_nodes, data.status)
return (data.choices, data.status)
except PreviouslyUnseenBehaviour:
return (nodes, None)
return (choices, None)

def simulate_test_function(self, data):
"""Run a simulated version of the test function recorded by
Expand Down
15 changes: 6 additions & 9 deletions hypothesis-python/src/hypothesis/internal/conjecture/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@
_Overrun,
ir_kwargs_key,
ir_size,
ir_size_nodes,
ir_value_key,
)
from hypothesis.internal.conjecture.datatree import (
Expand Down Expand Up @@ -415,7 +414,7 @@ def cached_test_function_ir(
except KeyError:
pass

max_length = min(BUFFER_SIZE_IR, ir_size_nodes(nodes) + extend)
max_length = min(BUFFER_SIZE_IR, ir_size(nodes) + extend)

# explicitly use a no-op DataObserver here instead of a TreeRecordingObserver.
# The reason is we don't expect simulate_test_function to explore new choices
Expand Down Expand Up @@ -584,7 +583,7 @@ def test_function(self, data: ConjectureData) -> None:
initial_traceback = getattr(
data.extra_information, "_expected_traceback", None
)
data = ConjectureData.for_ir_tree(data.ir_nodes)
data = ConjectureData.for_choices(data.choices)
self.__stoppable_test_function(data)
data.freeze()
# TODO: Convert to FlakyFailure on the way out. Should same-origin
Expand Down Expand Up @@ -1067,7 +1066,7 @@ def generate_new_examples(self) -> None:
and not self.interesting_examples
and consecutive_zero_extend_is_invalid < 5
):
prefix_size = ir_size_nodes(prefix)
prefix_size = ir_size(prefix)
minimal_example = self.cached_test_function_ir(
prefix
+ (NodeTemplate("simplest", size=BUFFER_SIZE_IR - prefix_size),)
Expand All @@ -1081,9 +1080,7 @@ def generate_new_examples(self) -> None:
# ConjectureResult object.
assert isinstance(minimal_example, ConjectureResult)
consecutive_zero_extend_is_invalid = 0
minimal_extension = (
ir_size_nodes(minimal_example.ir_nodes) - prefix_size
)
minimal_extension = ir_size(minimal_example.ir_nodes) - prefix_size
max_length = min(prefix_size + minimal_extension * 10, BUFFER_SIZE_IR)

# We could end up in a situation where even though the prefix was
Expand Down Expand Up @@ -1328,8 +1325,8 @@ def new_conjecture_data_ir(
if not self.using_hypothesis_backend:
observer = DataObserver()

return ConjectureData.for_ir_tree(
ir_tree_prefix,
return ConjectureData.for_choices(
[n if isinstance(n, NodeTemplate) else n.value for n in ir_tree_prefix],
observer=observer,
provider=provider,
max_length=max_length,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
Status,
_Overrun,
bits_to_bytes,
ir_size_nodes,
ir_size,
ir_value_permitted,
)
from hypothesis.internal.conjecture.engine import BUFFER_SIZE_IR, ConjectureRunner
Expand Down Expand Up @@ -176,7 +176,7 @@ def attempt_replace(k: int) -> bool:
)
attempt = self.engine.cached_test_function_ir(
attempt_nodes,
extend=BUFFER_SIZE_IR - ir_size_nodes(attempt_nodes),
extend=BUFFER_SIZE_IR - ir_size(attempt_nodes),
)

if self.consider_new_data(attempt):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
ConjectureResult,
IRNode,
Status,
ir_size_nodes,
ir_size,
ir_to_buffer,
ir_value_equal,
ir_value_key,
Expand Down Expand Up @@ -586,7 +586,7 @@ def explain(self):

attempt = nodes[:start] + tuple(replacement) + nodes[end:]
result = self.engine.cached_test_function_ir(
attempt, extend=BUFFER_SIZE_IR - ir_size_nodes(attempt)
attempt, extend=BUFFER_SIZE_IR - ir_size(attempt)
)

# Turns out this was a variable-length part, so grab the infix...
Expand Down
Loading
Loading