A Coding Guide for Property-Based Testing Using Hypothesis with Stateful, Differential, and Metamorphic Test Design

Thank you for reading this post, don't forget to subscribe!

In this tutorial, we explore property-based testing using Hypothesis and build a rigorous testing pipeline that goes far beyond traditional unit testing. We implement invariants, differential testing, metamorphic testing, targeted exploration, and stateful testing to validate both functional correctness and behavioral guarantees of our systems. Instead of manually crafting edge cases, we let Hypothesis generate structured inputs, shrink failures to minimal counterexamples, and systematically uncover hidden bugs. Also, we demonstrate how modern testing practices can be integrated directly into experimental and research-driven workflows.

import sys, textwrap, subprocess, os, re, math
!{sys.executable} -m pip -q install hypothesis pytest

test_code = r”’
import re, math
import pytest
from hypothesis import (
given, assume, example, settings, note, target,
HealthCheck, Phase
)
from hypothesis import strategies as st
from hypothesis.stateful import RuleBasedStateMachine, rule, invariant, initialize, precondition

def clamp(x: int, lo: int, hi: int) -> int:
if x < lo:
return lo
if x > hi:
return hi
return x

def normalize_whitespace(s: str) -> str:
return ” “.join(s.split())

def is_sorted_non_decreasing(xs):
return all(xs[i] <= xs[i+1] for i in range(len(xs)-1))

def merge_sorted(a, b):
i = j = 0
out = []
while i < len(a) and j < len(b):
if a[i] <= b[j]:
out.append(a[i]); i += 1
else:
out.append(b[j]); j += 1
out.extend(a[i:])
out.extend(b[j:])
return out

def merge_sorted_reference(a, b):
return sorted(list(a) + list(b))

We set up the environment by installing Hypothesis and pytest and importing all required modules. We begin constructing the full test suite by defining core utility functions such as clamp, normalize_whitespace, and merge_sorted. We establish the functional foundation that our property-based tests will rigorously validate in later snippets.

def safe_parse_int(s: str):
t = s.strip()
if re.fullmatch(r”[+-]?\d+”, t) is None:
return (False, “not_an_int”)
if len(t.lstrip(“+-“)) > 2000:
return (False, “too_big”)
try:
return (True, int(t))
except Exception:
return (False, “parse_error”)

def safe_parse_int_alt(s: str):
t = s.strip()
if not t:
return (False, “not_an_int”)
sign = 1
if t[0] == “+”:
t = t[1:]
elif t[0] == “-“:
sign = -1
t = t[1:]
if not t or any(ch < “0” or ch > “9” for ch in t):
return (False, “not_an_int”)
if len(t) > 2000:
return (False, “too_big”)
val = 0
for ch in t:
val = val * 10 + (ord(ch) – 48)
return (True, sign * val)

bounds = st.tuples(st.integers(-10_000, 10_000), st.integers(-10_000, 10_000)).map(
lambda t: (t[0], t[1]) if t[0] <= t[1] else (t[1], t[0])
)

@st.composite
def int_like_strings(draw):
sign = draw(st.sampled_from([“”, “+”, “-“]))
digits = draw(st.text(alphabet=st.characters(min_codepoint=48, max_codepoint=57), min_size=1, max_size=300))
left_ws = draw(st.text(alphabet=[” “, “\t”, “\n”], min_size=0, max_size=5))
right_ws = draw(st.text(alphabet=[” “, “\t”, “\n”], min_size=0, max_size=5))
return f”{left_ws}{sign}{digits}{right_ws}”

sorted_lists = st.lists(st.integers(-10_000, 10_000), min_size=0, max_size=200).map(sorted)

We implement parsing logic and define structured strategies that generate constrained, meaningful test inputs. We create composite strategies such as int_like_strings to precisely control the input space for property validation. We prepare sorted list generators and bounds strategies that enable differential and invariant-based testing.

@settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow])
@given(x=st.integers(-50_000, 50_000), b=bounds)
def test_clamp_within_bounds(x, b):
lo, hi = b
y = clamp(x, lo, hi)
assert lo <= y <= hi

@settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow])
@given(x=st.integers(-50_000, 50_000), b=bounds)
def test_clamp_idempotent(x, b):
lo, hi = b
y = clamp(x, lo, hi)
assert clamp(y, lo, hi) == y

@settings(max_examples=250)
@given(s=st.text())
@example(” a\t\tb \n c “)
def test_normalize_whitespace_is_idempotent(s):
t = normalize_whitespace(s)
assert normalize_whitespace(t) == t
assert normalize_whitespace(” \n\t ” + s + ” \t”) == normalize_whitespace(s)

@settings(max_examples=250, suppress_health_check=[HealthCheck.too_slow])
@given(a=sorted_lists, b=sorted_lists)
def test_merge_sorted_matches_reference(a, b):
out = merge_sorted(a, b)
ref = merge_sorted_reference(a, b)
assert out == ref
assert is_sorted_non_decreasing(out)

We define core property tests that validate correctness and idempotence across multiple functions. We use Hypothesis decorators to automatically explore edge cases and verify behavioral guarantees such as boundary constraints and deterministic normalization. We also implement differential testing to ensure our merge implementation matches a trusted reference.

@settings(max_examples=250, deadline=200, suppress_health_check=[HealthCheck.too_slow])
@given(s=int_like_strings())
def test_two_parsers_agree_on_int_like_strings(s):
ok1, v1 = safe_parse_int(s)
ok2, v2 = safe_parse_int_alt(s)
assert ok1 and ok2
assert v1 == v2

@settings(max_examples=250)
@given(s=st.text(min_size=0, max_size=200))
def test_safe_parse_int_rejects_non_ints(s):
t = s.strip()
m = re.fullmatch(r”[+-]?\d+”, t)
ok, val = safe_parse_int(s)
if m is None:
assert ok is False
else:
if len(t.lstrip(“+-“)) > 2000:
assert ok is False and val == “too_big”
else:
assert ok is True and isinstance(val, int)

def variance(xs):
if len(xs) < 2:
return 0.0
mu = sum(xs) / len(xs)
return sum((x – mu) ** 2 for x in xs) / (len(xs) – 1)

@settings(max_examples=250, phases=[Phase.generate, Phase.shrink])
@given(xs=st.lists(st.integers(-1000, 1000), min_size=0, max_size=80))
def test_statistics_sanity(xs):
target(variance(xs))
if len(xs) == 0:
assert variance(xs) == 0.0
elif len(xs) == 1:
assert variance(xs) == 0.0
else:
v = variance(xs)
assert v >= 0.0
k = 7
assert math.isclose(variance([x + k for x in xs]), v, rel_tol=1e-12, abs_tol=1e-12)

We extend our validation to parsing robustness and statistical correctness using targeted exploration. We verify that two independent integer parsers agree on structured inputs and enforce rejection rules on invalid strings. We further implement metamorphic testing by validating invariants of variance under transformation.

class Bank:
def __init__(self):
self.balance = 0
self.ledger = []

def deposit(self, amt: int):
if amt <= 0:
raise ValueError(“deposit must be positive”)
self.balance += amt
self.ledger.append((“dep”, amt))

def withdraw(self, amt: int):
if amt <= 0:
raise ValueError(“withdraw must be positive”)
if amt > self.balance:
raise ValueError(“insufficient funds”)
self.balance -= amt
self.ledger.append((“wd”, amt))

def replay_balance(self):
bal = 0
for typ, amt in self.ledger:
bal += amt if typ == “dep” else -amt
return bal

class BankMachine(RuleBasedStateMachine):
def __init__(self):
super().__init__()
self.bank = Bank()

@initialize()
def init(self):
assert self.bank.balance == 0
assert self.bank.replay_balance() == 0

@rule(amt=st.integers(min_value=1, max_value=10_000))
def deposit(self, amt):
self.bank.deposit(amt)

@precondition(lambda self: self.bank.balance > 0)
@rule(amt=st.integers(min_value=1, max_value=10_000))
def withdraw(self, amt):
assume(amt <= self.bank.balance)
self.bank.withdraw(amt)

@invariant()
def balance_never_negative(self):
assert self.bank.balance >= 0

@invariant()
def ledger_replay_matches_balance(self):
assert self.bank.replay_balance() == self.bank.balance

TestBankMachine = BankMachine.TestCase
”’

path = “/tmp/test_hypothesis_advanced.py”
with open(path, “w”, encoding=”utf-8″) as f:
f.write(test_code)

print(“Hypothesis version:”, __import__(“hypothesis”).__version__)
print(“\nRunning pytest on:”, path, “\n”)

res = subprocess.run([sys.executable, “-m”, “pytest”, “-q”, path], capture_output=True, text=True)
print(res.stdout)
if res.returncode != 0:
print(res.stderr)

if res.returncode == 0:
print(“\nAll Hypothesis tests passed.”)
elif res.returncode == 5:
print(“\nPytest collected no tests.”)
else:
print(“\nSome tests failed.”)

We implement a stateful system using Hypothesis’s rule-based state machine to simulate a bank account. We define rules, preconditions, and invariants to guarantee balance consistency and ledger integrity under arbitrary operation sequences. We then execute the entire test suite via pytest, allowing Hypothesis to automatically discover counterexamples and verify system correctness.

In conclusion, we built a comprehensive property-based testing framework that validates pure functions, parsing logic, statistical behavior, and even stateful systems with invariants. We leveraged Hypothesis’s shrinking, targeted search, and state machine testing capabilities to move from example-based testing to behavior-driven verification. It allows us to reason about correctness at a higher level of abstraction while maintaining strong guarantees for edge cases and system consistency.

Check out the Full Coding Notebook here. Also, feel free to follow us on Twitter and don’t forget to join our 130k+ ML SubReddit and Subscribe to our Newsletter. Wait! are you on telegram? now you can join us on telegram as well.

Need to partner with us for promoting your GitHub Repo OR Hugging Face Page OR Product Release OR Webinar etc.? Connect with us

Source link