Skip to content

Commit 1103fd6

Browse files
Use SystemRandom for cluster shuffle so random.seed() cannot defeat it
The find_leader sweep shuffles its candidate list to avoid every process picking up the same first node — a stampede-avoidance mechanism. Using random.shuffle from the module-level PRNG meant a downstream random.seed() call (test fixtures, libraries that want deterministic behavior without scoping it) defeated the avoidance: every seeded process picked the same shuffle and piled onto the same node. Switch to a module-level random.SystemRandom() instance. SystemRandom draws from OS entropy and ignores random.seed(). Update the existing shuffle-call pin to patch the new instance attribute. Pin the seed-immunity property in a new test file. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent fdc9409 commit 1103fd6

3 files changed

Lines changed: 86 additions & 10 deletions

File tree

src/dqliteclient/cluster.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@
4747
# held in memory and serialised into every traceback.
4848
_MAX_ERROR_MESSAGE_SNIPPET = 200
4949

50+
# Use OS-entropy randomness for the per-sweep node shuffle so that the
51+
# stampede-avoidance is not defeated by a downstream call to
52+
# ``random.seed(...)``. Test suites and some libraries seed the global
53+
# PRNG for determinism; if we used ``random.shuffle`` directly, every
54+
# process picking up that seed would produce the same shuffle and pile
55+
# onto the same node. ``SystemRandom`` ignores ``random.seed()``.
56+
_cluster_random = random.SystemRandom()
57+
5058
# Budget for the bounded writer-drain in ``_query_leader``. A
5159
# responsive peer drains FIN/ACK in microseconds; a slow peer must not
5260
# hold up leader discovery. 100 ms is generous for LAN and still
@@ -258,7 +266,7 @@ async def _find_leader_impl(self, *, trust_server_heartbeat: bool) -> str:
258266
# "fix" this toward Go's deterministic behavior without adding
259267
# an explicit stampede-avoidance mechanism elsewhere.
260268
nodes = list(nodes)
261-
random.shuffle(nodes)
269+
_cluster_random.shuffle(nodes)
262270
nodes.sort(key=lambda n: 0 if n.role == NodeRole.VOTER else 1)
263271

264272
errors: list[str] = []

tests/test_cluster.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -460,17 +460,19 @@ async def track(address: str, **_kwargs: object) -> str | None:
460460

461461
async def test_find_leader_calls_random_shuffle_on_candidate_list(self) -> None:
462462
"""Deterministic pin complementing the probabilistic test above:
463-
``find_leader`` invokes ``random.shuffle`` on its candidate list.
463+
``find_leader`` invokes the cluster shuffle on its candidate
464+
list.
464465
465466
The probabilistic test catches "shuffle disabled entirely"
466467
(``len(counts) == 1``) but not "shuffle replaced by some other
467468
deterministic permutation that still produces ≥ 2 distinct
468469
firsts" — a regression vector that would silently slip past.
469-
Patching ``random.shuffle`` directly lets us assert the call
470-
happens, and ``wraps=`` keeps the real shuffle behavior so
471-
``find_leader`` still terminates cleanly.
470+
Patching the bound ``shuffle`` method on the module-level
471+
``_cluster_random`` directly lets us assert the call happens.
472+
We swap to a deterministic identity-function so ``find_leader``
473+
still terminates cleanly.
472474
"""
473-
import random as _random
475+
from dqliteclient import cluster as cluster_module
474476

475477
store = MemoryNodeStore(["n1:9001", "n2:9001", "n3:9001", "n4:9001"])
476478
client = ClusterClient(store, timeout=0.2)
@@ -479,17 +481,18 @@ async def fail_query(address: str, **_kwargs: object) -> str | None:
479481
raise DqliteConnectionError("not leader")
480482

481483
with (
482-
patch(
483-
"dqliteclient.cluster.random.shuffle",
484-
wraps=_random.shuffle,
484+
patch.object(
485+
cluster_module._cluster_random,
486+
"shuffle",
487+
wraps=cluster_module._cluster_random.shuffle,
485488
) as mock_shuffle,
486489
patch.object(client, "_query_leader", side_effect=fail_query),
487490
contextlib.suppress(ClusterError),
488491
):
489492
await client.find_leader()
490493

491494
assert mock_shuffle.call_count >= 1, (
492-
"find_leader must invoke random.shuffle on its candidate list — "
495+
"find_leader must invoke the cluster shuffle on its candidate list — "
493496
"without the shuffle, parallel callers stampede the first-listed node"
494497
)
495498
# The first positional arg is the list being shuffled in place.
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""Pin: cluster shuffle is immune to ``random.seed()``.
2+
3+
The cluster module uses ``random.SystemRandom`` (exposed as
4+
``_cluster_random``) so that a downstream call to ``random.seed(...)``
5+
— common in test suites or in libraries that "want deterministic
6+
behavior" without scoping it — cannot defeat the per-sweep
7+
stampede-avoidance shuffle.
8+
9+
If we accidentally regress to module-level ``random.shuffle``, the
10+
seeded test below would produce identical shuffles every run and
11+
this test would fail.
12+
"""
13+
14+
from __future__ import annotations
15+
16+
import random
17+
18+
from dqliteclient.cluster import _cluster_random
19+
20+
21+
def test_cluster_random_is_system_random() -> None:
22+
"""The cluster shuffle source is a SystemRandom instance, not the
23+
module-level Random."""
24+
assert isinstance(_cluster_random, random.SystemRandom)
25+
26+
27+
def test_cluster_shuffle_unaffected_by_seed() -> None:
28+
"""Two shuffles taken with identical seeds applied to the global
29+
PRNG must still differ — proving the cluster shuffle does not
30+
consume the global PRNG state."""
31+
nodes_first: list[int] = list(range(100))
32+
nodes_second: list[int] = list(range(100))
33+
34+
random.seed(42)
35+
_cluster_random.shuffle(nodes_first)
36+
37+
random.seed(42)
38+
_cluster_random.shuffle(nodes_second)
39+
40+
# With SystemRandom, two seeded global states do not produce the
41+
# same shuffle. (The probability of a 100-element collision is
42+
# 1/100! — astronomically low.)
43+
assert nodes_first != nodes_second, (
44+
"cluster shuffle must use OS entropy, not the seeded module PRNG"
45+
)
46+
47+
48+
def test_cluster_shuffle_does_not_consume_module_random_state() -> None:
49+
"""A cluster shuffle in between two module-level ``random.random()``
50+
draws must not affect the second draw — proving the cluster
51+
shuffle is isolated from the module PRNG.
52+
"""
53+
random.seed(123)
54+
_ = random.random()
55+
expected_next = random.random()
56+
57+
# Reset, take one draw, then a cluster shuffle, then the next draw.
58+
random.seed(123)
59+
_ = random.random()
60+
_cluster_random.shuffle(list(range(100)))
61+
actual_next = random.random()
62+
63+
assert actual_next == expected_next, (
64+
"cluster shuffle must not consume bytes from the global module-level Random instance"
65+
)

0 commit comments

Comments
 (0)