Skip to content

Commit 7817c24

Browse files
fix: randomize node order in find_leader
find_leader iterated node_store.get_nodes() in a fixed order, so every caller stampeded the first-listed seed. That concentrates leader discovery load on one node and biases toward whatever stale view it happens to return. Shuffle the local copy before iterating. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 20610a6 commit 7817c24

2 files changed

Lines changed: 34 additions & 1 deletion

File tree

src/dqliteclient/cluster.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Cluster management and leader detection for dqlite."""
22

33
import asyncio
4+
import random
45

56
from dqliteclient.connection import DqliteConnection, _parse_address
67
from dqliteclient.exceptions import (
@@ -50,6 +51,12 @@ async def find_leader(self) -> str:
5051
if not nodes:
5152
raise ClusterError("No nodes configured")
5253

54+
# Shuffle so repeated callers don't all stampede the first-listed
55+
# node. A stable first probe concentrates leader-discovery load on
56+
# one seed and biases toward its (possibly stale) leader view.
57+
nodes = list(nodes)
58+
random.shuffle(nodes)
59+
5360
errors: list[str] = []
5461
last_exc: BaseException | None = None
5562

tests/test_cluster.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
"""Tests for cluster management."""
22

3+
import contextlib
34
from unittest.mock import AsyncMock, MagicMock, patch
45

56
import pytest
67

78
from dqliteclient.cluster import ClusterClient
8-
from dqliteclient.exceptions import ClusterError
9+
from dqliteclient.exceptions import ClusterError, DqliteConnectionError
910
from dqliteclient.node_store import MemoryNodeStore, NodeInfo
1011

1112

@@ -264,6 +265,31 @@ async def failing_query(_address: str) -> str | None:
264265
await client.find_leader()
265266
assert exc_info.value.__cause__ is boom
266267

268+
async def test_find_leader_randomizes_node_order(self) -> None:
269+
"""find_leader must not always probe the first-listed node first —
270+
otherwise stampedes and stale-cache biases concentrate on it.
271+
"""
272+
from collections import Counter
273+
274+
store = MemoryNodeStore(["n1:9001", "n2:9001", "n3:9001", "n4:9001"])
275+
client = ClusterClient(store, timeout=0.2)
276+
277+
first_probed: list[str] = []
278+
279+
async def track(address: str) -> str | None:
280+
first_probed.append(address)
281+
raise DqliteConnectionError("not leader")
282+
283+
with patch.object(client, "_query_leader", side_effect=track):
284+
for _ in range(50):
285+
with contextlib.suppress(ClusterError):
286+
await client.find_leader()
287+
288+
# Record the first probe of each call.
289+
firsts = [first_probed[i] for i in range(0, len(first_probed), 4)]
290+
counts = Counter(firsts)
291+
assert len(counts) >= 2, f"find_leader always probed the same node first: {counts}"
292+
267293
async def test_update_nodes(self) -> None:
268294
store = MemoryNodeStore()
269295
client = ClusterClient(store)

0 commit comments

Comments
 (0)