Skip to content

Commit c028b0a

Browse files
fix: find_leader prefers voter nodes
Standby and spare nodes never return a non-empty leader from the LEADER request (gateway.c CHECK_LEADER path returns (0, "") for non-voters). Probing them first wasted RTTs on every find_leader call. Sort nodes so voters come first; preserve the prior shuffle-within- voters for load balancing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 7817c24 commit c028b0a

2 files changed

Lines changed: 44 additions & 3 deletions

File tree

src/dqliteclient/cluster.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,13 @@ async def find_leader(self) -> str:
5151
if not nodes:
5252
raise ClusterError("No nodes configured")
5353

54-
# Shuffle so repeated callers don't all stampede the first-listed
55-
# node. A stable first probe concentrates leader-discovery load on
56-
# one seed and biases toward its (possibly stale) leader view.
54+
# Shuffle first so repeated callers don't stampede the same node;
55+
# then stable-sort by role so voters come before non-voters.
56+
# Standby/spare nodes can never become leader (their LEADER
57+
# response is always (0, "")), so probing them first wastes RTTs.
5758
nodes = list(nodes)
5859
random.shuffle(nodes)
60+
nodes.sort(key=lambda n: 0 if n.role == 0 else 1)
5961

6062
errors: list[str] = []
6163
last_exc: BaseException | None = None

tests/test_cluster.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,45 @@ async def track(address: str) -> str | None:
290290
counts = Counter(firsts)
291291
assert len(counts) >= 2, f"find_leader always probed the same node first: {counts}"
292292

293+
async def test_find_leader_probes_voters_before_non_voters(self) -> None:
294+
"""Non-voter nodes (standby/spare) cannot become leader; probing them
295+
first wastes an RTT. find_leader must prefer voters.
296+
"""
297+
store = MemoryNodeStore()
298+
# Seed with a non-voter first, then a voter.
299+
await store.set_nodes(
300+
[
301+
NodeInfo(node_id=2, address="spare:9002", role=2), # spare
302+
NodeInfo(node_id=1, address="standby:9003", role=1), # standby
303+
NodeInfo(node_id=3, address="voter1:9001", role=0),
304+
NodeInfo(node_id=4, address="voter2:9004", role=0),
305+
]
306+
)
307+
client = ClusterClient(store, timeout=0.2)
308+
309+
order: list[str] = []
310+
311+
async def track(address: str) -> str | None:
312+
order.append(address)
313+
return None # no leader known — keep probing
314+
315+
from contextlib import suppress
316+
317+
with (
318+
patch.object(client, "_query_leader", side_effect=track),
319+
suppress(ClusterError),
320+
):
321+
await client.find_leader()
322+
323+
# Both voters must be probed before the spare.
324+
voter_positions = [i for i, a in enumerate(order) if a.startswith("voter")]
325+
non_voter_positions = [i for i, a in enumerate(order) if not a.startswith("voter")]
326+
assert voter_positions, "no voters were probed"
327+
assert non_voter_positions, "test setup broken — no non-voters"
328+
assert max(voter_positions) < min(non_voter_positions), (
329+
f"voters should be probed first; order={order}"
330+
)
331+
293332
async def test_update_nodes(self) -> None:
294333
store = MemoryNodeStore()
295334
client = ClusterClient(store)

0 commit comments

Comments
 (0)