Skip to content

Commit e115841

Browse files
Observable connect retries with configurable attempt count
- Per-attempt failures are logged at DEBUG level with the attempted leader address and the exception. Previously, retry exhaustion exposed only the final error, hiding whether the cluster was churning leaders, rolling upgrades, or simply unreachable. Operators who enable ``logging.DEBUG`` for ``dqliteclient.cluster`` now see every attempt. - Extract the magic ``max_attempts=3`` into a named module constant ``_DEFAULT_CONNECT_MAX_ATTEMPTS`` and expose a ``max_attempts`` parameter on ``ClusterClient.connect``. The default is unchanged; operators can override without patching the library. Tests: - TestConnectMaxAttempts: default value, override honored, zero rejected. - TestConnectObservability: caplog captures one DEBUG line per attempt. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3ffe64a commit e115841

2 files changed

Lines changed: 111 additions & 10 deletions

File tree

src/dqliteclient/cluster.py

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Cluster management and leader detection for dqlite."""
22

33
import asyncio
4+
import logging
45
import random
56
from collections.abc import Callable, Iterable
67

@@ -15,10 +16,19 @@
1516
from dqliteclient.protocol import DqliteProtocol
1617
from dqliteclient.retry import retry_with_backoff
1718

19+
logger = logging.getLogger(__name__)
20+
1821
# Type alias for a redirect-target policy. Returns True if the address
1922
# should be accepted, False to reject with a ClusterError.
2023
RedirectPolicy = Callable[[str], bool]
2124

25+
# Default attempt count for connect(). Three attempts cover one leader
26+
# change plus one transport hiccup; substantially higher counts risk
27+
# hiding genuine cluster instability under what looks like "a slow
28+
# connect" (ISSUE-109). Operators can override via ClusterClient.connect(
29+
# max_attempts=...).
30+
_DEFAULT_CONNECT_MAX_ATTEMPTS = 3
31+
2232

2333
class ClusterClient:
2434
"""Client with automatic leader detection and failover."""
@@ -150,25 +160,54 @@ async def connect(
150160
database: str = "default",
151161
*,
152162
max_total_rows: int | None = 10_000_000,
163+
max_attempts: int | None = None,
153164
) -> DqliteConnection:
154165
"""Connect to the cluster leader.
155166
156167
Returns a connection to the current leader. ``max_total_rows``
157168
is forwarded to the underlying :class:`DqliteConnection` so
158169
callers (including :class:`ConnectionPool`) can tune the
159170
cumulative row cap from one place.
171+
172+
``max_attempts`` overrides the default
173+
:data:`_DEFAULT_CONNECT_MAX_ATTEMPTS` (ISSUE-109).
174+
175+
Each attempt's failure is logged at DEBUG level with the
176+
attempted leader address and the error, so operators can
177+
enable debug logging to diagnose cluster churn instead of
178+
seeing only the final exception (ISSUE-78).
160179
"""
180+
attempts_cap = (
181+
max_attempts if max_attempts is not None else _DEFAULT_CONNECT_MAX_ATTEMPTS
182+
)
183+
if attempts_cap < 1:
184+
raise ValueError(f"max_attempts must be >= 1, got {attempts_cap}")
185+
186+
attempt_counter = [0]
161187

162188
async def try_connect() -> DqliteConnection:
163-
leader = await self.find_leader()
164-
conn = DqliteConnection(
165-
leader,
166-
database=database,
167-
timeout=self._timeout,
168-
max_total_rows=max_total_rows,
169-
)
170-
await conn.connect()
171-
return conn
189+
attempt_counter[0] += 1
190+
attempt = attempt_counter[0]
191+
leader: str | None = None
192+
try:
193+
leader = await self.find_leader()
194+
conn = DqliteConnection(
195+
leader,
196+
database=database,
197+
timeout=self._timeout,
198+
max_total_rows=max_total_rows,
199+
)
200+
await conn.connect()
201+
return conn
202+
except Exception as exc:
203+
logger.debug(
204+
"ClusterClient.connect attempt %d/%d failed (leader=%r): %s",
205+
attempt,
206+
attempts_cap,
207+
leader,
208+
exc,
209+
)
210+
raise
172211

173212
# Retry only transport-level errors. Leader-change OperationalError
174213
# codes are reclassified into DqliteConnectionError inside
@@ -177,7 +216,7 @@ async def try_connect() -> DqliteConnection:
177216
# into 5 × N_nodes RTTs before propagating.
178217
return await retry_with_backoff(
179218
try_connect,
180-
max_attempts=3,
219+
max_attempts=attempts_cap,
181220
retryable_exceptions=(
182221
DqliteConnectionError,
183222
ClusterError,

tests/test_cluster.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,3 +368,65 @@ async def test_update_nodes(self) -> None:
368368

369369
stored = await store.get_nodes()
370370
assert len(stored) == 2
371+
372+
373+
class TestConnectMaxAttempts:
374+
"""ISSUE-109: connect() exposes a max_attempts parameter.
375+
376+
The previous hardcoded ``max_attempts=3`` forced operators to patch
377+
the library to tune retry behavior. The default is unchanged; the
378+
knob simply becomes adjustable.
379+
"""
380+
381+
async def test_max_attempts_defaults_to_three(self) -> None:
382+
from dqliteclient.cluster import _DEFAULT_CONNECT_MAX_ATTEMPTS
383+
384+
assert _DEFAULT_CONNECT_MAX_ATTEMPTS == 3
385+
386+
async def test_max_attempts_override_honored(self) -> None:
387+
store = MemoryNodeStore(["localhost:1"]) # unreachable
388+
client = ClusterClient(store, timeout=0.1)
389+
390+
call_count = [0]
391+
392+
async def fake_find_leader() -> str:
393+
call_count[0] += 1
394+
raise DqliteConnectionError("unreachable")
395+
396+
client.find_leader = fake_find_leader # type: ignore[method-assign]
397+
398+
with contextlib.suppress(DqliteConnectionError):
399+
await client.connect(max_attempts=5)
400+
assert call_count[0] == 5, f"Expected 5 attempts with max_attempts=5, got {call_count[0]}"
401+
402+
async def test_max_attempts_zero_rejected(self) -> None:
403+
store = MemoryNodeStore(["localhost:1"])
404+
client = ClusterClient(store, timeout=0.1)
405+
with pytest.raises(ValueError, match=">= 1"):
406+
await client.connect(max_attempts=0)
407+
408+
409+
class TestConnectObservability:
410+
"""ISSUE-78: per-attempt failures are logged at DEBUG for diagnosis."""
411+
412+
async def test_failed_attempts_logged(self, caplog: pytest.LogCaptureFixture) -> None:
413+
import logging
414+
415+
store = MemoryNodeStore(["localhost:1"]) # unreachable
416+
client = ClusterClient(store, timeout=0.1)
417+
418+
async def fake_find_leader() -> str:
419+
raise DqliteConnectionError("simulated")
420+
421+
client.find_leader = fake_find_leader # type: ignore[method-assign]
422+
423+
caplog.set_level(logging.DEBUG, logger="dqliteclient.cluster")
424+
with contextlib.suppress(DqliteConnectionError):
425+
await client.connect(max_attempts=2)
426+
427+
# Every attempt should emit a debug log.
428+
attempt_logs = [r for r in caplog.records if "connect attempt" in r.message]
429+
assert len(attempt_logs) == 2, (
430+
f"Expected 2 per-attempt log lines, got {len(attempt_logs)}: "
431+
f"{[r.message for r in attempt_logs]}"
432+
)

0 commit comments

Comments
 (0)