Observable connect retries with configurable attempt count

antoineleclair · claude · antoineleclair · commit e1158417b3b8 · 2026-04-18T11:26:21.000-04:00
- Per-attempt failures are logged at DEBUG level with the attempted
  leader address and the exception. Previously, retry exhaustion
  exposed only the final error, hiding whether the cluster was
  churning leaders, rolling upgrades, or simply unreachable.
  Operators who enable ``logging.DEBUG`` for
  ``dqliteclient.cluster`` now see every attempt.

- Extract the magic ``max_attempts=3`` into a named module constant
  ``_DEFAULT_CONNECT_MAX_ATTEMPTS`` and expose a ``max_attempts``
  parameter on ``ClusterClient.connect``. The default is unchanged;
  operators can override without patching the library.

Tests:
- TestConnectMaxAttempts: default value, override honored, zero
  rejected.
- TestConnectObservability: caplog captures one DEBUG line per
  attempt.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/dqliteclient/cluster.py b/src/dqliteclient/cluster.py
@@ -1,6 +1,7 @@
 """Cluster management and leader detection for dqlite."""
 
 import asyncio
+import logging
 import random
 from collections.abc import Callable, Iterable
 
@@ -15,10 +16,19 @@
 from dqliteclient.protocol import DqliteProtocol
 from dqliteclient.retry import retry_with_backoff
 
+logger = logging.getLogger(__name__)
+
 # Type alias for a redirect-target policy. Returns True if the address
 # should be accepted, False to reject with a ClusterError.
 RedirectPolicy = Callable[[str], bool]
 
+# Default attempt count for connect(). Three attempts cover one leader
+# change plus one transport hiccup; substantially higher counts risk
+# hiding genuine cluster instability under what looks like "a slow
+# connect" (ISSUE-109). Operators can override via ClusterClient.connect(
+# max_attempts=...).
+_DEFAULT_CONNECT_MAX_ATTEMPTS = 3
+
 
 class ClusterClient:
     """Client with automatic leader detection and failover."""
@@ -150,25 +160,54 @@ async def connect(
         database: str = "default",
         *,
         max_total_rows: int | None = 10_000_000,
+        max_attempts: int | None = None,
     ) -> DqliteConnection:
         """Connect to the cluster leader.
 
         Returns a connection to the current leader. ``max_total_rows``
         is forwarded to the underlying :class:`DqliteConnection` so
         callers (including :class:`ConnectionPool`) can tune the
         cumulative row cap from one place.
+
+        ``max_attempts`` overrides the default
+        :data:`_DEFAULT_CONNECT_MAX_ATTEMPTS` (ISSUE-109).
+
+        Each attempt's failure is logged at DEBUG level with the
+        attempted leader address and the error, so operators can
+        enable debug logging to diagnose cluster churn instead of
+        seeing only the final exception (ISSUE-78).
         """
+        attempts_cap = (
+            max_attempts if max_attempts is not None else _DEFAULT_CONNECT_MAX_ATTEMPTS
+        )
+        if attempts_cap < 1:
+            raise ValueError(f"max_attempts must be >= 1, got {attempts_cap}")
+
+        attempt_counter = [0]
 
         async def try_connect() -> DqliteConnection:
-            leader = await self.find_leader()
-            conn = DqliteConnection(
-                leader,
-                database=database,
-                timeout=self._timeout,
-                max_total_rows=max_total_rows,
-            )
-            await conn.connect()
-            return conn
+            attempt_counter[0] += 1
+            attempt = attempt_counter[0]
+            leader: str | None = None
+            try:
+                leader = await self.find_leader()
+                conn = DqliteConnection(
+                    leader,
+                    database=database,
+                    timeout=self._timeout,
+                    max_total_rows=max_total_rows,
+                )
+                await conn.connect()
+                return conn
+            except Exception as exc:
+                logger.debug(
+                    "ClusterClient.connect attempt %d/%d failed (leader=%r): %s",
+                    attempt,
+                    attempts_cap,
+                    leader,
+                    exc,
+                )
+                raise
 
         # Retry only transport-level errors. Leader-change OperationalError
         # codes are reclassified into DqliteConnectionError inside
@@ -177,7 +216,7 @@ async def try_connect() -> DqliteConnection:
         # into 5 × N_nodes RTTs before propagating.
         return await retry_with_backoff(
             try_connect,
-            max_attempts=3,
+            max_attempts=attempts_cap,
             retryable_exceptions=(
                 DqliteConnectionError,
                 ClusterError,
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
@@ -368,3 +368,65 @@ async def test_update_nodes(self) -> None:
 
         stored = await store.get_nodes()
         assert len(stored) == 2
+
+
+class TestConnectMaxAttempts:
+    """ISSUE-109: connect() exposes a max_attempts parameter.
+
+    The previous hardcoded ``max_attempts=3`` forced operators to patch
+    the library to tune retry behavior. The default is unchanged; the
+    knob simply becomes adjustable.
+    """
+
+    async def test_max_attempts_defaults_to_three(self) -> None:
+        from dqliteclient.cluster import _DEFAULT_CONNECT_MAX_ATTEMPTS
+
+        assert _DEFAULT_CONNECT_MAX_ATTEMPTS == 3
+
+    async def test_max_attempts_override_honored(self) -> None:
+        store = MemoryNodeStore(["localhost:1"])  # unreachable
+        client = ClusterClient(store, timeout=0.1)
+
+        call_count = [0]
+
+        async def fake_find_leader() -> str:
+            call_count[0] += 1
+            raise DqliteConnectionError("unreachable")
+
+        client.find_leader = fake_find_leader  # type: ignore[method-assign]
+
+        with contextlib.suppress(DqliteConnectionError):
+            await client.connect(max_attempts=5)
+        assert call_count[0] == 5, f"Expected 5 attempts with max_attempts=5, got {call_count[0]}"
+
+    async def test_max_attempts_zero_rejected(self) -> None:
+        store = MemoryNodeStore(["localhost:1"])
+        client = ClusterClient(store, timeout=0.1)
+        with pytest.raises(ValueError, match=">= 1"):
+            await client.connect(max_attempts=0)
+
+
+class TestConnectObservability:
+    """ISSUE-78: per-attempt failures are logged at DEBUG for diagnosis."""
+
+    async def test_failed_attempts_logged(self, caplog: pytest.LogCaptureFixture) -> None:
+        import logging
+
+        store = MemoryNodeStore(["localhost:1"])  # unreachable
+        client = ClusterClient(store, timeout=0.1)
+
+        async def fake_find_leader() -> str:
+            raise DqliteConnectionError("simulated")
+
+        client.find_leader = fake_find_leader  # type: ignore[method-assign]
+
+        caplog.set_level(logging.DEBUG, logger="dqliteclient.cluster")
+        with contextlib.suppress(DqliteConnectionError):
+            await client.connect(max_attempts=2)
+
+        # Every attempt should emit a debug log.
+        attempt_logs = [r for r in caplog.records if "connect attempt" in r.message]
+        assert len(attempt_logs) == 2, (
+            f"Expected 2 per-attempt log lines, got {len(attempt_logs)}: "
+            f"{[r.message for r in attempt_logs]}"
+        )