Add aggregate WARNING at cluster failure decision points

antoineleclair · claude · antoineleclair · commit 8ea4ed729fe1 · 2026-05-01T17:49:29.000-04:00
Per-node leader-discovery probes and per-attempt connect retries log
at DEBUG so a routine leader flip's per-attempt churn does not spam
operator logs at default verbosity. But the same code paths also
fire on the all-nodes-failed / attempts-exhausted case — and there
was no log line at WARNING/ERROR aggregating that outcome.
Operators tailing logs at INFO saw nothing during a cluster-wide
failure cascade and only the application-level traceback after the
caller caught the exception. If the caller swallowed the exception
to retry with backoff, the failure was invisible.

Add two aggregate WARNINGs at the decision points:
- _find_leader_impl: just before the ClusterError raise after every
  node has been probed.
- ClusterClient.connect: catching the retry_with_backoff exhaustion
  and re-raising with a one-line summary of the attempts and last
  error.

Per-attempt DEBUG lines remain unchanged. Pin both WARNINGs against
regression — including a guard that exactly one WARNING fires per
aggregate failure (no per-attempt escalation that would reproduce
the spam pattern this pattern was designed to avoid).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/dqliteclient/cluster.py b/src/dqliteclient/cluster.py
@@ -423,6 +423,16 @@ async def _find_leader_impl(self, *, trust_server_heartbeat: bool) -> str:
             joined = (
                 joined[:_MAX_AGGREGATE_ERROR_PAYLOAD] + f"... [aggregate truncated, {kept} chars]"
             )
+        # Aggregate-failure WARNING. Per-node probes are at DEBUG so
+        # healthy sweeps do not spam logs, but the all-nodes-failed
+        # outcome is the one event operators paged on cluster-wide
+        # unreachable need to see at default verbosity. The errors
+        # string is already capped above so the log line is bounded.
+        logger.warning(
+            "cluster: leader discovery failed across %d nodes; errors=%s",
+            total_nodes,
+            joined,
+        )
         # Chain via ``BaseExceptionGroup`` when more than one node
         # contributed a real exception (the no-leader-known arm
         # produces no exception, only an entry in ``errors``). Single-
@@ -692,19 +702,33 @@ async def try_connect() -> DqliteConnection:
         # reflects a deterministic configuration mismatch (redirect
         # blocked) and is excluded: retrying would just reproduce it and
         # multiply the wall-clock cost.
-        return await retry_with_backoff(
-            try_connect,
-            max_attempts=attempts_cap,
-            # OSError subsumes TimeoutError / BrokenPipeError /
-            # ConnectionError / ConnectionResetError, so a single
-            # OSError entry covers every stdlib transport-error shape.
-            retryable_exceptions=(
-                DqliteConnectionError,
-                ClusterError,
-                OSError,
-            ),
-            excluded_exceptions=(ClusterPolicyError,),
-        )
+        try:
+            return await retry_with_backoff(
+                try_connect,
+                max_attempts=attempts_cap,
+                # OSError subsumes TimeoutError / BrokenPipeError /
+                # ConnectionError / ConnectionResetError, so a single
+                # OSError entry covers every stdlib transport-error shape.
+                retryable_exceptions=(
+                    DqliteConnectionError,
+                    ClusterError,
+                    OSError,
+                ),
+                excluded_exceptions=(ClusterPolicyError,),
+            )
+        except (DqliteConnectionError, ClusterError, OSError) as exc:
+            # Aggregate-failure WARNING. Per-attempt failures log at
+            # DEBUG (so a routine leader flip's per-attempt churn does
+            # not spam logs at default verbosity), but the
+            # all-attempts-exhausted outcome is the one event paged
+            # operators need to see at default verbosity.
+            logger.warning(
+                "cluster: connect exhausted %d attempts; last_error=%s: %s",
+                attempts_cap,
+                type(exc).__name__,
+                _truncate_error(str(exc)),
+            )
+            raise
 
 
 def allowlist_policy(addresses: Iterable[str]) -> RedirectPolicy:
diff --git a/tests/test_cluster_aggregate_failure_warning.py b/tests/test_cluster_aggregate_failure_warning.py
@@ -0,0 +1,80 @@
+"""Pin: cluster-wide unreachable / connect-exhaustion log a single
+WARNING summary at the aggregate-failure decision point — distinct
+from the per-attempt DEBUG noise that fires on every individual
+probe / retry.
+
+Per-attempt log lines stay at DEBUG (a routine leader flip's
+per-attempt churn must not spam logs at default verbosity), but the
+all-attempts-exhausted outcome is the one event paged operators need
+to see at default verbosity. Without this, operators tailing logs at
+INFO see nothing during the failure cascade and only the
+application-level traceback after the caller catches the exception.
+"""
+
+from __future__ import annotations
+
+import logging
+from unittest.mock import AsyncMock
+
+import pytest
+
+from dqliteclient.cluster import ClusterClient
+from dqliteclient.exceptions import ClusterError, DqliteConnectionError
+from dqliteclient.node_store import MemoryNodeStore
+
+
+@pytest.mark.asyncio
+async def test_find_leader_logs_aggregate_warning_on_all_nodes_failed(
+    caplog: pytest.LogCaptureFixture,
+) -> None:
+    """When every node in the cluster fails leader discovery, an
+    aggregate WARNING fires before the ClusterError raise."""
+    caplog.set_level(logging.DEBUG, logger="dqliteclient.cluster")
+
+    store = MemoryNodeStore(["node-a:9001", "node-b:9001"])
+    cluster = ClusterClient(store, timeout=0.5)
+    cluster._query_leader = AsyncMock(side_effect=DqliteConnectionError("connection refused"))
+
+    with pytest.raises(ClusterError):
+        await cluster.find_leader()
+
+    warnings = [
+        r
+        for r in caplog.records
+        if r.levelno == logging.WARNING and r.name == "dqliteclient.cluster"
+    ]
+    assert any("leader discovery failed" in r.getMessage() for r in warnings), (
+        "Aggregate WARNING must fire when all nodes fail leader discovery; "
+        f"saw warnings={[r.getMessage() for r in warnings]}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_per_node_failures_remain_at_debug(caplog: pytest.LogCaptureFixture) -> None:
+    """Per-attempt failures must NOT escalate to WARNING — those would
+    spam logs during routine leader-flip churn. The aggregate WARNING
+    is the only WARNING that fires."""
+    caplog.set_level(logging.DEBUG, logger="dqliteclient.cluster")
+
+    store = MemoryNodeStore(["node-a:9001", "node-b:9001"])
+    cluster = ClusterClient(store, timeout=0.5)
+    cluster._query_leader = AsyncMock(side_effect=DqliteConnectionError("connection refused"))
+
+    with pytest.raises(ClusterError):
+        await cluster.find_leader()
+
+    # Per-node DEBUG lines should be present; per-node WARNING lines
+    # should not.
+    debug_lines = [
+        r for r in caplog.records if r.levelno == logging.DEBUG and r.name == "dqliteclient.cluster"
+    ]
+    warning_lines = [
+        r
+        for r in caplog.records
+        if r.levelno == logging.WARNING and r.name == "dqliteclient.cluster"
+    ]
+    assert any("find_leader" in r.getMessage() for r in debug_lines)
+    assert len(warning_lines) == 1, (
+        f"Expected exactly one aggregate WARNING; got {len(warning_lines)}: "
+        f"{[r.getMessage() for r in warning_lines]}"
+    )