Skip to content

Commit 8ea4ed7

Browse files
Add aggregate WARNING at cluster failure decision points
Per-node leader-discovery probes and per-attempt connect retries log at DEBUG so a routine leader flip's per-attempt churn does not spam operator logs at default verbosity. But the same code paths also fire on the all-nodes-failed / attempts-exhausted case — and there was no log line at WARNING/ERROR aggregating that outcome. Operators tailing logs at INFO saw nothing during a cluster-wide failure cascade and only the application-level traceback after the caller caught the exception. If the caller swallowed the exception to retry with backoff, the failure was invisible. Add two aggregate WARNINGs at the decision points: - _find_leader_impl: just before the ClusterError raise after every node has been probed. - ClusterClient.connect: catching the retry_with_backoff exhaustion and re-raising with a one-line summary of the attempts and last error. Per-attempt DEBUG lines remain unchanged. Pin both WARNINGs against regression — including a guard that exactly one WARNING fires per aggregate failure (no per-attempt escalation that would reproduce the spam pattern this pattern was designed to avoid). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent d241d87 commit 8ea4ed7

2 files changed

Lines changed: 117 additions & 13 deletions

File tree

src/dqliteclient/cluster.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,16 @@ async def _find_leader_impl(self, *, trust_server_heartbeat: bool) -> str:
423423
joined = (
424424
joined[:_MAX_AGGREGATE_ERROR_PAYLOAD] + f"... [aggregate truncated, {kept} chars]"
425425
)
426+
# Aggregate-failure WARNING. Per-node probes are at DEBUG so
427+
# healthy sweeps do not spam logs, but the all-nodes-failed
428+
# outcome is the one event operators paged on cluster-wide
429+
# unreachable need to see at default verbosity. The errors
430+
# string is already capped above so the log line is bounded.
431+
logger.warning(
432+
"cluster: leader discovery failed across %d nodes; errors=%s",
433+
total_nodes,
434+
joined,
435+
)
426436
# Chain via ``BaseExceptionGroup`` when more than one node
427437
# contributed a real exception (the no-leader-known arm
428438
# produces no exception, only an entry in ``errors``). Single-
@@ -692,19 +702,33 @@ async def try_connect() -> DqliteConnection:
692702
# reflects a deterministic configuration mismatch (redirect
693703
# blocked) and is excluded: retrying would just reproduce it and
694704
# multiply the wall-clock cost.
695-
return await retry_with_backoff(
696-
try_connect,
697-
max_attempts=attempts_cap,
698-
# OSError subsumes TimeoutError / BrokenPipeError /
699-
# ConnectionError / ConnectionResetError, so a single
700-
# OSError entry covers every stdlib transport-error shape.
701-
retryable_exceptions=(
702-
DqliteConnectionError,
703-
ClusterError,
704-
OSError,
705-
),
706-
excluded_exceptions=(ClusterPolicyError,),
707-
)
705+
try:
706+
return await retry_with_backoff(
707+
try_connect,
708+
max_attempts=attempts_cap,
709+
# OSError subsumes TimeoutError / BrokenPipeError /
710+
# ConnectionError / ConnectionResetError, so a single
711+
# OSError entry covers every stdlib transport-error shape.
712+
retryable_exceptions=(
713+
DqliteConnectionError,
714+
ClusterError,
715+
OSError,
716+
),
717+
excluded_exceptions=(ClusterPolicyError,),
718+
)
719+
except (DqliteConnectionError, ClusterError, OSError) as exc:
720+
# Aggregate-failure WARNING. Per-attempt failures log at
721+
# DEBUG (so a routine leader flip's per-attempt churn does
722+
# not spam logs at default verbosity), but the
723+
# all-attempts-exhausted outcome is the one event paged
724+
# operators need to see at default verbosity.
725+
logger.warning(
726+
"cluster: connect exhausted %d attempts; last_error=%s: %s",
727+
attempts_cap,
728+
type(exc).__name__,
729+
_truncate_error(str(exc)),
730+
)
731+
raise
708732

709733

710734
def allowlist_policy(addresses: Iterable[str]) -> RedirectPolicy:
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""Pin: cluster-wide unreachable / connect-exhaustion log a single
2+
WARNING summary at the aggregate-failure decision point — distinct
3+
from the per-attempt DEBUG noise that fires on every individual
4+
probe / retry.
5+
6+
Per-attempt log lines stay at DEBUG (a routine leader flip's
7+
per-attempt churn must not spam logs at default verbosity), but the
8+
all-attempts-exhausted outcome is the one event paged operators need
9+
to see at default verbosity. Without this, operators tailing logs at
10+
INFO see nothing during the failure cascade and only the
11+
application-level traceback after the caller catches the exception.
12+
"""
13+
14+
from __future__ import annotations
15+
16+
import logging
17+
from unittest.mock import AsyncMock
18+
19+
import pytest
20+
21+
from dqliteclient.cluster import ClusterClient
22+
from dqliteclient.exceptions import ClusterError, DqliteConnectionError
23+
from dqliteclient.node_store import MemoryNodeStore
24+
25+
26+
@pytest.mark.asyncio
27+
async def test_find_leader_logs_aggregate_warning_on_all_nodes_failed(
28+
caplog: pytest.LogCaptureFixture,
29+
) -> None:
30+
"""When every node in the cluster fails leader discovery, an
31+
aggregate WARNING fires before the ClusterError raise."""
32+
caplog.set_level(logging.DEBUG, logger="dqliteclient.cluster")
33+
34+
store = MemoryNodeStore(["node-a:9001", "node-b:9001"])
35+
cluster = ClusterClient(store, timeout=0.5)
36+
cluster._query_leader = AsyncMock(side_effect=DqliteConnectionError("connection refused"))
37+
38+
with pytest.raises(ClusterError):
39+
await cluster.find_leader()
40+
41+
warnings = [
42+
r
43+
for r in caplog.records
44+
if r.levelno == logging.WARNING and r.name == "dqliteclient.cluster"
45+
]
46+
assert any("leader discovery failed" in r.getMessage() for r in warnings), (
47+
"Aggregate WARNING must fire when all nodes fail leader discovery; "
48+
f"saw warnings={[r.getMessage() for r in warnings]}"
49+
)
50+
51+
52+
@pytest.mark.asyncio
53+
async def test_per_node_failures_remain_at_debug(caplog: pytest.LogCaptureFixture) -> None:
54+
"""Per-attempt failures must NOT escalate to WARNING — those would
55+
spam logs during routine leader-flip churn. The aggregate WARNING
56+
is the only WARNING that fires."""
57+
caplog.set_level(logging.DEBUG, logger="dqliteclient.cluster")
58+
59+
store = MemoryNodeStore(["node-a:9001", "node-b:9001"])
60+
cluster = ClusterClient(store, timeout=0.5)
61+
cluster._query_leader = AsyncMock(side_effect=DqliteConnectionError("connection refused"))
62+
63+
with pytest.raises(ClusterError):
64+
await cluster.find_leader()
65+
66+
# Per-node DEBUG lines should be present; per-node WARNING lines
67+
# should not.
68+
debug_lines = [
69+
r for r in caplog.records if r.levelno == logging.DEBUG and r.name == "dqliteclient.cluster"
70+
]
71+
warning_lines = [
72+
r
73+
for r in caplog.records
74+
if r.levelno == logging.WARNING and r.name == "dqliteclient.cluster"
75+
]
76+
assert any("find_leader" in r.getMessage() for r in debug_lines)
77+
assert len(warning_lines) == 1, (
78+
f"Expected exactly one aggregate WARNING; got {len(warning_lines)}: "
79+
f"{[r.getMessage() for r in warning_lines]}"
80+
)

0 commit comments

Comments
 (0)