Skip to content

Commit 9b7a7f6

Browse files
fix: narrow retry and reduce max_attempts for connect()
ClusterClient.connect wrapped try_connect in retry_with_backoff with max_attempts=5 and OperationalError in the retryable set. A non-leader OperationalError (a real SQL error) was retried 5× find_leader amplification = 5 × N_nodes RTTs before surfacing. Since #148 now reclassifies leader-change OperationalErrors into DqliteConnectionError at connect time, we no longer need OperationalError in the retry set. Drop it and reduce max_attempts to 3 so a true transport outage surfaces faster. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c028b0a commit 9b7a7f6

2 files changed

Lines changed: 33 additions & 2 deletions

File tree

src/dqliteclient/cluster.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,13 +124,17 @@ async def try_connect() -> DqliteConnection:
124124
await conn.connect()
125125
return conn
126126

127+
# Retry only transport-level errors. Leader-change OperationalError
128+
# codes are now reclassified into DqliteConnectionError at connect
129+
# time (see #148), so we no longer need OperationalError in the
130+
# retry set — that avoids amplifying a schema/SQL error into 5 ×
131+
# N_nodes RTTs before propagating.
127132
return await retry_with_backoff(
128133
try_connect,
129-
max_attempts=5,
134+
max_attempts=3,
130135
retryable_exceptions=(
131136
DqliteConnectionError,
132137
ClusterError,
133-
OperationalError,
134138
OSError,
135139
TimeoutError,
136140
),

tests/test_cluster.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,33 @@ async def track(address: str) -> str | None:
329329
f"voters should be probed first; order={order}"
330330
)
331331

332+
async def test_connect_does_not_retry_plain_sql_errors(self) -> None:
333+
"""OperationalError without a leader code is a SQL-level error, not
334+
a transport issue — connect() should NOT retry it. Otherwise a
335+
schema mismatch takes 5x find_leader round trips to propagate.
336+
"""
337+
from dqliteclient.exceptions import OperationalError
338+
339+
store = MemoryNodeStore(["localhost:9001"])
340+
client = ClusterClient(store, timeout=0.2)
341+
342+
call_count = 0
343+
344+
async def always_sql_error() -> str:
345+
nonlocal call_count
346+
call_count += 1
347+
raise OperationalError(1, "some sql error")
348+
349+
with (
350+
patch.object(client, "find_leader", side_effect=always_sql_error),
351+
pytest.raises(OperationalError),
352+
):
353+
await client.connect()
354+
355+
assert call_count == 1, (
356+
f"SQL-level OperationalError must not be retried, got {call_count} attempts"
357+
)
358+
332359
async def test_update_nodes(self) -> None:
333360
store = MemoryNodeStore()
334361
client = ClusterClient(store)

0 commit comments

Comments
 (0)