fix: narrow retry and reduce max_attempts for connect()

antoineleclair · claude · antoineleclair · commit 9b7a7f6fbf0c · 2026-04-17T14:18:29.000-04:00
ClusterClient.connect wrapped try_connect in retry_with_backoff with
max_attempts=5 and OperationalError in the retryable set. A non-leader
OperationalError (a real SQL error) was retried 5× find_leader
amplification = 5 × N_nodes RTTs before surfacing.

Since #148 now reclassifies leader-change OperationalErrors into
DqliteConnectionError at connect time, we no longer need
OperationalError in the retry set. Drop it and reduce max_attempts to
3 so a true transport outage surfaces faster.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/dqliteclient/cluster.py b/src/dqliteclient/cluster.py
@@ -124,13 +124,17 @@ async def try_connect() -> DqliteConnection:
             await conn.connect()
             return conn
 
+        # Retry only transport-level errors. Leader-change OperationalError
+        # codes are now reclassified into DqliteConnectionError at connect
+        # time (see #148), so we no longer need OperationalError in the
+        # retry set — that avoids amplifying a schema/SQL error into 5 ×
+        # N_nodes RTTs before propagating.
         return await retry_with_backoff(
             try_connect,
-            max_attempts=5,
+            max_attempts=3,
             retryable_exceptions=(
                 DqliteConnectionError,
                 ClusterError,
-                OperationalError,
                 OSError,
                 TimeoutError,
             ),
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
@@ -329,6 +329,33 @@ async def track(address: str) -> str | None:
             f"voters should be probed first; order={order}"
         )
 
+    async def test_connect_does_not_retry_plain_sql_errors(self) -> None:
+        """OperationalError without a leader code is a SQL-level error, not
+        a transport issue — connect() should NOT retry it. Otherwise a
+        schema mismatch takes 5x find_leader round trips to propagate.
+        """
+        from dqliteclient.exceptions import OperationalError
+
+        store = MemoryNodeStore(["localhost:9001"])
+        client = ClusterClient(store, timeout=0.2)
+
+        call_count = 0
+
+        async def always_sql_error() -> str:
+            nonlocal call_count
+            call_count += 1
+            raise OperationalError(1, "some sql error")
+
+        with (
+            patch.object(client, "find_leader", side_effect=always_sql_error),
+            pytest.raises(OperationalError),
+        ):
+            await client.connect()
+
+        assert call_count == 1, (
+            f"SQL-level OperationalError must not be retried, got {call_count} attempts"
+        )
+
     async def test_update_nodes(self) -> None:
         store = MemoryNodeStore()
         client = ClusterClient(store)