Skip to content

Commit 94e1327

Browse files
Add operator-actionable discriminators to pool acquire timeout
The pool acquire timeout was the most-likely-to-page exception in the driver and the least actionable. The previous message gave ``max_size`` and ``timeout`` only — an operator paged on a 3am timeout could not tell whether the cause was a leaking application (checked_out at max, idle 0) or a slow cluster, even though both have very different remediations. Add ``pool_id``, ``checked_out`` and ``idle`` to the message and attach a one-sentence remediation hint that discriminates leak vs slow cluster. The neighbouring ``_check_in_use`` site already proves the codebase can write good actionable exceptions; the pool acquire timeout now matches that bar. Pin the new fields and the hint against regression so a future refactor can't accidentally drop them. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent d2d6c13 commit 94e1327

2 files changed

Lines changed: 101 additions & 1 deletion

File tree

src/dqliteclient/pool.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -821,9 +821,22 @@ async def acquire(self) -> AsyncIterator[DqliteConnection]:
821821
# re-check capacity (another coroutine may have freed a slot)
822822
remaining = deadline - loop.time()
823823
if remaining <= 0:
824+
# Compute saturation snapshot once for the message —
825+
# ``_size`` is checked-out-plus-reserved; ``qsize`` is
826+
# idle. A pool at ``checked_out == max_size`` with
827+
# ``idle == 0`` is leaking; otherwise the cluster is
828+
# slow. The ``pool_id`` lets operators correlate the
829+
# failed acquire with the warm-up / drain log lines
830+
# that already include ``id(self)``.
831+
idle = self._pool.qsize()
832+
checked_out = self._size - idle
824833
raise DqliteConnectionError(
825834
f"Timed out waiting for a connection from the pool "
826-
f"(max_size={self._max_size}, timeout={self._timeout}s)"
835+
f"(pool_id={id(self)}, max_size={self._max_size}, "
836+
f"checked_out={checked_out}, idle={idle}, "
837+
f"timeout={self._timeout}s). "
838+
f"If checked_out is at max_size, the application is "
839+
f"leaking connections; otherwise the cluster is slow."
827840
)
828841
# Race the queue against the state-change event so any pool
829842
# state change (close, size decrement, drain) wakes waiters
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""Pin: pool acquire timeout error includes operator-actionable
2+
discriminators (pool_id, checked_out, idle) and a remediation hint.
3+
4+
The message previously gave only ``max_size`` and ``timeout`` — an
5+
operator paged on a 3am ``DqliteConnectionError("Timed out waiting
6+
for a connection from the pool")`` could not tell whether the cause
7+
was a leaking application (checked_out at max, idle 0) or a slow
8+
cluster (checked_out below max, idle 0). Both have the same parameters
9+
in the message but very different remediations.
10+
11+
Pin the new fields against regression so a future refactor can't
12+
accidentally drop them.
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import asyncio
18+
19+
import pytest
20+
21+
from dqliteclient.exceptions import DqliteConnectionError
22+
from dqliteclient.pool import ConnectionPool
23+
24+
25+
def _make_full_pool_at_capacity() -> ConnectionPool:
26+
"""Build a minimal ConnectionPool with no available connections
27+
so an acquire times out immediately."""
28+
import os
29+
30+
pool = ConnectionPool.__new__(ConnectionPool)
31+
pool._closed = False
32+
pool._max_size = 5
33+
pool._size = 5 # checked-out + reserved == max_size
34+
pool._timeout = 0.01
35+
pool._pool = asyncio.Queue() # no connections in the queue
36+
pool._lock = asyncio.Lock()
37+
pool._closed_event = None
38+
pool._addresses = ["host:9001"]
39+
pool._creator_pid = os.getpid()
40+
return pool
41+
42+
43+
async def _acquire_with_timeout(pool: ConnectionPool) -> None:
44+
async with pool.acquire():
45+
pass
46+
47+
48+
@pytest.mark.asyncio
49+
async def test_acquire_timeout_message_includes_pool_id() -> None:
50+
pool = _make_full_pool_at_capacity()
51+
with pytest.raises(DqliteConnectionError) as exc_info:
52+
await _acquire_with_timeout(pool)
53+
text = str(exc_info.value)
54+
assert f"pool_id={id(pool)}" in text
55+
56+
57+
@pytest.mark.asyncio
58+
async def test_acquire_timeout_message_includes_checked_out_and_idle() -> None:
59+
pool = _make_full_pool_at_capacity()
60+
with pytest.raises(DqliteConnectionError) as exc_info:
61+
await _acquire_with_timeout(pool)
62+
text = str(exc_info.value)
63+
assert "checked_out=" in text
64+
assert "idle=" in text
65+
66+
67+
@pytest.mark.asyncio
68+
async def test_acquire_timeout_message_includes_remediation_hint() -> None:
69+
"""The hint discriminates leak vs slow-cluster — pin so a future
70+
edit doesn't strip it."""
71+
pool = _make_full_pool_at_capacity()
72+
with pytest.raises(DqliteConnectionError) as exc_info:
73+
await _acquire_with_timeout(pool)
74+
text = str(exc_info.value)
75+
assert "leaking" in text.lower() or "leak" in text.lower()
76+
77+
78+
@pytest.mark.asyncio
79+
async def test_acquire_timeout_message_keeps_max_size_and_timeout() -> None:
80+
"""Existing fields must still be present — they are referenced
81+
by operators / runbooks."""
82+
pool = _make_full_pool_at_capacity()
83+
with pytest.raises(DqliteConnectionError) as exc_info:
84+
await _acquire_with_timeout(pool)
85+
text = str(exc_info.value)
86+
assert "max_size=5" in text
87+
assert "timeout=" in text

0 commit comments

Comments
 (0)