Fail loud on _close_impl re-snapshot cap exhaustion

antoineleclair · claude · antoineleclair · commit 778e10b8649b · 2026-05-01T22:28:47.000-04:00
The cycle-27 CC2 fix added a bounded re-snapshot loop to _close_impl
with a cap of 3, with the comment "Cap at 3 to fail loudly on a
pathological feedback loop rather than spin." The for-loop had no
``else:`` clause — when the cap exhausted (a racing _invalidate kept
creating fresh pending_drain tasks each iteration), the loop silently
broke with self._pending_drain still pointing at a live task,
defeating the comment's "fail loudly" promise: the residual task
was orphaned and surfaced as "Task was destroyed but it is pending"
at GC.

Add the ``else:`` clause: cancel the residual task (best-effort) so
no orphan diagnostic fires, null out self._pending_drain to keep the
post-condition consistent with the break path, and log a WARNING so
operators see the pathological feedback loop in production logs.
Hoist the cap into a named ``_RESNAPSHOT_CAP`` constant so future
tuning changes are obvious.

Pin the WARNING firing, the residual cancellation, and the absence
of "Task was destroyed" diagnostics under an adversarial __await__
that re-plants a fresh non-done pending each iteration. Without the
fail-loud branch the warning never fires.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/dqliteclient/connection.py b/src/dqliteclient/connection.py
@@ -1227,7 +1227,8 @@ async def _close_impl(self) -> None:
         # few rounds the racing _invalidate sees ``_protocol is None``
         # and the cycle terminates). Cap at 3 to fail loudly on a
         # pathological feedback loop rather than spin.
-        for _attempt in range(3):
+        _RESNAPSHOT_CAP = 3
+        for _attempt in range(_RESNAPSHOT_CAP):
             pending = self._pending_drain
             self._pending_drain = None
             if pending is None or pending.done():
@@ -1244,6 +1245,27 @@ async def _close_impl(self) -> None:
             # would be lost forever if absorbed here.
             with contextlib.suppress(Exception, asyncio.CancelledError):
                 await pending
+        else:
+            # Cap exhausted: a racing ``_invalidate`` keeps creating
+            # fresh ``_pending_drain`` tasks each iteration. The
+            # comment block above promised "fail loudly"; without an
+            # explicit log/cancel here, ``self._pending_drain`` would
+            # remain set and the residual task would still be
+            # orphaned at GC. Cancel it (best-effort) and log a
+            # WARNING so operators see the pathological feedback
+            # loop in production logs.
+            stuck = self._pending_drain
+            if stuck is not None and not stuck.done():
+                stuck.cancel()
+            self._pending_drain = None
+            logger.warning(
+                "DqliteConnection._close_impl: _pending_drain still set after "
+                "%d re-snapshot iterations; cancelling residual task to avoid "
+                "'Task was destroyed but it is pending' at GC. This indicates "
+                "a pathological _invalidate feedback loop on connection id=%s.",
+                _RESNAPSHOT_CAP,
+                id(self),
+            )
         # Mirror ``_invalidate``'s atomic clear of the transaction
         # bookkeeping. Without this, a raw ``BEGIN`` followed by an
         # explicit ``close()`` and a reconnect on the same instance
diff --git a/tests/test_close_impl_reaps_pending_drain_created_during_await.py b/tests/test_close_impl_reaps_pending_drain_created_during_await.py
@@ -93,6 +93,98 @@ def _race_invalidate_callback() -> None:
         raise AssertionError(f"Expected no orphaned-task diagnostics; got {msgs}")
 
 
+@pytest.mark.asyncio
+async def test_close_impl_fails_loud_when_resnapshot_cap_exhausted(
+    caplog: pytest.LogCaptureFixture,
+) -> None:
+    """Pathological feedback loop: a racing callback re-creates a
+    fresh ``_pending_drain`` on every iteration. The cap exhausts;
+    the loop must (a) cancel the residual task to avoid the
+    "Task was destroyed but it is pending" diagnostic at GC, and
+    (b) log a WARNING so operators see the loop in production logs.
+
+    Pre-fix the loop's ``else:`` clause did not exist — the loop
+    silently broke with ``_pending_drain`` still pointing at a live
+    task and the warning the comment promised never fired.
+    """
+    import logging
+
+    loop = asyncio.get_running_loop()
+
+    conn = DqliteConnection.__new__(DqliteConnection)
+    conn._protocol = None
+    conn._in_transaction = False
+    conn._tx_owner = None
+    conn._savepoint_stack = []
+    conn._savepoint_implicit_begin = False
+    conn._has_untracked_savepoint = False
+    conn._invalidation_cause = None
+    conn._bound_loop = None
+
+    # Adversarial: each ``await pending`` that resolves triggers a
+    # fresh not-done task to be assigned to ``conn._pending_drain``.
+    # Use a custom awaitable so we can precisely simulate the
+    # pathological _invalidate feedback loop: the awaitable's
+    # ``__await__`` yields once (giving control back to _close_impl),
+    # then sets a fresh adversary on conn._pending_drain before
+    # returning. This guarantees iteration N+1 ALWAYS sees a
+    # not-done pending, exhausting the cap.
+    fresh_tasks: list[asyncio.Task[None]] = []
+
+    from collections.abc import Generator
+    from typing import Any
+
+    class _Adversary:
+        def __init__(self) -> None:
+            self._real = loop.create_task(asyncio.sleep(0.001))
+            fresh_tasks.append(self._real)
+
+        def done(self) -> bool:
+            return False  # always reports not-done so loop awaits
+
+        def cancel(self) -> bool:
+            return self._real.cancel()
+
+        def __await__(self) -> Generator[Any]:
+            yield from self._real.__await__()
+            # Before returning, plant a fresh adversary so iteration
+            # N+1 sees a non-done pending.
+            conn._pending_drain = _Adversary()  # type: ignore[assignment]
+
+    conn._pending_drain = _Adversary()  # type: ignore[assignment]
+
+    captured: list[dict[str, object]] = []
+    prior_handler = loop.get_exception_handler()
+    loop.set_exception_handler(lambda _loop, ctx: captured.append(ctx))
+    with caplog.at_level(logging.WARNING, logger="dqliteclient.connection"):
+        try:
+            await conn._close_impl()
+            await asyncio.sleep(0.05)
+        finally:
+            loop.set_exception_handler(prior_handler)
+
+    # The fail-loud WARNING must fire when the cap is exhausted.
+    warnings_seen = [
+        r
+        for r in caplog.records
+        if r.levelno == logging.WARNING and "re-snapshot iterations" in r.getMessage()
+    ]
+    assert warnings_seen, (
+        "_close_impl must log a WARNING when the bounded re-snapshot loop's "
+        "cap is exhausted; without it, the comment's 'fail loudly' promise "
+        "is unkept and operators have no signal of the feedback loop."
+    )
+    # The residual pending_drain must be cleared so no orphaned-task
+    # diagnostic surfaces.
+    assert conn._pending_drain is None
+    asyncio_diagnostics = [
+        ctx for ctx in captured if "Task was destroyed" in str(ctx.get("message", ""))
+    ]
+    if asyncio_diagnostics:
+        msgs = [ctx.get("message") for ctx in asyncio_diagnostics]
+        raise AssertionError(f"Expected no orphaned-task diagnostics; got {msgs}")
+
+
 @pytest.mark.asyncio
 async def test_close_impl_loop_terminates_when_invalidate_clears_protocol() -> None:
     """The re-snapshot loop terminates after one iteration when the