fix(gateway): bound lifespan shutdown hooks to prevent worker hang under uvicorn reload (#2331)

* fix(gateway): bound lifespan shutdown hooks to prevent worker hang

Gateway worker can hang indefinitely in `uvicorn --reload` mode with
the listening socket still bound — all /api/* requests return 504,
and SIGKILL is the only recovery.

Root cause (py-spy dump from a reproduction showed 16+ stacked frames
of signal_handler -> Event.set -> threading.Lock.__enter__ on the
main thread): CPython's `threading.Event` uses `Condition(Lock())`
where the inner Lock is non-reentrant. uvicorn's BaseReload signal
handler calls `should_exit.set()` directly from signal context; if a
second signal (SIGTERM/SIGHUP from the reload supervisor, or
watchfiles-triggered reload) arrives while the first handler holds
the Lock, the reentrant call deadlocks on itself.

The reload supervisor keeps sending those signals only when the
worker fails to exit promptly. DeerFlow's lifespan currently awaits
`stop_channel_service()` with no timeout; if a channel's `stop()`
stalls (e.g. Feishu/Slack WebSocket waiting for an ack), the worker
can't exit, the supervisor keeps signaling, and the deadlock becomes
reachable.

This is a defense-in-depth fix — it does not repair the upstream
uvicorn/CPython issue, but it ensures DeerFlow's lifespan exits
within a bounded window so the supervisor has no reason to keep
firing signals. No behavior change on the happy path.

Wraps the shutdown hook in `asyncio.wait_for(timeout=5.0)` and logs
a warning on timeout before proceeding to worker exit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* Update backend/app/gateway/app.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* style: apply make format (ruff) to test assertions

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
JerryChaox 2026-04-23 19:41:26 +08:00 committed by GitHub
parent c42ae3af79
commit 4e72410154
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 84 additions and 2 deletions

View File

@ -1,3 +1,4 @@
import asyncio
import logging
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
@ -32,6 +33,11 @@ logging.basicConfig(
logger = logging.getLogger(__name__)
# Upper bound (seconds) each lifespan shutdown hook is allowed to run.
# Bounds worker exit time so uvicorn's reload supervisor does not keep
# firing signals into a worker that is stuck waiting for shutdown cleanup.
_SHUTDOWN_HOOK_TIMEOUT_SECONDS = 5.0
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
@ -63,11 +69,19 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
yield
# Stop channel service on shutdown
# Stop channel service on shutdown (bounded to prevent worker hang)
try:
from app.channels.service import stop_channel_service
await stop_channel_service()
await asyncio.wait_for(
stop_channel_service(),
timeout=_SHUTDOWN_HOOK_TIMEOUT_SECONDS,
)
except TimeoutError:
logger.warning(
"Channel service shutdown exceeded %.1fs; proceeding with worker exit.",
_SHUTDOWN_HOOK_TIMEOUT_SECONDS,
)
except Exception:
logger.exception("Failed to stop channel service")

View File

@ -0,0 +1,68 @@
"""Regression tests for Gateway lifespan shutdown.
These tests guard the invariant that lifespan shutdown is *bounded*: a
misbehaving channel whose ``stop()`` blocks forever must not keep the
uvicorn worker alive. A hung worker is the precondition for the
signal-reentrancy deadlock described in
``app.gateway.app._SHUTDOWN_HOOK_TIMEOUT_SECONDS``.
"""
from __future__ import annotations
import asyncio
from contextlib import asynccontextmanager
from unittest.mock import MagicMock, patch
from fastapi import FastAPI
@asynccontextmanager
async def _noop_langgraph_runtime(_app):
yield
async def _run_lifespan_with_hanging_stop() -> float:
"""Drive the lifespan context with stop_channel_service hanging forever.
Returns the elapsed wall-clock seconds.
"""
from app.gateway.app import _SHUTDOWN_HOOK_TIMEOUT_SECONDS, lifespan
async def hang_forever() -> None:
await asyncio.sleep(3600)
app = FastAPI()
fake_service = MagicMock()
fake_service.get_status = MagicMock(return_value={})
async def fake_start():
return fake_service
with (
patch("app.gateway.app.get_app_config"),
patch("app.gateway.app.get_gateway_config", return_value=MagicMock(host="x", port=0)),
patch("app.gateway.app.langgraph_runtime", _noop_langgraph_runtime),
patch("app.channels.service.start_channel_service", side_effect=fake_start),
patch("app.channels.service.stop_channel_service", side_effect=hang_forever),
):
loop = asyncio.get_event_loop()
start = loop.time()
async with lifespan(app):
pass
elapsed = loop.time() - start
assert _SHUTDOWN_HOOK_TIMEOUT_SECONDS < 30.0, "Timeout constant must stay modest"
return elapsed
def test_shutdown_is_bounded_when_channel_stop_hangs():
"""Lifespan exit must complete near the configured timeout, not hang."""
from app.gateway.app import _SHUTDOWN_HOOK_TIMEOUT_SECONDS
elapsed = asyncio.run(_run_lifespan_with_hanging_stop())
# Generous upper bound: timeout + 2s slack for scheduling overhead.
assert elapsed < _SHUTDOWN_HOOK_TIMEOUT_SECONDS + 2.0, f"Lifespan shutdown took {elapsed:.2f}s; expected <= {_SHUTDOWN_HOOK_TIMEOUT_SECONDS + 2.0:.1f}s"
# Lower bound: the wait_for should actually have waited.
assert elapsed >= _SHUTDOWN_HOOK_TIMEOUT_SECONDS - 0.5, f"Lifespan exited too quickly ({elapsed:.2f}s); wait_for may not have been invoked."