mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-04-25 11:18:22 +00:00
fix(gateway): bound lifespan shutdown hooks to prevent worker hang under uvicorn reload (#2331)
* fix(gateway): bound lifespan shutdown hooks to prevent worker hang Gateway worker can hang indefinitely in `uvicorn --reload` mode with the listening socket still bound — all /api/* requests return 504, and SIGKILL is the only recovery. Root cause (py-spy dump from a reproduction showed 16+ stacked frames of signal_handler -> Event.set -> threading.Lock.__enter__ on the main thread): CPython's `threading.Event` uses `Condition(Lock())` where the inner Lock is non-reentrant. uvicorn's BaseReload signal handler calls `should_exit.set()` directly from signal context; if a second signal (SIGTERM/SIGHUP from the reload supervisor, or watchfiles-triggered reload) arrives while the first handler holds the Lock, the reentrant call deadlocks on itself. The reload supervisor keeps sending those signals only when the worker fails to exit promptly. DeerFlow's lifespan currently awaits `stop_channel_service()` with no timeout; if a channel's `stop()` stalls (e.g. Feishu/Slack WebSocket waiting for an ack), the worker can't exit, the supervisor keeps signaling, and the deadlock becomes reachable. This is a defense-in-depth fix — it does not repair the upstream uvicorn/CPython issue, but it ensures DeerFlow's lifespan exits within a bounded window so the supervisor has no reason to keep firing signals. No behavior change on the happy path. Wraps the shutdown hook in `asyncio.wait_for(timeout=5.0)` and logs a warning on timeout before proceeding to worker exit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Update backend/app/gateway/app.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * style: apply make format (ruff) to test assertions Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Willem Jiang <willem.jiang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
parent
c42ae3af79
commit
4e72410154
@ -1,3 +1,4 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from collections.abc import AsyncGenerator
|
||||
from contextlib import asynccontextmanager
|
||||
@ -32,6 +33,11 @@ logging.basicConfig(
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Upper bound (seconds) each lifespan shutdown hook is allowed to run.
|
||||
# Bounds worker exit time so uvicorn's reload supervisor does not keep
|
||||
# firing signals into a worker that is stuck waiting for shutdown cleanup.
|
||||
_SHUTDOWN_HOOK_TIMEOUT_SECONDS = 5.0
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
@ -63,11 +69,19 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
|
||||
yield
|
||||
|
||||
# Stop channel service on shutdown
|
||||
# Stop channel service on shutdown (bounded to prevent worker hang)
|
||||
try:
|
||||
from app.channels.service import stop_channel_service
|
||||
|
||||
await stop_channel_service()
|
||||
await asyncio.wait_for(
|
||||
stop_channel_service(),
|
||||
timeout=_SHUTDOWN_HOOK_TIMEOUT_SECONDS,
|
||||
)
|
||||
except TimeoutError:
|
||||
logger.warning(
|
||||
"Channel service shutdown exceeded %.1fs; proceeding with worker exit.",
|
||||
_SHUTDOWN_HOOK_TIMEOUT_SECONDS,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Failed to stop channel service")
|
||||
|
||||
|
||||
68
backend/tests/test_gateway_lifespan_shutdown.py
Normal file
68
backend/tests/test_gateway_lifespan_shutdown.py
Normal file
@ -0,0 +1,68 @@
|
||||
"""Regression tests for Gateway lifespan shutdown.
|
||||
|
||||
These tests guard the invariant that lifespan shutdown is *bounded*: a
|
||||
misbehaving channel whose ``stop()`` blocks forever must not keep the
|
||||
uvicorn worker alive. A hung worker is the precondition for the
|
||||
signal-reentrancy deadlock described in
|
||||
``app.gateway.app._SHUTDOWN_HOOK_TIMEOUT_SECONDS``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def _noop_langgraph_runtime(_app):
|
||||
yield
|
||||
|
||||
|
||||
async def _run_lifespan_with_hanging_stop() -> float:
|
||||
"""Drive the lifespan context with stop_channel_service hanging forever.
|
||||
|
||||
Returns the elapsed wall-clock seconds.
|
||||
"""
|
||||
from app.gateway.app import _SHUTDOWN_HOOK_TIMEOUT_SECONDS, lifespan
|
||||
|
||||
async def hang_forever() -> None:
|
||||
await asyncio.sleep(3600)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
fake_service = MagicMock()
|
||||
fake_service.get_status = MagicMock(return_value={})
|
||||
|
||||
async def fake_start():
|
||||
return fake_service
|
||||
|
||||
with (
|
||||
patch("app.gateway.app.get_app_config"),
|
||||
patch("app.gateway.app.get_gateway_config", return_value=MagicMock(host="x", port=0)),
|
||||
patch("app.gateway.app.langgraph_runtime", _noop_langgraph_runtime),
|
||||
patch("app.channels.service.start_channel_service", side_effect=fake_start),
|
||||
patch("app.channels.service.stop_channel_service", side_effect=hang_forever),
|
||||
):
|
||||
loop = asyncio.get_event_loop()
|
||||
start = loop.time()
|
||||
async with lifespan(app):
|
||||
pass
|
||||
elapsed = loop.time() - start
|
||||
|
||||
assert _SHUTDOWN_HOOK_TIMEOUT_SECONDS < 30.0, "Timeout constant must stay modest"
|
||||
return elapsed
|
||||
|
||||
|
||||
def test_shutdown_is_bounded_when_channel_stop_hangs():
|
||||
"""Lifespan exit must complete near the configured timeout, not hang."""
|
||||
from app.gateway.app import _SHUTDOWN_HOOK_TIMEOUT_SECONDS
|
||||
|
||||
elapsed = asyncio.run(_run_lifespan_with_hanging_stop())
|
||||
|
||||
# Generous upper bound: timeout + 2s slack for scheduling overhead.
|
||||
assert elapsed < _SHUTDOWN_HOOK_TIMEOUT_SECONDS + 2.0, f"Lifespan shutdown took {elapsed:.2f}s; expected <= {_SHUTDOWN_HOOK_TIMEOUT_SECONDS + 2.0:.1f}s"
|
||||
# Lower bound: the wait_for should actually have waited.
|
||||
assert elapsed >= _SHUTDOWN_HOOK_TIMEOUT_SECONDS - 0.5, f"Lifespan exited too quickly ({elapsed:.2f}s); wait_for may not have been invoked."
|
||||
Loading…
x
Reference in New Issue
Block a user