diff --git a/backend/app/gateway/app.py b/backend/app/gateway/app.py index 39d17498f..92f50b324 100644 --- a/backend/app/gateway/app.py +++ b/backend/app/gateway/app.py @@ -1,3 +1,4 @@ +import asyncio import logging from collections.abc import AsyncGenerator from contextlib import asynccontextmanager @@ -32,6 +33,11 @@ logging.basicConfig( logger = logging.getLogger(__name__) +# Upper bound (seconds) each lifespan shutdown hook is allowed to run. +# Bounds worker exit time so uvicorn's reload supervisor does not keep +# firing signals into a worker that is stuck waiting for shutdown cleanup. +_SHUTDOWN_HOOK_TIMEOUT_SECONDS = 5.0 + @asynccontextmanager async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: @@ -63,11 +69,19 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: yield - # Stop channel service on shutdown + # Stop channel service on shutdown (bounded to prevent worker hang) try: from app.channels.service import stop_channel_service - await stop_channel_service() + await asyncio.wait_for( + stop_channel_service(), + timeout=_SHUTDOWN_HOOK_TIMEOUT_SECONDS, + ) + except TimeoutError: + logger.warning( + "Channel service shutdown exceeded %.1fs; proceeding with worker exit.", + _SHUTDOWN_HOOK_TIMEOUT_SECONDS, + ) except Exception: logger.exception("Failed to stop channel service") diff --git a/backend/tests/test_gateway_lifespan_shutdown.py b/backend/tests/test_gateway_lifespan_shutdown.py new file mode 100644 index 000000000..9319c6268 --- /dev/null +++ b/backend/tests/test_gateway_lifespan_shutdown.py @@ -0,0 +1,68 @@ +"""Regression tests for Gateway lifespan shutdown. + +These tests guard the invariant that lifespan shutdown is *bounded*: a +misbehaving channel whose ``stop()`` blocks forever must not keep the +uvicorn worker alive. A hung worker is the precondition for the +signal-reentrancy deadlock described in +``app.gateway.app._SHUTDOWN_HOOK_TIMEOUT_SECONDS``. +""" + +from __future__ import annotations + +import asyncio +from contextlib import asynccontextmanager +from unittest.mock import MagicMock, patch + +from fastapi import FastAPI + + +@asynccontextmanager +async def _noop_langgraph_runtime(_app): + yield + + +async def _run_lifespan_with_hanging_stop() -> float: + """Drive the lifespan context with stop_channel_service hanging forever. + + Returns the elapsed wall-clock seconds. + """ + from app.gateway.app import _SHUTDOWN_HOOK_TIMEOUT_SECONDS, lifespan + + async def hang_forever() -> None: + await asyncio.sleep(3600) + + app = FastAPI() + + fake_service = MagicMock() + fake_service.get_status = MagicMock(return_value={}) + + async def fake_start(): + return fake_service + + with ( + patch("app.gateway.app.get_app_config"), + patch("app.gateway.app.get_gateway_config", return_value=MagicMock(host="x", port=0)), + patch("app.gateway.app.langgraph_runtime", _noop_langgraph_runtime), + patch("app.channels.service.start_channel_service", side_effect=fake_start), + patch("app.channels.service.stop_channel_service", side_effect=hang_forever), + ): + loop = asyncio.get_event_loop() + start = loop.time() + async with lifespan(app): + pass + elapsed = loop.time() - start + + assert _SHUTDOWN_HOOK_TIMEOUT_SECONDS < 30.0, "Timeout constant must stay modest" + return elapsed + + +def test_shutdown_is_bounded_when_channel_stop_hangs(): + """Lifespan exit must complete near the configured timeout, not hang.""" + from app.gateway.app import _SHUTDOWN_HOOK_TIMEOUT_SECONDS + + elapsed = asyncio.run(_run_lifespan_with_hanging_stop()) + + # Generous upper bound: timeout + 2s slack for scheduling overhead. + assert elapsed < _SHUTDOWN_HOOK_TIMEOUT_SECONDS + 2.0, f"Lifespan shutdown took {elapsed:.2f}s; expected <= {_SHUTDOWN_HOOK_TIMEOUT_SECONDS + 2.0:.1f}s" + # Lower bound: the wait_for should actually have waited. + assert elapsed >= _SHUTDOWN_HOOK_TIMEOUT_SECONDS - 0.5, f"Lifespan exited too quickly ({elapsed:.2f}s); wait_for may not have been invoked."