From 8d2e55a05f6a23d4318b910b123d182d1adb9638 Mon Sep 17 00:00:00 2001 From: Xinmin Zeng <135568692+fancyboi999@users.noreply.github.com> Date: Sun, 7 Jun 2026 22:49:55 +0800 Subject: [PATCH] fix(subagent): structured subagent_status field over text parsing (#3146) (#3154) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(subagent): structured subagent_status field over text parsing Closes #3146. ## Why The frontend used to derive subtask card state by string-matching the leading text of the `task` tool's result. That contract surface was fragile — `#3107` BUG-007 and the `#3131` review both surfaced cases where new backend wording (`Task cancelled by user.`, `Task polling timed out after N minutes`, `ToolErrorHandlingMiddleware` exception wrappers) silently broke the card lifecycle. The frontend fallback kept growing more prefixes; any future rewording would break it again. ## Design 1. **Backend → frontend contract**: `ToolMessage.additional_kwargs` carries `subagent_status` (one of `completed | failed | cancelled | timed_out | polling_timed_out`) and an optional `subagent_error` blob. The frontend prefers it over parsing `content`. 2. **Centralised stamping, not 8 sprinkled stamps**: rather than have each of `task_tool.py`'s 5 normal-return + 3 pre-execution `Error:` paths remember to set `additional_kwargs`, `ToolErrorHandlingMiddleware` stamps the field after every task-tool call. Adding a new return path in `task_tool.py` cannot now skip the stamp. 3. **Cross-language contract fixture**: the prefix→status mapping is the one piece both sides must agree on. The shared fixture at `contracts/subagent_status_contract.json` lists every backend return string, the expected status, and what the error substring should contain. Backend test (`backend/tests/test_subagent_status_contract.py`) and frontend test (`frontend/tests/unit/core/tasks/subtask-result.test.ts`) both load that fixture and assert the same cases. A wording drift on either side fails the matching language's test. 4. **Round-trip serialisation pinned**: the round-trip test asserts `ToolMessage.model_dump_json()` → `model_validate_json()` preserves `additional_kwargs.subagent_status`. Catches the case where a future LangChain or Pydantic upgrade silently strips unknown kwargs. 5. **Frontend status collapse documented**: the backend has five status values, the frontend card has three (`completed | failed | in_progress`). `cancelled` / `timed_out` / `polling_timed_out` all collapse to `failed` with the original status preserved in `error`. `parseSubtaskResult` returns `in_progress` for unknown values so a backend that ships a new enum variant before the frontend upgrades degrades to the legacy prefix fallback instead of getting pinned. ## Changes Backend: - `deerflow.subagents.status_contract` — new module exporting `SUBAGENT_STATUS_KEY`, `SUBAGENT_ERROR_KEY`, `SUBAGENT_STATUS_VALUES`, `extract_subagent_status(content)`, and `make_subagent_additional_kwargs(status, error)`. - `ToolErrorHandlingMiddleware`: new `_stamp_task_subagent_status` helper centralises the stamp; `wrap_tool_call` / `awrap_tool_call` stamp on the success path; `_build_error_message` stamps on the wrapper path (carrying `ExcClass: detail` into `subagent_error`). Non-task tools are untouched. - New tests: `test_subagent_status_contract.py` (19 cases from the shared fixture + status-enum / blank-error / unknown-status rejection) and `test_tool_error_handling_subagent_stamp.py` (middleware integration: terminal-content stamps, non-terminal doesn't, non-task tools untouched, async path mirrors sync, existing additional_kwargs survive, JSON round-trip preserved). Frontend: - `parseSubtaskResult(text, additionalKwargs?)` — prefers the structured stamp; falls back to the legacy prefix matcher for historical threads / unknown future status values. - `STRUCTURED_STATUS_TO_SUBTASK` documents the five→three collapse. - `message-list.tsx` passes `message.additional_kwargs` through. - `subtask-result.test.ts` adds a structured-status block + a fixture-driven contract block; legacy prefix tests stay green for the fallback path. Contract: - `contracts/subagent_status_contract.json` — single source of truth both languages load. Whitespace variants, varied N for polling timeouts, the 3 pre-execution `Error:` returns task_tool produces, and the middleware wrapper shape are all in there. ## Test plan - `make lint` clean (backend + frontend). - `pytest tests/test_subagent_status_contract.py tests/test_tool_error_handling_subagent_stamp.py` → 37 passed. - `pnpm test --run` → 103 passed (was 76, +27 new). ## Migration / fallback retirement The text-prefix fallback stays in place until backend telemetry shows the frontend never hits it for newly produced messages. At that point a follow-up PR can drop the prefix branches and keep only the structured-status branch. Refs: bytedance/deer-flow#3138 (split summary), #3107 (origin), #3131 (prior prefix-only fix), #3146 (this issue). * fix(subtask): back-fill result/error from text when structured status present Three follow-ups on the PR #3154 review: 1. `readStructuredStatus` no longer short-circuits the prefix parse. The backend currently stamps only the `subagent_status` enum value; the human-facing `result` body and wrapped-error message still live in `ToolMessage.content`. Dropping the text parse meant successful tasks rendered empty completed pills and wrapped failures lost their diagnostic. Now both shapes get composed: structured status wins, `result`/`error` come from text when both sides agree, and a lying success body under a `failed` stamp is dropped instead of leaking. 2. Replace the ESM-incompatible `__dirname` fixture lookup in subtask-result.test.ts with `fileURLToPath(new URL(..., import.meta.url))`. The frontend package is `"type": "module"`, so the previous path would have thrown at runtime if anything ever changed under the contract directory. 3. Drop the `$schema` reference from contracts/subagent_status_contract.json pointing at a file that doesn't exist in the tree. Three new tests cover the structured + text composition: completed back-fills the success body, failed back-fills the wrapper text, and unrecognised content under a `failed` stamp stays empty rather than echoing noise. --- .../tool_error_handling_middleware.py | 62 +++++- .../deerflow/subagents/status_contract.py | 102 ++++++++++ .../tests/test_subagent_status_contract.py | 78 ++++++++ ...test_tool_error_handling_subagent_stamp.py | 151 +++++++++++++++ contracts/subagent_status_contract.json | 98 ++++++++++ .../workspace/messages/message-list.tsx | 1 + frontend/src/core/tasks/subtask-result.ts | 126 ++++++++++-- .../unit/core/tasks/subtask-result.test.ts | 179 +++++++++++++++++- 8 files changed, 780 insertions(+), 17 deletions(-) create mode 100644 backend/packages/harness/deerflow/subagents/status_contract.py create mode 100644 backend/tests/test_subagent_status_contract.py create mode 100644 backend/tests/test_tool_error_handling_subagent_stamp.py create mode 100644 contracts/subagent_status_contract.json diff --git a/backend/packages/harness/deerflow/agents/middlewares/tool_error_handling_middleware.py b/backend/packages/harness/deerflow/agents/middlewares/tool_error_handling_middleware.py index fd590d5e9..8012f04ef 100644 --- a/backend/packages/harness/deerflow/agents/middlewares/tool_error_handling_middleware.py +++ b/backend/packages/harness/deerflow/agents/middlewares/tool_error_handling_middleware.py @@ -12,10 +12,45 @@ from langgraph.prebuilt.tool_node import ToolCallRequest from langgraph.types import Command from deerflow.config.app_config import AppConfig +from deerflow.subagents.status_contract import ( + extract_subagent_status, + make_subagent_additional_kwargs, +) logger = logging.getLogger(__name__) _MISSING_TOOL_CALL_ID = "missing_tool_call_id" +_TASK_TOOL_NAME = "task" + + +def _stamp_task_subagent_status(message: ToolMessage, *, tool_name: str, error: str | None = None) -> ToolMessage: + """Centralised stamping of ``additional_kwargs.subagent_status``. + + Bytedance/deer-flow issue #3146: the frontend now reads the subagent + status from a structured field instead of parsing the leading text of + the task tool's return string. That contract is enforced here, in the + one place every task tool result flows through, rather than at the 5 + normal-return + 3 ``Error:`` pre-execution branches inside + ``task_tool.py``. Centralisation prevents the "added a new return + path, forgot the stamp" drift mode. + + For non-``task`` tools this is a no-op so other tools' additional_kwargs + conventions are untouched. + """ + if tool_name != _TASK_TOOL_NAME: + return message + content = message.content if isinstance(message.content, str) else "" + status = extract_subagent_status(content) + if status is None: + # Non-terminal streaming chunks or unrecognised shapes leave the + # field unset so the frontend can keep the card on its in-progress + # placeholder until a real terminal frame arrives. + return message + stamp = make_subagent_additional_kwargs(status, error=error) + existing = dict(message.additional_kwargs or {}) + existing.update(stamp) + message.additional_kwargs = existing + return message class ToolErrorHandlingMiddleware(AgentMiddleware[AgentState]): @@ -29,12 +64,31 @@ class ToolErrorHandlingMiddleware(AgentMiddleware[AgentState]): detail = detail[:497] + "..." content = f"Error: Tool '{tool_name}' failed with {exc.__class__.__name__}: {detail}. Continue with available context, or choose an alternative tool." - return ToolMessage( + message = ToolMessage( content=content, tool_call_id=tool_call_id, name=tool_name, status="error", ) + # Stamp the structured subagent status on the wrapper too: the + # frontend would otherwise have to fall back to prefix-matching + # ``Error: Tool 'task' failed ...`` on the wire. The ``subagent_error`` + # carries the same ``ExcClass: detail`` shape the wrapper string + # uses so debugging artifacts stay aligned. + structured_error = f"{exc.__class__.__name__}: {detail}" + return _stamp_task_subagent_status(message, tool_name=tool_name, error=structured_error) + + @staticmethod + def _maybe_stamp(result: ToolMessage | Command, request: ToolCallRequest) -> ToolMessage | Command: + """Apply the subagent stamp to successful task tool returns. + + ``Command`` results bypass the stamp — they encode LangGraph + control flow rather than user-facing tool output. + """ + if not isinstance(result, ToolMessage): + return result + tool_name = str(request.tool_call.get("name") or "") + return _stamp_task_subagent_status(result, tool_name=tool_name) @override def wrap_tool_call( @@ -43,13 +97,14 @@ class ToolErrorHandlingMiddleware(AgentMiddleware[AgentState]): handler: Callable[[ToolCallRequest], ToolMessage | Command], ) -> ToolMessage | Command: try: - return handler(request) + result = handler(request) except GraphBubbleUp: # Preserve LangGraph control-flow signals (interrupt/pause/resume). raise except Exception as exc: logger.exception("Tool execution failed (sync): name=%s id=%s", request.tool_call.get("name"), request.tool_call.get("id")) return self._build_error_message(request, exc) + return self._maybe_stamp(result, request) @override async def awrap_tool_call( @@ -58,13 +113,14 @@ class ToolErrorHandlingMiddleware(AgentMiddleware[AgentState]): handler: Callable[[ToolCallRequest], Awaitable[ToolMessage | Command]], ) -> ToolMessage | Command: try: - return await handler(request) + result = await handler(request) except GraphBubbleUp: # Preserve LangGraph control-flow signals (interrupt/pause/resume). raise except Exception as exc: logger.exception("Tool execution failed (async): name=%s id=%s", request.tool_call.get("name"), request.tool_call.get("id")) return self._build_error_message(request, exc) + return self._maybe_stamp(result, request) def _build_runtime_middlewares( diff --git a/backend/packages/harness/deerflow/subagents/status_contract.py b/backend/packages/harness/deerflow/subagents/status_contract.py new file mode 100644 index 000000000..03fa010af --- /dev/null +++ b/backend/packages/harness/deerflow/subagents/status_contract.py @@ -0,0 +1,102 @@ +"""Backend↔frontend contract for the structured subagent status. + +Bytedance/deer-flow issue #3146: the frontend used to derive the +subtask card state by string-matching the leading text of the +``task`` tool's result. That contract was fragile — any rewording on +the backend silently broke the card lifecycle, and the issue history +of #3107 BUG-007 / #3131 review showed it repeatedly. + +This module replaces the text-shaped contract with a small structured +one carried inside ``ToolMessage.additional_kwargs``: + +- ``subagent_status``: one of ``SUBAGENT_STATUS_VALUES``. +- ``subagent_error`` (optional): the human-readable error blob the + backend recorded. + +The mapping from "task tool result text" to status is the one piece +the backend stamper (``ToolErrorHandlingMiddleware``) and the +frontend fallback parser must agree on. The shared fixture at +``contracts/subagent_status_contract.json`` is the single source of +truth — both sides' tests load it and assert behaviour. +""" + +from __future__ import annotations + +from typing import Literal + +SUBAGENT_STATUS_KEY = "subagent_status" +SUBAGENT_ERROR_KEY = "subagent_error" + +SubagentStatusValue = Literal[ + "completed", + "failed", + "cancelled", + "timed_out", + "polling_timed_out", +] + +#: Enumeration of every value ``subagent_status`` may take. Mirrors the +#: ``valid_status_values`` array in the shared fixture; the contract test +#: pins them against each other. +SUBAGENT_STATUS_VALUES: tuple[SubagentStatusValue, ...] = ( + "completed", + "failed", + "cancelled", + "timed_out", + "polling_timed_out", +) + +# Prefix table — ordered most-specific-first because some prefixes are +# substrings of others ("Task timed out" vs "Task polling timed out", "Task +# failed" vs "Task failed. Error: ..."). The "Task " prefixes come from +# ``task_tool.py``'s 5 normal-return strings; the bare ``Error:`` prefix +# catches both the 3 ``Error:`` pre-execution returns and the wrapper +# produced by ``ToolErrorHandlingMiddleware`` for any task tool exception. +_PREFIX_TO_STATUS: tuple[tuple[str, SubagentStatusValue], ...] = ( + ("Task Succeeded. Result:", "completed"), + ("Task polling timed out", "polling_timed_out"), + ("Task timed out", "timed_out"), + ("Task cancelled by user", "cancelled"), + ("Task failed.", "failed"), + ("Error", "failed"), +) + + +def extract_subagent_status(content: str) -> SubagentStatusValue | None: + """Infer the structured status for a ``task`` tool result string. + + Returns ``None`` when the content does not match any known terminal + prefix. Non-terminal streaming chunks fall into this branch by + design — the middleware then leaves ``subagent_status`` unset so + the frontend keeps the card on its in-progress placeholder until + the real terminal frame arrives. + """ + trimmed = content.strip() + for prefix, status in _PREFIX_TO_STATUS: + if trimmed.startswith(prefix): + return status + return None + + +def make_subagent_additional_kwargs( + status: SubagentStatusValue, + *, + error: str | None = None, +) -> dict[str, str]: + """Build the ``additional_kwargs`` payload the middleware stamps. + + Drops the error field when blank so the JSON wire format never carries + a misleading empty ``subagent_error: ""``. + + Raises: + ValueError: when ``status`` is not in :data:`SUBAGENT_STATUS_VALUES`. + We do not accept arbitrary strings: a typo would silently leak + through to the frontend and degrade to the legacy prefix + fallback rather than failing loudly. + """ + if status not in SUBAGENT_STATUS_VALUES: + raise ValueError(f"invalid subagent status {status!r}; expected one of {SUBAGENT_STATUS_VALUES}") + payload: dict[str, str] = {SUBAGENT_STATUS_KEY: status} + if error and error.strip(): + payload[SUBAGENT_ERROR_KEY] = error.strip() + return payload diff --git a/backend/tests/test_subagent_status_contract.py b/backend/tests/test_subagent_status_contract.py new file mode 100644 index 000000000..95e35ca42 --- /dev/null +++ b/backend/tests/test_subagent_status_contract.py @@ -0,0 +1,78 @@ +"""Contract tests for ``deerflow.subagents.status_contract``. + +Bytedance/deer-flow issue #3146: the backend stamps +``ToolMessage.additional_kwargs.subagent_status`` so the frontend can read +the subagent state from a structured field instead of parsing the result +text. The mapping from "task tool result text" to status is shared with the +frontend through the cross-language fixture file +``contracts/subagent_status_contract.json``. + +These tests pin the backend implementation against that fixture so any +edit on either side surfaces immediately as a test failure. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from deerflow.subagents.status_contract import ( + SUBAGENT_ERROR_KEY, + SUBAGENT_STATUS_KEY, + SUBAGENT_STATUS_VALUES, + extract_subagent_status, + make_subagent_additional_kwargs, +) + +_REPO_ROOT = Path(__file__).resolve().parents[2] +_CONTRACT_PATH = _REPO_ROOT / "contracts" / "subagent_status_contract.json" + + +def _load_contract() -> dict: + return json.loads(_CONTRACT_PATH.read_text(encoding="utf-8")) + + +def test_contract_file_exists(): + assert _CONTRACT_PATH.is_file(), f"missing shared fixture: {_CONTRACT_PATH}" + + +def test_status_values_match_contract(): + """Backend status enum stays aligned with the contract document.""" + contract = _load_contract() + assert set(SUBAGENT_STATUS_VALUES) == set(contract["valid_status_values"]) + + +@pytest.mark.parametrize("case", _load_contract()["cases"], ids=lambda c: c["name"]) +def test_extract_subagent_status_matches_contract(case): + """Every fixture case maps through ``extract_subagent_status`` to the + expected status — covers task_tool's 5 normal returns, the 3 + pre-execution ``Error:`` returns, the middleware-wrapped exception + case, whitespace handling, and the streaming chunk that must stay + unrecognised. + """ + status = extract_subagent_status(case["content"]) + assert status == case["expected_status"], f"case {case['name']!r}: expected {case['expected_status']!r}, got {status!r}" + + +def test_make_subagent_additional_kwargs_includes_status(): + kwargs = make_subagent_additional_kwargs("completed") + assert kwargs == {SUBAGENT_STATUS_KEY: "completed"} + + +def test_make_subagent_additional_kwargs_includes_error_when_present(): + kwargs = make_subagent_additional_kwargs("failed", error="boom") + assert kwargs == {SUBAGENT_STATUS_KEY: "failed", SUBAGENT_ERROR_KEY: "boom"} + + +def test_make_subagent_additional_kwargs_omits_blank_error(): + """Empty / whitespace error must not leak as ``subagent_error: ""``.""" + assert make_subagent_additional_kwargs("failed", error="") == {SUBAGENT_STATUS_KEY: "failed"} + assert make_subagent_additional_kwargs("failed", error=" ") == {SUBAGENT_STATUS_KEY: "failed"} + assert make_subagent_additional_kwargs("failed", error=None) == {SUBAGENT_STATUS_KEY: "failed"} + + +def test_make_subagent_additional_kwargs_rejects_unknown_status(): + with pytest.raises(ValueError, match="invalid subagent status"): + make_subagent_additional_kwargs("garbage") # type: ignore[arg-type] diff --git a/backend/tests/test_tool_error_handling_subagent_stamp.py b/backend/tests/test_tool_error_handling_subagent_stamp.py new file mode 100644 index 000000000..b130e5fe5 --- /dev/null +++ b/backend/tests/test_tool_error_handling_subagent_stamp.py @@ -0,0 +1,151 @@ +"""Regression tests for ToolErrorHandlingMiddleware's subagent status stamp. + +Bytedance/deer-flow issue #3146: rather than stamp +``ToolMessage.additional_kwargs.subagent_status`` from each of +task_tool.py's 5 normal returns + 3 pre-execution Error: returns (which +would be 8 separate places to drift over time), the middleware that +already wraps every tool call does the stamping in one place. These +tests pin that centralisation. + +For non-``task`` tools the middleware must not touch additional_kwargs +— other tools have their own conventions and we do not want to leak a +``subagent_status`` field onto them. +""" + +from __future__ import annotations + +import asyncio +import json +from pathlib import Path + +import pytest +from langchain_core.messages import ToolMessage + +from deerflow.agents.middlewares.tool_error_handling_middleware import ( + ToolErrorHandlingMiddleware, +) +from deerflow.subagents.status_contract import ( + SUBAGENT_ERROR_KEY, + SUBAGENT_STATUS_KEY, +) + +_CONTRACT_PATH = Path(__file__).resolve().parents[2] / "contracts" / "subagent_status_contract.json" + + +def _load_terminal_cases() -> list[dict]: + """Load only the cases that should produce a terminal status stamp.""" + data = json.loads(_CONTRACT_PATH.read_text(encoding="utf-8")) + return [c for c in data["cases"] if c["expected_status"] is not None] + + +class _FakeRequest: + """Stand-in for ``ToolCallRequest`` used by the middleware.""" + + def __init__(self, tool_name: str, tool_call_id: str = "call-1") -> None: + self.tool_call = {"name": tool_name, "id": tool_call_id} + + +@pytest.mark.parametrize("case", _load_terminal_cases(), ids=lambda c: c["name"]) +def test_stamps_subagent_status_on_successful_task_return(case): + """Every terminal task tool result string stamps the matching status.""" + middleware = ToolErrorHandlingMiddleware() + request = _FakeRequest("task") + + def handler(_req): + return ToolMessage(content=case["content"], tool_call_id="call-1", name="task") + + result = middleware.wrap_tool_call(request, handler) + assert isinstance(result, ToolMessage) + assert result.additional_kwargs.get(SUBAGENT_STATUS_KEY) == case["expected_status"] + + +def test_does_not_stamp_unknown_streaming_chunk(): + """Non-terminal content leaves additional_kwargs alone.""" + middleware = ToolErrorHandlingMiddleware() + request = _FakeRequest("task") + + def handler(_req): + return ToolMessage(content="Investigating ...", tool_call_id="call-1", name="task") + + result = middleware.wrap_tool_call(request, handler) + assert SUBAGENT_STATUS_KEY not in (result.additional_kwargs or {}) + + +def test_does_not_stamp_non_task_tool(): + """A non-task tool returning a string that happens to start with + ``Error:`` must not pick up a subagent stamp.""" + middleware = ToolErrorHandlingMiddleware() + request = _FakeRequest("bash") + + def handler(_req): + return ToolMessage(content="Error: command not found", tool_call_id="call-1", name="bash") + + result = middleware.wrap_tool_call(request, handler) + assert SUBAGENT_STATUS_KEY not in (result.additional_kwargs or {}) + + +def test_stamps_failed_when_task_tool_raises(): + """The exception path goes through ``_build_error_message`` which is + the only place ToolErrorHandlingMiddleware ever emits a brand-new + ToolMessage. It must stamp ``failed`` for task too, since the wrapper + text starts with ``Error:``. + """ + middleware = ToolErrorHandlingMiddleware() + request = _FakeRequest("task") + + def handler(_req): + raise RuntimeError("blew up during execution") + + result = middleware.wrap_tool_call(request, handler) + assert isinstance(result, ToolMessage) + assert result.additional_kwargs.get(SUBAGENT_STATUS_KEY) == "failed" + assert "RuntimeError" in result.additional_kwargs.get(SUBAGENT_ERROR_KEY, "") + + +def test_async_wrap_also_stamps(): + """The async wrap path must behave identically.""" + middleware = ToolErrorHandlingMiddleware() + request = _FakeRequest("task") + + async def handler(_req): + return ToolMessage(content="Task Succeeded. Result: ok", tool_call_id="call-1", name="task") + + result = asyncio.run(middleware.awrap_tool_call(request, handler)) + assert result.additional_kwargs.get(SUBAGENT_STATUS_KEY) == "completed" + + +def test_preserves_existing_additional_kwargs(): + """The stamper must not clobber unrelated fields the tool already set.""" + middleware = ToolErrorHandlingMiddleware() + request = _FakeRequest("task") + + def handler(_req): + return ToolMessage( + content="Task Succeeded. Result: ok", + tool_call_id="call-1", + name="task", + additional_kwargs={"existing_field": "must_survive"}, + ) + + result = middleware.wrap_tool_call(request, handler) + assert result.additional_kwargs.get("existing_field") == "must_survive" + assert result.additional_kwargs.get(SUBAGENT_STATUS_KEY) == "completed" + + +def test_additional_kwargs_round_trip_via_json(): + """Pydantic dump → JSON → restore must keep the stamp intact. + + ``ToolMessage`` is what LangGraph serialises into the checkpoint and + what the frontend deserialises off the stream. If a future Pydantic / + LangChain upgrade silently strips unknown ``additional_kwargs`` we + want that to fail loudly here rather than in the wild. + """ + msg = ToolMessage( + content="Task Succeeded. Result: ok", + tool_call_id="call-1", + name="task", + additional_kwargs={SUBAGENT_STATUS_KEY: "completed", SUBAGENT_ERROR_KEY: ""}, + ) + serialised = msg.model_dump_json() + restored = ToolMessage.model_validate_json(serialised) + assert restored.additional_kwargs.get(SUBAGENT_STATUS_KEY) == "completed" diff --git a/contracts/subagent_status_contract.json b/contracts/subagent_status_contract.json new file mode 100644 index 000000000..b89e1fdc1 --- /dev/null +++ b/contracts/subagent_status_contract.json @@ -0,0 +1,98 @@ +{ + "version": 1, + "description": "Cross-language contract test fixture for the subagent status field. The backend stamps ToolMessage.additional_kwargs.subagent_status using these prefixes; the frontend reads the structured field and falls back to the same prefixes. Both sides' tests load this file and must agree.", + "valid_status_values": ["completed", "failed", "cancelled", "timed_out", "polling_timed_out"], + "cases": [ + { + "name": "succeeded", + "origin": "task_tool.py succeeded path", + "content": "Task Succeeded. Result: investigated and produced a 3-page report", + "expected_status": "completed", + "expected_error_contains": null + }, + { + "name": "failed", + "origin": "task_tool.py failed path", + "content": "Task failed. Error: underlying tool raised RuntimeError", + "expected_status": "failed", + "expected_error_contains": "RuntimeError" + }, + { + "name": "cancelled", + "origin": "task_tool.py cancelled path", + "content": "Task cancelled by user.", + "expected_status": "cancelled", + "expected_error_contains": null + }, + { + "name": "timed_out", + "origin": "task_tool.py timed_out path", + "content": "Task timed out. Error: 900 seconds", + "expected_status": "timed_out", + "expected_error_contains": "900" + }, + { + "name": "polling_timed_out", + "origin": "task_tool.py polling timeout safety-net path", + "content": "Task polling timed out after 15 minutes. This may indicate the background task is stuck. Status: RUNNING", + "expected_status": "polling_timed_out", + "expected_error_contains": "15" + }, + { + "name": "polling_timed_out_other_n", + "origin": "varied N coverage", + "content": "Task polling timed out after 1 minutes. Status: RUNNING", + "expected_status": "polling_timed_out", + "expected_error_contains": null + }, + { + "name": "pre_unknown_subagent", + "origin": "task_tool.py pre-execution Error path (unknown subagent type)", + "content": "Error: Unknown subagent type 'foo'. Available: bash, general-purpose", + "expected_status": "failed", + "expected_error_contains": "Unknown subagent" + }, + { + "name": "pre_bash_disabled", + "origin": "task_tool.py pre-execution Error path (host bash disabled)", + "content": "Error: Host bash subagent is disabled by configuration", + "expected_status": "failed", + "expected_error_contains": "disabled" + }, + { + "name": "pre_task_disappeared", + "origin": "task_tool.py pre-execution Error path (background task disappeared)", + "content": "Error: Task 1234 disappeared from background tasks", + "expected_status": "failed", + "expected_error_contains": "disappeared" + }, + { + "name": "wrapper_error", + "origin": "ToolErrorHandlingMiddleware wrap on tool exception", + "content": "Error: Tool 'task' failed with TypeError: 'AsyncCallbackManager' object is not iterable. Continue with available context, or choose an alternative tool.", + "expected_status": "failed", + "expected_error_contains": "TypeError" + }, + { + "name": "streaming_chunk_unknown", + "origin": "non-terminal chunk reaching parser", + "content": "Investigating ...", + "expected_status": null, + "expected_error_contains": null + }, + { + "name": "succeeded_with_surrounding_whitespace", + "origin": "streaming sometimes prepends/appends newlines", + "content": " Task Succeeded. Result: ok ", + "expected_status": "completed", + "expected_error_contains": "ok" + }, + { + "name": "cancelled_with_surrounding_whitespace", + "origin": "streaming whitespace coverage", + "content": " Task cancelled by user.\n", + "expected_status": "cancelled", + "expected_error_contains": null + } + ] +} diff --git a/frontend/src/components/workspace/messages/message-list.tsx b/frontend/src/components/workspace/messages/message-list.tsx index ca8672a3a..df702ef6c 100644 --- a/frontend/src/components/workspace/messages/message-list.tsx +++ b/frontend/src/components/workspace/messages/message-list.tsx @@ -375,6 +375,7 @@ export function MessageList({ if (taskId) { const parsed = parseSubtaskResult( extractTextFromMessage(message), + message.additional_kwargs, ); updateSubtask({ id: taskId, ...parsed }); } diff --git a/frontend/src/core/tasks/subtask-result.ts b/frontend/src/core/tasks/subtask-result.ts index ac4a422a9..8cec19c59 100644 --- a/frontend/src/core/tasks/subtask-result.ts +++ b/frontend/src/core/tasks/subtask-result.ts @@ -8,6 +8,35 @@ export interface SubtaskResultUpdate { error?: string; } +/** + * Structured-status keys the backend stamps onto + * ``ToolMessage.additional_kwargs`` for every ``task`` tool result. + * + * The values mirror the Python contract in + * ``backend/packages/harness/deerflow/subagents/status_contract.py`` + * (``SUBAGENT_STATUS_KEY`` / ``SUBAGENT_ERROR_KEY``). The cross-language + * fixture at ``contracts/subagent_status_contract.json`` pins both sides + * to the same values. + */ +export const SUBAGENT_STATUS_KEY = "subagent_status"; +export const SUBAGENT_ERROR_KEY = "subagent_error"; + +/** + * Map from the backend ``subagent_status`` value to the frontend + * {@link SubtaskStatus} enum. The frontend collapses ``cancelled`` / + * ``timed_out`` / ``polling_timed_out`` into ``failed`` because the + * subtask card only renders three pill states. The richer backend + * vocabulary still survives on ``error`` for tooling that wants the + * detail. + */ +const STRUCTURED_STATUS_TO_SUBTASK: Record = { + completed: "completed", + failed: "failed", + cancelled: "failed", + timed_out: "failed", + polling_timed_out: "failed", +}; + /** * Prefix strings the backend `task` tool writes into its result `content`. * @@ -34,24 +63,68 @@ export const POLLING_TIMEOUT_PREFIX = "Task polling timed out"; export const ERROR_WRAPPER_PATTERN = /^Error\b/i; /** - * Map a `task` tool result string to a {@link SubtaskStatus}. + * Map a `task` tool result to a {@link SubtaskStatus}. * - * Bytedance/deer-flow issue #3107 BUG-007: parent-visible task tool errors do - * not always start with one of the three legacy prefixes (e.g. when - * `ToolErrorHandlingMiddleware` wraps an exception as - * `Error: Tool 'task' failed ...`). Treat any leading `Error:` token as a - * terminal failure so subtask cards stop being stuck on "in_progress". + * Bytedance/deer-flow issue #3146: prefers the structured + * ``additional_kwargs.subagent_status`` field the backend now stamps via + * ``ToolErrorHandlingMiddleware``. Falls back to the legacy prefix + * matching for messages that pre-date the stamping commit (historical + * threads, third-party clients, or any tool path that bypasses the + * middleware). Both shapes converge on the same {@link SubtaskStatus} + * vocabulary the card UI renders. + * + * When the structured field is present, the prefix parser is still run + * so the success `result` body and the wrapped-error message can be + * back-filled from `content`. Today the backend only stamps the + * `subagent_status` enum value — the human-facing payload still lives + * in `content`, so dropping the prefix parse would regress the subtask + * card display. Structured fields win on conflict: if `subagent_status` + * and the text disagree, the text-derived `result`/`error` are + * discarded so a malformed wrapper can't sneak through. * * Returning `in_progress` is the **deliberate** fallback for content that - * matches none of the known prefixes. LangChain only ever emits a - * `ToolMessage` once the tool itself has returned (success or wrapped - * exception), so an unknown shape means "the contract changed underneath us" - * — surfacing it as still-running prompts the operator to investigate, where - * eagerly marking it terminal-failed would mask the drift. + * matches none of the known prefixes and carries no structured stamp. + * LangChain only ever emits a `ToolMessage` once the tool itself has + * returned (success or wrapped exception), so an unknown shape means + * "the contract changed underneath us" — surfacing it as still-running + * prompts the operator to investigate, where eagerly marking it + * terminal-failed would mask the drift. */ -export function parseSubtaskResult(text: string): SubtaskResultUpdate { - const trimmed = text.trim(); +export function parseSubtaskResult( + text: string, + additionalKwargs?: Record | null, +): SubtaskResultUpdate { + const fromText = parseFromText(text.trim()); + const structured = readStructuredStatus(additionalKwargs); + if (!structured) { + return fromText; + } + const update: SubtaskResultUpdate = { status: structured.status }; + // Structured `subagent_error` wins; otherwise inherit the text-derived + // error only when both sides agree on the status (so a "Task Succeeded" + // body can't bleed into a `failed` structured stamp and vice versa). + if (structured.error) { + update.error = structured.error; + } else if ( + fromText.status === structured.status && + fromText.error !== undefined + ) { + update.error = fromText.error; + } + // Result body only matters for `completed`; require text agreement so + // a lying success prefix under a `failed` stamp is dropped. + if ( + structured.status === "completed" && + fromText.status === "completed" && + fromText.result !== undefined + ) { + update.result = fromText.result; + } + return update; +} + +function parseFromText(trimmed: string): SubtaskResultUpdate { if (trimmed.startsWith(SUCCESS_PREFIX)) { return { status: "completed", @@ -86,3 +159,30 @@ export function parseSubtaskResult(text: string): SubtaskResultUpdate { return { status: "in_progress" }; } + +interface StructuredStatus { + status: SubtaskStatus; + error?: string; +} + +function readStructuredStatus( + additionalKwargs: Record | null | undefined, +): StructuredStatus | null { + if (!additionalKwargs) return null; + const rawStatus = additionalKwargs[SUBAGENT_STATUS_KEY]; + if (typeof rawStatus !== "string") return null; + const mapped = STRUCTURED_STATUS_TO_SUBTASK[rawStatus]; + if (mapped === undefined) { + // Unknown future status value — stay on the legacy prefix fallback + // so a backend that ships a new enum variant before the frontend + // upgrades still renders something predictable instead of getting + // pinned to "in_progress" by an empty branch. + return null; + } + const rawError = additionalKwargs[SUBAGENT_ERROR_KEY]; + const result: StructuredStatus = { status: mapped }; + if (typeof rawError === "string" && rawError.trim()) { + result.error = rawError; + } + return result; +} diff --git a/frontend/tests/unit/core/tasks/subtask-result.test.ts b/frontend/tests/unit/core/tasks/subtask-result.test.ts index 4f0597fda..37840b1d4 100644 --- a/frontend/tests/unit/core/tasks/subtask-result.test.ts +++ b/frontend/tests/unit/core/tasks/subtask-result.test.ts @@ -1,6 +1,37 @@ +import { readFileSync } from "node:fs"; +import { fileURLToPath } from "node:url"; + import { describe, expect, it } from "vitest"; -import { parseSubtaskResult } from "@/core/tasks/subtask-result"; +import { + SUBAGENT_ERROR_KEY, + SUBAGENT_STATUS_KEY, + parseSubtaskResult, +} from "@/core/tasks/subtask-result"; + +interface ContractCase { + name: string; + content: string; + expected_status: string | null; + expected_error_contains: string | null; +} + +interface ContractFile { + valid_status_values: string[]; + cases: ContractCase[]; +} + +// The frontend package is ESM (`"type": "module"`), so `__dirname` is not +// defined. Resolve the cross-language fixture relative to this module URL. +const CONTRACT_PATH = fileURLToPath( + new URL( + "../../../../../contracts/subagent_status_contract.json", + import.meta.url, + ), +); +const CONTRACT: ContractFile = JSON.parse( + readFileSync(CONTRACT_PATH, "utf-8"), +) as ContractFile; describe("parseSubtaskResult", () => { it("recognises the standard success prefix", () => { @@ -110,3 +141,149 @@ describe("parseSubtaskResult", () => { expect(parsed.result).toBe("ok"); }); }); + +/** + * Structured-status path (bytedance/deer-flow#3146). + * + * The backend stamps `ToolMessage.additional_kwargs.subagent_status` + * directly. The frontend should prefer that over reverse-engineering it + * from the content string. + */ +describe("parseSubtaskResult — structured additional_kwargs (preferred path)", () => { + it("uses additional_kwargs.subagent_status when present", () => { + const parsed = parseSubtaskResult("Task Succeeded. Result: foo", { + [SUBAGENT_STATUS_KEY]: "completed", + }); + expect(parsed.status).toBe("completed"); + }); + + it("collapses cancelled / timed_out / polling_timed_out to failed for the card UI", () => { + for (const backendStatus of [ + "cancelled", + "timed_out", + "polling_timed_out", + ]) { + const parsed = parseSubtaskResult("anything at all", { + [SUBAGENT_STATUS_KEY]: backendStatus, + }); + expect(parsed.status).toBe("failed"); + } + }); + + it("uses subagent_error when supplied", () => { + const parsed = parseSubtaskResult("ignored content", { + [SUBAGENT_STATUS_KEY]: "failed", + [SUBAGENT_ERROR_KEY]: "boom from backend", + }); + expect(parsed.status).toBe("failed"); + expect(parsed.error).toBe("boom from backend"); + }); + + it("ignores empty / non-string subagent_error", () => { + const parsed = parseSubtaskResult("ignored content", { + [SUBAGENT_STATUS_KEY]: "failed", + [SUBAGENT_ERROR_KEY]: "", + }); + expect(parsed.status).toBe("failed"); + expect(parsed.error).toBeUndefined(); + }); + + it("falls back to prefix parsing when the structured status is missing", () => { + const parsed = parseSubtaskResult("Task Succeeded. Result: foo", { + // No subagent_status here — backend versions that pre-date the + // middleware stamping commit still need to render. + other_field: "irrelevant", + }); + expect(parsed.status).toBe("completed"); + expect(parsed.result).toBe("foo"); + }); + + it("falls back to prefix parsing when the structured status is an unknown future value", () => { + const parsed = parseSubtaskResult("Task Succeeded. Result: foo", { + [SUBAGENT_STATUS_KEY]: "renamed_in_v3", + }); + // Falls back to prefix and still finds the success path. + expect(parsed.status).toBe("completed"); + }); + + it("structured status overrides legacy text — opposite content", () => { + // Defence: if backend sends `failed` structured but the content + // accidentally starts with "Task Succeeded.", we must trust the + // structured field. The structured field is the source of truth. + const parsed = parseSubtaskResult("Task Succeeded. Result: this is a lie", { + [SUBAGENT_STATUS_KEY]: "failed", + }); + expect(parsed.status).toBe("failed"); + // The misleading success body must be dropped — `result` is reserved + // for the completed pill, and the suspicious text isn't replayed as + // an error either. + expect(parsed.result).toBeUndefined(); + expect(parsed.error).toBeUndefined(); + }); + + it("back-fills `result` from the success-prefixed content when structured says completed", () => { + // The backend currently stamps `subagent_status: completed` but the + // success body still lives in `content`. Without back-fill the card + // would render an empty completed pill (regression flagged in PR #3154 + // Copilot review). + const parsed = parseSubtaskResult( + "Task Succeeded. Result: investigated and produced a 3-page report", + { [SUBAGENT_STATUS_KEY]: "completed" }, + ); + expect(parsed.status).toBe("completed"); + expect(parsed.result).toBe("investigated and produced a 3-page report"); + }); + + it("back-fills `error` from a wrapped-error body when structured says failed and no subagent_error", () => { + // Same regression on the failure side: the wrapper text is the only + // place the diagnostic message exists when the backend stamps the + // enum but not `subagent_error`. + const parsed = parseSubtaskResult( + "Error: Tool 'task' failed with TypeError: boom", + { [SUBAGENT_STATUS_KEY]: "failed" }, + ); + expect(parsed.status).toBe("failed"); + expect(parsed.error).toContain("TypeError: boom"); + }); + + it("leaves `error` undefined when structured says failed with no error and unrecognised text", () => { + // Don't dump arbitrary content into the error field — better to render + // an empty `failed` pill than to surface noise. + const parsed = parseSubtaskResult("partial streaming chunk", { + [SUBAGENT_STATUS_KEY]: "failed", + }); + expect(parsed.status).toBe("failed"); + expect(parsed.error).toBeUndefined(); + }); +}); + +/** + * Cross-language contract test (bytedance/deer-flow#3146). + * + * Loads the shared fixture at ``contracts/subagent_status_contract.json`` + * and runs every case through the legacy prefix parser. The matching + * backend test (`backend/tests/test_subagent_status_contract.py`) runs + * the same cases through ``extract_subagent_status``. Any drift between + * the two implementations surfaces here. + * + * Status-collapse expectations: + * - `completed` → `completed` + * - `failed` → `failed` + * - `cancelled` / `timed_out` / `polling_timed_out` → `failed` + * (the frontend card has three pill states, not five) + * - `null` → `in_progress` + */ +describe("parseSubtaskResult — shared contract fixture", () => { + const expectedCardStatus = (backendStatus: string | null): string => { + if (backendStatus === null) return "in_progress"; + if (backendStatus === "completed") return "completed"; + return "failed"; + }; + + for (const c of CONTRACT.cases) { + it(`legacy prefix parser matches contract: ${c.name}`, () => { + const parsed = parseSubtaskResult(c.content); + expect(parsed.status).toBe(expectedCardStatus(c.expected_status)); + }); + } +});