diff --git a/.github/workflows/replay-e2e.yml b/.github/workflows/replay-e2e.yml new file mode 100644 index 000000000..b0c3acae8 --- /dev/null +++ b/.github/workflows/replay-e2e.yml @@ -0,0 +1,108 @@ +name: Replay E2E (front-back contract) + +# Guards the front-back contract via record/replay (no API key in CI): +# Layer 1 — backend golden: replay a recorded trace through the real gateway, +# assert the SSE event sequence matches the committed golden. +# Layer 2 — full-stack render: real Next.js frontend + real gateway (replay +# model) + Chromium; assert the replayed turns render in the browser. +# Triggered by changes on EITHER side of the contract so a backend change can no +# longer pass without the frontend-facing checks running. + +on: + push: + branches: ["main"] + paths: + - "frontend/**" + - "backend/app/gateway/**" + - "backend/packages/harness/**" + - "backend/tests/fixtures/replay/**" + - "backend/tests/replay_provider.py" + - "backend/tests/_replay_fixture.py" + - "backend/tests/seed_runs_router.py" + - "backend/tests/test_replay_golden.py" + - "backend/scripts/run_replay_gateway.py" + - ".github/workflows/replay-e2e.yml" + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + paths: + - "frontend/**" + - "backend/app/gateway/**" + - "backend/packages/harness/**" + - "backend/tests/fixtures/replay/**" + - "backend/tests/replay_provider.py" + - "backend/tests/_replay_fixture.py" + - "backend/tests/seed_runs_router.py" + - "backend/tests/test_replay_golden.py" + - "backend/scripts/run_replay_gateway.py" + - ".github/workflows/replay-e2e.yml" + +concurrency: + group: replay-e2e-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + backend-replay-golden: + name: Layer 1 — backend golden (no API key) + if: github.event_name != 'pull_request' || github.event.pull_request.draft == false + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + - name: Install uv + uses: astral-sh/setup-uv@v7 + - name: Install backend dependencies + working-directory: backend + run: uv sync --group dev + - name: Replay golden (backend SSE contract) + working-directory: backend + run: PYTHONPATH=. uv run pytest tests/test_replay_golden.py -v + + fullstack-replay-render: + name: Layer 2 — full-stack render (no API key) + if: github.event_name != 'pull_request' || github.event.pull_request.draft == false + runs-on: ubuntu-latest + timeout-minutes: 25 + steps: + - uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + - name: Install uv + uses: astral-sh/setup-uv@v7 + - name: Install backend dependencies (replay gateway) + working-directory: backend + run: uv sync --group dev + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: "22" + - name: Enable Corepack + run: corepack enable + - name: Use pinned pnpm version + run: corepack prepare pnpm@10.26.2 --activate + - name: Install frontend dependencies + working-directory: frontend + run: pnpm install --frozen-lockfile + - name: Install Playwright Chromium + working-directory: frontend + run: npx playwright install chromium --with-deps + - name: Full-stack replay render (DOM assertions are the gate) + working-directory: frontend + run: pnpm exec playwright test -c playwright.real-backend.config.ts + - name: Upload report + render artifact + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: replay-render + path: | + frontend/playwright-report/ + frontend/test-results/ + retention-days: 7 diff --git a/backend/docs/REPLAY_E2E.md b/backend/docs/REPLAY_E2E.md new file mode 100644 index 000000000..546e160c2 --- /dev/null +++ b/backend/docs/REPLAY_E2E.md @@ -0,0 +1,103 @@ +# Record/Replay E2E — front-back contract verification + +Deterministic, **key-free** end-to-end checks that a backend change can't +silently break the frontend (and vice-versa). Two complementary layers, fed by a +single recording. + +## Why + +The mock-based frontend e2e hand-writes the backend's JSON/SSE, so a backend +schema or SSE change passes green ("fake green"). These layers replay a recorded +**real** run against the **real** backend (and, for Layer 2, the real frontend), +so contract drift turns the build red instead. + +## The two layers + +- **Layer 1 — backend golden** (`tests/test_replay_golden.py`): replays a fixture + through the real FastAPI gateway with `ReplayChatModel` and asserts the streamed + SSE event sequence equals a committed golden. Fast, no browser. Guards protocol + *shape*. +- **Layer 2 — full-stack render** (`frontend/tests/e2e-real-backend/`): real + Next.js + real gateway (replay model) + Chromium; asserts the replayed + auto-title and a follow-up suggestion render in the browser. Guards semantic + *render*. (Complementary to Layer 1 — neither subsumes the other.) + +Layer 2 also hosts **cross-stack contract scenarios** — the dangerous class +where a backend change silently breaks a frontend assumption and *both sides' +unit tests stay green*. See below. + +## Cross-stack scenario: multi-run render order (`multi-run-order.spec.ts`) + +Regression guard for issue **#3352** (after context compression, refreshing a +thread rendered history out of order). Root cause was a front-back desync: +backend `RunManager.list_by_thread` returns runs **newest-first** (PR #2932), +while the frontend (`core/threads/hooks.ts`) iterated runs and **prepended** each +loaded page — inverting chronological order once the checkpoint no longer held +the older messages. The backend ordering test was green throughout, and the +frontend regression unit test hardcodes "backend returns newest-first" in a mock, +so only a *real frontend against a real backend* catches the desync. + +This scenario does **not** record a conversation. It uses a **test-only seeder** +(`tests/seed_runs_router.py`, mounted on the replay gateway only when +`DEERFLOW_ENABLE_TEST_SEED=1`) to stand up a thread with ≥2 runs and per-run +message events — and deliberately **no checkpoint**, which is the #3352 +precondition: it forces the frontend's per-run reload path to be the sole source +of truth so the ordering bug becomes observable. The seeder writes through the +gateway's own run/event stores using the request's auth context, so the real +`list_by_thread` → `/runs/{id}/messages` → prepend path runs live. Reverting the +#3354 frontend fix turns this spec red. + +## How replay works + +`tests/replay_provider.py::ReplayChatModel` returns recorded assistant turns keyed +by a **normalized hash** of the model input (strips ``, dates, +UUIDs, tmp paths). A miss raises loudly rather than passing silently. The system +prompt is made environment-independent by pinning skills + extensions empty and +disabling memory/summarization (`tests/_replay_fixture.py::build_config_yaml`), so +a fixture replays the same across machines, days, and CI. Replaying needs **no +API key**. + +## Record a new scenario (needs a real key — dev machine only) + +Recording drives the **real frontend** so captured inputs match exactly what the +browser sends; fixtures contain no API key. + +```bash +# 1. drive the real frontend against a real-model gateway, capturing model calls +OPENAI_API_KEY=... OPENAI_API_BASE=/v1 \ + DEERFLOW_RECORD_OUT=/tmp/rec/turns.jsonl RECORD_MODEL= \ + bash -c 'cd frontend && pnpm exec playwright test -c playwright.record.config.ts' + +# 2. stitch the capture into a fixture +cd backend && uv run python scripts/build_fixture_from_jsonl.py \ + --jsonl /tmp/rec/turns.jsonl --meta /tmp/rec/turns.jsonl.meta.json \ + --out tests/fixtures/replay/..json --model + +# 3. regenerate the committed golden +DEERFLOW_WRITE_GOLDEN=1 PYTHONPATH=. uv run pytest tests/test_replay_golden.py +``` + +## Run (no key) + +```bash +cd backend && PYTHONPATH=. uv run pytest tests/test_replay_golden.py # Layer 1 +cd frontend && pnpm exec playwright test -c playwright.real-backend.config.ts # Layer 2 +``` + +## CI + +`.github/workflows/replay-e2e.yml` runs both layers on changes to **either** side +of the contract (`frontend/**`, `backend/app/gateway/**`, +`backend/packages/harness/**`, fixtures). DOM assertions are the gate; the rendered +screenshot + Playwright HTML report are uploaded as a CI artifact. + +## Known limitations + +- Visual regression baselines are OS-specific, so they are a **local dev gate + only** (gitignored); CI uploads the render as an artifact for human review + instead of hard-asserting a cross-OS baseline. +- Fixtures are coupled to the recording-time prompt; if new + environment-dependent content enters the system prompt, extend the + normalization in `replay_provider.py` (or pin it in `build_config_yaml`). +- Re-record a scenario if the agent graph changes how many model calls it makes + — the replay raises loudly on a hash miss pointing at the divergence. diff --git a/backend/scripts/build_fixture_from_jsonl.py b/backend/scripts/build_fixture_from_jsonl.py new file mode 100644 index 000000000..9bd7e1f93 --- /dev/null +++ b/backend/scripts/build_fixture_from_jsonl.py @@ -0,0 +1,44 @@ +"""Turn a record-through-browser JSONL capture into a replay fixture. + +The recording gateway (``record_gateway.py``) appends ``{input_hash, output}`` +lines as the frontend drives a real run; the record spec writes a ``.meta.json`` +sidecar with ``{scenario, mode, prompt}``. This stitches them into the fixture +the replay provider + tests consume. +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--jsonl", required=True) + parser.add_argument("--meta", required=True) + parser.add_argument("--out", required=True) + parser.add_argument("--model", default="gpt-5.5") + args = parser.parse_args() + + turns = [json.loads(line) for line in Path(args.jsonl).read_text(encoding="utf-8").splitlines() if line.strip()] + meta = json.loads(Path(args.meta).read_text(encoding="utf-8")) + fixture = { + "scenario": meta["scenario"], + "mode": meta["mode"], + "model": args.model, + "prompt": meta["prompt"], + "context": meta.get("context", {}), + "turns": turns, + } + Path(args.out).write_text(json.dumps(fixture, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"wrote {len(turns)} turn(s) -> {args.out}") + for index, turn in enumerate(turns): + data = turn["output"].get("data", {}) + tool_calls = [tc.get("name") for tc in (data.get("tool_calls") or [])] + print(f" turn {index}: hash={turn['input_hash'][:12]} tool_calls={tool_calls} content={str(data.get('content'))[:50]!r}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/backend/scripts/record_gateway.py b/backend/scripts/record_gateway.py new file mode 100644 index 000000000..ecab4b6cd --- /dev/null +++ b/backend/scripts/record_gateway.py @@ -0,0 +1,109 @@ +"""Recording gateway for *record-through-browser* (Plan A). + +Runs the gateway with a REAL model and a callback that appends every model +call's ``(input_hash, output)`` to a JSONL file. Because the run is driven by +the real frontend (Playwright), the captured inputs are EXACTLY what the +frontend produces (date system-reminder, suggestions/title calls, ...), so the +resulting fixture replays cleanly against the browser. + +Used by ``frontend/playwright.record.config.ts``. Env: + OPENAI_API_KEY / OPENAI_API_BASE - the real upstream (never committed) + DEERFLOW_RECORD_OUT - JSONL path to append captured turns to + RECORD_PORT (default 8012), RECORD_MODEL (default gpt-5.5) +""" + +from __future__ import annotations + +import json +import os +import sys +import tempfile +from pathlib import Path + +_BACKEND = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(_BACKEND)) +sys.path.insert(0, str(_BACKEND / "tests")) + + +def _install_capture(out_path: Path) -> None: + from langchain_core.callbacks import BaseCallbackHandler + from langchain_core.messages import messages_to_dict + from replay_provider import hash_messages + + import deerflow.models.factory as factory_mod + + class Capture(BaseCallbackHandler): + def __init__(self) -> None: + self.inputs: dict[str, list] = {} + + def on_chat_model_start(self, serialized, messages, *, run_id=None, **kwargs): # noqa: ANN001 + self.inputs[str(run_id)] = messages[0] if messages else [] + + def on_llm_end(self, response, *, run_id=None, **kwargs): # noqa: ANN001 + inp = self.inputs.pop(str(run_id), None) + if inp is None: + return + for batch in response.generations: + for gen in batch: + message = getattr(gen, "message", None) + if message is None: + continue + record = {"input_hash": hash_messages(inp), "output": messages_to_dict([message])[0]} + with open(out_path, "a", encoding="utf-8") as handle: + handle.write(json.dumps(record, ensure_ascii=False) + "\n") + handle.flush() + + cb = Capture() + original = factory_mod.create_chat_model + + def wrapped(*args, **kwargs): + model = original(*args, **kwargs) + model.callbacks = (model.callbacks or []) + [cb] + return model + + factory_mod.create_chat_model = wrapped + for module in list(sys.modules.values()): + if getattr(module, "create_chat_model", None) is original: + module.create_chat_model = wrapped + + +def main() -> int: + if not os.environ.get("OPENAI_API_KEY") or not os.environ.get("OPENAI_API_BASE"): + print("ERROR: set OPENAI_API_KEY and OPENAI_API_BASE (an OpenAI-compatible /v1 endpoint)", file=sys.stderr) + return 2 + + record_out = os.environ.get("DEERFLOW_RECORD_OUT") + if not record_out: + print("ERROR: set DEERFLOW_RECORD_OUT to the JSONL path to append captured turns to", file=sys.stderr) + return 2 + + port = int(os.environ.get("RECORD_PORT", "8012")) + model = os.environ.get("RECORD_MODEL", "gpt-5.5") + out = Path(record_out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text("", encoding="utf-8") # fresh capture per recording run + + from _replay_fixture import build_config_yaml, prepare_hermetic_extras, real_model_block + + home = Path(tempfile.mkdtemp(prefix="record-gw-")) + cfg = home / "config.yaml" + cfg.write_text(build_config_yaml(model_block=real_model_block(model), home=home), encoding="utf-8") + # Override (not setdefault): the recorder must be hermetic, so an outer + # DEER_FLOW_HOME can't leak in and shift prompt-affecting paths/skills. + os.environ["DEER_FLOW_HOME"] = str(home) + os.environ["DEER_FLOW_CONFIG_PATH"] = str(cfg) + os.environ["DEER_FLOW_EXTENSIONS_CONFIG_PATH"] = str(prepare_hermetic_extras(home)) + os.environ.setdefault("AUTH_JWT_SECRET", "record-secret") + os.environ["PYTHONPATH"] = os.pathsep.join(p for p in (str(_BACKEND), str(_BACKEND / "tests"), os.environ.get("PYTHONPATH", "")) if p) + + _install_capture(out) + + import uvicorn + + print(f"[record-gw] model={model} out={out} port={port}", flush=True) + uvicorn.run("app.gateway.app:app", host="127.0.0.1", port=port, log_level="warning") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/backend/scripts/run_replay_gateway.py b/backend/scripts/run_replay_gateway.py new file mode 100644 index 000000000..996ee4d42 --- /dev/null +++ b/backend/scripts/run_replay_gateway.py @@ -0,0 +1,73 @@ +"""Start a hermetic *replay* gateway for the full-stack (Layer 2) e2e. + +Builds an ephemeral config that points the model at ``ReplayChatModel`` + a +recorded fixture, then runs uvicorn — no API key, deterministic. Used as a +Playwright ``webServer`` (see ``frontend/playwright.real-backend.config.ts``) and +runnable standalone for debugging:: + + uv run python scripts/run_replay_gateway.py --port 8011 + +``tests/`` is put on the path so the config ``use: replay_provider:ReplayChatModel`` +resolves; ``GATEWAY_CORS_ORIGINS`` is set so the frontend on :3000 can talk to it. +""" + +from __future__ import annotations + +import argparse +import os +import sys +import tempfile +from pathlib import Path + +_BACKEND = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(_BACKEND)) +sys.path.insert(0, str(_BACKEND / "tests")) # replay_provider + build_config_yaml live here + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--port", type=int, default=8011) + parser.add_argument("--fixture", default=str(_BACKEND / "tests" / "fixtures" / "replay" / "write_read_file.ultra.json")) + parser.add_argument("--cors", default="http://localhost:3000") + args = parser.parse_args() + + from _replay_fixture import REPLAY_MODEL_BLOCK, build_config_yaml, prepare_hermetic_extras + + home = Path(tempfile.mkdtemp(prefix="replay-gw-")) + cfg = home / "config.yaml" + cfg.write_text(build_config_yaml(model_block=REPLAY_MODEL_BLOCK, home=home), encoding="utf-8") + + # Override (not setdefault): the replay gateway must be hermetic, so an outer + # DEER_FLOW_HOME can't leak in and shift prompt-affecting paths/skills. + os.environ["DEER_FLOW_HOME"] = str(home) + os.environ["DEER_FLOW_CONFIG_PATH"] = str(cfg) + os.environ["DEER_FLOW_EXTENSIONS_CONFIG_PATH"] = str(prepare_hermetic_extras(home)) + os.environ["DEERFLOW_REPLAY_FIXTURE"] = args.fixture + os.environ.setdefault("AUTH_JWT_SECRET", "ci-replay-secret") + os.environ["GATEWAY_CORS_ORIGINS"] = args.cors + # Child / dynamic imports (resolve_class) search PYTHONPATH too. + os.environ["PYTHONPATH"] = os.pathsep.join(p for p in (str(_BACKEND), str(_BACKEND / "tests"), os.environ.get("PYTHONPATH", "")) if p) + + import uvicorn + + target: str | object = "app.gateway.app:app" + # Test-only: attach the run/message seeder used by the multi-run render-order + # e2e (#3352). Imported from tests/ and mounted here only — never in the + # production app. Pass the app object (not the import string) so the extra + # router is registered before uvicorn serves it. + if os.environ.get("DEERFLOW_ENABLE_TEST_SEED") == "1": + from seed_runs_router import router as seed_router + + from app.gateway.app import app as gateway_app + + gateway_app.include_router(seed_router) + target = gateway_app + print("[replay-gw] test-only seed router mounted at /api/test-only/seed-runs", flush=True) + + print(f"[replay-gw] config={cfg} fixture={args.fixture} cors={args.cors} port={args.port}", flush=True) + uvicorn.run(target, host="127.0.0.1", port=args.port, log_level="warning") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/backend/tests/_replay_fixture.py b/backend/tests/_replay_fixture.py new file mode 100644 index 000000000..56f1a080a --- /dev/null +++ b/backend/tests/_replay_fixture.py @@ -0,0 +1,163 @@ +"""Shared config + gateway-drive helpers for the record/replay e2e. + +Record (``scripts/record_gateway.py`` + ``scripts/build_fixture_from_jsonl.py``) +and replay (``tests/test_replay_golden.py``) +MUST drive the gateway through an identical, prompt-affecting config — otherwise +the system prompt differs and the recorded input hashes never match on replay. +Centralising the config builder + drive loop here makes that identity hold by +construction; only the ``models[].use`` block differs (real model vs +``ReplayChatModel``). +""" + +from __future__ import annotations + +import json +import uuid +from pathlib import Path + +# mode -> (thinking_enabled, is_plan_mode, subagent_enabled). Mirrors the +# frontend mapping in core/threads/hooks.ts. +MODE_CONTEXT: dict[str, tuple[bool, bool, bool]] = { + "flash": (False, False, False), + "thinking": (True, False, False), + "pro": (True, True, False), + # thinking_enabled mirrors the frontend `context.mode !== "flash"` (hooks.ts), + # so ultra is thinking-enabled too. + "ultra": (True, True, True), +} + +# The replay model block: same model NAME as recording (so nothing in the prompt +# shifts), only ``use`` swapped to the deterministic replay provider. +REPLAY_MODEL_BLOCK = """\ + - name: scenario-model + display_name: Scenario Model + use: replay_provider:ReplayChatModel + model: replay""" + + +def real_model_block(model: str) -> str: + return f"""\ + - name: scenario-model + display_name: Scenario Model + use: langchain_openai:ChatOpenAI + model: {model} + api_key: $OPENAI_API_KEY + base_url: $OPENAI_API_BASE""" + + +def build_config_yaml(*, model_block: str, home: Path) -> str: + """Full gateway config. Only ``model_block`` varies between record/replay. + + Everything that shapes the system prompt is pinned so record, replay, and CI + produce byte-identical prompts regardless of the machine: + - sandbox / tool_groups / tools — fixed here + - skills — pointed at an empty ``/skills`` so filesystem skills (incl. + gitignored custom skills present only on a dev box) never leak into the + prompt. Pair with an empty ``extensions_config.json`` (no MCP) via + :func:`prepare_hermetic_extras`. + - memory / summarization — disabled (background, non-deterministic timing) + """ + return f"""\ +log_level: warning +models: +{model_block} +sandbox: + use: deerflow.sandbox.local:LocalSandboxProvider +skills: + path: {home / "skills"} + container_path: /mnt/skills +tool_groups: + - name: file:read + - name: file:write +tools: + - name: ls + group: file:read + use: deerflow.sandbox.tools:ls_tool + - name: read_file + group: file:read + use: deerflow.sandbox.tools:read_file_tool + - name: write_file + group: file:write + use: deerflow.sandbox.tools:write_file_tool +# Memory + summarization make background / debounced model calls whose timing is +# non-deterministic; disable them so record and replay see the same model-call +# set. (Title stays — it is an in-graph, deterministic call we record.) +memory: + enabled: false + injection_enabled: false +summarization: + enabled: false +agents_api: + enabled: true +database: + backend: sqlite + sqlite_dir: {home / "db"} +""" + + +def prepare_hermetic_extras(home: Path) -> Path: + """Create the empty skills tree + an empty extensions_config.json so the + system prompt has no environment-dependent skills/MCP content. + + Returns the extensions-config path; the caller must point + ``DEER_FLOW_EXTENSIONS_CONFIG_PATH`` at it. Call before starting the gateway. + """ + (home / "skills" / "public").mkdir(parents=True, exist_ok=True) + (home / "skills" / "custom").mkdir(parents=True, exist_ok=True) + extensions = home / "extensions_config.json" + extensions.write_text(json.dumps({"mcpServers": {}, "skills": {}}), encoding="utf-8") + return extensions + + +def sse_event_shapes(resp) -> list[dict]: + """Reduce an SSE stream to (event name, sorted top-level data keys). + + Snapshots the *shape* of the stream, not volatile values, so the golden is + stable across runs while still catching event-sequence / payload-shape drift. + """ + events: list[dict] = [] + current: str | None = None + for line in resp.iter_lines(): + if line.startswith("event:"): + current = line[len("event:") :].strip() + elif line.startswith("data:"): + raw = line[len("data:") :].strip() + try: + data = json.loads(raw) if raw else {} + except json.JSONDecodeError: + data = {"_raw": raw[:200]} + events.append({"event": current, "keys": sorted(data.keys()) if isinstance(data, dict) else None}) + return events + + +def drive_gateway(app, *, prompt: str, context: dict) -> list[dict]: + """Register -> create thread -> POST /runs/stream; return SSE event shapes. + + This is the exact wire path the React frontend uses (LangGraph SDK), driven + in-process via Starlette's TestClient with the real auth flow. + """ + from starlette.testclient import TestClient + + with TestClient(app) as client: + reg = client.post( + "/api/v1/auth/register", + json={"email": f"e2e-{uuid.uuid4().hex[:8]}@example.com", "password": "very-strong-password-123"}, + ) + assert reg.status_code == 201, reg.text + csrf = client.cookies.get("csrf_token") + assert csrf, "register must set csrf_token cookie" + + thread_id = str(uuid.uuid4()) + created = client.post("/api/threads", json={"thread_id": thread_id, "metadata": {}}, headers={"X-CSRF-Token": csrf}) + assert created.status_code == 200, created.text + + body = { + "assistant_id": "lead_agent", + "input": {"messages": [{"role": "user", "content": prompt}]}, + "config": {"recursion_limit": 50}, + "context": context, + "stream_mode": ["values"], + } + with client.stream("POST", f"/api/threads/{thread_id}/runs/stream", json=body, headers={"X-CSRF-Token": csrf}) as resp: + assert resp.status_code == 200, resp.read().decode() + return sse_event_shapes(resp) diff --git a/backend/tests/fixtures/replay/write_read_file.ultra.events.json b/backend/tests/fixtures/replay/write_read_file.ultra.events.json new file mode 100644 index 000000000..3a4f8c041 --- /dev/null +++ b/backend/tests/fixtures/replay/write_read_file.ultra.events.json @@ -0,0 +1,72 @@ +{ + "scenario": "write_read_file", + "mode": "ultra", + "events": [ + { + "event": "metadata", + "keys": [ + "run_id", + "thread_id" + ] + }, + { + "event": "values", + "keys": [ + "artifacts", + "messages", + "viewed_images" + ] + }, + { + "event": "values", + "keys": [ + "artifacts", + "messages", + "thread_data", + "viewed_images" + ] + }, + { + "event": "values", + "keys": [ + "artifacts", + "messages", + "thread_data", + "viewed_images" + ] + }, + { + "event": "values", + "keys": [ + "artifacts", + "messages", + "thread_data", + "viewed_images" + ] + }, + { + "event": "values", + "keys": [ + "artifacts", + "messages", + "thread_data", + "title", + "viewed_images" + ] + }, + { + "event": "values", + "keys": [ + "artifacts", + "messages", + "thread_data", + "title", + "viewed_images" + ] + }, + { + "event": "end", + "keys": null + } + ] +} \ No newline at end of file diff --git a/backend/tests/fixtures/replay/write_read_file.ultra.json b/backend/tests/fixtures/replay/write_read_file.ultra.json new file mode 100644 index 000000000..a534eb2eb --- /dev/null +++ b/backend/tests/fixtures/replay/write_read_file.ultra.json @@ -0,0 +1,224 @@ +{ + "scenario": "write_read_file", + "mode": "ultra", + "model": "gpt-5.5", + "prompt": "Using your own file tools directly, create the file /mnt/user-data/outputs/note.txt with exactly this content: hi from replay. Then read that same file back and reply with its exact contents. Do NOT delegate to a subagent and do NOT use the task tool — do it yourself. Do not ask any clarifying questions.", + "context": { + "is_bootstrap": false, + "mode": "ultra", + "thinking_enabled": true, + "is_plan_mode": true, + "subagent_enabled": true + }, + "turns": [ + { + "input_hash": "686cd44a9f17fadc0398768731324f3980480a027593a475fad4583581df677f", + "output": { + "type": "ai", + "data": { + "content": "", + "additional_kwargs": {}, + "response_metadata": { + "finish_reason": "tool_calls", + "model_name": "gpt-5.5", + "model_provider": "openai" + }, + "type": "ai", + "name": null, + "id": "lc_run--019e8c60-8d4b-79a1-8d77-0a67fc360ce4", + "tool_calls": [ + { + "name": "write_file", + "args": { + "description": "Create requested note file", + "path": "/mnt/user-data/outputs/note.txt", + "content": "hi from replay" + }, + "id": "call_UdIzq5Vyx7pu1Usnj4wPCC6G", + "type": "tool_call" + } + ], + "invalid_tool_calls": [], + "usage_metadata": { + "input_tokens": 3285, + "output_tokens": 66, + "total_tokens": 3351, + "input_token_details": { + "audio": 0, + "cache_read": 0 + }, + "output_token_details": { + "audio": 0, + "reasoning": 21 + } + } + } + } + }, + { + "input_hash": "3598aeb87e221ca8f554e4d61ce6d5e8801754606fa5c95a89c38bd6cb623045", + "output": { + "type": "ai", + "data": { + "content": "File Creation and Verification", + "additional_kwargs": {}, + "response_metadata": { + "finish_reason": "stop", + "model_name": "gpt-5.5", + "model_provider": "openai" + }, + "type": "ai", + "name": null, + "id": "lc_run--019e8c60-9c18-72c1-95e8-f6a240747395", + "tool_calls": [], + "invalid_tool_calls": [], + "usage_metadata": { + "input_tokens": 104, + "output_tokens": 53, + "total_tokens": 157, + "input_token_details": { + "audio": 0, + "cache_read": 0 + }, + "output_token_details": { + "audio": 0, + "reasoning": 39 + } + } + } + } + }, + { + "input_hash": "92430ba866abe577c86d2e67eb5158b10f3f19ec306aa9de235bb06736320d70", + "output": { + "type": "ai", + "data": { + "content": "", + "additional_kwargs": {}, + "response_metadata": { + "finish_reason": "tool_calls", + "model_name": "gpt-5.5", + "model_provider": "openai" + }, + "type": "ai", + "name": null, + "id": "lc_run--019e8c60-b036-7710-8db9-717ab54e5805", + "tool_calls": [ + { + "name": "read_file", + "args": { + "description": "Read requested note file", + "path": "/mnt/user-data/outputs/note.txt" + }, + "id": "call_0BFNns0FkRb3n2LR0PRrfbIJ", + "type": "tool_call" + } + ], + "invalid_tool_calls": [], + "usage_metadata": { + "input_tokens": 3334, + "output_tokens": 33, + "total_tokens": 3367, + "input_token_details": { + "audio": 0, + "cache_read": 0 + }, + "output_token_details": { + "audio": 0, + "reasoning": 0 + } + } + } + } + }, + { + "input_hash": "8ab757aa51f9d556adcea07c0221445a2b791cc882ef11922babf7f2865d1913", + "output": { + "type": "ai", + "data": { + "content": "hi from replay", + "additional_kwargs": {}, + "response_metadata": { + "finish_reason": "stop", + "model_name": "gpt-5.5", + "model_provider": "openai" + }, + "type": "ai", + "name": null, + "id": "lc_run--019e8c60-bef3-7201-a30a-cbc5f45920ba", + "tool_calls": [], + "invalid_tool_calls": [], + "usage_metadata": { + "input_tokens": 3380, + "output_tokens": 7, + "total_tokens": 3387, + "input_token_details": { + "audio": 0, + "cache_read": 0 + }, + "output_token_details": { + "audio": 0, + "reasoning": 0 + } + } + } + } + }, + { + "input_hash": "fd67723cc8810ce79b4785fec4c251a272a91d677a216c735b23b5f6d3dec0c3", + "output": { + "type": "ai", + "data": { + "content": "[\"Can you append another line to the file?\",\"Can you show the file path again?\",\"Can you delete the file now?\"]", + "additional_kwargs": { + "refusal": null + }, + "response_metadata": { + "token_usage": { + "completion_tokens": 71, + "prompt_tokens": 224, + "total_tokens": 295, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 33, + "rejected_prediction_tokens": 0 + }, + "prompt_tokens_details": { + "audio_tokens": 0, + "cached_tokens": 0 + }, + "input_tokens": 0, + "output_tokens": 0, + "input_tokens_details": null + }, + "model_provider": "openai", + "model_name": "gpt-5.5", + "system_fingerprint": null, + "id": "chatcmpl-DmaI5yVqQ39LRWyugoCEPalKw0gBR", + "finish_reason": "stop", + "logprobs": null + }, + "type": "ai", + "name": null, + "id": "lc_run--019e8c60-d025-7fd2-9cc9-8b4fb8fe1a82-0", + "tool_calls": [], + "invalid_tool_calls": [], + "usage_metadata": { + "input_tokens": 224, + "output_tokens": 71, + "total_tokens": 295, + "input_token_details": { + "audio": 0, + "cache_read": 0 + }, + "output_token_details": { + "audio": 0, + "reasoning": 33 + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/backend/tests/replay_provider.py b/backend/tests/replay_provider.py new file mode 100644 index 000000000..c16c46448 --- /dev/null +++ b/backend/tests/replay_provider.py @@ -0,0 +1,230 @@ +"""Replay a recorded LLM trace deterministically — the "replay" half of +record/replay e2e (mirrors open-design's ``mocks/`` golden traces). + +A fixture is a JSON file capturing the *real* model calls of one scenario, +keyed by a normalized hash of the **input** each call received:: + + { + "scenario": "write_read_file", + "mode": "ultra", + "model": "gpt-5.5", + "turns": [ + {"input_hash": "", "input_preview": "...", "output": }, + ... + ] + } + +Why hash-by-input (not turn index) +---------------------------------- +A real run makes model calls from several callers — the lead agent's own turns, +``TitleMiddleware`` (auto-title), memory, and possibly subagents. They interleave +and their count/order is not something we want a replay to depend on. Matching by +a normalized hash of the *input messages* means each call gets back exactly the +output that was recorded for that input, regardless of order or which middleware +issued it. That keeps the in-graph, deterministic title call part of the +recording; memory/summarization, by contrast, are disabled in the replay config +(``_replay_fixture.py``) because their background, debounced timing is not +reproducible across runs. + +Volatile fields (UUID thread/run/user ids, timestamps, dates, tmp/home paths) +are normalized out before hashing so a recording replays across processes with +different temp dirs. The same ``hash_messages`` is used by the recorder +(``scripts/record_gateway.py``) and here, so record and replay agree by +construction. + +This lives in ``tests/`` (not in the publishable ``deerflow-harness`` package), +matching the repo convention for test-only fakes (cf. ``FakeToolCallingModel`` in +``_agent_e2e_helpers.py``). In-process tests get ``tests/`` on ``sys.path`` for +free via pytest; a standalone replay gateway just needs ``PYTHONPATH`` to include +``backend/tests`` so the config ``use:`` below resolves. + +Point a config model's ``use`` at this class and set the fixture via env:: + + models: + - name: replay-model + use: replay_provider:ReplayChatModel + model: gpt-5.5 # placeholder; ignored + + DEERFLOW_REPLAY_FIXTURE=/path/to/write_read_file.ultra.json + +A cache miss raises loudly with a diagnostic — that is the signal that the +replayed run diverged from the recording (graph changed, a new volatile field +slipped through normalization, or a non-deterministic tool result changed a +downstream input). Re-record or extend normalization; never pass silently. + +Recording lives outside production code too (``scripts/record_gateway.py`` + +``scripts/build_fixture_from_jsonl.py``); CI consumes the fixtures through this +replay side with no API key. +""" + +from __future__ import annotations + +import hashlib +import json +import os +import re +from collections import deque +from collections.abc import Iterator +from typing import Any + +from langchain_core.callbacks import CallbackManagerForLLMRun +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import AIMessage, AIMessageChunk, BaseMessage, messages_from_dict +from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult +from langchain_core.runnables import Runnable +from pydantic import PrivateAttr + +_FIXTURE_ENV = "DEERFLOW_REPLAY_FIXTURE" + +# Volatile substrings that differ between a recording run and a replay run but +# carry no semantic weight for matching. Normalized to stable placeholders +# before hashing so the same logical input hashes identically across processes. +# The frontend injects a per-request ```` (current date, weekday, +# dynamic context) that the backend-direct path does not — and its date/weekday +# change every day. Strip the whole block before hashing so a fixture replays +# (a) across days and (b) from both the browser and direct-POST paths. +_SYSTEM_REMINDER_RE = re.compile(r".*?", re.DOTALL) +_UUID_RE = re.compile(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}") +_ISO_TS_RE = re.compile(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?") +_DATE_RE = re.compile(r"\d{4}-\d{2}-\d{2}") +# Absolute temp/home roots used for per-run isolation (macOS + Linux + DEER_FLOW_HOME tmp). +_PATH_RE = re.compile(r"(?:/private)?/(?:var/folders|tmp)/[^\s\"']*") + + +def _normalize_text(text: str) -> str: + text = _SYSTEM_REMINDER_RE.sub("", text) + text = _UUID_RE.sub("", text) + text = _ISO_TS_RE.sub("", text) + text = _DATE_RE.sub("", text) + text = _PATH_RE.sub("", text) + return text + + +def _content_to_text(content: Any) -> str: + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for block in content: + if isinstance(block, dict): + parts.append(block.get("text", "") or json.dumps(block, sort_keys=True, ensure_ascii=False)) + else: + parts.append(str(block)) + return "".join(parts) + return str(content) + + +def _canonical_messages(messages: list[BaseMessage]) -> str: + """Project messages to a stable shape that excludes volatile metadata/ids. + + Keeps only what determines the model's next output: role, text content, and + tool-call name+args. Drops ``id``, ``response_metadata``, ``usage_metadata``, + and ``tool_call_id`` (all volatile), then normalizes embedded volatile + substrings. + """ + projected: list[dict[str, Any]] = [] + for message in messages: + content = _normalize_text(_content_to_text(message.content)) + tool_calls = getattr(message, "tool_calls", None) + # Drop messages that are empty after normalization — e.g. a turn that was + # nothing but a frontend-injected . They carry no + # decision-relevant content and differ between client paths. + if not content.strip() and not tool_calls: + continue + entry: dict[str, Any] = {"type": message.type, "content": content} + if tool_calls: + entry["tool_calls"] = [{"name": tc.get("name"), "args": tc.get("args")} for tc in tool_calls] + name = getattr(message, "name", None) + if name: + entry["name"] = name + projected.append(entry) + raw = json.dumps(projected, sort_keys=True, ensure_ascii=False) + return _normalize_text(raw) + + +def hash_messages(messages: list[BaseMessage]) -> str: + """Stable hash of a model call's input. Shared by recorder and replayer.""" + return hashlib.sha256(_canonical_messages(messages).encode("utf-8")).hexdigest() + + +def _load_fixture(fixture_path: str) -> dict[str, deque[AIMessage]]: + with open(fixture_path, encoding="utf-8") as handle: + payload = json.load(handle) + table: dict[str, deque[AIMessage]] = {} + for index, turn in enumerate(payload.get("turns", [])): + input_hash = turn["input_hash"] + (message,) = messages_from_dict([turn["output"]]) + if not isinstance(message, AIMessage): + raise ValueError(f"replay fixture {fixture_path!r} turn {index} output is {type(message).__name__}, expected AIMessage") + table.setdefault(input_hash, deque()).append(message) + return table + + +class ReplayChatModel(BaseChatModel): + """Returns the recorded assistant output whose input matches this call. + + ``bind_tools`` is a no-op returning ``self`` — recorded turns already carry + the real ``tool_calls``, so the agent dispatches them as if a live model had + produced them. + """ + + _table: dict[str, deque] = PrivateAttr(default_factory=dict) + _fixture_path: str = PrivateAttr(default="") + + def __init__(self, **kwargs: Any) -> None: + # Ignore provider noise the factory forwards from config (model, api_key, + # base_url, ...). Fixture path comes from the ``fixture`` kwarg or env. + fixture_path = kwargs.pop("fixture", None) or os.environ.get(_FIXTURE_ENV) + super().__init__() + if not fixture_path: + raise ValueError(f"ReplayChatModel needs a fixture path via the ``fixture`` kwarg or ${_FIXTURE_ENV}") + self._fixture_path = fixture_path + self._table = _load_fixture(fixture_path) + + @property + def _llm_type(self) -> str: + return "deerflow-replay" + + def _match(self, messages: list[BaseMessage]) -> AIMessage: + key = hash_messages(messages) + bucket = self._table.get(key) + if not bucket: + preview = _canonical_messages(messages) + raise KeyError( + f"replay miss: no recorded output for input hash {key} in {self._fixture_path!r}. " + "The replayed run diverged from the recording (graph changed, a non-deterministic tool result " + "altered a downstream input, or a volatile field slipped past normalization). " + f"Known hashes: {sorted(self._table)}. " + f"Normalized input (first 800 chars): {preview[:800]!r}" + ) + return bucket.popleft() + + def _generate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: CallbackManagerForLLMRun | None = None, + **kwargs: Any, + ) -> ChatResult: + return ChatResult(generations=[ChatGeneration(message=self._match(messages))]) + + def _stream( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: CallbackManagerForLLMRun | None = None, + **kwargs: Any, + ) -> Iterator[ChatGenerationChunk]: + turn = self._match(messages) + text = turn.content if isinstance(turn.content, str) else "" + chunk = ChatGenerationChunk(message=AIMessageChunk(content=turn.content, tool_calls=turn.tool_calls, additional_kwargs=turn.additional_kwargs, id=turn.id)) + if run_manager is not None and text: + run_manager.on_llm_new_token(text, chunk=chunk) + yield chunk + + def bind_tools(self, tools: Any, **kwargs: Any) -> Runnable: # type: ignore[override] + return self + + +# Re-export so the recorder shares the exact hashing logic. +__all__ = ["ReplayChatModel", "hash_messages"] diff --git a/backend/tests/seed_runs_router.py b/backend/tests/seed_runs_router.py new file mode 100644 index 000000000..5ca39d290 --- /dev/null +++ b/backend/tests/seed_runs_router.py @@ -0,0 +1,100 @@ +"""Test-only run/message seeder for the multi-run render-order e2e (issue #3352). + +Mounted **only** by ``scripts/run_replay_gateway.py`` (the replay e2e gateway) +and never by the production app, so it cannot ship. It lets a Playwright spec +stand up a thread with >=2 runs whose per-run messages exercise the frontend's +reload / history-rebuild ordering path — with no real model, no recording, and +no API key. + +Why a seeder instead of recording a conversation: issue #3352 only reproduces +when the checkpoint no longer holds the older messages (post-compression), so +the frontend rebuilds them from the per-run history endpoints. A seeder lets us +create exactly that precondition deterministically — runs in the run store + +per-run ``category="message"`` events, and **no checkpoint** — so on reload the +buggy ``findLatestUnloadedRunIndex`` + prepend in ``core/threads/hooks.ts`` is +the sole source of truth and its reversed order becomes observable. + +It writes through the gateway's OWN ``app.state.run_store`` + +``app.state.run_event_store`` using the request's auth context, so the seeded +``user_id`` matches the browser session that reads it back. The event shape +mirrors exactly what ``runtime/journal.py`` writes for real runs +(``event_type`` ``llm.human.input`` / ``llm.ai.response``, ``category`` +``"message"``, ``content`` = ``message.model_dump()``, ``metadata.caller`` = +``"lead_agent"``). +""" + +from __future__ import annotations + +from typing import Literal + +from fastapi import APIRouter, Request +from pydantic import BaseModel + +router = APIRouter(prefix="/api/test-only", tags=["test-only"]) + +# Mirror runtime/journal.py: human prompts are recorded as ``llm.human.input`` +# and assistant turns as ``llm.ai.response``; both land in ``category="message"``. +_EVENT_TYPE = {"human": "llm.human.input", "ai": "llm.ai.response"} + + +class SeedMessage(BaseModel): + role: Literal["human", "ai"] + content: str + id: str + + +class SeedRun(BaseModel): + run_id: str + # ISO timestamp; RunManager.list_by_thread sorts newest-first by created_at, + # so a later created_at must mean a later run for the ordering to be faithful. + created_at: str + messages: list[SeedMessage] + + +class SeedRunsBody(BaseModel): + thread_id: str + runs: list[SeedRun] + + +@router.post("/seed-runs") +async def seed_runs(body: SeedRunsBody, request: Request) -> dict: + """Seed runs + per-run message events for the authenticated user. + + No checkpoint is written: that is the whole point — it forces the frontend's + reload path to rebuild history from the per-run endpoints (the #3352 bug + site) instead of the (correctly ordered) checkpoint snapshot. + """ + from langchain_core.messages import AIMessage, HumanMessage + + run_store = request.app.state.run_store + event_store = request.app.state.run_event_store + + for run in body.runs: + # user_id defaults (AUTO) to the request's auth context, matching the + # browser session that will read these runs back via GET /runs. + await run_store.put( + run.run_id, + thread_id=body.thread_id, + assistant_id="lead_agent", + status="success", + created_at=run.created_at, + ) + events = [] + for m in run.messages: + msg = (HumanMessage if m.role == "human" else AIMessage)(content=m.content, id=m.id) + events.append( + { + "thread_id": body.thread_id, + "run_id": run.run_id, + "event_type": _EVENT_TYPE[m.role], + "category": "message", + "content": msg.model_dump(), + "metadata": {"caller": "lead_agent"}, + "created_at": run.created_at, + } + ) + # One batch per run so seq is monotonic and run1's messages precede + # run2's; the gateway reads them back per-run anyway. + await event_store.put_batch(events) + + return {"ok": True, "thread_id": body.thread_id, "runs": len(body.runs)} diff --git a/backend/tests/test_replay_golden.py b/backend/tests/test_replay_golden.py new file mode 100644 index 000000000..f90bbd88e --- /dev/null +++ b/backend/tests/test_replay_golden.py @@ -0,0 +1,87 @@ +"""Layer 1 of the record/replay e2e: replay a recorded trace through the **real +gateway** with a deterministic ``ReplayChatModel`` (no API key, no network) and +assert the streamed SSE event sequence matches a committed golden. + +This catches backend protocol drift: if a change alters the shape/sequence of +SSE the gateway emits for the recorded scenario, this test goes red. The replay +model serves the recorded assistant turns by input hash, so the agent graph +(write_file -> auto-title -> read_file -> final answer) reproduces offline. + +Fixtures are produced by ``scripts/record_gateway.py`` + +``scripts/build_fixture_from_jsonl.py`` (manual, needs a key). +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path + +import pytest +from _replay_fixture import REPLAY_MODEL_BLOCK, build_config_yaml, drive_gateway, prepare_hermetic_extras + +FIXTURE_DIR = Path(__file__).parent / "fixtures" / "replay" + + +def _reset_process_singletons(monkeypatch: pytest.MonkeyPatch) -> None: + """Invalidate process-wide caches so the test-only config/home take effect. + + Same set the real-server e2e resets (see test_setup_agent_http_e2e_real_server). + """ + from deerflow.config import app_config as app_config_module + from deerflow.config import paths as paths_module + from deerflow.persistence import engine as engine_module + + for module, attr in ( + (app_config_module, "_app_config"), + (app_config_module, "_app_config_path"), + (app_config_module, "_app_config_mtime"), + (paths_module, "_paths_singleton"), + (engine_module, "_engine"), + (engine_module, "_session_factory"), + ): + monkeypatch.setattr(module, attr, None, raising=False) + + +@pytest.mark.no_auto_user +def test_replay_write_read_file_ultra_matches_golden(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): + scenario, mode = "write_read_file", "ultra" + fixture_path = FIXTURE_DIR / f"{scenario}.{mode}.json" + events_path = FIXTURE_DIR / f"{scenario}.{mode}.events.json" + fixture = json.loads(fixture_path.read_text(encoding="utf-8")) + + home = tmp_path / "home" + home.mkdir() + monkeypatch.setenv("DEER_FLOW_HOME", str(home)) + monkeypatch.setenv("DEERFLOW_REPLAY_FIXTURE", str(fixture_path)) + + cfg_path = tmp_path / "config.yaml" + cfg_path.write_text(build_config_yaml(model_block=REPLAY_MODEL_BLOCK, home=home), encoding="utf-8") + monkeypatch.setenv("DEER_FLOW_CONFIG_PATH", str(cfg_path)) + monkeypatch.setenv("DEER_FLOW_EXTENSIONS_CONFIG_PATH", str(prepare_hermetic_extras(home))) + + _reset_process_singletons(monkeypatch) + from deerflow.config import app_config as app_config_module + + cfg = app_config_module.get_app_config() + cfg.database.sqlite_dir = str(home / "db") + + from app.gateway.app import create_app + + events = drive_gateway(create_app(), prompt=fixture["prompt"], context=fixture["context"]) + + assert events, "replay produced no SSE events" + assert events[0]["event"] == "metadata", f"first event should be metadata, got {events[0]!r}" + assert events[-1]["event"] == "end", f"last event should be end (run completed), got {events[-1]!r}" + + # Regenerate the committed golden after re-recording the fixture: + # DEERFLOW_WRITE_GOLDEN=1 uv run pytest tests/test_replay_golden.py + if os.environ.get("DEERFLOW_WRITE_GOLDEN"): + events_path.write_text(json.dumps({"scenario": scenario, "mode": mode, "events": events}, ensure_ascii=False, indent=2), encoding="utf-8") + return + + golden = json.loads(events_path.read_text(encoding="utf-8"))["events"] + # A replay hash-miss surfaces as the run erroring mid-stream -> the event + # shape sequence diverges from the golden, so this assertion is the catch-all + # for both backend SSE drift and replay divergence. + assert events == golden, f"SSE event-shape sequence drifted from the golden.\ngot ({len(events)}): {[e['event'] for e in events]}\nwant ({len(golden)}): {[e['event'] for e in golden]}" diff --git a/frontend/playwright.real-backend.config.ts b/frontend/playwright.real-backend.config.ts new file mode 100644 index 000000000..9db673b90 --- /dev/null +++ b/frontend/playwright.real-backend.config.ts @@ -0,0 +1,60 @@ +import { defineConfig, devices } from "@playwright/test"; + +/** + * Layer 2 of the record/replay e2e: the REAL Next.js frontend rendering data + * from a REAL gateway whose LLM is the deterministic `ReplayChatModel` (no API + * key). This is separate from `playwright.config.ts` (which mocks the backend) + * so the mock-based suite is untouched. + * + * Two webServers are started: the replay gateway (:8011) and the frontend + * (:3000, pointed at the gateway). Auth uses a throwaway test account the spec + * registers at runtime — no secrets. + */ +export default defineConfig({ + testDir: "./tests/e2e-real-backend", + fullyParallel: false, + forbidOnly: !!process.env.CI, + retries: process.env.CI ? 1 : 0, + workers: 1, + reporter: process.env.CI ? "github" : "html", + timeout: 90_000, + + use: { + baseURL: "http://localhost:3000", + trace: "on-first-retry", + }, + + projects: [{ name: "chromium", use: { ...devices["Desktop Chrome"] } }], + + webServer: [ + { + command: "uv run python scripts/run_replay_gateway.py --port 8011", + cwd: "../backend", + url: "http://localhost:8011/health", + reuseExistingServer: !process.env.CI, + timeout: 180_000, + stdout: "pipe", + stderr: "pipe", + // Mount the test-only run/message seeder used by multi-run-order.spec.ts + // (#3352). The endpoint exists only on this replay gateway, never in the + // production app. + env: { DEERFLOW_ENABLE_TEST_SEED: "1" }, + }, + { + command: "pnpm build && pnpm start", + url: "http://localhost:3000", + reuseExistingServer: !process.env.CI, + timeout: 240_000, + env: { + SKIP_ENV_VALIDATION: "1", + DEER_FLOW_AUTH_DISABLED: "1", + BETTER_AUTH_SECRET: "local-dev-secret", + // Leave NEXT_PUBLIC_* unset so the frontend uses its built-in + // next.config rewrites (same-origin proxy) instead of talking to the + // gateway cross-origin — cross-origin fetches drop the auth cookies. + // Just point that proxy at the replay gateway. + DEER_FLOW_INTERNAL_GATEWAY_BASE_URL: "http://127.0.0.1:8011", + }, + }, + ], +}); diff --git a/frontend/playwright.record.config.ts b/frontend/playwright.record.config.ts new file mode 100644 index 000000000..4e2ea8063 --- /dev/null +++ b/frontend/playwright.record.config.ts @@ -0,0 +1,58 @@ +import { defineConfig, devices } from "@playwright/test"; + +/** + * RECORD-through-browser config (Plan A): drive the REAL frontend against a + * REAL-model gateway and capture every model call so the fixture's inputs match + * exactly what the frontend produces. Manual, needs OPENAI_API_KEY/OPENAI_API_BASE + * + DEERFLOW_RECORD_OUT in the environment — never run in CI. + * + * Not committed as a test run; `tests/e2e-record/` holds the driver spec. + */ +export default defineConfig({ + testDir: "./tests/e2e-record", + fullyParallel: false, + workers: 1, + reporter: "list", + timeout: 200_000, + use: { baseURL: "http://localhost:3000", trace: "off" }, + projects: [{ name: "chromium", use: { ...devices["Desktop Chrome"] } }], + webServer: [ + { + command: "uv run python scripts/record_gateway.py", + cwd: "../backend", + url: "http://localhost:8012/health", + reuseExistingServer: false, + timeout: 180_000, + stdout: "pipe", + stderr: "pipe", + env: { + RECORD_PORT: "8012", + RECORD_MODEL: process.env.RECORD_MODEL ?? "gpt-5.5", + // Forwarded from the invoking shell; never hardcoded. Passed through only + // when actually set, so record_gateway.py raises a clear "missing env" + // error instead of receiving "" (which would write to Path("")). + ...(process.env.DEERFLOW_RECORD_OUT + ? { DEERFLOW_RECORD_OUT: process.env.DEERFLOW_RECORD_OUT } + : {}), + ...(process.env.OPENAI_API_KEY + ? { OPENAI_API_KEY: process.env.OPENAI_API_KEY } + : {}), + ...(process.env.OPENAI_API_BASE + ? { OPENAI_API_BASE: process.env.OPENAI_API_BASE } + : {}), + }, + }, + { + command: "pnpm build && pnpm start", + url: "http://localhost:3000", + reuseExistingServer: false, + timeout: 240_000, + env: { + SKIP_ENV_VALIDATION: "1", + DEER_FLOW_AUTH_DISABLED: "1", + BETTER_AUTH_SECRET: "local-dev-secret", + DEER_FLOW_INTERNAL_GATEWAY_BASE_URL: "http://127.0.0.1:8012", + }, + }, + ], +}); diff --git a/frontend/tests/e2e-real-backend/.gitignore b/frontend/tests/e2e-real-backend/.gitignore new file mode 100644 index 000000000..dfbe5fa83 --- /dev/null +++ b/frontend/tests/e2e-real-backend/.gitignore @@ -0,0 +1,2 @@ +# OS-specific Playwright visual baselines — generated locally, not committed +*-snapshots/ diff --git a/frontend/tests/e2e-real-backend/multi-run-order.spec.ts b/frontend/tests/e2e-real-backend/multi-run-order.spec.ts new file mode 100644 index 000000000..5f40ba07d --- /dev/null +++ b/frontend/tests/e2e-real-backend/multi-run-order.spec.ts @@ -0,0 +1,101 @@ +import { expect, test } from "@playwright/test"; + +/** + * Layer 2 (cross-stack contract): reproduces upstream issue #3352 — after the + * checkpoint no longer holds the older messages (post context-compression), the + * frontend rebuilds thread history from the per-run endpoints, and the order it + * rebuilds them in must stay chronological. + * + * The dangerous class this guards: a BACKEND change to run ordering silently + * breaks a FRONTEND assumption. Backend `list_by_thread` returns runs + * NEWEST-FIRST (PR #2932); the pre-#3354 frontend iterated runs from the end and + * PREPENDED each loaded page (`core/threads/hooks.ts`), which inverts order. A + * backend-only ordering test was green the whole time #3352 was live, and the + * frontend regression unit test hardcodes "backend returns newest-first" in a + * mock — so only a real frontend against a real backend catches the desync. + * + * This drives the REAL frontend against a REAL gateway with two seeded runs and + * NO checkpoint (the seeder forces the per-run reload path to be the sole source + * of truth), then asserts the first run's message renders ABOVE the second's. + * No model, no recording, no API key — the runs are seeded via a test-only + * endpoint mounted only on the replay gateway. + */ +const APP = "http://localhost:3000"; + +// Distinctive markers so getByText can't collide with UI chrome. +const ALPHA = "ALPHA-FIRST-QUESTION-7f3a2c"; +const OMEGA = "OMEGA-SECOND-QUESTION-9b21d4"; + +test.describe("multi-run thread renders chronologically (replay, no API key)", () => { + test("first run renders above second run after history rebuild (#3352)", async ({ + page, + context, + }) => { + const uniq = `${Date.now()}-${Math.floor(Math.random() * 1e6)}`; + const threadId = `e2e-multi-run-${uniq}`; + const email = `e2e-${uniq}@example.com`; + + // Register through the frontend origin (same-origin proxy) so the auth + // cookies are stored for localhost and forwarded to the gateway via the + // next.config rewrite — never cross-origin from the browser. + const reg = await context.request.post(`${APP}/api/v1/auth/register`, { + data: { email, password: "very-strong-password-123" }, + }); + expect(reg.status(), await reg.text()).toBe(201); + + const cookies = await context.cookies(); + const csrf = cookies.find((c) => c.name === "csrf_token")?.value; + expect(csrf, "register must set csrf_token cookie").toBeTruthy(); + + // Seed two runs in one thread: run-1 (ALPHA) older, run-2 (OMEGA) newer, so + // the real backend's list_by_thread returns them newest-first. No checkpoint + // is seeded — that is the #3352 precondition. + const seed = await context.request.post(`${APP}/api/test-only/seed-runs`, { + headers: { "X-CSRF-Token": csrf! }, + data: { + thread_id: threadId, + runs: [ + { + run_id: `${threadId}-r1`, + created_at: "2026-01-01T00:00:00+00:00", + messages: [ + { role: "human", content: ALPHA, id: `${threadId}-a-h` }, + { role: "ai", content: "ALPHA reply", id: `${threadId}-a-a` }, + ], + }, + { + run_id: `${threadId}-r2`, + created_at: "2026-01-01T00:01:00+00:00", + messages: [ + { role: "human", content: OMEGA, id: `${threadId}-o-h` }, + { role: "ai", content: "OMEGA reply", id: `${threadId}-o-a` }, + ], + }, + ], + }, + }); + expect(seed.status(), await seed.text()).toBe(200); + + // Load the thread fresh — triggers useThreadHistory's per-run reload path. + await page.goto(`/workspace/chats/${threadId}`); + + const alpha = page.getByText(ALPHA, { exact: false }); + const omega = page.getByText(OMEGA, { exact: false }); + await expect(alpha).toBeVisible({ timeout: 60_000 }); + await expect(omega).toBeVisible({ timeout: 30_000 }); + // Each marker renders exactly once (guards against accidental duplicate matches). + expect(await alpha.count(), "ALPHA should render exactly once").toBe(1); + expect(await omega.count(), "OMEGA should render exactly once").toBe(1); + + // The contract: ALPHA (first run) must render ABOVE OMEGA (second run). With + // the #3352 bug the per-run rebuild inverts this and OMEGA renders first. + const alphaBox = await alpha.first().boundingBox(); + const omegaBox = await omega.first().boundingBox(); + expect(alphaBox, "ALPHA must have a layout box").toBeTruthy(); + expect(omegaBox, "OMEGA must have a layout box").toBeTruthy(); + expect( + alphaBox!.y, + `chronological order broken: ALPHA(first run) rendered at y=${alphaBox!.y}, OMEGA(second run) at y=${omegaBox!.y} — backend list_by_thread ordering and frontend history rebuild are out of sync (#3352)`, + ).toBeLessThan(omegaBox!.y); + }); +}); diff --git a/frontend/tests/e2e-real-backend/real-backend-render.spec.ts b/frontend/tests/e2e-real-backend/real-backend-render.spec.ts new file mode 100644 index 000000000..fe4446e67 --- /dev/null +++ b/frontend/tests/e2e-real-backend/real-backend-render.spec.ts @@ -0,0 +1,123 @@ +import { readFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +import { expect, test } from "@playwright/test"; + +const here = dirname(fileURLToPath(import.meta.url)); + +/** + * Layer 2: drive the REAL frontend against the REAL gateway (replay model, no + * API key) and assert the browser renders the backend's data correctly. + * + * The prompt is read from the same fixture the gateway replays, so the input + * hash matches and the recorded turns (write_file -> auto-title -> read_file -> + * final answer) reproduce deterministically. + */ +// Register through the frontend origin (same-origin proxy) so the auth cookies +// are stored for and sent to localhost:3000 — the gateway is reached via the +// next.config rewrite, never cross-origin from the browser. +const APP = "http://localhost:3000"; +const fixture = JSON.parse( + readFileSync( + join( + here, + "../../../backend/tests/fixtures/replay/write_read_file.ultra.json", + ), + "utf-8", + ), +) as { + prompt: string; + turns: Array<{ output: { data: { content?: unknown } } }>; +}; + +const PROMPT = fixture.prompt; +// Derive the assertions from the fixture so a re-record auto-updates them. Both +// are model-generated strings absent from the user prompt, so a pass proves the +// replay drove the render (not a prompt echo): the first plain-text turn is the +// in-graph auto-title; the JSON-array turn is the follow-up suggestions. +const textTurns = fixture.turns + .map((t) => t.output?.data?.content) + .filter((c): c is string => typeof c === "string" && c.trim().length > 0); +const suggestionsRaw = textTurns.find((c) => c.trim().startsWith("[")); +// Guarded parse: a bracket-prefixed turn that isn't a valid JSON string array +// falls back to "" so the `not.toBe("")` assertion below fails with a clear +// message instead of a generic JSON.parse throw. +const EXPECTED_SUGGESTION = ((): string => { + if (!suggestionsRaw) return ""; + try { + const arr: unknown = JSON.parse(suggestionsRaw); + return Array.isArray(arr) && typeof arr[0] === "string" ? arr[0] : ""; + } catch { + return ""; + } +})(); +const EXPECTED_TITLE = textTurns.find((c) => !c.trim().startsWith("[")) ?? ""; + +test.describe("real backend render (replay, no API key)", () => { + test.beforeEach(async ({ context }) => { + // Throwaway test account: register sets access_token + csrf_token cookies in + // the browser context (host-scoped to localhost, shared across ports), so + // the frontend's SDK (credentials:include + X-CSRF-Token) authenticates. + const email = `e2e-${Date.now()}-${Math.floor(Math.random() * 1e6)}@example.com`; + const resp = await context.request.post(`${APP}/api/v1/auth/register`, { + data: { email, password: "very-strong-password-123" }, + }); + expect(resp.status(), await resp.text()).toBe(201); + }); + + test("renders the replayed auto-title + suggestions from a real backend", async ({ + page, + }) => { + // ultra mode so the context the frontend sends (is_plan_mode + subagent_enabled) + // matches the recorded fixture; otherwise the replay input hash would miss. + await page.addInitScript(() => { + window.localStorage.setItem( + "deerflow.local-settings", + JSON.stringify({ context: { mode: "ultra" } }), + ); + }); + + await page.goto("/workspace/chats/new"); + + const textarea = page.getByPlaceholder(/how can i assist you/i); + await expect(textarea).toBeVisible({ timeout: 30_000 }); + await textarea.fill(PROMPT); + await textarea.press("Enter"); + + // Replay-only DOM assertions (derived from the fixture): they render only if + // the recorded turns replayed AND the real frontend rendered them — the + // in-graph auto-title and the post-answer follow-up suggestion. Together they + // prove the whole pipeline (replay backend -> real frontend render). + expect( + EXPECTED_TITLE, + "fixture should contain an auto-title turn", + ).not.toBe(""); + expect( + EXPECTED_SUGGESTION, + "fixture should contain a suggestions turn", + ).not.toBe(""); + await expect(page.getByText(EXPECTED_TITLE)).toBeVisible({ + timeout: 60_000, + }); + await expect(page.getByText(EXPECTED_SUGGESTION)).toBeVisible({ + timeout: 30_000, + }); + + // Visual regression is OS-sensitive (a macOS baseline won't match CI's + // Linux render), so it's a local dev gate only; in CI we capture the render + // as an artifact for human review instead of hard-asserting a cross-OS + // baseline. The DOM assertions above are the CI gate. + if (process.env.CI) { + await page.screenshot({ + path: "test-results/real-backend-render.png", + fullPage: true, + }); + } else { + await expect(page).toHaveScreenshot("real-backend-render.png", { + maxDiffPixelRatio: 0.02, + fullPage: true, + }); + } + }); +}); diff --git a/frontend/tests/e2e-record/record-write-read-file.spec.ts b/frontend/tests/e2e-record/record-write-read-file.spec.ts new file mode 100644 index 000000000..77f02ec85 --- /dev/null +++ b/frontend/tests/e2e-record/record-write-read-file.spec.ts @@ -0,0 +1,115 @@ +import { existsSync, readFileSync, writeFileSync } from "node:fs"; + +import { expect, test } from "@playwright/test"; + +/** + * RECORD driver (Plan A): drive the real frontend through the write/read-file + * scenario against the real-model gateway. The gateway captures every model + * call to DEERFLOW_RECORD_OUT; this just needs to drive the flow and wait until + * the captures stop arriving (main turns + in-graph title + follow-up + * suggestions all fired). It asserts nothing about content — it produces the + * fixture, it doesn't verify it. + */ +const APP = "http://localhost:3000"; +const SCENARIO = "write_read_file"; +const MODE = "ultra"; +const PROMPT = + "Using your own file tools directly, create the file /mnt/user-data/outputs/note.txt " + + "with exactly this content: hi from replay. Then read that same file back and reply with its " + + "exact contents. Do NOT delegate to a subagent and do NOT use the task tool — do it yourself. " + + "Do not ask any clarifying questions."; + +function countLines(path: string): number { + return existsSync(path) + ? readFileSync(path, "utf-8") + .split("\n") + .filter((l) => l.trim()).length + : 0; +} + +async function waitForCaptureStable( + path: string, + { stableMs = 12_000, maxMs = 160_000 } = {}, +): Promise { + const start = Date.now(); + let last = -1; + let lastChange = Date.now(); + while (Date.now() - start < maxMs) { + const n = countLines(path); + if (n !== last) { + last = n; + lastChange = Date.now(); + } else if (n > 0 && Date.now() - lastChange > stableMs) { + return n; + } + await new Promise((r) => setTimeout(r, 1000)); + } + // Hard failure on timeout: returning the last count here would let a + // truncated/partial recording pass silently (captured > 0). A recording must + // stabilize, or it is not trustworthy. + throw new Error( + `[record] captures never stabilized within ${maxMs}ms (last count=${last}); ` + + `the recording may be truncated — raise maxMs or check the record gateway.`, + ); +} + +test.describe.configure({ timeout: 220_000 }); + +test("record write/read-file run through the real frontend", async ({ + page, + context, +}) => { + const out = process.env.DEERFLOW_RECORD_OUT; + expect(out, "DEERFLOW_RECORD_OUT must be set").toBeTruthy(); + // The context the frontend derives for ultra mode (core/threads/hooks.ts). The + // backend-direct golden test (Layer 1) POSTs this so its prompt — hence the + // recorded input hashes — matches the browser run. thinking/reasoning don't + // affect the prompt; is_plan_mode + subagent_enabled add the todo/task tools. + const CONTEXT = { + is_bootstrap: false, + mode: MODE, + thinking_enabled: true, + is_plan_mode: true, + subagent_enabled: true, + }; + writeFileSync( + `${out}.meta.json`, + JSON.stringify({ + scenario: SCENARIO, + mode: MODE, + prompt: PROMPT, + context: CONTEXT, + }), + "utf-8", + ); + + const reg = await context.request.post(`${APP}/api/v1/auth/register`, { + data: { + email: `rec-${Date.now()}@example.com`, + password: "very-strong-password-123", + }, + }); + expect(reg.status(), await reg.text()).toBe(201); + + await page.addInitScript(() => { + window.localStorage.setItem( + "deerflow.local-settings", + JSON.stringify({ context: { mode: "ultra" } }), + ); + }); + await page.goto("/workspace/chats/new"); + + const textarea = page.getByPlaceholder(/how can i assist you/i); + await expect(textarea).toBeVisible({ timeout: 30_000 }); + await textarea.fill(PROMPT); + await textarea.press("Enter"); + + const captured = await waitForCaptureStable(out!); + console.log( + `[record] captures stabilized at ${captured} model call(s) -> ${out}`, + ); + expect( + captured, + "expected at least the agent turns to be captured", + ).toBeGreaterThan(0); +});