deer-flow/backend/tests/test_replay_golden.py

"""Layer 1 of the record/replay e2e: replay a recorded trace through the **real
gateway** with a deterministic ``ReplayChatModel`` (no API key, no network) and
assert the streamed SSE event sequence matches a committed golden.

This catches backend protocol drift: if a change alters the shape/sequence of
SSE the gateway emits for the recorded scenario, this test goes red. The replay
model serves the recorded assistant turns by input hash, so the agent graph
(write_file -> auto-title -> read_file -> final answer) reproduces offline.

Fixtures are produced by ``scripts/record_gateway.py`` +
``scripts/build_fixture_from_jsonl.py`` (manual, needs a key).
"""

from __future__ import annotations

import json
import os
from pathlib import Path

import pytest
from _replay_fixture import REPLAY_MODEL_BLOCK, build_config_yaml, drive_gateway, prepare_hermetic_extras

FIXTURE_DIR = Path(__file__).parent / "fixtures" / "replay"


def _reset_process_singletons(monkeypatch: pytest.MonkeyPatch) -> None:
    """Invalidate process-wide caches so the test-only config/home take effect.

    Same set the real-server e2e resets (see test_setup_agent_http_e2e_real_server).
    """
    from deerflow.config import app_config as app_config_module
    from deerflow.config import paths as paths_module
    from deerflow.persistence import engine as engine_module

    for module, attr in (
        (app_config_module, "_app_config"),
        (app_config_module, "_app_config_path"),
        (app_config_module, "_app_config_mtime"),
        (paths_module, "_paths_singleton"),
        (engine_module, "_engine"),
        (engine_module, "_session_factory"),
    ):
        monkeypatch.setattr(module, attr, None, raising=False)


@pytest.mark.no_auto_user
def test_replay_write_read_file_ultra_matches_golden(tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
    scenario, mode = "write_read_file", "ultra"
    fixture_path = FIXTURE_DIR / f"{scenario}.{mode}.json"
    events_path = FIXTURE_DIR / f"{scenario}.{mode}.events.json"
    fixture = json.loads(fixture_path.read_text(encoding="utf-8"))

    home = tmp_path / "home"
    home.mkdir()
    monkeypatch.setenv("DEER_FLOW_HOME", str(home))
    monkeypatch.setenv("DEERFLOW_REPLAY_FIXTURE", str(fixture_path))

    cfg_path = tmp_path / "config.yaml"
    cfg_path.write_text(build_config_yaml(model_block=REPLAY_MODEL_BLOCK, home=home), encoding="utf-8")
    monkeypatch.setenv("DEER_FLOW_CONFIG_PATH", str(cfg_path))
    monkeypatch.setenv("DEER_FLOW_EXTENSIONS_CONFIG_PATH", str(prepare_hermetic_extras(home)))

    _reset_process_singletons(monkeypatch)
    from deerflow.config import app_config as app_config_module

    cfg = app_config_module.get_app_config()
    cfg.database.sqlite_dir = str(home / "db")

    # Fail loud on a replay miss. The gateway swallows a hash-miss into a normal
    # assistant error message, so the SSE *shapes* below stay green on a stale
    # fixture — the miss list is the only reliable signal at this layer.
    import replay_provider

    from app.gateway.app import create_app

    replay_provider.reset_replay_misses()

    events = drive_gateway(create_app(), prompt=fixture["prompt"], context=fixture["context"])

    assert events, "replay produced no SSE events"
    assert events[0]["event"] == "metadata", f"first event should be metadata, got {events[0]!r}"
    assert events[-1]["event"] == "end", f"last event should be end (run completed), got {events[-1]!r}"

    misses = replay_provider.replay_misses()
    assert not misses, f"replay miss ({len(misses)}): the fixture is stale vs the current system prompt or agent graph. Re-record it (see backend/docs/REPLAY_E2E.md). Missed hashes: {misses}"

    # Regenerate the committed golden after re-recording the fixture:
    #   DEERFLOW_WRITE_GOLDEN=1 uv run pytest tests/test_replay_golden.py
    if os.environ.get("DEERFLOW_WRITE_GOLDEN"):
        events_path.write_text(json.dumps({"scenario": scenario, "mode": mode, "events": events}, ensure_ascii=False, indent=2), encoding="utf-8")
        return

    golden = json.loads(events_path.read_text(encoding="utf-8"))["events"]
    # Guards backend SSE protocol drift: the event name + payload-key sequence
    # must match the committed golden. (Replay divergence is caught by the miss
    # assertion above, not here — a swallowed miss keeps the shapes identical.)
    assert events == golden, f"SSE event-shape sequence drifted from the golden.\ngot  ({len(events)}): {[e['event'] for e in events]}\nwant ({len(golden)}): {[e['event'] for e in golden]}"