deer-flow/backend/tests/test_phase2b_integration.py

"""Phase 2-B integration tests.

End-to-end test: simulate a run's complete lifecycle, verify data
is correctly written to both RunStore and RunEventStore.
"""

import asyncio
from unittest.mock import MagicMock
from uuid import uuid4

import pytest

from deerflow.runtime.events.store.memory import MemoryRunEventStore
from deerflow.runtime.journal import RunJournal
from deerflow.runtime.runs.store.memory import MemoryRunStore


def _make_llm_response(content="Hello", usage=None):
    msg = MagicMock()
    msg.content = content
    msg.tool_calls = []
    msg.response_metadata = {"model_name": "test-model"}
    msg.usage_metadata = usage

    gen = MagicMock()
    gen.message = msg

    response = MagicMock()
    response.generations = [[gen]]
    return response


class TestRunLifecycle:
    @pytest.mark.anyio
    async def test_full_run_lifecycle(self):
        """Simulate a complete run lifecycle with RunStore + RunEventStore."""
        run_store = MemoryRunStore()
        event_store = MemoryRunEventStore()

        # 1. Create run
        await run_store.put("r1", thread_id="t1", status="pending")

        # 2. Write human_message
        await event_store.put(
            thread_id="t1",
            run_id="r1",
            event_type="human_message",
            category="message",
            content="What is AI?",
        )

        # 3. Simulate RunJournal callback sequence
        on_complete_data = {}

        def on_complete(**data):
            on_complete_data.update(data)

        journal = RunJournal("r1", "t1", event_store, on_complete=on_complete, flush_threshold=100)
        journal.set_first_human_message("What is AI?")

        # chain_start (top-level)
        journal.on_chain_start({}, {"messages": ["What is AI?"]}, run_id=uuid4(), parent_run_id=None)

        # llm_start + llm_end
        llm_run_id = uuid4()
        journal.on_llm_start({"name": "gpt-4"}, ["prompt"], run_id=llm_run_id, tags=["lead_agent"])
        usage = {"input_tokens": 50, "output_tokens": 100, "total_tokens": 150}
        journal.on_llm_end(_make_llm_response("AI is artificial intelligence.", usage=usage), run_id=llm_run_id, tags=["lead_agent"])

        # chain_end (triggers on_complete + flush_sync which creates a task)
        journal.on_chain_end({}, run_id=uuid4(), parent_run_id=None)
        await journal.flush()
        # Let event loop process any pending flush tasks from _flush_sync
        await asyncio.sleep(0.05)

        # 4. Verify messages
        messages = await event_store.list_messages("t1")
        assert len(messages) == 2  # human + ai
        assert messages[0]["event_type"] == "human_message"
        assert messages[1]["event_type"] == "ai_message"
        assert messages[1]["content"] == "AI is artificial intelligence."

        # 5. Verify events
        events = await event_store.list_events("t1", "r1")
        event_types = {e["event_type"] for e in events}
        assert "run_start" in event_types
        assert "llm_start" in event_types
        assert "llm_end" in event_types
        assert "run_end" in event_types

        # 6. Verify on_complete data
        assert on_complete_data["total_tokens"] == 150
        assert on_complete_data["llm_call_count"] == 1
        assert on_complete_data["lead_agent_tokens"] == 150
        assert on_complete_data["message_count"] == 1
        assert on_complete_data["last_ai_message"] == "AI is artificial intelligence."
        assert on_complete_data["first_human_message"] == "What is AI?"

    @pytest.mark.anyio
    async def test_run_with_tool_calls(self):
        """Simulate a run that uses tools."""
        event_store = MemoryRunEventStore()
        journal = RunJournal("r1", "t1", event_store, flush_threshold=100)

        # tool_start + tool_end
        journal.on_tool_start({"name": "web_search"}, '{"query": "AI"}', run_id=uuid4())
        journal.on_tool_end("Search results...", run_id=uuid4(), name="web_search")
        await journal.flush()

        events = await event_store.list_events("t1", "r1")
        assert len(events) == 2
        assert events[0]["event_type"] == "tool_start"
        assert events[1]["event_type"] == "tool_end"

    @pytest.mark.anyio
    async def test_multi_run_thread(self):
        """Multiple runs on the same thread maintain unified seq ordering."""
        event_store = MemoryRunEventStore()

        # Run 1
        await event_store.put(thread_id="t1", run_id="r1", event_type="human_message", category="message", content="Q1")
        await event_store.put(thread_id="t1", run_id="r1", event_type="ai_message", category="message", content="A1")

        # Run 2
        await event_store.put(thread_id="t1", run_id="r2", event_type="human_message", category="message", content="Q2")
        await event_store.put(thread_id="t1", run_id="r2", event_type="ai_message", category="message", content="A2")

        messages = await event_store.list_messages("t1")
        assert len(messages) == 4
        assert [m["seq"] for m in messages] == [1, 2, 3, 4]
        assert messages[0]["run_id"] == "r1"
        assert messages[2]["run_id"] == "r2"

    @pytest.mark.anyio
    async def test_runmanager_with_store_backing(self):
        """RunManager persists to RunStore when one is provided."""
        from deerflow.runtime.runs.manager import RunManager

        run_store = MemoryRunStore()
        mgr = RunManager(store=run_store)

        record = await mgr.create("t1", assistant_id="lead_agent")
        # Verify persisted to store
        row = await run_store.get(record.run_id)
        assert row is not None
        assert row["thread_id"] == "t1"
        assert row["status"] == "pending"

        # Status update
        from deerflow.runtime.runs.schemas import RunStatus

        await mgr.set_status(record.run_id, RunStatus.running)
        row = await run_store.get(record.run_id)
        assert row["status"] == "running"