From 9d0a42c1fbf11a112916053c11e3f51c38e51aeb Mon Sep 17 00:00:00 2001 From: rayhpeng Date: Wed, 22 Apr 2026 11:28:01 +0800 Subject: [PATCH] refactor(runtime): restructure runs module with new execution architecture Major refactoring of deerflow/runtime/: - runs/callbacks/ - new callback system (builder, events, title, tokens) - runs/internal/ - execution internals (executor, supervisor, stream_logic, registry) - runs/internal/execution/ - execution artifacts and events handling - runs/facade.py - high-level run facade - runs/observer.py - run observation protocol - runs/types.py - type definitions - runs/store/ - simplified store interfaces (create, delete, query, event) Refactor stream_bridge/: - Replace old providers with contract.py and exceptions.py - Remove async_provider.py, base.py, memory.py Add documentation: - README.md and README_zh.md for runtime module Remove deprecated: - manager.py moved to internal/ - worker.py, schemas.py - user_context.py Co-Authored-By: Claude Opus 4.5 --- .../harness/deerflow/runtime/README.md | 594 ++++++++++++++++++ .../harness/deerflow/runtime/README_zh.md | 584 +++++++++++++++++ .../harness/deerflow/runtime/__init__.py | 100 ++- .../harness/deerflow/runtime/actor_context.py | 117 ++++ .../harness/deerflow/runtime/converters.py | 2 +- .../harness/deerflow/runtime/runs/__init__.py | 50 +- .../runtime/runs/callbacks/__init__.py | 15 + .../runtime/runs/callbacks/builder.py | 138 ++++ .../deerflow/runtime/runs/callbacks/events.py | 353 +++++++++++ .../deerflow/runtime/runs/callbacks/title.py | 51 ++ .../deerflow/runtime/runs/callbacks/tokens.py | 122 ++++ .../harness/deerflow/runtime/runs/facade.py | 240 +++++++ .../runtime/runs/internal/__init__.py | 4 + .../runs/internal/execution/__init__.py | 1 + .../runs/internal/execution/artifacts.py | 64 ++ .../runtime/runs/internal/execution/events.py | 45 ++ .../runs/internal/execution/executor.py | 376 +++++++++++ .../runs/internal/execution/stream_logic.py | 93 +++ .../runs/internal/execution/supervisor.py | 78 +++ .../runtime/runs/{ => internal}/manager.py | 26 +- .../deerflow/runtime/runs/internal/planner.py | 42 ++ .../runtime/runs/internal/registry.py | 146 +++++ .../deerflow/runtime/runs/internal/streams.py | 76 +++ .../deerflow/runtime/runs/internal/wait.py | 95 +++ .../harness/deerflow/runtime/runs/observer.py | 203 ++++++ .../harness/deerflow/runtime/runs/schemas.py | 21 - .../deerflow/runtime/runs/store/__init__.py | 15 +- .../deerflow/runtime/runs/store/base.py | 95 --- .../runtime/runs/store/create_store.py | 13 + .../runtime/runs/store/delete_store.py | 11 + .../runtime/runs/store/event_store.py | 11 + .../deerflow/runtime/runs/store/memory.py | 98 --- .../runtime/runs/store/query_store.py | 20 + .../harness/deerflow/runtime/runs/types.py | 117 ++++ .../harness/deerflow/runtime/runs/worker.py | 493 --------------- .../harness/deerflow/runtime/serialization.py | 4 +- .../runtime/stream_bridge/__init__.py | 48 +- .../runtime/stream_bridge/async_provider.py | 52 -- .../deerflow/runtime/stream_bridge/base.py | 72 --- .../runtime/stream_bridge/contract.py | 112 ++++ .../runtime/stream_bridge/exceptions.py | 23 + .../deerflow/runtime/stream_bridge/memory.py | 133 ---- .../harness/deerflow/runtime/user_context.py | 167 ----- 43 files changed, 3928 insertions(+), 1192 deletions(-) create mode 100644 backend/packages/harness/deerflow/runtime/README.md create mode 100644 backend/packages/harness/deerflow/runtime/README_zh.md create mode 100644 backend/packages/harness/deerflow/runtime/actor_context.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/callbacks/__init__.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/callbacks/builder.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/callbacks/events.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/callbacks/title.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/callbacks/tokens.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/facade.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/__init__.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/execution/__init__.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/execution/artifacts.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/execution/events.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/execution/executor.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/execution/stream_logic.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/execution/supervisor.py rename backend/packages/harness/deerflow/runtime/runs/{ => internal}/manager.py (91%) create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/planner.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/registry.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/streams.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/internal/wait.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/observer.py delete mode 100644 backend/packages/harness/deerflow/runtime/runs/schemas.py delete mode 100644 backend/packages/harness/deerflow/runtime/runs/store/base.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/store/create_store.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/store/delete_store.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/store/event_store.py delete mode 100644 backend/packages/harness/deerflow/runtime/runs/store/memory.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/store/query_store.py create mode 100644 backend/packages/harness/deerflow/runtime/runs/types.py delete mode 100644 backend/packages/harness/deerflow/runtime/runs/worker.py delete mode 100644 backend/packages/harness/deerflow/runtime/stream_bridge/async_provider.py delete mode 100644 backend/packages/harness/deerflow/runtime/stream_bridge/base.py create mode 100644 backend/packages/harness/deerflow/runtime/stream_bridge/contract.py create mode 100644 backend/packages/harness/deerflow/runtime/stream_bridge/exceptions.py delete mode 100644 backend/packages/harness/deerflow/runtime/stream_bridge/memory.py delete mode 100644 backend/packages/harness/deerflow/runtime/user_context.py diff --git a/backend/packages/harness/deerflow/runtime/README.md b/backend/packages/harness/deerflow/runtime/README.md new file mode 100644 index 000000000..769bc3d1c --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/README.md @@ -0,0 +1,594 @@ +# deerflow.runtime Design Overview + +This document describes the current implementation of `backend/packages/harness/deerflow/runtime`, including its overall design, boundary model, the collaboration between `runs` and `stream_bridge`, how it interacts with external infrastructure and the `app` layer, and how `actor_context` is dynamically injected to provide user isolation. + +## 1. Overall Role + +`deerflow.runtime` is the runtime kernel layer of DeerFlow. + +It sits below agents / tools / middlewares and above app / gateway / infra. Its purpose is to define runtime semantics and boundary contracts, without directly owning web endpoints, ORM models, or concrete infrastructure implementations. + +Its public surface is re-exported from [`__init__.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/__init__.py) and currently exposes four main capability areas: + +1. `runs` + - Run domain types, execution facade, lifecycle observers, and store protocols +2. `stream_bridge` + - Stream event bridge contract and public stream types +3. `actor_context` + - Request/task-scoped actor context and user-isolation bridge +4. `serialization` + - Runtime serialization helpers for LangChain / LangGraph data and outward-facing events + +Structurally, the current package looks like: + +```text +runtime + ├─ runs + │ ├─ facade / types / observer / store + │ ├─ internal/* + │ └─ callbacks/* + ├─ stream_bridge + │ ├─ contract + │ └─ exceptions + ├─ actor_context + └─ serialization / converters +``` + +## 2. Overall Design and Constraint Model + +### 2.1 Design Goal + +The core goal of `runtime` is to decouple runtime control-plane semantics from infrastructure implementations. + +It only cares about: + +1. What a run is and how run state changes over time +2. What lifecycle events and stream events are produced during execution +3. Which capabilities must be injected from the outside, such as checkpointer, event store, stream bridge, and durable stores +4. Who the current actor is, and how lower layers can use that for isolation + +It deliberately does not care about: + +1. Whether events are stored in memory, Redis, or another transport +2. How run / thread / feedback data is persisted +3. HTTP / SSE / FastAPI details +4. How the auth plugin resolves the request user + +### 2.2 Boundary Rules + +The current package has a fairly clear boundary model: + +1. `runs` owns execution orchestration, not ORM or SQL writes +2. `stream_bridge` defines stream semantics, not app-level bridge construction +3. `actor_context` defines runtime context, not auth-plugin behavior +4. Durable data enters only through boundary protocols: + - `RunCreateStore` + - `RunQueryStore` + - `RunDeleteStore` + - `RunEventStore` +5. Lifecycle side effects enter only through `RunObserver` +6. User isolation is not implemented ad hoc in each module; it is propagated through actor context + +In one sentence: + +`runtime` defines semantics and contracts; `app.infra` provides implementations. + +## 3. runs Subsystem Design + +### 3.1 Purpose + +`runtime/runs` is the run orchestration domain. It is responsible for: + +1. Defining run domain objects and status transitions +2. Organizing create / stream / wait / join / cancel / delete behavior +3. Maintaining the in-process runtime control plane +4. Emitting stream events and lifecycle events during execution +5. Collecting trace, token, title, and message data through callbacks + +### 3.2 Core Objects + +See [`runs/types.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/types.py). + +The most important types are: + +1. `RunSpec` + - Built by the app-side input layer + - The real execution input +2. `RunRecord` + - The runtime record managed by `RunRegistry` +3. `RunStatus` + - `pending`, `starting`, `running`, `success`, `error`, `interrupted`, `timeout` +4. `RunScope` + - Distinguishes stateful vs stateless execution and temporary thread behavior + +### 3.3 Current Constraints + +The current implementation explicitly limits some parts of the problem space: + +1. `multitask_strategy` currently supports only `reject` and `interrupt` on the main path +2. `enqueue`, `after_seconds`, and batch execution are not on the current primary path +3. `RunRegistry` is an in-process state source, not a durable source of truth +4. External queries may use durable stores, but the live control plane still centers on the in-memory registry + +### 3.4 Facade and Internal Components + +`RunsFacade` in [`runs/facade.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/facade.py) provides the unified API: + +1. `create_background` +2. `create_and_stream` +3. `create_and_wait` +4. `join_stream` +5. `join_wait` +6. `cancel` +7. `get_run` +8. `list_runs` +9. `delete_run` + +Internally it composes: + +1. `RunRegistry` +2. `ExecutionPlanner` +3. `RunSupervisor` +4. `RunStreamService` +5. `RunWaitService` +6. `RunCreateStore` / `RunQueryStore` / `RunDeleteStore` +7. `RunObserver` + +So `RunsFacade` is the public entry point, while execution and state transitions are distributed across smaller components. + +## 4. stream_bridge Design and Implementation + +### 4.1 Why stream_bridge Is a Separate Abstraction + +`StreamBridge` is defined in [`stream_bridge/contract.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/stream_bridge/contract.py). + +It exists because run execution needs an event channel that is: + +1. Subscribable +2. Replayable +3. Terminal-state aware +4. Resume-capable + +That behavior must not be hard-coupled to HTTP SSE, in-memory queues, or Redis-specific details. + +So: + +1. harness defines stream semantics +2. the app layer owns backend selection and implementation + +### 4.2 Contract Contents + +The abstract `StreamBridge` currently exposes: + +1. `publish(run_id, event, data)` +2. `publish_end(run_id)` +3. `publish_terminal(run_id, kind, data)` +4. `subscribe(run_id, last_event_id, heartbeat_interval)` +5. `cleanup(run_id, delay=0)` +6. `cancel(run_id)` +7. `mark_awaiting_input(run_id)` +8. `start()` +9. `close()` + +Public types include: + +1. `StreamEvent` +2. `StreamStatus` +3. `ResumeResult` +4. `HEARTBEAT_SENTINEL` +5. `END_SENTINEL` +6. `CANCELLED_SENTINEL` + +### 4.3 Semantic Boundary + +The contract explicitly distinguishes: + +1. `end` / `cancel` / `error` + - Real business-level terminal events for a run +2. `close()` + - Bridge-level shutdown + - Not equivalent to run cancellation + +### 4.4 Current Implementation Style + +The concrete implementation currently used is the app-layer [`MemoryStreamBridge`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/stream_bridge/adapters/memory.py). + +Its design is effectively “one in-memory event log per run”: + +1. `_RunStream` stores the event list, offset mapping, status, subscriber count, and awaiting-input state +2. `publish()` generates increasing event IDs and appends to the per-run log +3. `subscribe()` supports replay, heartbeat, resume, and terminal exit +4. `cleanup_loop()` handles: + - old streams + - active streams with no publish activity + - orphan terminal streams + - TTL expiration +5. `mark_awaiting_input()` extends timeout behavior for HITL flows + +The Redis implementation is still only a placeholder in [`RedisStreamBridge`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/stream_bridge/adapters/redis.py). + +### 4.5 Call Chain + +The stream bridge participates in the execution chain like this: + +```text +RunsFacade + -> RunStreamService + -> StreamBridge + -> app route converts events to SSE +``` + +More concretely: + +1. `_RunExecution._start()` publishes `metadata` +2. `_RunExecution._stream()` converts agent `astream()` output into bridge events +3. `_RunExecution._finish_success()` / `_finish_failed()` / `_finish_aborted()` publish terminal events +4. `RunWaitService` waits by subscribing for `values`, `error`, or terminal events +5. The app route layer converts those events into outward-facing SSE + +### 4.6 Future Extensions + +Likely future directions include: + +1. A real Redis bridge for cross-process / multi-instance streaming +2. Stronger Last-Event-ID gap recovery behavior +3. Richer HITL state handling +4. Cross-node run coordination and more explicit dead-letter strategies + +## 5. External Communication and Store Read/Write Boundaries + +### 5.1 Two Main Outward Boundaries + +`runtime` does not send HTTP requests directly and does not write ORM models directly, but it communicates outward through two main boundaries: + +1. `StreamBridge` + - For outward-facing stream events +2. `store` / `observer` + - For durable data and lifecycle side effects + +### 5.2 Store Boundary Protocols + +Under [`runs/store`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/store), the harness layer defines: + +1. `RunCreateStore` +2. `RunQueryStore` +3. `RunDeleteStore` +4. `RunEventStore` + +These are not harness-internal persistence implementations. They are app-facing contracts declared by the runtime. + +### 5.3 How the app Layer Supplies Store Implementations + +The app layer currently provides: + +1. [`AppRunCreateStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/services/runs/store/create_store.py) +2. [`AppRunQueryStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/services/runs/store/query_store.py) +3. [`AppRunDeleteStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/services/runs/store/delete_store.py) +4. [`AppRunEventStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/storage/run_events.py) +5. [`JsonlRunEventStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/run_events/jsonl_store.py) + +The shared pattern is: + +1. harness depends only on protocols +2. the app layer owns session lifecycle, commit behavior, access control, and backend choice +3. durable data eventually lands in `store.repositories.*` or JSONL files + +### 5.4 How Run Lifecycle Data Leaves the Runtime + +The single-run executor [`_RunExecution`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/internal/execution/executor.py) does not write to the database directly. + +It exports data through three paths: + +1. bridge events + - Streamed outward to subscribers +2. callback -> `RunEventStore` + - Execution trace / message / tool / custom events are persisted in batches +3. lifecycle event -> `RunObserver` + - Run started, completed, failed, cancelled, and thread-status updates are emitted for app observers + +### 5.5 `RunEventStore` Backends + +The app-side factory [`app/infra/run_events/factory.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/run_events/factory.py) currently selects: + +1. `run_events.backend == "db"` + - `AppRunEventStore` +2. `run_events.backend == "jsonl"` + - `JsonlRunEventStore` + +So the runtime does not care whether events end up in a database or in files. It only requires the event-store protocol. + +## 6. Run Lifecycle Data, Callbacks, Write-Back, and Query Flow + +### 6.1 Main Single-Run Flow + +The main `_RunExecution.run()` flow is: + +1. `_start()` +2. `_prepare()` +3. `_stream()` +4. `_finish_after_stream()` +5. `finally` + - `_emit_final_thread_status()` + - `callbacks.flush()` + - `bridge.cleanup(run_id)` + +### 6.2 What the Start Phase Records + +`_start()`: + +1. sets run status to `running` +2. emits `RUN_STARTED` +3. extracts the first human message and emits `HUMAN_MESSAGE` +4. captures the pre-run checkpoint ID +5. publishes a `metadata` stream event + +### 6.3 What the Callbacks Collect + +Callbacks live under [`runs/callbacks`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/callbacks). + +The main ones are: + +1. `RunEventCallback` + - Records `run_start`, `run_end`, `llm_request`, `llm_response`, `tool_start`, `tool_end`, `tool_result`, `custom_event`, and more + - Flushes batches into `RunEventStore` +2. `RunTokenCallback` + - Aggregates token usage, LLM call counts, lead/subagent/middleware token split, message counts, first human message, and last AI message +3. `RunTitleCallback` + - Extracts thread title from title middleware output or custom events + +### 6.4 How completion_data Is Produced + +`RunTokenCallback.completion_data()` yields `RunCompletionData`, including: + +1. `total_input_tokens` +2. `total_output_tokens` +3. `total_tokens` +4. `llm_call_count` +5. `lead_agent_tokens` +6. `subagent_tokens` +7. `middleware_tokens` +8. `message_count` +9. `last_ai_message` +10. `first_human_message` + +The executor includes this data in lifecycle payloads on success, failure, and cancellation. + +### 6.5 How the app Layer Writes Lifecycle Results Back + +The executor emits `RunLifecycleEvent` objects through [`RunEventEmitter`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/internal/execution/events.py). + +The app-layer [`StorageRunObserver`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/storage/runs.py) then persists durable state: + +1. `RUN_STARTED` + - Marks the run as `running` +2. `RUN_COMPLETED` + - Writes completion data + - Syncs thread title if present +3. `RUN_FAILED` + - Writes error and completion data +4. `RUN_CANCELLED` + - Writes `interrupted` state and completion data +5. `THREAD_STATUS_UPDATED` + - Syncs thread status + +### 6.6 Query Paths + +`RunsFacade.get_run()` and `list_runs()` have two paths: + +1. If a `RunQueryStore` is injected, durable state is used first +2. Otherwise, the facade falls back to `RunRegistry` + +So: + +1. the in-memory registry is the control plane +2. the durable store is the preferred query surface + +## 7. How actor_context Is Dynamically Injected for User Isolation + +### 7.1 Design Goal + +`actor_context` is defined in [`actor_context.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/actor_context.py). + +Its purpose is to let the runtime and lower-level infrastructure modules depend on a stable notion of “who the current actor is” without importing the auth plugin, FastAPI request objects, or a specific user model. + +### 7.2 Current Implementation + +The current implementation is a request/task-scoped context built on top of `ContextVar`: + +1. `ActorContext` + - Currently carries only `user_id` +2. `_current_actor` + - A `ContextVar[ActorContext | None]` +3. `bind_actor_context(actor)` + - Binds the current actor +4. `reset_actor_context(token)` + - Restores the previous context +5. `get_actor_context()` + - Returns the current actor +6. `get_effective_user_id()` + - Returns the current user ID or `DEFAULT_USER_ID` +7. `resolve_user_id(value=AUTO | explicit | None)` + - Resolves repository/storage-facing user IDs consistently + +### 7.3 How the app Layer Injects It Dynamically + +Dynamic injection currently happens at the app/auth boundary. + +For HTTP request flows: + +1. [`app.plugins.auth.security.middleware`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/plugins/auth/security/middleware.py) + - Builds `ActorContext(user_id=...)` from the authenticated request user + - Binds and resets runtime actor context around request handling +2. [`app.plugins.auth.security.actor_context`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/plugins/auth/security/actor_context.py) + - Provides `bind_request_actor_context(request)` and `bind_user_actor_context(user_id)` + - Allows routes and non-HTTP entry points to bind runtime actor context explicitly + +For non-HTTP / external channel flows: + +1. [`app/channels/manager.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/channels/manager.py) +2. [`app/channels/feishu.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/channels/feishu.py) + +Those entry points also wrap execution with `bind_user_actor_context(user_id)` before they enter runtime-facing code. This matters because: + +1. the runtime does not need to distinguish HTTP from Feishu or other channels +2. any entry point that can resolve a user ID can inject the same isolation semantics +3. the same runtime/store/path/memory code can stay protocol-agnostic + +So the runtime itself does not know what a request is, and it does not know the auth plugin’s user model. It only knows whether an `ActorContext` is currently bound in the `ContextVar`. + +### 7.4 Propagation Semantics After Injection + +In practice, “dynamic injection” here does not mean manually threading `user_id` through every function signature. The app boundary binds the actor into a `ContextVar`, and runtime-facing code reads it only where isolation is actually needed. + +The current semantics are: + +1. an entry boundary calls `bind_actor_context(...)` +2. the async call chain created inside that context sees the same actor view +3. the boundary restores the previous value with `reset_actor_context(token)` when the request/task exits + +That gives two practical outcomes: + +1. most runtime interfaces do not need to carry `user_id` as an explicit parameter through every layer +2. boundaries that do need durable isolation or path isolation can still read explicitly via `resolve_user_id()` or `get_effective_user_id()` + +### 7.5 How User Isolation Actually Works + +User isolation is implemented through “dynamic injection + boundary-specific reads”. + +The main paths are: + +1. path / uploads / sandbox / memory + - Use `get_effective_user_id()` to derive per-user directories and resource scopes +2. app storage adapters + - Use `resolve_user_id(AUTO)` in `RunStoreAdapter`, `ThreadMetaStorage`, and related boundaries +3. run event store + - `AppRunEventStore` reads `get_actor_context()` and decides whether the current actor may see a thread + +So user isolation is not centralized in a single middleware and then forgotten. Instead: + +1. the app boundary dynamically binds the actor into runtime context +2. runtime and lower layers read that context when they need isolation input +3. each boundary applies the user ID according to its own responsibility + +### 7.6 Why This Approach Works Well + +The current design has several practical strengths: + +1. The runtime does not depend on a specific auth implementation +2. HTTP and non-HTTP entry points can reuse the same isolation mechanism +3. The same user ID propagates naturally into paths, memory, store access, and event visibility +4. Where stronger enforcement is needed, `AUTO` + `resolve_user_id()` can require a bound actor context + +### 7.7 Future Extensions + +`ActorContext` already contains explicit future-extension hints. The current pattern can be extended without changing the architecture: + +1. `tenant_id` + - For multi-tenant isolation +2. `subject_id` + - For a more stable identity key +3. `scopes` + - For finer-grained authorization +4. `auth_source` + - To track the source channel or auth mechanism + +The recommended extension model is to preserve the current shape: + +1. The app/auth boundary binds a richer `ActorContext` +2. The runtime depends only on abstract context fields, never on request/user objects +3. Lower layers read only the fields they actually need +4. Store / path / sandbox / stream / memory boundaries can gradually become tenant-aware or scope-aware + +More concretely, stronger isolation can be added incrementally at the boundaries: + +1. store boundaries + - add `tenant_id` filtering in `RunStoreAdapter`, `ThreadMetaStorage`, and feedback/event stores +2. path and sandbox boundaries + - shard directories by `tenant_id/user_id` instead of `user_id` alone +3. event-visibility boundaries + - layer `scopes` or `subject_id` checks into run-event and thread queries +4. external-channel boundaries + - populate `auth_source` so API, channel, and internal-job traffic can be distinguished + +That keeps the runtime dependent on the abstract “current actor context” concept, not on FastAPI request objects or a specific auth implementation. + +## 8. Interaction with the app Layer + +### 8.1 How the app Layer Wires the Runtime + +The app composition root for runs is [`app/gateway/services/runs/facade_factory.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/services/runs/facade_factory.py). + +It assembles: + +1. `RunRegistry` +2. `ExecutionPlanner` +3. `RunSupervisor` +4. `RunStreamService` +5. `RunWaitService` +6. `RunsRuntime` + - `bridge` + - `checkpointer` + - `store` + - `event_store` + - `agent_factory_resolver` +7. `StorageRunObserver` +8. `AppRunCreateStore` +9. `AppRunQueryStore` +10. `AppRunDeleteStore` + +### 8.2 How app.state Provides Infrastructure + +In [`app/gateway/registrar.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/registrar.py): + +1. `init_persistence()` creates: + - `persistence` + - `checkpointer` + - `run_store` + - `thread_meta_storage` + - `run_event_store` +2. `init_runtime()` creates: + - `stream_bridge` + +Those objects are then attached to `app.state` for dependency injection and facade construction. + +### 8.3 The app Boundary for `stream_bridge` + +Concrete stream bridge construction now belongs entirely to the app layer: + +1. harness exports only the `StreamBridge` contract +2. [`app.infra.stream_bridge.build_stream_bridge`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/stream_bridge/factory.py) constructs the actual implementation + +That is a very explicit boundary: + +1. harness defines runtime semantics and interfaces +2. app selects and constructs infrastructure + +## 9. Summary + +The most accurate one-line summary of `deerflow.runtime` today is: + +It is a runtime kernel built around run orchestration, a stream bridge as the streaming boundary, actor context as the dynamic isolation bridge, and store / observer protocols as the durable and side-effect boundaries. + +More concretely: + +1. `runs` owns orchestration and lifecycle progression +2. `stream_bridge` owns stream semantics +3. `actor_context` owns runtime-scoped user context and isolation bridging +4. `serialization` / `converters` own outward event and message formatting +5. the app layer owns real persistence, stream infrastructure, and auth-driven context injection + +The main strengths of this structure are: + +1. Runtime semantics are decoupled from infrastructure implementations +2. Request identity is decoupled from runtime logic +3. HTTP, CLI, and channel-worker entry points can reuse the same runtime boundaries +4. The system can grow toward multi-tenancy, cross-process stream bridges, and richer durable backends without changing the core model + +The current limitations are also clear: + +1. `RunRegistry` is still an in-process control plane +2. The Redis bridge is not implemented yet +3. Some multitask strategies and batch capabilities are still outside the main path +4. `ActorContext` currently carries only `user_id`, not richer fields such as tenant, scopes, or auth source + +So the best way to understand the current code is not as a final platform, but as a runtime kernel with clear semantics and extension boundaries. diff --git a/backend/packages/harness/deerflow/runtime/README_zh.md b/backend/packages/harness/deerflow/runtime/README_zh.md new file mode 100644 index 000000000..740b938cc --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/README_zh.md @@ -0,0 +1,584 @@ +# deerflow.runtime 设计说明 + +本文基于当前代码实现,说明 `backend/packages/harness/deerflow/runtime` 的总体设计、约束边界、`stream_bridge` 与 `runs` 的协作方式、与外部基础设施和 `app` 层的交互方式,以及 `actor_context` 如何通过动态注入实现用户隔离。 + +## 1. 总体定位 + +`deerflow.runtime` 是 DeerFlow 的运行时内核层。它位于 agent / tool / middleware 之下、app / gateway / infra 之上,主要负责定义“运行时语义”和“基础边界契约”,而不直接拥有 Web 接口、数据库模型或具体基础设施实现。 + +当前 `runtime` 的公开表面由 [`__init__.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/__init__.py) 统一导出,主要包括四类能力: + +1. `runs` + - run 领域类型、执行 façade、生命周期观察者、store 协议 +2. `stream_bridge` + - 流式事件桥接契约与公共类型 +3. `actor_context` + - 请求/任务级的 actor 上下文与用户隔离桥 +4. `serialization` + - 运行时对外事件与 LangChain / LangGraph 数据的序列化能力 + +从结构上看,可以把当前 `runtime` 理解成: + +```text +runtime + ├─ runs + │ ├─ facade / types / observer / store + │ ├─ internal/* + │ └─ callbacks/* + ├─ stream_bridge + │ ├─ contract + │ └─ exceptions + ├─ actor_context + └─ serialization / converters +``` + +## 2. 总体设计与约束范式 + +### 2.1 设计目标 + +`runtime` 当前最核心的设计目标是把“运行时控制面”和“基础设施实现”解耦。 + +它自己只关心: + +1. run 是什么、状态如何变化 +2. 执行时会产出哪些生命周期事件和流式事件 +3. 哪些能力必须由外部注入,例如 checkpointer、event store、stream bridge、durable store +4. 当前 actor 是谁,以及下游如何据此做隔离 + +它刻意不关心: + +1. 事件是落到内存、Redis 还是别的消息介质 +2. run / thread / feedback 是怎么持久化的 +3. HTTP / SSE / FastAPI 细节 +4. 认证插件如何识别 request user + +### 2.2 约束边界 + +当前 `runtime` 的边界约束比较明确: + +1. `runs` 负责运行编排,不直接写 ORM 或 SQL。 +2. `stream_bridge` 只定义流语义,不提供 app 级基础设施装配。 +3. `actor_context` 只定义运行时上下文,不依赖 auth plugin。 +4. durable 数据只能通过协议边界接入: + - `RunCreateStore` + - `RunQueryStore` + - `RunDeleteStore` + - `RunEventStore` +5. 生命周期副作用只能通过 `RunObserver` 接入。 +6. 用户隔离不是散落在每个模块里做,而是通过 actor context 自上而下传递。 + +这套范式可以概括成一句话: + +`runtime` 定义语义和边界,`app.infra` 提供实现和装配。 + +## 3. runs 子系统的设计 + +### 3.1 作用 + +`runtime/runs` 是运行编排域。它负责: + +1. 定义 run 的领域对象与状态机 +2. 组织 create / stream / wait / join / cancel / delete 等操作 +3. 维护进程内运行控制面 +4. 在执行期间发出流式事件与生命周期事件 +5. 通过 callbacks 收集 trace、token、title、message 等运行数据 + +### 3.2 核心对象 + +见 [`runs/types.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/types.py)。 + +关键对象有: + +1. `RunSpec` + - 由 app 输入层构建,是执行器输入 +2. `RunRecord` + - 运行中的记录对象,由 `RunRegistry` 管理 +3. `RunStatus` + - `pending` / `starting` / `running` / `success` / `error` / `interrupted` / `timeout` +4. `RunScope` + - 区分 stateful / stateless 与临时 thread + +### 3.3 当前约束 + +当前 `runs` 明确限制了一些能力范围: + +1. `multitask_strategy` 当前主路径只支持 `reject` 和 `interrupt` +2. `enqueue`、`after_seconds`、批量执行等尚未进入当前主路径 +3. `RunRegistry` 是进程内状态,不是 durable source of truth +4. 外部查询可以走 durable store,但控制面仍然以内存 registry 为中心 + +### 3.4 façade 与内部组件 + +`RunsFacade` 在 [`runs/facade.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/facade.py) 中暴露统一入口: + +1. `create_background` +2. `create_and_stream` +3. `create_and_wait` +4. `join_stream` +5. `join_wait` +6. `cancel` +7. `get_run` +8. `list_runs` +9. `delete_run` + +它底层组合了: + +1. `RunRegistry` +2. `ExecutionPlanner` +3. `RunSupervisor` +4. `RunStreamService` +5. `RunWaitService` +6. `RunCreateStore` / `RunQueryStore` / `RunDeleteStore` +7. `RunObserver` + +也就是说,`RunsFacade` 是 public entry point,而真正的执行和状态推进拆散在内部组件中。 + +## 4. stream_bridge 的设计和实现思路 + +### 4.1 为什么单独抽象 + +`StreamBridge` 在 [`stream_bridge/contract.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/stream_bridge/contract.py) 中定义。 + +把它单独抽象出来的原因是:run 执行期间需要一个“可订阅、可回放、可终止、可恢复”的事件通道,而这件事不能直接绑定到 HTTP SSE、in-memory queue 或 Redis 细节。 + +所以: + +1. harness 负责定义流语义 +2. app 层负责选择和实现流后端 + +### 4.2 契约内容 + +`StreamBridge` 当前提供这些关键方法: + +1. `publish(run_id, event, data)` +2. `publish_end(run_id)` +3. `publish_terminal(run_id, kind, data)` +4. `subscribe(run_id, last_event_id, heartbeat_interval)` +5. `cleanup(run_id, delay=0)` +6. `cancel(run_id)` +7. `mark_awaiting_input(run_id)` +8. `start()` +9. `close()` + +公共类型包括: + +1. `StreamEvent` +2. `StreamStatus` +3. `ResumeResult` +4. `HEARTBEAT_SENTINEL` +5. `END_SENTINEL` +6. `CANCELLED_SENTINEL` + +### 4.3 语义边界 + +当前契约显式区分了两类终止语义: + +1. `end` / `cancel` / `error` + - 是 run 级别的真实业务终止事件 +2. `close()` + - 是 bridge 自身关闭 + - 不应被当作 run 被取消 + +### 4.4 当前实现方式 + +当前实际使用的实现是 app 层的 [`MemoryStreamBridge`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/stream_bridge/adapters/memory.py)。 + +它的设计是“每个 run 一条内存事件日志”: + +1. `_RunStream` 保存事件列表、offset 映射、状态、subscriber 计数和 awaiting-input 标记 +2. `publish()` 生成递增事件 ID 并追加到 per-run log +3. `subscribe()` 支持 replay、heartbeat、resume、terminal 退出 +4. `cleanup_loop()` 处理: + - 过老 stream + - 长时间无 publish 的 active stream + - orphan terminal stream + - TTL 过期 stream +5. `mark_awaiting_input()` 为 HITL 场景延长超时 + +Redis 版本当前仍在 [`RedisStreamBridge`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/stream_bridge/adapters/redis.py) 中作为占位。 + +### 4.5 调用链路 + +stream bridge 在运行链路中的作用可以概括为: + +```text +RunsFacade + -> RunStreamService + -> StreamBridge + -> app route converts events to SSE +``` + +更具体地说: + +1. `_RunExecution._start()` 会发布 `metadata` +2. `_RunExecution._stream()` 会把 agent 的 `astream()` 输出统一转成 bridge 事件 +3. `_RunExecution._finish_success()` / `_finish_failed()` / `_finish_aborted()` 会发布 terminal 事件 +4. `RunWaitService` 通过 `subscribe()` 等待 `values` / `error` / terminal +5. app 路由层再把这些事件转换为对外 SSE + +### 4.6 后续扩展 + +后续可以沿几个方向扩展: + +1. Redis 真正落地,支持跨进程 / 多实例流桥接 +2. 更完整的 Last-Event-ID gap recovery +3. 更细粒度的 HITL 状态管理 +4. 跨节点运行协调和 dead-letter 策略 + +## 5. 如何与外部通信,store 如何读写数据 + +### 5.1 两条主要外部边界 + +`runtime` 自身不直接发 HTTP 请求,也不直接写 ORM,但通过两条主边界与外界交互: + +1. `StreamBridge` + - 对外输出流式运行事件 +2. `store` / `observer` + - 对外输出 durable 数据与生命周期副作用 + +### 5.2 store 边界协议 + +在 [`runs/store`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/store) 中定义了四个协议: + +1. `RunCreateStore` +2. `RunQueryStore` +3. `RunDeleteStore` +4. `RunEventStore` + +这些协议不是 harness 内部的数据层,而是 harness 对 app 层的依赖声明。 + +### 5.3 app 层如何提供 store 实现 + +当前 app 层提供了这些实现: + +1. [`AppRunCreateStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/services/runs/store/create_store.py) +2. [`AppRunQueryStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/services/runs/store/query_store.py) +3. [`AppRunDeleteStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/services/runs/store/delete_store.py) +4. [`AppRunEventStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/storage/run_events.py) +5. [`JsonlRunEventStore`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/run_events/jsonl_store.py) + +这里的统一模式是: + +1. harness 只看协议 +2. app 层自己决定 session、commit、访问控制和后端选型 +3. durable 数据最终通过 `store.repositories.*` 落数据库,或者通过 JSONL 落盘 + +### 5.4 runs 生命周期数据是怎么写出去的 + +单次执行器 [`_RunExecution`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/internal/execution/executor.py) 不直接写数据库。 + +它把数据写出去的方式有三条: + +1. bridge 事件 + - 流式发布给订阅者 +2. callback -> `RunEventStore` + - 执行 trace / message / tool / custom event 以批次方式落地 +3. lifecycle event -> `RunObserver` + - 把 run 开始、完成、失败、取消、thread status 更新发给 app 层观察者 + +### 5.5 `RunEventStore` 的后端 + +`RunEventStore` 当前由 app 层工厂 [`app/infra/run_events/factory.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/run_events/factory.py) 统一构造: + +1. `run_events.backend == "db"` + - 走 `AppRunEventStore` +2. `run_events.backend == "jsonl"` + - 走 `JsonlRunEventStore` + +因此,`runtime` 不关心事件最终是数据库还是文件,它只要求支持 `put_batch()` 和相关读取方法。 + +## 6. runs 生命周期数据、callback 和查询回写 + +### 6.1 单次 run 的主流程 + +`_RunExecution.run()` 的主流程是: + +1. `_start()` +2. `_prepare()` +3. `_stream()` +4. `_finish_after_stream()` +5. `finally` + - `_emit_final_thread_status()` + - `callbacks.flush()` + - `bridge.cleanup(run_id)` + +### 6.2 start 阶段记录什么 + +`_start()` 会: + +1. 把 run 状态置为 `running` +2. 发出 `RUN_STARTED` +3. 抽取首条 human message,并发出 `HUMAN_MESSAGE` +4. 捕获 pre-run checkpoint id +5. 发布 `metadata` 流事件 + +### 6.3 callbacks 收集什么 + +当前 callbacks 位于 [`runs/callbacks`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/callbacks)。 + +主要有三类: + +1. `RunEventCallback` + - 记录 run_start / run_end / llm_request / llm_response / tool_start / tool_end / tool_result / custom_event 等 + - 按批 flush 到 `RunEventStore` +2. `RunTokenCallback` + - 聚合 token 使用、LLM 调用次数、lead/subagent/middleware token、message_count、首条 human message、最后一条 AI message +3. `RunTitleCallback` + - 从 title middleware 响应或 custom event 中提取 thread title + +### 6.4 completion_data 如何形成 + +`RunTokenCallback.completion_data()` 会得到 `RunCompletionData`,包括: + +1. `total_input_tokens` +2. `total_output_tokens` +3. `total_tokens` +4. `llm_call_count` +5. `lead_agent_tokens` +6. `subagent_tokens` +7. `middleware_tokens` +8. `message_count` +9. `last_ai_message` +10. `first_human_message` + +执行器在完成 / 失败 / 取消时都会把这份数据带入 lifecycle payload。 + +### 6.5 app 层如何回写 + +执行器通过 [`RunEventEmitter`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/runs/internal/execution/events.py) 发出 `RunLifecycleEvent`。 + +app 层 [`StorageRunObserver`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/storage/runs.py) 再根据事件类型回写 durable 状态: + +1. `RUN_STARTED` + - 更新 run 状态为 `running` +2. `RUN_COMPLETED` + - 写 completion_data + - 同步 title 到 thread metadata +3. `RUN_FAILED` + - 写 error 和 completion_data +4. `RUN_CANCELLED` + - 写 `interrupted` 状态与 completion_data +5. `THREAD_STATUS_UPDATED` + - 同步 thread status + +### 6.6 查询路径 + +`RunsFacade.get_run()` / `list_runs()` 有两条路径: + +1. 注入了 `RunQueryStore` 时,优先查 durable store +2. 否则回退到 `RunRegistry` + +这意味着: + +1. 内存 registry 负责控制面 +2. durable store 负责对外查询面 + +## 7. actor_context 如何动态注入并实现用户隔离 + +### 7.1 设计目标 + +`actor_context` 在 [`actor_context.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/packages/harness/deerflow/runtime/actor_context.py) 中定义。 + +它的目标是让 runtime 和下游基础模块可以依赖“当前 actor 是谁”这个运行时事实,但不直接依赖 auth plugin、FastAPI request 或具体用户模型。 + +### 7.2 当前实现方式 + +当前实现是一个基于 `ContextVar` 的请求/任务级上下文: + +1. `ActorContext` + - 当前只有 `user_id` +2. `_current_actor` + - `ContextVar[ActorContext | None]` +3. `bind_actor_context(actor)` + - 绑定当前 actor +4. `reset_actor_context(token)` + - 恢复之前上下文 +5. `get_actor_context()` + - 获取当前 actor +6. `get_effective_user_id()` + - 取当前 user_id,如果没有则返回 `DEFAULT_USER_ID` +7. `resolve_user_id(value=AUTO | explicit | None)` + - 在 repository / storage 边界统一解析 user_id + +### 7.3 app 如何动态注入 + +动态注入链路当前在 auth plugin 侧完成。 + +HTTP 请求路径: + +1. [`app.plugins.auth.security.middleware`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/plugins/auth/security/middleware.py) + - 从认证后的 request user 构造 `ActorContext(user_id=...)` + - 在请求处理期间绑定 / 重置 runtime actor context +2. [`app.plugins.auth.security.actor_context`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/plugins/auth/security/actor_context.py) + - 提供 `bind_request_actor_context(request)` 和 `bind_user_actor_context(user_id)` + - 在路由或非 HTTP 入口中显式绑定 runtime actor + +非 HTTP / 外部通道路径: + +1. [`app/channels/manager.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/channels/manager.py) +2. [`app/channels/feishu.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/channels/feishu.py) + +这些入口在把外部消息转入 runtime 前,也会用 `bind_user_actor_context(user_id)` 包住执行过程。这样做的意义是: + +1. runtime 不区分请求来自 HTTP、飞书还是别的 channel +2. 只要入口能解析出 user_id,就能把同一套隔离语义注入进去 +3. 同一份 runtime/store/path/memory 代码不需要知道上层协议来源 + +因此 runtime 自己不知道 request 是什么,也不知道 auth plugin 的 user model 长什么样;它只知道当前 `ContextVar` 中是否绑定了 `ActorContext`。 + +### 7.4 注入后的传播语义 + +这里的“动态注入”本质上不是把 `user_id` 一层层作为函数参数硬传下去,而是在 app 边界把 actor 绑定进 `ContextVar`,让当前请求/任务上下文中的 runtime 代码按需读取。 + +当前语义可以理解为: + +1. 入口边界先 `bind_actor_context(...)` +2. 在该上下文内创建的异步调用链共享同一个 actor 视图 +3. 请求结束或任务退出后用 `reset_actor_context(token)` 恢复 + +这有两个直接效果: + +1. 运行链路中的大部分接口不需要把 `user_id` 塞进每一层函数签名 +2. 真正需要 durable 隔离或路径隔离的边界,仍然可以通过 `resolve_user_id()` / `get_effective_user_id()` 显式取值 + +### 7.5 用户隔离如何生效 + +用户隔离当前是通过“动态注入 + 下游统一读取”实现的。 + +几条关键链路如下: + +1. path / uploads / sandbox / memory + - 通过 `get_effective_user_id()` 把 user_id 带入路径解析和目录隔离 +2. app storage adapter + - 通过 `resolve_user_id(AUTO)` 在 `RunStoreAdapter`、`ThreadMetaStorage` 等处做查询和写入隔离 +3. run event store + - `AppRunEventStore` 会读取 `get_actor_context()`,判断当前 actor 是否可见指定 thread + +也就是说,用户隔离并不是靠单一中间件“一次性做完”,而是: + +1. app 边界把 actor 动态绑定进 runtime context +2. runtime 及其下游模块在需要时读取该 context +3. 每个边界按自己的职责决定如何使用 user_id + +### 7.6 这种方式的优点 + +当前设计有几个明显优点: + +1. runtime 不依赖具体 auth 实现 +2. HTTP 和非 HTTP 入口都能复用同一套隔离机制 +3. user_id 可以自然传递到路径、memory、store、事件可见性等不同边界 +4. 需要强约束时可通过 `AUTO` + `resolve_user_id()` 强制要求 actor context 存在 + +### 7.7 后续如何扩展 + +`ActorContext` 文件里已经预留了扩展点注释,后续完全可以在不破坏当前模式的前提下继续扩展: + +1. `tenant_id` + - 用于多租户隔离 +2. `subject_id` + - 用于更稳定的主体标识 +3. `scopes` + - 用于更细粒度授权 +4. `auth_source` + - 用于记录来源渠道 + +扩展方式建议保持现有模式不变: + +1. 继续由 app/auth 边界负责绑定 richer `ActorContext` +2. runtime 只依赖抽象上下文字段,不依赖 request/user 对象 +3. 下游基础模块按需读取必要字段 +4. 在 store / path / sandbox / stream / memory 等边界逐步引入 tenant-aware 或 scope-aware 行为 + +更具体地说,后续如果要做多租户和更强隔离,推荐按边界渐进式扩展: + +1. store 边界 + - 在 `RunStoreAdapter`、`ThreadMetaStorage`、feedback/event store 中引入 `tenant_id` 过滤 +2. 路径与沙箱边界 + - 把目录分片从 `user_id` 扩展成 `tenant_id/user_id` +3. 事件可见性边界 + - 在 run event 查询和 thread 查询时叠加 `scopes` 或 `subject_id` +4. 外部通道边界 + - 为不同来源填充 `auth_source`,区分 API / channel / internal job + +这样 runtime 仍然只依赖“当前 actor 上下文”这个抽象,不会重新耦合回 FastAPI request 或某个认证实现。 + +## 8. 与 app 层的交互 + +### 8.1 app 如何装配 runtime + +当前 app 层会在 [`app/gateway/services/runs/facade_factory.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/services/runs/facade_factory.py) 装配 `RunsFacade`。 + +它会组装: + +1. `RunRegistry` +2. `ExecutionPlanner` +3. `RunSupervisor` +4. `RunStreamService` +5. `RunWaitService` +6. `RunsRuntime` + - `bridge` + - `checkpointer` + - `store` + - `event_store` + - `agent_factory_resolver` +7. `StorageRunObserver` +8. `AppRunCreateStore` +9. `AppRunQueryStore` +10. `AppRunDeleteStore` + +### 8.2 app.state 如何提供基础设施 + +在 [`app/gateway/registrar.py`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/gateway/registrar.py): + +1. `init_persistence()` 创建: + - `persistence` + - `checkpointer` + - `run_store` + - `thread_meta_storage` + - `run_event_store` +2. `init_runtime()` 创建: + - `stream_bridge` + +然后这些对象挂在 `app.state`,供依赖注入和 façade 构造使用。 + +### 8.3 `stream_bridge` 的 app 边界 + +当前具体 stream bridge 的装配已经完全属于 app 层: + +1. harness 只导出 `StreamBridge` 契约 +2. 具体实现由 [`app.infra.stream_bridge.build_stream_bridge`](/Users/rayhpeng/workspace/open-source/deer-flow/backend/app/infra/stream_bridge/factory.py) 构造 + +这条边界非常清晰: + +1. harness 定义运行语义和接口 +2. app 选择和构造基础设施实现 + +## 9. 设计总结 + +可以把当前 `deerflow.runtime` 总结为一句话: + +它是一个“以 run orchestration 为核心、以 stream bridge 为流式边界、以 actor context 为动态隔离桥、以 store / observer 为 durable 与副作用边界”的运行时内核层。 + +更具体地说: + +1. `runs` 负责编排和生命周期推进 +2. `stream_bridge` 负责流语义 +3. `actor_context` 负责运行时用户上下文和隔离桥 +4. `serialization` / `converters` 负责对外事件与消息格式转换 +5. app 层通过 infra 负责真正的持久化、流式基础设施和 auth 注入 + +这套结构的优势是: + +1. 运行语义与基础设施实现解耦 +2. 请求身份与 runtime 逻辑解耦 +3. HTTP、CLI、channel worker 等多种入口都可以复用同一套 runtime 边界 +4. 后续可平滑扩展到多租户、跨进程 stream bridge、更多 durable backend + +当前的主要限制也同样清楚: + +1. `RunRegistry` 仍然是进程内控制面 +2. Redis bridge 仍未落地 +3. 一些多任务策略和批量能力仍未进入主路径 +4. `actor_context` 目前只携带 `user_id`,还没有 tenant / scopes / auth_source 等 richer context + +因此,当前最准确的理解方式不是“最终态平台”,而是“已经具备清晰语义和扩展边界的 runtime kernel”。 diff --git a/backend/packages/harness/deerflow/runtime/__init__.py b/backend/packages/harness/deerflow/runtime/__init__.py index 5a3df2eb6..02a796436 100644 --- a/backend/packages/harness/deerflow/runtime/__init__.py +++ b/backend/packages/harness/deerflow/runtime/__init__.py @@ -5,42 +5,98 @@ Re-exports the public API of :mod:`~deerflow.runtime.runs` and directly from ``deerflow.runtime``. """ -from .checkpointer import checkpointer_context, get_checkpointer, make_checkpointer, reset_checkpointer -from .runs import ConflictError, DisconnectMode, RunContext, RunManager, RunRecord, RunStatus, UnsupportedStrategyError, run_agent +from .runs import ( + CallbackObserver, + CompositeObserver, + CancelAction, + LifecycleEventType, + NullObserver, + ObserverBinding, + ObserverLike, + RunEventCallback, + RunCreateStore, + RunDeleteStore, + RunEventStore, + RunManager, + RunRecord, + RunQueryStore, + RunScope, + RunSpec, + RunLifecycleEvent, + RunObserver, + RunResult, + RunsFacade, + RunStatus, + WaitResult, + ensure_observer, +) +from .actor_context import ( + AUTO, + ActorContext, + DEFAULT_USER_ID, + bind_actor_context, + get_actor_context, + get_effective_user_id, + require_actor_context, + reset_actor_context, + resolve_user_id, +) from .serialization import serialize, serialize_channel_values, serialize_lc_object, serialize_messages_tuple -from .store import get_store, make_store, reset_store, store_context -from .stream_bridge import END_SENTINEL, HEARTBEAT_SENTINEL, MemoryStreamBridge, StreamBridge, StreamEvent, make_stream_bridge +from .stream_bridge import ( + CANCELLED_SENTINEL, + END_SENTINEL, + HEARTBEAT_SENTINEL, + StreamBridge, + StreamEvent, + StreamStatus, +) __all__ = [ - # checkpointer - "checkpointer_context", - "get_checkpointer", - "make_checkpointer", - "reset_checkpointer", - # runs - "ConflictError", - "DisconnectMode", - "RunContext", + # runs - hooks + "RunsFacade", + "RunCreateStore", + "RunDeleteStore", + "RunEventStore", "RunManager", + "RunQueryStore", + "CallbackObserver", + "CompositeObserver", + "ensure_observer", + "LifecycleEventType", + "NullObserver", + "ObserverBinding", + "ObserverLike", + "RunEventCallback", + "RunLifecycleEvent", + "RunObserver", + "RunResult", + # runs - types + "CancelAction", + "RunScope", + "RunSpec", + "WaitResult", "RunRecord", "RunStatus", - "UnsupportedStrategyError", - "run_agent", + # actor context + "AUTO", + "ActorContext", + "DEFAULT_USER_ID", + "bind_actor_context", + "get_actor_context", + "get_effective_user_id", + "require_actor_context", + "reset_actor_context", + "resolve_user_id", # serialization "serialize", "serialize_channel_values", "serialize_lc_object", "serialize_messages_tuple", - # store - "get_store", - "make_store", - "reset_store", - "store_context", # stream_bridge + "CANCELLED_SENTINEL", "END_SENTINEL", "HEARTBEAT_SENTINEL", - "MemoryStreamBridge", "StreamBridge", "StreamEvent", - "make_stream_bridge", + "StreamStatus", ] diff --git a/backend/packages/harness/deerflow/runtime/actor_context.py b/backend/packages/harness/deerflow/runtime/actor_context.py new file mode 100644 index 000000000..de483e206 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/actor_context.py @@ -0,0 +1,117 @@ +"""Request/task-scoped actor context for runtime-facing user isolation. + +This module defines a runtime-owned context bridge that lower layers can +depend on without importing the auth plugin. The app/auth boundary maps +``request.user`` into :class:`ActorContext` and binds it before entering +runtime-facing code. +""" + +from __future__ import annotations + +from contextvars import ContextVar, Token +from dataclasses import dataclass +from typing import Final + + +@dataclass(frozen=True) +class ActorContext: + user_id: str | None = None + # Future extension points: + # subject_id: str | None = None + # tenant_id: str | None = None + # scopes: frozenset[str] = frozenset() + # auth_source: str | None = None + + +_current_actor: Final[ContextVar[ActorContext | None]] = ContextVar( + "deerflow_actor_context", + default=None, +) + + +def bind_actor_context(actor: ActorContext) -> Token[ActorContext | None]: + """Bind the current actor for this async task.""" + + return _current_actor.set(actor) + + +def reset_actor_context(token: Token[ActorContext | None]) -> None: + """Restore the actor context captured by ``token``.""" + + _current_actor.reset(token) + + +def get_actor_context() -> ActorContext | None: + """Return the current actor context, or ``None`` if unset.""" + + return _current_actor.get() + + +def require_actor_context() -> ActorContext: + """Return the current actor context, or raise if unset.""" + + actor = _current_actor.get() + if actor is None: + raise RuntimeError("runtime accessed without actor context") + return actor + + +DEFAULT_USER_ID: Final[str] = "default" + + +def get_effective_user_id() -> str: + """Return the effective user id, or ``DEFAULT_USER_ID`` if unset.""" + + actor = _current_actor.get() + if actor is None or actor.user_id is None: + return DEFAULT_USER_ID + return str(actor.user_id) + + +class _AutoSentinel: + """Singleton marker meaning 'resolve user_id from actor context'.""" + + _instance: _AutoSentinel | None = None + + def __new__(cls) -> _AutoSentinel: + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __repr__(self) -> str: + return "" + + +AUTO: Final[_AutoSentinel] = _AutoSentinel() + + +def resolve_user_id( + value: str | None | _AutoSentinel, + *, + method_name: str = "repository method", +) -> str | None: + """Resolve a repository ``user_id`` argument against the current actor.""" + + if isinstance(value, _AutoSentinel): + actor = _current_actor.get() + if actor is None or actor.user_id is None: + raise RuntimeError( + f"{method_name} called with user_id=AUTO but no actor context is set; " + "pass an explicit user_id, bind ActorContext at the app/runtime boundary, " + "or opt out with user_id=None for migration/CLI paths." + ) + return str(actor.user_id) + return value + + +__all__ = [ + "AUTO", + "ActorContext", + "DEFAULT_USER_ID", + "bind_actor_context", + "get_actor_context", + "get_effective_user_id", + "require_actor_context", + "reset_actor_context", + "resolve_user_id", +] diff --git a/backend/packages/harness/deerflow/runtime/converters.py b/backend/packages/harness/deerflow/runtime/converters.py index 811031160..dc7d75bec 100644 --- a/backend/packages/harness/deerflow/runtime/converters.py +++ b/backend/packages/harness/deerflow/runtime/converters.py @@ -1,6 +1,6 @@ """Pure functions to convert LangChain message objects to OpenAI Chat Completions format. -Used by RunJournal to build content dicts for event storage. +Used by run callbacks to build content dicts for event storage. """ from __future__ import annotations diff --git a/backend/packages/harness/deerflow/runtime/runs/__init__.py b/backend/packages/harness/deerflow/runtime/runs/__init__.py index 9faa30c17..da7c964d9 100644 --- a/backend/packages/harness/deerflow/runtime/runs/__init__.py +++ b/backend/packages/harness/deerflow/runtime/runs/__init__.py @@ -1,16 +1,48 @@ -"""Run lifecycle management for LangGraph Platform API compatibility.""" +"""Public runs API.""" -from .manager import ConflictError, RunManager, RunRecord, UnsupportedStrategyError -from .schemas import DisconnectMode, RunStatus -from .worker import RunContext, run_agent +from .facade import RunsFacade +from .internal.manager import RunManager +from .observer import ( + CallbackObserver, + CompositeObserver, + LifecycleEventType, + NullObserver, + ObserverBinding, + ObserverLike, + RunEventCallback, + RunLifecycleEvent, + RunObserver, + RunResult, + ensure_observer, +) +from .store import RunCreateStore, RunDeleteStore, RunEventStore, RunQueryStore +from .types import CancelAction, RunRecord, RunScope, RunSpec, RunStatus, WaitResult __all__ = [ - "ConflictError", - "DisconnectMode", - "RunContext", + # facade + "RunsFacade", "RunManager", + "RunCreateStore", + "RunDeleteStore", + "RunEventStore", + "RunQueryStore", + # hooks + "CallbackObserver", + "CompositeObserver", + "LifecycleEventType", + "NullObserver", + "ObserverBinding", + "ObserverLike", + "RunEventCallback", + "RunLifecycleEvent", + "RunObserver", + "RunResult", + "ensure_observer", + # types + "CancelAction", "RunRecord", + "RunScope", + "RunSpec", + "WaitResult", "RunStatus", - "UnsupportedStrategyError", - "run_agent", ] diff --git a/backend/packages/harness/deerflow/runtime/runs/callbacks/__init__.py b/backend/packages/harness/deerflow/runtime/runs/callbacks/__init__.py new file mode 100644 index 000000000..04c0e004e --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/callbacks/__init__.py @@ -0,0 +1,15 @@ +"""Runs execution callbacks.""" + +from .builder import RunCallbackArtifacts, build_run_callbacks +from .events import RunEventCallback +from .title import RunTitleCallback +from .tokens import RunCompletionData, RunTokenCallback + +__all__ = [ + "RunCallbackArtifacts", + "RunCompletionData", + "RunEventCallback", + "RunTitleCallback", + "RunTokenCallback", + "build_run_callbacks", +] diff --git a/backend/packages/harness/deerflow/runtime/runs/callbacks/builder.py b/backend/packages/harness/deerflow/runtime/runs/callbacks/builder.py new file mode 100644 index 000000000..948591956 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/callbacks/builder.py @@ -0,0 +1,138 @@ +"""Callback assembly for runs execution.""" + +from __future__ import annotations + +from collections.abc import Iterable +from dataclasses import dataclass +from typing import Any + +from langchain_core.callbacks import BaseCallbackHandler + +from ..store import RunEventStore +from ..types import RunRecord +from .events import RunEventCallback +from .title import RunTitleCallback +from .tokens import RunCompletionData, RunTokenCallback + + +@dataclass +class RunCallbackArtifacts: + """Callbacks plus handles used by the executor after callbacks run.""" + + callbacks: list[BaseCallbackHandler] + event_callback: RunEventCallback | None = None + token_callback: RunTokenCallback | None = None + title_callback: RunTitleCallback | None = None + + async def flush(self) -> None: + for callback in self.callbacks: + flush = getattr(callback, "flush", None) + if flush is None: + continue + result = flush() + if hasattr(result, "__await__"): + await result + + def completion_data(self) -> RunCompletionData: + if self.token_callback is None: + return RunCompletionData() + return self.token_callback.completion_data() + + def title(self) -> str | None: + if self.title_callback is None: + return None + return self.title_callback.title() + + +def build_run_callbacks( + *, + record: RunRecord, + graph_input: dict[str, Any], + event_store: RunEventStore | None, + existing_callbacks: Iterable[BaseCallbackHandler] = (), +) -> RunCallbackArtifacts: + """Build execution callbacks for a run. + + Reference callbacks are intentionally not assembled here yet; they remain + in the existing artifacts path until that integration is migrated. + """ + callbacks = list(existing_callbacks) + + event_callback = None + if event_store is not None: + event_callback = RunEventCallback( + run_id=record.run_id, + thread_id=record.thread_id, + event_store=event_store, + ) + callbacks.append(event_callback) + + token_callback = RunTokenCallback(track_token_usage=True) + _set_first_human_message(token_callback, graph_input) + callbacks.append(token_callback) + + title_callback = RunTitleCallback() + callbacks.append(title_callback) + + return RunCallbackArtifacts( + callbacks=callbacks, + event_callback=event_callback, + token_callback=token_callback, + title_callback=title_callback, + ) + + +def _set_first_human_message(token_callback: RunTokenCallback, graph_input: dict[str, Any]) -> None: + messages = graph_input.get("messages") + if not isinstance(messages, list) or not messages: + return + + first = messages[0] + content = _extract_first_human_text(first) + if content: + token_callback.set_first_human_message(content) + + +def _extract_first_human_text(message: Any) -> str | None: + if isinstance(message, str): + return message + + content = getattr(message, "content", None) + if content is not None: + return _extract_text_content(content) + + if isinstance(message, dict): + return _extract_text_content(message.get("content")) + + return None + + +def _extract_text_content(content: Any) -> str | None: + if isinstance(content, str): + return content + + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, str): + parts.append(item) + continue + if not isinstance(item, dict): + continue + if item.get("type") == "text" and isinstance(item.get("text"), str): + parts.append(item["text"]) + continue + if isinstance(item.get("content"), str): + parts.append(item["content"]) + joined = "".join(parts).strip() + return joined or None + + if isinstance(content, dict): + text = content.get("text") + if isinstance(text, str): + return text + nested = content.get("content") + if isinstance(nested, str): + return nested + + return None diff --git a/backend/packages/harness/deerflow/runtime/runs/callbacks/events.py b/backend/packages/harness/deerflow/runtime/runs/callbacks/events.py new file mode 100644 index 000000000..8e57e4615 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/callbacks/events.py @@ -0,0 +1,353 @@ +"""Run execution event recording callback.""" + +from __future__ import annotations + +import asyncio +import logging +import time +from datetime import UTC, datetime +from typing import Any +from uuid import UUID + +from langchain_core.callbacks import BaseCallbackHandler +from langchain_core.messages import HumanMessage + +from deerflow.runtime.converters import langchain_messages_to_openai, langchain_to_openai_completion + +from ..store import RunEventStore + +logger = logging.getLogger(__name__) + + +class RunEventCallback(BaseCallbackHandler): + """Capture LangChain execution events into the run event store.""" + + def __init__( + self, + *, + run_id: str, + thread_id: str, + event_store: RunEventStore, + flush_threshold: int = 5, + max_trace_content: int = 10240, + ) -> None: + super().__init__() + self.run_id = run_id + self.thread_id = thread_id + self._store = event_store + self._flush_threshold = flush_threshold + self._max_trace_content = max_trace_content + self._buffer: list[dict[str, Any]] = [] + self._llm_start_times: dict[str, float] = {} + self._llm_call_index = 0 + self._cached_prompts: dict[str, list[dict[str, Any]]] = {} + self._tool_call_ids: dict[str, str] = {} + self._human_message_recorded = False + + def on_chain_start(self, serialized: dict, inputs: Any, *, run_id: UUID, **kwargs: Any) -> None: + if kwargs.get("parent_run_id") is not None: + return + self._put( + event_type="run_start", + category="lifecycle", + metadata={"input_preview": str(inputs)[:500]}, + ) + + def on_chain_end(self, outputs: Any, *, run_id: UUID, **kwargs: Any) -> None: + if kwargs.get("parent_run_id") is not None: + return + self._put(event_type="run_end", category="lifecycle", metadata={"status": "success"}) + self._flush_sync() + + def on_chain_error(self, error: BaseException, *, run_id: UUID, **kwargs: Any) -> None: + if kwargs.get("parent_run_id") is not None: + return + self._put( + event_type="run_error", + category="lifecycle", + content=str(error), + metadata={"error_type": type(error).__name__}, + ) + self._flush_sync() + + def on_chat_model_start(self, serialized: dict, messages: list[list], *, run_id: UUID, **kwargs: Any) -> None: + rid = str(run_id) + self._llm_start_times[rid] = time.monotonic() + self._llm_call_index += 1 + + prompt_msgs = messages[0] if messages else [] + openai_msgs = langchain_messages_to_openai(prompt_msgs) + self._cached_prompts[rid] = openai_msgs + caller = self._identify_caller(kwargs) + + self._record_first_human_message(prompt_msgs, caller=caller) + + self._put( + event_type="llm_request", + category="trace", + content={"model": serialized.get("name", ""), "messages": openai_msgs}, + metadata={ + "caller": caller, + "llm_call_index": self._llm_call_index, + }, + ) + + def on_llm_start(self, serialized: dict, prompts: list[str], *, run_id: UUID, **kwargs: Any) -> None: + self._llm_start_times[str(run_id)] = time.monotonic() + + def on_llm_end(self, response: Any, *, run_id: UUID, **kwargs: Any) -> None: + try: + message = response.generations[0][0].message + except (IndexError, AttributeError): + logger.debug("on_llm_end: could not extract message from response") + return + + rid = str(run_id) + start = self._llm_start_times.pop(rid, None) + latency_ms = int((time.monotonic() - start) * 1000) if start else None + usage = dict(getattr(message, "usage_metadata", None) or {}) + caller = self._identify_caller(kwargs) + + call_index = self._llm_call_index + if rid not in self._cached_prompts: + self._llm_call_index += 1 + call_index = self._llm_call_index + self._cached_prompts.pop(rid, None) + + self._put( + event_type="llm_response", + category="trace", + content=langchain_to_openai_completion(message), + metadata={ + "caller": caller, + "usage": usage, + "latency_ms": latency_ms, + "llm_call_index": call_index, + }, + ) + + content = getattr(message, "content", "") + tool_calls = getattr(message, "tool_calls", None) or [] + if caller != "lead_agent": + return + if tool_calls: + self._put( + event_type="ai_tool_call", + category="message", + content=message.model_dump(), + metadata={"finish_reason": "tool_calls"}, + ) + elif isinstance(content, str) and content: + self._put( + event_type="ai_message", + category="message", + content=message.model_dump(), + metadata={"finish_reason": "stop"}, + ) + + def on_llm_error(self, error: BaseException, *, run_id: UUID, **kwargs: Any) -> None: + self._llm_start_times.pop(str(run_id), None) + self._put(event_type="llm_error", category="trace", content=str(error)) + + def on_tool_start(self, serialized: dict, input_str: str, *, run_id: UUID, **kwargs: Any) -> None: + tool_call_id = kwargs.get("tool_call_id") + if tool_call_id: + self._tool_call_ids[str(run_id)] = tool_call_id + self._put( + event_type="tool_start", + category="trace", + metadata={ + "tool_name": serialized.get("name", ""), + "tool_call_id": tool_call_id, + "args": str(input_str)[:2000], + }, + ) + + def on_tool_end(self, output: Any, *, run_id: UUID, **kwargs: Any) -> None: + from langchain_core.messages import ToolMessage + + if isinstance(output, ToolMessage): + tool_call_id = output.tool_call_id or kwargs.get("tool_call_id") or self._tool_call_ids.pop(str(run_id), None) + tool_name = output.name or kwargs.get("name", "") + status = getattr(output, "status", "success") or "success" + content_str = output.content if isinstance(output.content, str) else str(output.content) + msg_content = output.model_dump() + if msg_content.get("tool_call_id") != tool_call_id: + msg_content["tool_call_id"] = tool_call_id + else: + tool_call_id = kwargs.get("tool_call_id") or self._tool_call_ids.pop(str(run_id), None) + tool_name = kwargs.get("name", "") + status = "success" + content_str = str(output) + msg_content = ToolMessage( + content=content_str, + tool_call_id=tool_call_id or "", + name=tool_name, + status=status, + ).model_dump() + + self._put( + event_type="tool_end", + category="trace", + content=content_str, + metadata={ + "tool_name": tool_name, + "tool_call_id": tool_call_id, + "status": status, + }, + ) + self._put( + event_type="tool_result", + category="message", + content=msg_content, + metadata={"tool_name": tool_name, "status": status}, + ) + + def on_tool_error(self, error: BaseException, *, run_id: UUID, **kwargs: Any) -> None: + from langchain_core.messages import ToolMessage + + tool_call_id = kwargs.get("tool_call_id") or self._tool_call_ids.pop(str(run_id), None) + tool_name = kwargs.get("name", "") + self._put( + event_type="tool_error", + category="trace", + content=str(error), + metadata={"tool_name": tool_name, "tool_call_id": tool_call_id}, + ) + self._put( + event_type="tool_result", + category="message", + content=ToolMessage( + content=str(error), + tool_call_id=tool_call_id or "", + name=tool_name, + status="error", + ).model_dump(), + metadata={"tool_name": tool_name, "status": "error"}, + ) + + def on_custom_event(self, name: str, data: Any, *, run_id: UUID, **kwargs: Any) -> None: + from deerflow.runtime.serialization import serialize_lc_object + + if name == "summarization": + data_dict = data if isinstance(data, dict) else {} + self._put( + event_type="summarization", + category="trace", + content=data_dict.get("summary", ""), + metadata={ + "replaced_message_ids": data_dict.get("replaced_message_ids", []), + "replaced_count": data_dict.get("replaced_count", 0), + }, + ) + self._put( + event_type="middleware:summarize", + category="middleware", + content={"role": "system", "content": data_dict.get("summary", "")}, + metadata={"replaced_count": data_dict.get("replaced_count", 0)}, + ) + return + + event_data = serialize_lc_object(data) if not isinstance(data, dict) else data + self._put( + event_type=name, + category="trace", + metadata=event_data if isinstance(event_data, dict) else {"data": event_data}, + ) + + async def flush(self) -> None: + if self._buffer: + batch = self._buffer.copy() + self._buffer.clear() + await self._store.put_batch(batch) + + def _put( + self, + *, + event_type: str, + category: str, + content: Any = "", + metadata: dict[str, Any] | None = None, + ) -> None: + normalized_metadata = dict(metadata or {}) + if category != "message" and isinstance(content, str) and len(content) > self._max_trace_content: + normalized_metadata["content_truncated"] = True + normalized_metadata["original_content_length"] = len(content) + content = content[: self._max_trace_content] + + self._buffer.append( + { + "thread_id": self.thread_id, + "run_id": self.run_id, + "event_type": event_type, + "category": category, + "content": content, + "metadata": normalized_metadata, + "created_at": datetime.now(UTC).isoformat(), + } + ) + if len(self._buffer) >= self._flush_threshold: + self._flush_sync() + + def _flush_sync(self) -> None: + if not self._buffer: + return + try: + loop = asyncio.get_running_loop() + except RuntimeError: + return + batch = self._buffer.copy() + self._buffer.clear() + task = loop.create_task(self._flush_async(batch)) + task.add_done_callback(self._on_flush_done) + + async def _flush_async(self, batch: list[dict[str, Any]]) -> None: + try: + await self._store.put_batch(batch) + except Exception: + logger.warning( + "Failed to flush %d events for run %s; returning to buffer", + len(batch), + self.run_id, + exc_info=True, + ) + self._buffer = batch + self._buffer + + @staticmethod + def _on_flush_done(task: asyncio.Task) -> None: + if task.cancelled(): + return + exc = task.exception() + if exc: + logger.warning("Run event flush task failed: %s", exc) + + def _identify_caller(self, kwargs: dict[str, Any]) -> str: + for tag in kwargs.get("tags") or []: + if isinstance(tag, str) and ( + tag.startswith("subagent:") + or tag.startswith("middleware:") + or tag == "lead_agent" + ): + return tag + return "lead_agent" + + def _record_first_human_message(self, messages: list[Any], *, caller: str) -> None: + if self._human_message_recorded: + return + + for message in messages: + if not isinstance(message, HumanMessage): + continue + if message.name == "summary": + continue + self._put( + event_type="human_message", + category="message", + content=message.model_dump(), + metadata={ + "caller": caller, + "source": "chat_model_start", + }, + ) + self._human_message_recorded = True + return diff --git a/backend/packages/harness/deerflow/runtime/runs/callbacks/title.py b/backend/packages/harness/deerflow/runtime/runs/callbacks/title.py new file mode 100644 index 000000000..5e8749c64 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/callbacks/title.py @@ -0,0 +1,51 @@ +"""Title capture callback for runs.""" + +from __future__ import annotations + +from typing import Any +from uuid import UUID + +from langchain_core.callbacks import BaseCallbackHandler + + +class RunTitleCallback(BaseCallbackHandler): + """Capture title generated by title middleware LLM calls or custom events.""" + + def __init__(self) -> None: + super().__init__() + self._title: str | None = None + + def on_llm_end(self, response: Any, *, run_id: UUID, **kwargs: Any) -> None: + if self._identify_caller(kwargs) != "middleware:title": + return + try: + message = response.generations[0][0].message + except (IndexError, AttributeError): + return + content = getattr(message, "content", "") + if isinstance(content, str) and content: + self._title = content.strip().strip('"').strip("'")[:200] + + def on_custom_event(self, name: str, data: Any, *, run_id: UUID, **kwargs: Any) -> None: + if name not in {"title", "thread_title", "middleware:title"}: + return + if isinstance(data, str): + self._title = data.strip()[:200] + return + if isinstance(data, dict): + title = data.get("title") + if isinstance(title, str): + self._title = title.strip()[:200] + + def title(self) -> str | None: + return self._title + + def _identify_caller(self, kwargs: dict[str, Any]) -> str: + for tag in kwargs.get("tags") or []: + if isinstance(tag, str) and ( + tag.startswith("subagent:") + or tag.startswith("middleware:") + or tag == "lead_agent" + ): + return tag + return "lead_agent" diff --git a/backend/packages/harness/deerflow/runtime/runs/callbacks/tokens.py b/backend/packages/harness/deerflow/runtime/runs/callbacks/tokens.py new file mode 100644 index 000000000..6e62cc1ee --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/callbacks/tokens.py @@ -0,0 +1,122 @@ +"""Token and message summary callback for runs.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any +from uuid import UUID + +from langchain_core.callbacks import BaseCallbackHandler + + +@dataclass(frozen=True) +class RunCompletionData: + total_input_tokens: int = 0 + total_output_tokens: int = 0 + total_tokens: int = 0 + llm_call_count: int = 0 + lead_agent_tokens: int = 0 + subagent_tokens: int = 0 + middleware_tokens: int = 0 + message_count: int = 0 + last_ai_message: str | None = None + first_human_message: str | None = None + + def to_dict(self) -> dict[str, object]: + return { + "total_input_tokens": self.total_input_tokens, + "total_output_tokens": self.total_output_tokens, + "total_tokens": self.total_tokens, + "llm_call_count": self.llm_call_count, + "lead_agent_tokens": self.lead_agent_tokens, + "subagent_tokens": self.subagent_tokens, + "middleware_tokens": self.middleware_tokens, + "message_count": self.message_count, + "last_ai_message": self.last_ai_message, + "first_human_message": self.first_human_message, + } + + +class RunTokenCallback(BaseCallbackHandler): + """Aggregate token and message summary data for one run.""" + + def __init__(self, *, track_token_usage: bool = True) -> None: + super().__init__() + self._track_token_usage = track_token_usage + self._total_input_tokens = 0 + self._total_output_tokens = 0 + self._total_tokens = 0 + self._llm_call_count = 0 + self._lead_agent_tokens = 0 + self._subagent_tokens = 0 + self._middleware_tokens = 0 + self._message_count = 0 + self._last_ai_message: str | None = None + self._first_human_message: str | None = None + + def set_first_human_message(self, content: str) -> None: + self._first_human_message = content[:2000] if content else None + + def on_llm_end(self, response: Any, *, run_id: UUID, **kwargs: Any) -> None: + try: + message = response.generations[0][0].message + except (IndexError, AttributeError): + return + + self._record_ai_message(message, kwargs) + if not self._track_token_usage: + return + + usage = dict(getattr(message, "usage_metadata", None) or {}) + input_tk = usage.get("input_tokens", 0) or 0 + output_tk = usage.get("output_tokens", 0) or 0 + total_tk = usage.get("total_tokens", 0) or input_tk + output_tk + if total_tk <= 0: + return + + self._total_input_tokens += input_tk + self._total_output_tokens += output_tk + self._total_tokens += total_tk + self._llm_call_count += 1 + + caller = self._identify_caller(kwargs) + if caller.startswith("subagent:"): + self._subagent_tokens += total_tk + elif caller.startswith("middleware:"): + self._middleware_tokens += total_tk + else: + self._lead_agent_tokens += total_tk + + def completion_data(self) -> RunCompletionData: + return RunCompletionData( + total_input_tokens=self._total_input_tokens, + total_output_tokens=self._total_output_tokens, + total_tokens=self._total_tokens, + llm_call_count=self._llm_call_count, + lead_agent_tokens=self._lead_agent_tokens, + subagent_tokens=self._subagent_tokens, + middleware_tokens=self._middleware_tokens, + message_count=self._message_count, + last_ai_message=self._last_ai_message, + first_human_message=self._first_human_message, + ) + + def _record_ai_message(self, message: Any, kwargs: dict[str, Any]) -> None: + if self._identify_caller(kwargs) != "lead_agent": + return + if getattr(message, "tool_calls", None): + return + content = getattr(message, "content", "") + if isinstance(content, str) and content: + self._last_ai_message = content[:2000] + self._message_count += 1 + + def _identify_caller(self, kwargs: dict[str, Any]) -> str: + for tag in kwargs.get("tags") or []: + if isinstance(tag, str) and ( + tag.startswith("subagent:") + or tag.startswith("middleware:") + or tag == "lead_agent" + ): + return tag + return "lead_agent" diff --git a/backend/packages/harness/deerflow/runtime/runs/facade.py b/backend/packages/harness/deerflow/runtime/runs/facade.py new file mode 100644 index 000000000..7ba02bdd8 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/facade.py @@ -0,0 +1,240 @@ +"""Public runs facade.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, AsyncIterator, Callable + +from deerflow.runtime.stream_bridge import StreamEvent + +from .internal.execution.executor import _RunExecution +from .internal.execution.supervisor import RunSupervisor +from .internal.planner import ExecutionPlanner +from .internal.registry import RunRegistry +from .internal.streams import RunStreamService +from .internal.wait import RunWaitService, WaitErrorResult +from .observer import ObserverLike +from .store import RunCreateStore, RunDeleteStore, RunEventStore, RunQueryStore +from .types import CancelAction, RunRecord, RunSpec + + +class MultitaskRejectError(Exception): + """Raised when multitask_strategy is reject and thread has inflight runs.""" + + pass + + +@dataclass(frozen=True) +class RunsRuntime: + """Runtime dependencies needed to execute a run.""" + + bridge: Any + checkpointer: Any + store: Any | None + event_store: RunEventStore | None + agent_factory_resolver: Callable[[str | None], Any] + + +class _RegistryStatusAdapter: + """Minimal adapter so execution can update registry-backed run status.""" + + def __init__(self, registry: RunRegistry) -> None: + self._registry = registry + + async def set_status(self, run_id: str, status: Any, *, error: str | None = None) -> None: + await self._registry.set_status(run_id, status, error=error) + + +class RunsFacade: + """ + Phase 1 runs domain facade. + + Provides unified interface for: + - create_background + - create_and_stream + - create_and_wait + - join_stream + - join_wait + + Orchestrates registry, planner, supervisor, stream, and wait services. + Execution now flows through ExecutionPlanner + RunSupervisor rather than + the legacy RunManager create/start path. + """ + + def __init__( + self, + registry: RunRegistry, + planner: ExecutionPlanner, + supervisor: RunSupervisor, + stream_service: RunStreamService, + wait_service: RunWaitService, + runtime: RunsRuntime, + observer: ObserverLike = None, + query_store: RunQueryStore | None = None, + create_store: RunCreateStore | None = None, + delete_store: RunDeleteStore | None = None, + ) -> None: + self._registry = registry + self._planner = planner + self._supervisor = supervisor + self._stream = stream_service + self._wait = wait_service + self._runtime = runtime + self._observer = observer + self._query_store = query_store + self._create_store = create_store + self._delete_store = delete_store + + async def create_background(self, spec: RunSpec) -> RunRecord: + """ + Create a run in background mode. + + Returns immediately with the run record. + The run executes asynchronously. + """ + return await self._create_run(spec) + + async def create_and_stream( + self, + spec: RunSpec, + ) -> tuple[RunRecord, AsyncIterator[StreamEvent]]: + """ + Create a run and return stream. + + Returns (record, stream_iterator). + """ + record = await self._create_run(spec) + + stream = self._stream.subscribe(record.run_id) + return record, stream + + async def create_and_wait( + self, + spec: RunSpec, + ) -> tuple[RunRecord, dict[str, Any] | WaitErrorResult | None]: + """ + Create a run and wait for completion. + + Returns (record, final_values_or_error). + """ + record = await self._create_run(spec) + + result = await self._wait.wait_for_values_or_error(record.run_id) + return record, result + + async def join_stream( + self, + run_id: str, + *, + last_event_id: str | None = None, + ) -> AsyncIterator[StreamEvent]: + """ + Join an existing run stream. + + Supports resumption via last_event_id. + """ + return self._stream.subscribe(run_id, last_event_id=last_event_id) + + async def join_wait( + self, + run_id: str, + *, + last_event_id: str | None = None, + ) -> dict[str, Any] | WaitErrorResult | None: + """ + Join an existing run and wait for completion. + """ + return await self._wait.wait_for_values_or_error( + run_id, + last_event_id=last_event_id, + ) + + async def cancel( + self, + run_id: str, + *, + action: CancelAction = "interrupt", + ) -> bool: + """Request cancellation for an active run.""" + return await self._supervisor.cancel(run_id, action=action) + + async def get_run(self, run_id: str) -> RunRecord | None: + """Get run record by ID.""" + if self._query_store is not None: + return await self._query_store.get_run(run_id) + return self._registry.get(run_id) + + async def list_runs(self, thread_id: str) -> list[RunRecord]: + """List runs for a thread.""" + if self._query_store is not None: + return await self._query_store.list_runs(thread_id) + return await self._registry.list_by_thread(thread_id) + + async def delete_run(self, run_id: str) -> bool: + """Delete a run from durable storage and local runtime state.""" + record = await self.get_run(run_id) + if record is None: + return False + + await self._supervisor.cancel(run_id, action="interrupt") + await self._registry.delete(run_id) + + if self._delete_store is not None: + return await self._delete_store.delete_run(run_id) + + return True + + async def _create_run(self, spec: RunSpec) -> RunRecord: + """Create a run record and hand it to the execution backend.""" + await self._apply_multitask_strategy(spec) + record = await self._registry.create(spec) + if self._create_store is not None: + await self._create_store.create_run(record) + await self._start_execution(record, spec) + return record + + async def _apply_multitask_strategy(self, spec: RunSpec) -> None: + """Apply multitask strategy before creating run.""" + has_inflight = await self._registry.has_inflight(spec.scope.thread_id) + + if not has_inflight: + return + + if spec.multitask_strategy == "reject": + raise MultitaskRejectError( + f"Thread {spec.scope.thread_id} has inflight runs" + ) + elif spec.multitask_strategy == "interrupt": + interrupted = await self._registry.interrupt_inflight(spec.scope.thread_id) + for run_id in interrupted: + await self._supervisor.cancel(run_id, action="interrupt") + + async def _start_execution(self, record: RunRecord, spec: RunSpec) -> None: + """Start run execution via planner + supervisor.""" + # Update status to starting + await self._registry.set_status(record.run_id, "starting") + + plan = self._planner.build(record, spec) + status_adapter = _RegistryStatusAdapter(self._registry) + agent_factory = self._runtime.agent_factory_resolver(spec.assistant_id) + + async def _runner(handle) -> Any: + return await _RunExecution( + bridge=self._runtime.bridge, + run_manager=status_adapter, # type: ignore[arg-type] + record=record, + checkpointer=self._runtime.checkpointer, + store=self._runtime.store, + event_store=self._runtime.event_store, + agent_factory=agent_factory, + graph_input=plan.graph_input, + config=plan.runnable_config, + observer=self._observer, + stream_modes=plan.stream_modes, + stream_subgraphs=plan.stream_subgraphs, + interrupt_before=plan.interrupt_before, + interrupt_after=plan.interrupt_after, + handle=handle, + ).run() + + await self._supervisor.launch(record.run_id, runner=_runner) diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/__init__.py b/backend/packages/harness/deerflow/runtime/runs/internal/__init__.py new file mode 100644 index 000000000..c33a0f305 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/__init__.py @@ -0,0 +1,4 @@ +"""Internal runs implementation modules. + +These modules are implementation details behind the public runs surface. +""" diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/execution/__init__.py b/backend/packages/harness/deerflow/runtime/runs/internal/execution/__init__.py new file mode 100644 index 000000000..5af9b18f0 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/execution/__init__.py @@ -0,0 +1 @@ +"""Internal execution components for runs domain.""" diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/execution/artifacts.py b/backend/packages/harness/deerflow/runtime/runs/internal/execution/artifacts.py new file mode 100644 index 000000000..01be20be5 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/execution/artifacts.py @@ -0,0 +1,64 @@ +"""Execution preparation helpers for a single run.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from langchain_core.callbacks import BaseCallbackHandler +from langchain_core.runnables import RunnableConfig +from langgraph.runtime import Runtime + +from deerflow.runtime.stream_bridge import StreamBridge + + +@dataclass +class RunBuildArtifacts: + """Assembled agent runtime pieces for a single run.""" + + agent: Any + runnable_config: dict[str, Any] + reference_store: Any | None = None + + +def build_run_artifacts( + *, + thread_id: str, + run_id: str, + checkpointer: Any | None, + store: Any | None, + agent_factory: Any, + config: dict[str, Any], + bridge: StreamBridge, + interrupt_before: list[str] | None = None, + interrupt_after: list[str] | None = None, + callbacks: list[BaseCallbackHandler] | None = None, +) -> RunBuildArtifacts: + """Assemble all components needed for agent execution.""" + runtime = Runtime(context={"thread_id": thread_id}, store=store) + if "context" in config and isinstance(config["context"], dict): + config["context"].setdefault("thread_id", thread_id) + config.setdefault("configurable", {})["__pregel_runtime"] = runtime + + config_callbacks = config.setdefault("callbacks", []) + if callbacks: + config_callbacks.extend(callbacks) + + runnable_config = RunnableConfig(**config) + agent = agent_factory(config=runnable_config) + + if checkpointer is not None: + agent.checkpointer = checkpointer + if store is not None: + agent.store = store + + if interrupt_before: + agent.interrupt_before_nodes = interrupt_before + if interrupt_after: + agent.interrupt_after_nodes = interrupt_after + + return RunBuildArtifacts( + agent=agent, + runnable_config=dict(runnable_config), + reference_store=store, + ) diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/execution/events.py b/backend/packages/harness/deerflow/runtime/runs/internal/execution/events.py new file mode 100644 index 000000000..c25f8c04e --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/execution/events.py @@ -0,0 +1,45 @@ +"""Lifecycle event helpers for run execution.""" + +from __future__ import annotations + +from datetime import UTC, datetime +from typing import Any + +from ...observer import LifecycleEventType, RunLifecycleEvent, RunObserver + + +class RunEventEmitter: + """Build and dispatch lifecycle events for a single run.""" + + def __init__( + self, + *, + run_id: str, + thread_id: str, + observer: RunObserver, + ) -> None: + self._run_id = run_id + self._thread_id = thread_id + self._observer = observer + self._sequence = 0 + + @property + def sequence(self) -> int: + return self._sequence + + async def emit( + self, + event_type: LifecycleEventType, + payload: dict[str, Any] | None = None, + ) -> None: + self._sequence += 1 + event = RunLifecycleEvent( + event_id=f"{self._run_id}:{event_type.value}:{self._sequence}", + event_type=event_type, + run_id=self._run_id, + thread_id=self._thread_id, + sequence=self._sequence, + occurred_at=datetime.now(UTC), + payload=payload or {}, + ) + await self._observer.on_event(event) diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/execution/executor.py b/backend/packages/harness/deerflow/runtime/runs/internal/execution/executor.py new file mode 100644 index 000000000..7400a475f --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/execution/executor.py @@ -0,0 +1,376 @@ +"""Single-run execution orchestrator and execution-local helpers.""" + +from __future__ import annotations + +import asyncio +import logging +from typing import Any, Literal + +from langchain_core.runnables import RunnableConfig + +from deerflow.runtime.serialization import serialize +from deerflow.runtime.stream_bridge import StreamBridge, StreamStatus + +from ...callbacks.builder import RunCallbackArtifacts, build_run_callbacks +from ...observer import LifecycleEventType, RunObserver, RunResult +from ...store import RunEventStore +from ...types import RunStatus +from .artifacts import build_run_artifacts +from .events import RunEventEmitter +from .stream_logic import external_stream_event_name, normalize_stream_modes, should_filter_event, unpack_stream_item +from .supervisor import RunHandle + +logger = logging.getLogger(__name__) + + +class _RunExecution: + """Encapsulate the lifecycle of a single run.""" + + def __init__( + self, + *, + bridge: StreamBridge, + run_manager: Any, + record: Any, + checkpointer: Any | None = None, + store: Any | None = None, + event_store: RunEventStore | None = None, + ctx: Any | None = None, + agent_factory: Any, + graph_input: dict, + config: dict, + observer: RunObserver, + stream_modes: list[str] | None, + stream_subgraphs: bool, + interrupt_before: list[str] | Literal["*"] | None, + interrupt_after: list[str] | Literal["*"] | None, + handle: RunHandle | None = None, + ) -> None: + if ctx is not None: + checkpointer = getattr(ctx, "checkpointer", checkpointer) + store = getattr(ctx, "store", store) + + self.bridge = bridge + self.run_manager = run_manager + self.record = record + self.checkpointer = checkpointer + self.store = store + self.event_store = event_store + self.agent_factory = agent_factory + self.graph_input = graph_input + self.config = config + self.observer = observer + self.stream_modes = stream_modes + self.stream_subgraphs = stream_subgraphs + self.interrupt_before = interrupt_before + self.interrupt_after = interrupt_after + self.handle = handle + + self.run_id = record.run_id + self.thread_id = record.thread_id + self._pre_run_checkpoint_id: str | None = None + self._emitter = RunEventEmitter( + run_id=self.run_id, + thread_id=self.thread_id, + observer=observer, + ) + self.result = RunResult( + run_id=self.run_id, + thread_id=self.thread_id, + status=RunStatus.pending, + ) + self._agent: Any = None + self._runnable_config: dict[str, Any] = {} + self._lg_modes: list[str] = [] + self._callback_artifacts: RunCallbackArtifacts | None = None + + @property + def _event_sequence(self) -> int: + return self._emitter.sequence + + async def _emit( + self, + event_type: LifecycleEventType, + payload: dict[str, Any] | None = None, + ) -> None: + await self._emitter.emit(event_type, payload) + + async def _start(self) -> None: + await self.run_manager.set_status(self.run_id, RunStatus.running) + await self._emit(LifecycleEventType.RUN_STARTED, {}) + + human_msg = self._extract_human_message() + if human_msg is not None: + await self._emit( + LifecycleEventType.HUMAN_MESSAGE, + {"message": human_msg.model_dump()}, + ) + + await self._capture_pre_run_checkpoint() + await self.bridge.publish( + self.run_id, + "metadata", + {"run_id": self.run_id, "thread_id": self.thread_id}, + ) + + def _extract_human_message(self) -> Any: + from langchain_core.messages import HumanMessage + + messages = self.graph_input.get("messages") + if not messages: + return None + last = messages[-1] if isinstance(messages, list) else messages + if isinstance(last, HumanMessage): + return last + if isinstance(last, str): + return HumanMessage(content=last) if last else None + if hasattr(last, "content"): + return HumanMessage(content=last.content) + if isinstance(last, dict): + content = last.get("content", "") + return HumanMessage(content=content) if content else None + return None + + async def _capture_pre_run_checkpoint(self) -> None: + try: + config_for_check = {"configurable": {"thread_id": self.thread_id, "checkpoint_ns": ""}} + ckpt_tuple = await self.checkpointer.aget_tuple(config_for_check) + if ckpt_tuple is not None: + self._pre_run_checkpoint_id = ( + getattr(ckpt_tuple, "config", {}) + .get("configurable", {}) + .get("checkpoint_id") + ) + except Exception: + logger.debug("Could not get pre-run checkpoint_id for run %s", self.run_id) + + async def _prepare(self) -> None: + config = dict(self.config) + existing_callbacks = config.pop("callbacks", []) + if existing_callbacks is None: + existing_callbacks = [] + elif not isinstance(existing_callbacks, list): + existing_callbacks = [existing_callbacks] + + self._callback_artifacts = build_run_callbacks( + record=self.record, + graph_input=self.graph_input, + event_store=self.event_store, + existing_callbacks=existing_callbacks, + ) + + artifacts = build_run_artifacts( + thread_id=self.thread_id, + run_id=self.run_id, + checkpointer=self.checkpointer, + store=self.store, + agent_factory=self.agent_factory, + config=config, + bridge=self.bridge, + interrupt_before=self.interrupt_before, + interrupt_after=self.interrupt_after, + callbacks=self._callback_artifacts.callbacks, + ) + + self._agent = artifacts.agent + self._runnable_config = artifacts.runnable_config + self._lg_modes = normalize_stream_modes(self.stream_modes) + logger.info( + "Run %s: streaming with modes %s (requested: %s)", + self.run_id, + self._lg_modes, + self.stream_modes, + ) + + async def _finish_success(self) -> None: + await self.run_manager.set_status(self.run_id, RunStatus.success) + await self.bridge.publish_terminal(self.run_id, StreamStatus.ENDED) + self.result.status = RunStatus.success + completion_data = self._completion_data() + title = self._callback_title() or await self._extract_title_from_checkpoint() + self.result.title = title + self.result.completion_data = completion_data + await self._emit( + LifecycleEventType.RUN_COMPLETED, + { + "title": title, + "completion_data": completion_data, + }, + ) + + async def _finish_aborted(self, cancel_mode: str) -> None: + payload = { + "cancel_mode": cancel_mode, + "pre_run_checkpoint_id": self._pre_run_checkpoint_id, + "completion_data": self._completion_data(), + } + + if cancel_mode == "rollback": + await self.run_manager.set_status( + self.run_id, + RunStatus.error, + error="Rolled back by user", + ) + await self.bridge.publish_terminal( + self.run_id, + StreamStatus.CANCELLED, + {"cancel_mode": "rollback", "message": "Rolled back by user"}, + ) + self.result.status = RunStatus.error + self.result.error = "Rolled back by user" + logger.info("Run %s rolled back", self.run_id) + else: + await self.run_manager.set_status(self.run_id, RunStatus.interrupted) + await self.bridge.publish_terminal( + self.run_id, + StreamStatus.CANCELLED, + {"cancel_mode": cancel_mode}, + ) + self.result.status = RunStatus.interrupted + logger.info("Run %s cancelled (mode=%s)", self.run_id, cancel_mode) + + await self._emit(LifecycleEventType.RUN_CANCELLED, payload) + + async def _finish_failed(self, exc: Exception) -> None: + error_msg = str(exc) + logger.exception("Run %s failed: %s", self.run_id, error_msg) + + await self.run_manager.set_status(self.run_id, RunStatus.error, error=error_msg) + await self.bridge.publish_terminal( + self.run_id, + StreamStatus.ERRORED, + {"message": error_msg, "name": type(exc).__name__}, + ) + self.result.status = RunStatus.error + self.result.error = error_msg + + await self._emit( + LifecycleEventType.RUN_FAILED, + { + "error": error_msg, + "error_type": type(exc).__name__, + "completion_data": self._completion_data(), + }, + ) + + def _completion_data(self) -> dict[str, object]: + if self._callback_artifacts is None: + return {} + return self._callback_artifacts.completion_data().to_dict() + + def _callback_title(self) -> str | None: + if self._callback_artifacts is None: + return None + return self._callback_artifacts.title() + + async def _extract_title_from_checkpoint(self) -> str | None: + if self.checkpointer is None: + return None + try: + ckpt_config = {"configurable": {"thread_id": self.thread_id, "checkpoint_ns": ""}} + ckpt_tuple = await self.checkpointer.aget_tuple(ckpt_config) + if ckpt_tuple is not None: + ckpt = getattr(ckpt_tuple, "checkpoint", {}) or {} + return ckpt.get("channel_values", {}).get("title") + except Exception: + logger.debug("Failed to extract title from checkpoint for thread %s", self.thread_id) + return None + + def _map_run_status_to_thread_status(self, status: RunStatus) -> str: + if status == RunStatus.success: + return "idle" + if status == RunStatus.interrupted: + return "interrupted" + if status in (RunStatus.error, RunStatus.timeout): + return "error" + return "running" + + def _abort_requested(self) -> bool: + if self.handle is not None: + return self.handle.cancel_event.is_set() + return self.record.abort_event.is_set() + + def _abort_action(self) -> str: + if self.handle is not None: + return self.handle.cancel_action + return self.record.abort_action + + async def _stream(self) -> None: + runnable_config = RunnableConfig(**self._runnable_config) + + if len(self._lg_modes) == 1 and not self.stream_subgraphs: + single_mode = self._lg_modes[0] + async for chunk in self._agent.astream( + self.graph_input, + config=runnable_config, + stream_mode=single_mode, + ): + if self._abort_requested(): + logger.info("Run %s abort requested - stopping", self.run_id) + break + if should_filter_event(single_mode, chunk): + continue + await self.bridge.publish( + self.run_id, + external_stream_event_name(single_mode), + serialize(chunk, mode=single_mode), + ) + return + + async for item in self._agent.astream( + self.graph_input, + config=runnable_config, + stream_mode=self._lg_modes, + subgraphs=self.stream_subgraphs, + ): + if self._abort_requested(): + logger.info("Run %s abort requested - stopping", self.run_id) + break + + mode, chunk = unpack_stream_item(item, self._lg_modes, stream_subgraphs=self.stream_subgraphs) + if mode is None: + continue + if should_filter_event(mode, chunk): + continue + await self.bridge.publish( + self.run_id, + external_stream_event_name(mode), + serialize(chunk, mode=mode), + ) + + async def _finish_after_stream(self) -> None: + if self._abort_requested(): + action = self._abort_action() + cancel_mode = "rollback" if action == "rollback" else "interrupt" + await self._finish_aborted(cancel_mode) + return + + await self._finish_success() + + async def _emit_final_thread_status(self) -> None: + final_thread_status = self._map_run_status_to_thread_status(self.result.status) + await self._emit( + LifecycleEventType.THREAD_STATUS_UPDATED, + {"status": final_thread_status}, + ) + + async def run(self) -> RunResult: + try: + await self._start() + await self._prepare() + await self._stream() + await self._finish_after_stream() + except asyncio.CancelledError: + await self._finish_aborted("task_cancelled") + except Exception as exc: + await self._finish_failed(exc) + finally: + await self._emit_final_thread_status() + if self._callback_artifacts is not None: + await self._callback_artifacts.flush() + await self.bridge.cleanup(self.run_id) + + return self.result + + +__all__ = ["_RunExecution"] diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/execution/stream_logic.py b/backend/packages/harness/deerflow/runtime/runs/internal/execution/stream_logic.py new file mode 100644 index 000000000..1c66a93a3 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/execution/stream_logic.py @@ -0,0 +1,93 @@ +"""Execution-local stream processing helpers.""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Any + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class StreamItem: + """Normalized stream item from LangGraph.""" + + mode: str + chunk: Any + + +_FILTERED_NODES = frozenset({"__start__", "__end__"}) +_VALID_LG_MODES = {"values", "updates", "checkpoints", "tasks", "debug", "messages", "custom"} + + +def normalize_stream_modes(requested_modes: list[str] | None) -> list[str]: + """Normalize requested stream modes to valid LangGraph modes.""" + input_modes: list[str] = list(requested_modes or ["values"]) + + lg_modes: list[str] = [] + for mode in input_modes: + if mode == "messages-tuple": + lg_modes.append("messages") + elif mode == "events": + logger.info("'events' stream_mode not supported (requires astream_events). Skipping.") + continue + elif mode in _VALID_LG_MODES: + lg_modes.append(mode) + + if not lg_modes: + lg_modes = ["values"] + + seen: set[str] = set() + deduped: list[str] = [] + for mode in lg_modes: + if mode not in seen: + seen.add(mode) + deduped.append(mode) + + return deduped + + +def unpack_stream_item( + item: Any, + lg_modes: list[str], + *, + stream_subgraphs: bool, +) -> tuple[str | None, Any]: + """Unpack a multi-mode or subgraph stream item into ``(mode, chunk)``.""" + if stream_subgraphs: + if isinstance(item, tuple) and len(item) == 3: + _namespace, mode, chunk = item + return str(mode), chunk + if isinstance(item, tuple) and len(item) == 2: + mode, chunk = item + return str(mode), chunk + return None, None + + if isinstance(item, tuple) and len(item) == 2: + mode, chunk = item + return str(mode), chunk + + return lg_modes[0] if lg_modes else None, item + + +def should_filter_event(mode: str, chunk: Any) -> bool: + """Determine whether a stream event should be filtered before publish.""" + if mode == "updates" and isinstance(chunk, dict): + node_names = set(chunk.keys()) + if node_names & _FILTERED_NODES: + return True + + if mode == "messages" and isinstance(chunk, tuple) and len(chunk) == 2: + _, metadata = chunk + if isinstance(metadata, dict): + node = metadata.get("langgraph_node", "") + if node in _FILTERED_NODES: + return True + + return False + + +def external_stream_event_name(mode: str) -> str: + """Map LangGraph internal modes to the external SSE event contract.""" + return mode diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/execution/supervisor.py b/backend/packages/harness/deerflow/runtime/runs/internal/execution/supervisor.py new file mode 100644 index 000000000..bd7a19f4b --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/execution/supervisor.py @@ -0,0 +1,78 @@ +"""Active execution handle management for runs domain.""" + +from __future__ import annotations + +import asyncio +from collections.abc import Awaitable, Callable +from dataclasses import dataclass, field +from typing import Any + +from ...types import CancelAction + + +@dataclass +class RunHandle: + """In-process control handle for an active run.""" + + run_id: str + task: asyncio.Task[Any] | None = None + cancel_event: asyncio.Event = field(default_factory=asyncio.Event) + cancel_action: CancelAction = "interrupt" + + +class RunSupervisor: + """Own and control active run handles within the current process.""" + + def __init__(self) -> None: + self._handles: dict[str, RunHandle] = {} + self._lock = asyncio.Lock() + + async def launch( + self, + run_id: str, + *, + runner: Callable[[RunHandle], Awaitable[Any]], + ) -> RunHandle: + """Create a handle and start a background task for it.""" + handle = RunHandle(run_id=run_id) + + async with self._lock: + if run_id in self._handles: + raise RuntimeError(f"Run {run_id} is already active") + self._handles[run_id] = handle + + task = asyncio.create_task(runner(handle)) + handle.task = task + task.add_done_callback(lambda _: asyncio.create_task(self.cleanup(run_id))) + return handle + + async def cancel( + self, + run_id: str, + *, + action: CancelAction = "interrupt", + ) -> bool: + """Signal cancellation for an active handle.""" + async with self._lock: + handle = self._handles.get(run_id) + if handle is None: + return False + + handle.cancel_action = action + handle.cancel_event.set() + if handle.task is not None and not handle.task.done(): + handle.task.cancel() + + return True + + def get_handle(self, run_id: str) -> RunHandle | None: + """Return the active handle for a run, if any.""" + return self._handles.get(run_id) + + async def cleanup(self, run_id: str, *, delay: float = 0) -> None: + """Remove a handle after optional delay.""" + if delay > 0: + await asyncio.sleep(delay) + + async with self._lock: + self._handles.pop(run_id, None) diff --git a/backend/packages/harness/deerflow/runtime/runs/manager.py b/backend/packages/harness/deerflow/runtime/runs/internal/manager.py similarity index 91% rename from backend/packages/harness/deerflow/runtime/runs/manager.py rename to backend/packages/harness/deerflow/runtime/runs/internal/manager.py index a54a408b8..990ba01ba 100644 --- a/backend/packages/harness/deerflow/runtime/runs/manager.py +++ b/backend/packages/harness/deerflow/runtime/runs/internal/manager.py @@ -7,12 +7,9 @@ import logging import uuid from dataclasses import dataclass, field from datetime import UTC, datetime -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Literal -from .schemas import DisconnectMode, RunStatus - -if TYPE_CHECKING: - from deerflow.runtime.runs.store.base import RunStore +from ..types import RunStatus logger = logging.getLogger(__name__) @@ -29,7 +26,7 @@ class RunRecord: thread_id: str assistant_id: str | None status: RunStatus - on_disconnect: DisconnectMode + on_disconnect: Literal["cancel", "continue"] multitask_strategy: str = "reject" metadata: dict = field(default_factory=dict) kwargs: dict = field(default_factory=dict) @@ -49,12 +46,12 @@ class RunManager: that run history survives process restarts. """ - def __init__(self, store: RunStore | None = None) -> None: + def __init__(self, store: Any | None = None) -> None: self._runs: dict[str, RunRecord] = {} self._lock = asyncio.Lock() self._store = store - async def _persist_to_store(self, record: RunRecord) -> None: + async def _persist_to_store(self, record: RunRecord, *, follow_up_to_run_id: str | None = None) -> None: """Best-effort persist run record to backing store.""" if self._store is None: return @@ -68,6 +65,7 @@ class RunManager: metadata=record.metadata or {}, kwargs=record.kwargs or {}, created_at=record.created_at, + follow_up_to_run_id=follow_up_to_run_id, ) except Exception: logger.warning("Failed to persist run %s to store", record.run_id, exc_info=True) @@ -85,10 +83,11 @@ class RunManager: thread_id: str, assistant_id: str | None = None, *, - on_disconnect: DisconnectMode = DisconnectMode.cancel, + on_disconnect: Literal["cancel", "continue"] = "cancel", metadata: dict | None = None, kwargs: dict | None = None, multitask_strategy: str = "reject", + follow_up_to_run_id: str | None = None, ) -> RunRecord: """Create a new pending run and register it.""" run_id = str(uuid.uuid4()) @@ -107,7 +106,7 @@ class RunManager: ) async with self._lock: self._runs[run_id] = record - await self._persist_to_store(record) + await self._persist_to_store(record, follow_up_to_run_id=follow_up_to_run_id) logger.info("Run created: run_id=%s thread_id=%s", run_id, thread_id) return record @@ -120,7 +119,7 @@ class RunManager: async with self._lock: # Dict insertion order matches creation order, so reversing it gives # us deterministic newest-first results even when timestamps tie. - return [r for r in self._runs.values() if r.thread_id == thread_id] + return [r for r in reversed(self._runs.values()) if r.thread_id == thread_id] async def set_status(self, run_id: str, status: RunStatus, *, error: str | None = None) -> None: """Transition a run to a new status.""" @@ -170,10 +169,11 @@ class RunManager: thread_id: str, assistant_id: str | None = None, *, - on_disconnect: DisconnectMode = DisconnectMode.cancel, + on_disconnect: Literal["cancel", "continue"] = "cancel", metadata: dict | None = None, kwargs: dict | None = None, multitask_strategy: str = "reject", + follow_up_to_run_id: str | None = None, ) -> RunRecord: """Atomically check for inflight runs and create a new one. @@ -227,7 +227,7 @@ class RunManager: ) self._runs[run_id] = record - await self._persist_to_store(record) + await self._persist_to_store(record, follow_up_to_run_id=follow_up_to_run_id) logger.info("Run created: run_id=%s thread_id=%s", run_id, thread_id) return record diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/planner.py b/backend/packages/harness/deerflow/runtime/runs/internal/planner.py new file mode 100644 index 000000000..e93793f66 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/planner.py @@ -0,0 +1,42 @@ +"""Execution plan builder for runs domain.""" + +from __future__ import annotations + +from copy import deepcopy +from dataclasses import dataclass +from typing import Any, Literal + +from ..types import RunRecord, RunSpec + + +@dataclass(frozen=True) +class ExecutionPlan: + """Normalized execution inputs derived from a run record and spec.""" + + record: RunRecord + graph_input: dict[str, Any] + runnable_config: dict[str, Any] + stream_modes: list[str] + stream_subgraphs: bool + interrupt_before: list[str] | Literal["*"] | None + interrupt_after: list[str] | Literal["*"] | None + + +class ExecutionPlanner: + """Build executor-ready plans from public run specs.""" + + def build(self, record: RunRecord, spec: RunSpec) -> ExecutionPlan: + return ExecutionPlan( + record=record, + graph_input=self._normalize_graph_input(spec.input), + runnable_config=deepcopy(spec.runnable_config), + stream_modes=list(spec.stream_modes), + stream_subgraphs=spec.stream_subgraphs, + interrupt_before=spec.interrupt_before, + interrupt_after=spec.interrupt_after, + ) + + def _normalize_graph_input(self, raw_input: dict[str, Any] | None) -> dict[str, Any]: + if raw_input is None: + return {} + return deepcopy(raw_input) diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/registry.py b/backend/packages/harness/deerflow/runtime/runs/internal/registry.py new file mode 100644 index 000000000..319512a25 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/registry.py @@ -0,0 +1,146 @@ +"""In-memory run registry for runs domain state.""" + +from __future__ import annotations + +import asyncio +import uuid +from datetime import datetime, timezone +from typing import Any + +from ..types import INFLIGHT_STATUSES, RunRecord, RunSpec, RunStatus + + +class RunRegistry: + """In-memory source of truth for run records and their status.""" + + def __init__(self) -> None: + self._records: dict[str, RunRecord] = {} + self._thread_index: dict[str, set[str]] = {} # thread_id -> set[run_id] + self._lock = asyncio.Lock() + + async def create(self, spec: RunSpec) -> RunRecord: + """Create a new RunRecord from RunSpec.""" + run_id = str(uuid.uuid4()) + now = datetime.now(timezone.utc).isoformat() + + record = RunRecord( + run_id=run_id, + thread_id=spec.scope.thread_id, + assistant_id=spec.assistant_id, + status="pending", + temporary=spec.scope.temporary, + multitask_strategy=spec.multitask_strategy, + metadata=dict(spec.metadata), + follow_up_to_run_id=spec.follow_up_to_run_id, + created_at=now, + updated_at=now, + ) + + async with self._lock: + self._records[run_id] = record + # Update thread index + if spec.scope.thread_id not in self._thread_index: + self._thread_index[spec.scope.thread_id] = set() + self._thread_index[spec.scope.thread_id].add(run_id) + + return record + + def get(self, run_id: str) -> RunRecord | None: + """Get RunRecord by run_id.""" + return self._records.get(run_id) + + async def list_by_thread(self, thread_id: str) -> list[RunRecord]: + """List all RunRecords for a thread.""" + async with self._lock: + run_ids = self._thread_index.get(thread_id, set()) + return [self._records[rid] for rid in run_ids if rid in self._records] + + async def set_status( + self, + run_id: str, + status: RunStatus, + *, + error: str | None = None, + started_at: str | None = None, + ended_at: str | None = None, + ) -> None: + """Update run status and optional fields.""" + async with self._lock: + record = self._records.get(run_id) + if record is None: + return + + record.status = status + record.updated_at = datetime.now(timezone.utc).isoformat() + + if error is not None: + record.error = error + if started_at is not None: + record.started_at = started_at + if ended_at is not None: + record.ended_at = ended_at + + async def has_inflight(self, thread_id: str) -> bool: + """Check if thread has any inflight runs.""" + async with self._lock: + run_ids = self._thread_index.get(thread_id, set()) + for rid in run_ids: + record = self._records.get(rid) + if record and record.status in INFLIGHT_STATUSES: + return True + return False + + async def interrupt_inflight(self, thread_id: str) -> list[str]: + """ + Mark all inflight runs for a thread as interrupted. + + Returns list of interrupted run_ids. + """ + interrupted: list[str] = [] + now = datetime.now(timezone.utc).isoformat() + + async with self._lock: + run_ids = self._thread_index.get(thread_id, set()) + for rid in run_ids: + record = self._records.get(rid) + if record and record.status in INFLIGHT_STATUSES: + record.status = "interrupted" + record.updated_at = now + record.ended_at = now + interrupted.append(rid) + + return interrupted + + async def update_metadata(self, run_id: str, metadata: dict[str, Any]) -> None: + """Update run metadata.""" + async with self._lock: + record = self._records.get(run_id) + if record is not None: + record.metadata.update(metadata) + record.updated_at = datetime.now(timezone.utc).isoformat() + + async def delete(self, run_id: str) -> bool: + """Delete a run record. Returns True if deleted.""" + async with self._lock: + record = self._records.pop(run_id, None) + if record is None: + return False + + # Update thread index + thread_runs = self._thread_index.get(record.thread_id) + if thread_runs: + thread_runs.discard(run_id) + + return True + + def count(self) -> int: + """Return total number of records.""" + return len(self._records) + + def count_by_status(self, status: RunStatus) -> int: + """Return count of records with given status.""" + return sum(1 for r in self._records.values() if r.status == status) + + +# Compatibility alias during the refactor. +RuntimeRunRegistry = RunRegistry diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/streams.py b/backend/packages/harness/deerflow/runtime/runs/internal/streams.py new file mode 100644 index 000000000..6c7656edf --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/streams.py @@ -0,0 +1,76 @@ +"""Internal run stream adapter over StreamBridge.""" + +from __future__ import annotations + +from collections.abc import AsyncIterator +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from deerflow.runtime.stream_bridge import JSONValue, StreamBridge, StreamEvent + +from deerflow.runtime.stream_bridge import StreamStatus + + +class RunStreamService: + """Thin runs-domain adapter over the harness stream bridge contract.""" + + def __init__(self, bridge: "StreamBridge") -> None: + self._bridge = bridge + + async def publish_event( + self, + run_id: str, + *, + event: str, + data: "JSONValue", + ) -> str: + """Publish a replayable run event.""" + return await self._bridge.publish(run_id, event, data) + + async def publish_end(self, run_id: str) -> str: + """Publish a successful terminal signal.""" + return await self._bridge.publish_terminal(run_id, StreamStatus.ENDED) + + async def publish_cancelled( + self, + run_id: str, + *, + data: "JSONValue" = None, + ) -> str: + """Publish a cancelled terminal signal.""" + return await self._bridge.publish_terminal( + run_id, + StreamStatus.CANCELLED, + data, + ) + + async def publish_error( + self, + run_id: str, + *, + data: "JSONValue", + ) -> str: + """Publish a failed terminal signal.""" + return await self._bridge.publish_terminal( + run_id, + StreamStatus.ERRORED, + data, + ) + + def subscribe( + self, + run_id: str, + *, + last_event_id: str | None = None, + heartbeat_interval: float = 15.0, + ) -> AsyncIterator[StreamEvent]: + """Subscribe to a run stream with resume support.""" + return self._bridge.subscribe( + run_id, + last_event_id=last_event_id, + heartbeat_interval=heartbeat_interval, + ) + + async def cleanup(self, run_id: str, *, delay: float = 0) -> None: + """Release per-run bridge resources after completion.""" + await self._bridge.cleanup(run_id, delay=delay) diff --git a/backend/packages/harness/deerflow/runtime/runs/internal/wait.py b/backend/packages/harness/deerflow/runtime/runs/internal/wait.py new file mode 100644 index 000000000..ae4cc205c --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/internal/wait.py @@ -0,0 +1,95 @@ +"""Internal run wait helpers based on stream events.""" + +from __future__ import annotations + +from typing import Any + +from deerflow.runtime.stream_bridge import StreamEvent + +from .streams import RunStreamService + + +class WaitTimeoutError(TimeoutError): + """Raised when wait times out.""" + + pass + + +class WaitErrorResult: + """Represents an error result from wait.""" + + def __init__(self, error: str, details: dict[str, Any] | None = None) -> None: + self.error = error + self.details = details or {} + + def to_dict(self) -> dict[str, Any]: + return {"error": self.error, **self.details} + + +class RunWaitService: + """ + Wait service for runs domain. + + Based on RunStreamService.subscribe(), implements wait semantics. + + Phase 1 behavior: + - Records last 'values' event + - On 'error', returns unified error structure + - On 'end' only, returns last values + """ + + TERMINAL_EVENTS = frozenset({"end", "error", "cancel"}) + + def __init__(self, stream_service: RunStreamService) -> None: + self._stream_service = stream_service + + async def wait_for_terminal( + self, + run_id: str, + *, + last_event_id: str | None = None, + ) -> StreamEvent | None: + """Block until the next terminal event for a run is observed.""" + async for event in self._stream_service.subscribe( + run_id, + last_event_id=last_event_id, + ): + if event.event in self.TERMINAL_EVENTS: + return event + + return None + + async def wait_for_values_or_error( + self, + run_id: str, + *, + last_event_id: str | None = None, + ) -> dict[str, Any] | WaitErrorResult | None: + """ + Wait for run to complete and return final values or error. + + Returns: + - dict: Final values if successful + - WaitErrorResult: If run failed + - None: If no values were produced + """ + last_values: dict[str, Any] | None = None + + async for event in self._stream_service.subscribe( + run_id, + last_event_id=last_event_id, + ): + if event.event == "values": + last_values = event.data + + elif event.event == "error": + return WaitErrorResult( + error=str(event.data) if event.data else "Unknown error", + details={"run_id": run_id}, + ) + + elif event.event in self.TERMINAL_EVENTS: + # Stream ended, return last values + break + + return last_values diff --git a/backend/packages/harness/deerflow/runtime/runs/observer.py b/backend/packages/harness/deerflow/runtime/runs/observer.py new file mode 100644 index 000000000..228702af3 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/observer.py @@ -0,0 +1,203 @@ +"""Run lifecycle observer types for decoupled observation. + +Defines the RunObserver protocol and lifecycle event types that allow +the harness layer to emit notifications without directly calling +storage implementations. + +The app layer provides concrete observers (e.g., StorageObserver) that +map lifecycle events to persistence operations. +""" + +from __future__ import annotations + +import logging +from collections.abc import Awaitable, Callable, Mapping +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +from typing import Any, Protocol, runtime_checkable + +from .types import RunStatus + +# Callback type for lightweight observer registration +type RunEventCallback = Callable[["RunLifecycleEvent"], Awaitable[None]] + + +class LifecycleEventType(str, Enum): + """Lifecycle event types emitted during run execution.""" + + # Run lifecycle + RUN_STARTED = "run_started" + RUN_COMPLETED = "run_completed" + RUN_FAILED = "run_failed" + RUN_CANCELLED = "run_cancelled" + + # Human message (for event store) + HUMAN_MESSAGE = "human_message" + + # Thread status updates + THREAD_STATUS_UPDATED = "thread_status_updated" + + +@dataclass(frozen=True) +class RunLifecycleEvent: + """A single lifecycle event emitted during run execution. + + Attributes: + event_type: The type of lifecycle event. + run_id: The run that emitted this event. + thread_id: The thread this run belongs to. + payload: Event-specific data (varies by event_type). + """ + + event_id: str + event_type: LifecycleEventType + run_id: str + thread_id: str + sequence: int + occurred_at: datetime + payload: Mapping[str, Any] = field(default_factory=dict) + + +@dataclass +class RunResult: + """Minimal result returned after run execution. + + Contains only the data needed for the caller to understand + what happened. Detailed events are delivered via observer. + + Attributes: + run_id: The run ID. + thread_id: The thread ID. + status: Final status (success, error, interrupted, etc.). + error: Error message if status is error. + completion_data: Token usage and message counts from journal. + title: Thread title extracted from checkpoint (if available). + """ + + run_id: str + thread_id: str + status: RunStatus + error: str | None = None + completion_data: dict[str, Any] = field(default_factory=dict) + title: str | None = None + + +@runtime_checkable +class RunObserver(Protocol): + """Protocol for observing run lifecycle events. + + Implementations receive events as they occur during execution + and can perform side effects (storage, logging, metrics, etc.) + without coupling the worker to specific implementations. + + Methods are async to support IO-bound operations like database writes. + """ + + async def on_event(self, event: RunLifecycleEvent) -> None: + """Called when a lifecycle event occurs. + + Args: + event: The lifecycle event with type, IDs, and payload. + + Implementations should be explicit about failure handling. + CompositeObserver can be configured to either swallow or raise + observer failures based on each binding's ``required`` flag. + """ + ... + + +@dataclass(frozen=True) +class ObserverBinding: + """Observer registration with failure policy. + + Attributes: + observer: Observer instance to invoke. + required: When True, observer failures are raised to the caller. + When False, failures are logged and dispatch continues. + """ + + observer: RunObserver + required: bool = False + + +class CompositeObserver: + """Observer that delegates to multiple child observers. + + Useful for combining storage, metrics, and logging observers. + Optional observers are logged on failure; required observers raise. + """ + + def __init__( + self, + observers: list[RunObserver | ObserverBinding] | None = None, + ) -> None: + self._observers: list[ObserverBinding] = [ + obs if isinstance(obs, ObserverBinding) else ObserverBinding(obs) + for obs in (observers or []) + ] + + def add(self, observer: RunObserver, *, required: bool = False) -> None: + """Add an observer to the composite.""" + self._observers.append(ObserverBinding(observer=observer, required=required)) + + async def on_event(self, event: RunLifecycleEvent) -> None: + """Dispatch event to all child observers.""" + logger = logging.getLogger(__name__) + for binding in self._observers: + try: + await binding.observer.on_event(event) + except Exception: + if binding.required: + raise + logger.warning( + "Observer %s failed on event %s", + type(binding.observer).__name__, + event.event_type.value, + exc_info=True, + ) + + +class NullObserver: + """No-op observer for when no observation is needed.""" + + async def on_event(self, event: RunLifecycleEvent) -> None: + """Do nothing.""" + pass + + +@dataclass(slots=True) +class CallbackObserver: + """Adapter that wraps a callback function as a RunObserver. + + Allows lightweight callback functions to participate in the + observer protocol without defining a full class. + """ + + callback: RunEventCallback + + async def on_event(self, event: RunLifecycleEvent) -> None: + """Invoke the wrapped callback with the event.""" + await self.callback(event) + + +type ObserverLike = RunObserver | RunEventCallback | None + + +def ensure_observer(observer: ObserverLike) -> RunObserver: + """Normalize an observer-like value to a RunObserver. + + Args: + observer: Can be: + - None: returns NullObserver + - A callable: wraps in CallbackObserver + - A RunObserver: returns as-is + + Returns: + A RunObserver instance. + """ + if observer is None: + return NullObserver() + if callable(observer) and not isinstance(observer, RunObserver): + return CallbackObserver(observer) + return observer diff --git a/backend/packages/harness/deerflow/runtime/runs/schemas.py b/backend/packages/harness/deerflow/runtime/runs/schemas.py deleted file mode 100644 index 622d8b70b..000000000 --- a/backend/packages/harness/deerflow/runtime/runs/schemas.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Run status and disconnect mode enums.""" - -from enum import StrEnum - - -class RunStatus(StrEnum): - """Lifecycle status of a single run.""" - - pending = "pending" - running = "running" - success = "success" - error = "error" - timeout = "timeout" - interrupted = "interrupted" - - -class DisconnectMode(StrEnum): - """Behaviour when the SSE consumer disconnects.""" - - cancel = "cancel" - continue_ = "continue" diff --git a/backend/packages/harness/deerflow/runtime/runs/store/__init__.py b/backend/packages/harness/deerflow/runtime/runs/store/__init__.py index 265a6fffb..bbd264ec7 100644 --- a/backend/packages/harness/deerflow/runtime/runs/store/__init__.py +++ b/backend/packages/harness/deerflow/runtime/runs/store/__init__.py @@ -1,4 +1,13 @@ -from deerflow.runtime.runs.store.base import RunStore -from deerflow.runtime.runs.store.memory import MemoryRunStore +"""Store boundary protocols for runs.""" -__all__ = ["MemoryRunStore", "RunStore"] +from .create_store import RunCreateStore +from .delete_store import RunDeleteStore +from .event_store import RunEventStore +from .query_store import RunQueryStore + +__all__ = [ + "RunCreateStore", + "RunDeleteStore", + "RunEventStore", + "RunQueryStore", +] diff --git a/backend/packages/harness/deerflow/runtime/runs/store/base.py b/backend/packages/harness/deerflow/runtime/runs/store/base.py deleted file mode 100644 index 518a1903c..000000000 --- a/backend/packages/harness/deerflow/runtime/runs/store/base.py +++ /dev/null @@ -1,95 +0,0 @@ -"""Abstract interface for run metadata storage. - -RunManager depends on this interface. Implementations: -- MemoryRunStore: in-memory dict (development, tests) -- Future: RunRepository backed by SQLAlchemy ORM - -All methods accept an optional user_id for user isolation. -When user_id is None, no user filtering is applied (single-user mode). -""" - -from __future__ import annotations - -import abc -from typing import Any - - -class RunStore(abc.ABC): - @abc.abstractmethod - async def put( - self, - run_id: str, - *, - thread_id: str, - assistant_id: str | None = None, - user_id: str | None = None, - status: str = "pending", - multitask_strategy: str = "reject", - metadata: dict[str, Any] | None = None, - kwargs: dict[str, Any] | None = None, - error: str | None = None, - created_at: str | None = None, - ) -> None: - pass - - @abc.abstractmethod - async def get(self, run_id: str) -> dict[str, Any] | None: - pass - - @abc.abstractmethod - async def list_by_thread( - self, - thread_id: str, - *, - user_id: str | None = None, - limit: int = 100, - ) -> list[dict[str, Any]]: - pass - - @abc.abstractmethod - async def update_status( - self, - run_id: str, - status: str, - *, - error: str | None = None, - ) -> None: - pass - - @abc.abstractmethod - async def delete(self, run_id: str) -> None: - pass - - @abc.abstractmethod - async def update_run_completion( - self, - run_id: str, - *, - status: str, - total_input_tokens: int = 0, - total_output_tokens: int = 0, - total_tokens: int = 0, - llm_call_count: int = 0, - lead_agent_tokens: int = 0, - subagent_tokens: int = 0, - middleware_tokens: int = 0, - message_count: int = 0, - last_ai_message: str | None = None, - first_human_message: str | None = None, - error: str | None = None, - ) -> None: - pass - - @abc.abstractmethod - async def list_pending(self, *, before: str | None = None) -> list[dict[str, Any]]: - pass - - @abc.abstractmethod - async def aggregate_tokens_by_thread(self, thread_id: str) -> dict[str, Any]: - """Aggregate token usage for completed runs in a thread. - - Returns a dict with keys: total_tokens, total_input_tokens, - total_output_tokens, total_runs, by_model (model_name → {tokens, runs}), - by_caller ({lead_agent, subagent, middleware}). - """ - pass diff --git a/backend/packages/harness/deerflow/runtime/runs/store/create_store.py b/backend/packages/harness/deerflow/runtime/runs/store/create_store.py new file mode 100644 index 000000000..03b85af30 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/store/create_store.py @@ -0,0 +1,13 @@ +"""Create-side boundary for durable run initialization.""" + +from __future__ import annotations + +from typing import Protocol + +from ..types import RunRecord + + +class RunCreateStore(Protocol): + """Persist the initial durable row for a newly created run.""" + + async def create_run(self, record: RunRecord) -> None: ... diff --git a/backend/packages/harness/deerflow/runtime/runs/store/delete_store.py b/backend/packages/harness/deerflow/runtime/runs/store/delete_store.py new file mode 100644 index 000000000..039ac0a39 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/store/delete_store.py @@ -0,0 +1,11 @@ +"""Delete-side durable boundary for runs.""" + +from __future__ import annotations + +from typing import Protocol + + +class RunDeleteStore(Protocol): + """Minimal protocol for removing durable run records.""" + + async def delete_run(self, run_id: str) -> bool: ... diff --git a/backend/packages/harness/deerflow/runtime/runs/store/event_store.py b/backend/packages/harness/deerflow/runtime/runs/store/event_store.py new file mode 100644 index 000000000..f8223369c --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/store/event_store.py @@ -0,0 +1,11 @@ +"""Run event store boundary used by runs callbacks.""" + +from __future__ import annotations + +from typing import Any, Protocol + + +class RunEventStore(Protocol): + """Minimal append-only event store protocol for execution callbacks.""" + + async def put_batch(self, events: list[dict[str, Any]]) -> list[dict[str, Any]]: ... diff --git a/backend/packages/harness/deerflow/runtime/runs/store/memory.py b/backend/packages/harness/deerflow/runtime/runs/store/memory.py deleted file mode 100644 index 5a14af3df..000000000 --- a/backend/packages/harness/deerflow/runtime/runs/store/memory.py +++ /dev/null @@ -1,98 +0,0 @@ -"""In-memory RunStore. Used when database.backend=memory (default) and in tests. - -Equivalent to the original RunManager._runs dict behavior. -""" - -from __future__ import annotations - -from datetime import UTC, datetime -from typing import Any - -from deerflow.runtime.runs.store.base import RunStore - - -class MemoryRunStore(RunStore): - def __init__(self) -> None: - self._runs: dict[str, dict[str, Any]] = {} - - async def put( - self, - run_id, - *, - thread_id, - assistant_id=None, - user_id=None, - status="pending", - multitask_strategy="reject", - metadata=None, - kwargs=None, - error=None, - created_at=None, - ): - now = datetime.now(UTC).isoformat() - self._runs[run_id] = { - "run_id": run_id, - "thread_id": thread_id, - "assistant_id": assistant_id, - "user_id": user_id, - "status": status, - "multitask_strategy": multitask_strategy, - "metadata": metadata or {}, - "kwargs": kwargs or {}, - "error": error, - "created_at": created_at or now, - "updated_at": now, - } - - async def get(self, run_id): - return self._runs.get(run_id) - - async def list_by_thread(self, thread_id, *, user_id=None, limit=100): - results = [r for r in self._runs.values() if r["thread_id"] == thread_id and (user_id is None or r.get("user_id") == user_id)] - results.sort(key=lambda r: r["created_at"], reverse=True) - return results[:limit] - - async def update_status(self, run_id, status, *, error=None): - if run_id in self._runs: - self._runs[run_id]["status"] = status - if error is not None: - self._runs[run_id]["error"] = error - self._runs[run_id]["updated_at"] = datetime.now(UTC).isoformat() - - async def delete(self, run_id): - self._runs.pop(run_id, None) - - async def update_run_completion(self, run_id, *, status, **kwargs): - if run_id in self._runs: - self._runs[run_id]["status"] = status - for key, value in kwargs.items(): - if value is not None: - self._runs[run_id][key] = value - self._runs[run_id]["updated_at"] = datetime.now(UTC).isoformat() - - async def list_pending(self, *, before=None): - now = before or datetime.now(UTC).isoformat() - results = [r for r in self._runs.values() if r["status"] == "pending" and r["created_at"] <= now] - results.sort(key=lambda r: r["created_at"]) - return results - - async def aggregate_tokens_by_thread(self, thread_id: str) -> dict[str, Any]: - completed = [r for r in self._runs.values() if r["thread_id"] == thread_id and r.get("status") in ("success", "error")] - by_model: dict[str, dict] = {} - for r in completed: - model = r.get("model_name") or "unknown" - entry = by_model.setdefault(model, {"tokens": 0, "runs": 0}) - entry["tokens"] += r.get("total_tokens", 0) - entry["runs"] += 1 - return { - "total_tokens": sum(r.get("total_tokens", 0) for r in completed), - "total_input_tokens": sum(r.get("total_input_tokens", 0) for r in completed), - "total_output_tokens": sum(r.get("total_output_tokens", 0) for r in completed), - "total_runs": len(completed), - "by_model": by_model, - "by_caller": { - "lead_agent": sum(r.get("lead_agent_tokens", 0) for r in completed), - "subagent": sum(r.get("subagent_tokens", 0) for r in completed), - "middleware": sum(r.get("middleware_tokens", 0) for r in completed), - }, - } diff --git a/backend/packages/harness/deerflow/runtime/runs/store/query_store.py b/backend/packages/harness/deerflow/runtime/runs/store/query_store.py new file mode 100644 index 000000000..c46bd1f19 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/store/query_store.py @@ -0,0 +1,20 @@ +"""Read-side boundary for durable run queries.""" + +from __future__ import annotations + +from typing import Protocol + +from ..types import RunRecord + + +class RunQueryStore(Protocol): + """Read durable run records for public query APIs.""" + + async def get_run(self, run_id: str) -> RunRecord | None: ... + + async def list_runs( + self, + thread_id: str, + *, + limit: int = 100, + ) -> list[RunRecord]: ... diff --git a/backend/packages/harness/deerflow/runtime/runs/types.py b/backend/packages/harness/deerflow/runtime/runs/types.py new file mode 100644 index 000000000..eada83502 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/runs/types.py @@ -0,0 +1,117 @@ +"""Public runs domain types.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime, timezone +from enum import StrEnum +from typing import Any, Literal + +# Intent: 表示请求的意图 +RunIntent = Literal[ + "create_background", + "create_and_stream", + "create_and_wait", + "join_stream", + "join_wait", +] + +# Scope kind: stateful (需要 thread_id) vs stateless (临时 thread) +RunScopeKind = Literal["stateful", "stateless"] + +class RunStatus(StrEnum): + pending = "pending" + starting = "starting" + running = "running" + success = "success" + error = "error" + interrupted = "interrupted" + timeout = "timeout" + +CancelAction = Literal["interrupt", "rollback"] + + +@dataclass(frozen=True) +class RunScope: + """Run 的作用域 - stateful 需要 thread_id, stateless 自动创建临时 thread.""" + + kind: RunScopeKind + thread_id: str + temporary: bool = False + + +@dataclass(frozen=True) +class CheckpointRequest: + """Checkpoint 恢复请求 - phase1 只接受但不实现 restore.""" + + checkpoint_id: str | None = None + checkpoint: dict[str, Any] | None = None + + +@dataclass(frozen=True) +class RunSpec: + """ + Run 规格对象 - 由 app 输入层构建,是执行器的输入。 + + Phase 1 限制: + - multitask_strategy 只支持 reject/interrupt + - 不支持 enqueue/rollback/after_seconds/batch + """ + + intent: RunIntent + scope: RunScope + assistant_id: str | None + input: dict[str, Any] | None + command: dict[str, Any] | None + runnable_config: dict[str, Any] + context: dict[str, Any] | None + metadata: dict[str, Any] + stream_modes: list[str] + stream_subgraphs: bool + stream_resumable: bool + on_disconnect: Literal["cancel", "continue"] + on_completion: Literal["delete", "keep"] + multitask_strategy: Literal["reject", "interrupt"] + interrupt_before: list[str] | Literal["*"] | None + interrupt_after: list[str] | Literal["*"] | None + checkpoint_request: CheckpointRequest | None + follow_up_to_run_id: str | None = None + webhook: str | None = None + feedback_keys: list[str] | None = None + + +type WaitResult = dict[str, Any] | None + + +@dataclass +class RunRecord: + """ + 运行时 Run 记录 - 由 RuntimeRunRegistry 管理。 + + 与 ORM 模型解耦,只在内存中维护。 + """ + + run_id: str + thread_id: str + assistant_id: str | None + status: RunStatus + temporary: bool + multitask_strategy: str + metadata: dict[str, Any] = field(default_factory=dict) + follow_up_to_run_id: str | None = None + created_at: str = "" + updated_at: str = "" + started_at: str | None = None + ended_at: str | None = None + error: str | None = None + + def __post_init__(self) -> None: + if not self.created_at: + now = datetime.now(timezone.utc).isoformat() + self.created_at = now + self.updated_at = now + + +# Terminal statuses for quick checks +TERMINAL_STATUSES: frozenset[RunStatus] = frozenset({"success", "error", "interrupted"}) +INFLIGHT_STATUSES: frozenset[RunStatus] = frozenset({"pending", "starting", "running"}) diff --git a/backend/packages/harness/deerflow/runtime/runs/worker.py b/backend/packages/harness/deerflow/runtime/runs/worker.py deleted file mode 100644 index c018bcabd..000000000 --- a/backend/packages/harness/deerflow/runtime/runs/worker.py +++ /dev/null @@ -1,493 +0,0 @@ -"""Background agent execution. - -Runs an agent graph inside an ``asyncio.Task``, publishing events to -a :class:`StreamBridge` as they are produced. - -Uses ``graph.astream(stream_mode=[...])`` which gives correct full-state -snapshots for ``values`` mode, proper ``{node: writes}`` for ``updates``, -and ``(chunk, metadata)`` tuples for ``messages`` mode. - -Note: ``events`` mode is not supported through the gateway — it requires -``graph.astream_events()`` which cannot simultaneously produce ``values`` -snapshots. The JS open-source LangGraph API server works around this via -internal checkpoint callbacks that are not exposed in the Python public API. -""" - -from __future__ import annotations - -import asyncio -import copy -import inspect -import logging -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Literal - -if TYPE_CHECKING: - from langchain_core.messages import HumanMessage - -from deerflow.runtime.serialization import serialize -from deerflow.runtime.stream_bridge import StreamBridge - -from .manager import RunManager, RunRecord -from .schemas import RunStatus - -logger = logging.getLogger(__name__) - -# Valid stream_mode values for LangGraph's graph.astream() -_VALID_LG_MODES = {"values", "updates", "checkpoints", "tasks", "debug", "messages", "custom"} - - -@dataclass(frozen=True) -class RunContext: - """Infrastructure dependencies for a single agent run. - - Groups checkpointer, store, and persistence-related singletons so that - ``run_agent`` (and any future callers) receive one object instead of a - growing list of keyword arguments. - """ - - checkpointer: Any - store: Any | None = field(default=None) - event_store: Any | None = field(default=None) - run_events_config: Any | None = field(default=None) - thread_store: Any | None = field(default=None) - - -async def run_agent( - bridge: StreamBridge, - run_manager: RunManager, - record: RunRecord, - *, - ctx: RunContext, - agent_factory: Any, - graph_input: dict, - config: dict, - stream_modes: list[str] | None = None, - stream_subgraphs: bool = False, - interrupt_before: list[str] | Literal["*"] | None = None, - interrupt_after: list[str] | Literal["*"] | None = None, -) -> None: - """Execute an agent in the background, publishing events to *bridge*.""" - - # Unpack infrastructure dependencies from RunContext. - checkpointer = ctx.checkpointer - store = ctx.store - event_store = ctx.event_store - run_events_config = ctx.run_events_config - thread_store = ctx.thread_store - - run_id = record.run_id - thread_id = record.thread_id - requested_modes: set[str] = set(stream_modes or ["values"]) - pre_run_checkpoint_id: str | None = None - pre_run_snapshot: dict[str, Any] | None = None - snapshot_capture_failed = False - - journal = None - - journal = None - - # Track whether "events" was requested but skipped - if "events" in requested_modes: - logger.info( - "Run %s: 'events' stream_mode not supported in gateway (requires astream_events + checkpoint callbacks). Skipping.", - run_id, - ) - - try: - # Initialize RunJournal + write human_message event. - # These are inside the try block so any exception (e.g. a DB - # error writing the event) flows through the except/finally - # path that publishes an "end" event to the SSE bridge — - # otherwise a failure here would leave the stream hanging - # with no terminator. - if event_store is not None: - from deerflow.runtime.journal import RunJournal - - journal = RunJournal( - run_id=run_id, - thread_id=thread_id, - event_store=event_store, - track_token_usage=getattr(run_events_config, "track_token_usage", True), - ) - - # 1. Mark running - await run_manager.set_status(run_id, RunStatus.running) - - # Snapshot the latest pre-run checkpoint so rollback can restore it. - if checkpointer is not None: - try: - config_for_check = {"configurable": {"thread_id": thread_id, "checkpoint_ns": ""}} - ckpt_tuple = await checkpointer.aget_tuple(config_for_check) - if ckpt_tuple is not None: - ckpt_config = getattr(ckpt_tuple, "config", {}).get("configurable", {}) - pre_run_checkpoint_id = ckpt_config.get("checkpoint_id") - pre_run_snapshot = { - "checkpoint_ns": ckpt_config.get("checkpoint_ns", ""), - "checkpoint": copy.deepcopy(getattr(ckpt_tuple, "checkpoint", {})), - "metadata": copy.deepcopy(getattr(ckpt_tuple, "metadata", {})), - "pending_writes": copy.deepcopy(getattr(ckpt_tuple, "pending_writes", []) or []), - } - except Exception: - snapshot_capture_failed = True - logger.warning("Could not capture pre-run checkpoint snapshot for run %s", run_id, exc_info=True) - - # 2. Publish metadata — useStream needs both run_id AND thread_id - await bridge.publish( - run_id, - "metadata", - { - "run_id": run_id, - "thread_id": thread_id, - }, - ) - - # 3. Build the agent - from langchain_core.runnables import RunnableConfig - from langgraph.runtime import Runtime - - # Inject runtime context so middlewares can access thread_id - # (langgraph-cli does this automatically; we must do it manually) - runtime = Runtime(context={"thread_id": thread_id, "run_id": run_id}, store=store) - # If the caller already set a ``context`` key (LangGraph >= 0.6.0 - # prefers it over ``configurable`` for thread-level data), make - # sure ``thread_id`` is available there too. - if "context" in config and isinstance(config["context"], dict): - config["context"].setdefault("thread_id", thread_id) - config["context"].setdefault("run_id", run_id) - config.setdefault("configurable", {})["__pregel_runtime"] = runtime - - # Inject RunJournal as a LangChain callback handler. - # on_llm_end captures token usage; on_chain_start/end captures lifecycle. - if journal is not None: - config.setdefault("callbacks", []).append(journal) - - runnable_config = RunnableConfig(**config) - agent = agent_factory(config=runnable_config) - - # 4. Attach checkpointer and store - if checkpointer is not None: - agent.checkpointer = checkpointer - if store is not None: - agent.store = store - - # 5. Set interrupt nodes - if interrupt_before: - agent.interrupt_before_nodes = interrupt_before - if interrupt_after: - agent.interrupt_after_nodes = interrupt_after - - # 6. Build LangGraph stream_mode list - # "events" is NOT a valid astream mode — skip it - # "messages-tuple" maps to LangGraph's "messages" mode - lg_modes: list[str] = [] - for m in requested_modes: - if m == "messages-tuple": - lg_modes.append("messages") - elif m == "events": - # Skipped — see log above - continue - elif m in _VALID_LG_MODES: - lg_modes.append(m) - if not lg_modes: - lg_modes = ["values"] - - # Deduplicate while preserving order - seen: set[str] = set() - deduped: list[str] = [] - for m in lg_modes: - if m not in seen: - seen.add(m) - deduped.append(m) - lg_modes = deduped - - logger.info("Run %s: streaming with modes %s (requested: %s)", run_id, lg_modes, requested_modes) - - # 7. Stream using graph.astream - if len(lg_modes) == 1 and not stream_subgraphs: - # Single mode, no subgraphs: astream yields raw chunks - single_mode = lg_modes[0] - async for chunk in agent.astream(graph_input, config=runnable_config, stream_mode=single_mode): - if record.abort_event.is_set(): - logger.info("Run %s abort requested — stopping", run_id) - break - sse_event = _lg_mode_to_sse_event(single_mode) - await bridge.publish(run_id, sse_event, serialize(chunk, mode=single_mode)) - else: - # Multiple modes or subgraphs: astream yields tuples - async for item in agent.astream( - graph_input, - config=runnable_config, - stream_mode=lg_modes, - subgraphs=stream_subgraphs, - ): - if record.abort_event.is_set(): - logger.info("Run %s abort requested — stopping", run_id) - break - - mode, chunk = _unpack_stream_item(item, lg_modes, stream_subgraphs) - if mode is None: - continue - - sse_event = _lg_mode_to_sse_event(mode) - await bridge.publish(run_id, sse_event, serialize(chunk, mode=mode)) - - # 8. Final status - if record.abort_event.is_set(): - action = record.abort_action - if action == "rollback": - await run_manager.set_status(run_id, RunStatus.error, error="Rolled back by user") - try: - await _rollback_to_pre_run_checkpoint( - checkpointer=checkpointer, - thread_id=thread_id, - run_id=run_id, - pre_run_checkpoint_id=pre_run_checkpoint_id, - pre_run_snapshot=pre_run_snapshot, - snapshot_capture_failed=snapshot_capture_failed, - ) - logger.info("Run %s rolled back to pre-run checkpoint %s", run_id, pre_run_checkpoint_id) - except Exception: - logger.warning("Failed to rollback checkpoint for run %s", run_id, exc_info=True) - else: - await run_manager.set_status(run_id, RunStatus.interrupted) - else: - await run_manager.set_status(run_id, RunStatus.success) - - except asyncio.CancelledError: - action = record.abort_action - if action == "rollback": - await run_manager.set_status(run_id, RunStatus.error, error="Rolled back by user") - try: - await _rollback_to_pre_run_checkpoint( - checkpointer=checkpointer, - thread_id=thread_id, - run_id=run_id, - pre_run_checkpoint_id=pre_run_checkpoint_id, - pre_run_snapshot=pre_run_snapshot, - snapshot_capture_failed=snapshot_capture_failed, - ) - logger.info("Run %s was cancelled and rolled back", run_id) - except Exception: - logger.warning("Run %s cancellation rollback failed", run_id, exc_info=True) - else: - await run_manager.set_status(run_id, RunStatus.interrupted) - logger.info("Run %s was cancelled", run_id) - - except Exception as exc: - error_msg = f"{exc}" - logger.exception("Run %s failed: %s", run_id, error_msg) - await run_manager.set_status(run_id, RunStatus.error, error=error_msg) - await bridge.publish( - run_id, - "error", - { - "message": error_msg, - "name": type(exc).__name__, - }, - ) - - finally: - # Flush any buffered journal events and persist completion data - if journal is not None: - try: - await journal.flush() - except Exception: - logger.warning("Failed to flush journal for run %s", run_id, exc_info=True) - - try: - # Persist token usage + convenience fields to RunStore - completion = journal.get_completion_data() - await run_manager.update_run_completion(run_id, status=record.status.value, **completion) - except Exception: - logger.warning("Failed to persist run completion for %s (non-fatal)", run_id, exc_info=True) - - # Sync title from checkpoint to threads_meta.display_name - if checkpointer is not None and thread_store is not None: - try: - ckpt_config = {"configurable": {"thread_id": thread_id, "checkpoint_ns": ""}} - ckpt_tuple = await checkpointer.aget_tuple(ckpt_config) - if ckpt_tuple is not None: - ckpt = getattr(ckpt_tuple, "checkpoint", {}) or {} - title = ckpt.get("channel_values", {}).get("title") - if title: - await thread_store.update_display_name(thread_id, title) - except Exception: - logger.debug("Failed to sync title for thread %s (non-fatal)", thread_id) - - # Update threads_meta status based on run outcome - if thread_store is not None: - try: - final_status = "idle" if record.status == RunStatus.success else record.status.value - await thread_store.update_status(thread_id, final_status) - except Exception: - logger.debug("Failed to update thread_meta status for %s (non-fatal)", thread_id) - - await bridge.publish_end(run_id) - asyncio.create_task(bridge.cleanup(run_id, delay=60)) - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -async def _call_checkpointer_method(checkpointer: Any, async_name: str, sync_name: str, *args: Any, **kwargs: Any) -> Any: - """Call a checkpointer method, supporting async and sync variants.""" - method = getattr(checkpointer, async_name, None) or getattr(checkpointer, sync_name, None) - if method is None: - raise AttributeError(f"Missing checkpointer method: {async_name}/{sync_name}") - result = method(*args, **kwargs) - if inspect.isawaitable(result): - return await result - return result - - -async def _rollback_to_pre_run_checkpoint( - *, - checkpointer: Any, - thread_id: str, - run_id: str, - pre_run_checkpoint_id: str | None, - pre_run_snapshot: dict[str, Any] | None, - snapshot_capture_failed: bool, -) -> None: - """Restore thread state to the checkpoint snapshot captured before run start.""" - if checkpointer is None: - logger.info("Run %s rollback requested but no checkpointer is configured", run_id) - return - - if snapshot_capture_failed: - logger.warning("Run %s rollback skipped: pre-run checkpoint snapshot capture failed", run_id) - return - - if pre_run_snapshot is None: - await _call_checkpointer_method(checkpointer, "adelete_thread", "delete_thread", thread_id) - logger.info("Run %s rollback reset thread %s to empty state", run_id, thread_id) - return - - checkpoint_to_restore = None - metadata_to_restore: dict[str, Any] = {} - checkpoint_ns = "" - checkpoint = pre_run_snapshot.get("checkpoint") - if not isinstance(checkpoint, dict): - logger.warning("Run %s rollback skipped: invalid pre-run checkpoint snapshot", run_id) - return - checkpoint_to_restore = checkpoint - if checkpoint_to_restore.get("id") is None and pre_run_checkpoint_id is not None: - checkpoint_to_restore = {**checkpoint_to_restore, "id": pre_run_checkpoint_id} - if checkpoint_to_restore.get("id") is None: - logger.warning("Run %s rollback skipped: pre-run checkpoint has no checkpoint id", run_id) - return - metadata = pre_run_snapshot.get("metadata", {}) - metadata_to_restore = metadata if isinstance(metadata, dict) else {} - raw_checkpoint_ns = pre_run_snapshot.get("checkpoint_ns") - checkpoint_ns = raw_checkpoint_ns if isinstance(raw_checkpoint_ns, str) else "" - - channel_versions = checkpoint_to_restore.get("channel_versions") - new_versions = dict(channel_versions) if isinstance(channel_versions, dict) else {} - - restore_config = {"configurable": {"thread_id": thread_id, "checkpoint_ns": checkpoint_ns}} - restored_config = await _call_checkpointer_method( - checkpointer, - "aput", - "put", - restore_config, - checkpoint_to_restore, - metadata_to_restore if isinstance(metadata_to_restore, dict) else {}, - new_versions, - ) - if not isinstance(restored_config, dict): - raise RuntimeError(f"Run {run_id} rollback restore returned invalid config: expected dict") - restored_configurable = restored_config.get("configurable", {}) - if not isinstance(restored_configurable, dict): - raise RuntimeError(f"Run {run_id} rollback restore returned invalid config payload") - restored_checkpoint_id = restored_configurable.get("checkpoint_id") - if not restored_checkpoint_id: - raise RuntimeError(f"Run {run_id} rollback restore did not return checkpoint_id") - - pending_writes = pre_run_snapshot.get("pending_writes", []) - if not pending_writes: - return - - writes_by_task: dict[str, list[tuple[str, Any]]] = {} - for item in pending_writes: - if not isinstance(item, (tuple, list)) or len(item) != 3: - raise RuntimeError(f"Run {run_id} rollback failed: pending_write is not a 3-tuple: {item!r}") - task_id, channel, value = item - if not isinstance(channel, str): - raise RuntimeError(f"Run {run_id} rollback failed: pending_write has non-string channel: task_id={task_id!r}, channel={channel!r}") - writes_by_task.setdefault(str(task_id), []).append((channel, value)) - - for task_id, writes in writes_by_task.items(): - await _call_checkpointer_method( - checkpointer, - "aput_writes", - "put_writes", - restored_config, - writes, - task_id=task_id, - ) - - -def _lg_mode_to_sse_event(mode: str) -> str: - """Map LangGraph internal stream_mode name to SSE event name. - - LangGraph's ``astream(stream_mode="messages")`` produces message - tuples. The SSE protocol calls this ``messages-tuple`` when the - client explicitly requests it, but the default SSE event name used - by LangGraph Platform is simply ``"messages"``. - """ - # All LG modes map 1:1 to SSE event names — "messages" stays "messages" - return mode - - -def _extract_human_message(graph_input: dict) -> HumanMessage | None: - """Extract or construct a HumanMessage from graph_input for event recording. - - Returns a LangChain HumanMessage so callers can use .model_dump() to get - the checkpoint-aligned serialization format. - """ - from langchain_core.messages import HumanMessage - - messages = graph_input.get("messages") - if not messages: - return None - last = messages[-1] if isinstance(messages, list) else messages - if isinstance(last, HumanMessage): - return last - if isinstance(last, str): - return HumanMessage(content=last) if last else None - if hasattr(last, "content"): - content = last.content - return HumanMessage(content=content) - if isinstance(last, dict): - content = last.get("content", "") - return HumanMessage(content=content) if content else None - return None - - -def _unpack_stream_item( - item: Any, - lg_modes: list[str], - stream_subgraphs: bool, -) -> tuple[str | None, Any]: - """Unpack a multi-mode or subgraph stream item into (mode, chunk). - - Returns ``(None, None)`` if the item cannot be parsed. - """ - if stream_subgraphs: - if isinstance(item, tuple) and len(item) == 3: - _ns, mode, chunk = item - return str(mode), chunk - if isinstance(item, tuple) and len(item) == 2: - mode, chunk = item - return str(mode), chunk - return None, None - - if isinstance(item, tuple) and len(item) == 2: - mode, chunk = item - return str(mode), chunk - - # Fallback: single-element output from first mode - return lg_modes[0] if lg_modes else None, item diff --git a/backend/packages/harness/deerflow/runtime/serialization.py b/backend/packages/harness/deerflow/runtime/serialization.py index 48853dfb3..6f75b67f7 100644 --- a/backend/packages/harness/deerflow/runtime/serialization.py +++ b/backend/packages/harness/deerflow/runtime/serialization.py @@ -4,8 +4,8 @@ Provides a single source of truth for converting LangChain message objects, Pydantic models, and LangGraph state dicts into plain JSON-serialisable Python structures. -Consumers: ``deerflow.runtime.runs.worker`` (SSE publishing) and -``app.gateway.routers.threads`` (REST responses). +Consumers: runs execution internals (SSE publishing) and +gateway thread state/history responses. """ from __future__ import annotations diff --git a/backend/packages/harness/deerflow/runtime/stream_bridge/__init__.py b/backend/packages/harness/deerflow/runtime/stream_bridge/__init__.py index 435520c48..af17d3ed4 100644 --- a/backend/packages/harness/deerflow/runtime/stream_bridge/__init__.py +++ b/backend/packages/harness/deerflow/runtime/stream_bridge/__init__.py @@ -1,21 +1,47 @@ -"""Stream bridge — decouples agent workers from SSE endpoints. +"""Stream bridge public surface. -A ``StreamBridge`` sits between the background task that runs an agent -(producer) and the HTTP endpoint that pushes Server-Sent Events to -the client (consumer). This package provides an abstract protocol -(:class:`StreamBridge`) plus a default in-memory implementation backed -by :mod:`asyncio.Queue`. +The harness package owns the stream abstraction and event semantics. +Concrete backends are intentionally not part of the public API here so +applications can inject infra-specific implementations. """ -from .async_provider import make_stream_bridge -from .base import END_SENTINEL, HEARTBEAT_SENTINEL, StreamBridge, StreamEvent -from .memory import MemoryStreamBridge +from .contract import ( + CANCELLED_SENTINEL, + END_SENTINEL, + HEARTBEAT_SENTINEL, + JSONScalar, + JSONValue, + TERMINAL_STATES, + ResumeResult, + StreamBridge, + StreamEvent, + StreamStatus, +) +from .exceptions import ( + BridgeClosedError, + StreamBridgeError, + StreamCapacityExceededError, + StreamNotFoundError, + StreamTerminatedError, +) __all__ = [ + # Sentinels + "CANCELLED_SENTINEL", "END_SENTINEL", "HEARTBEAT_SENTINEL", - "MemoryStreamBridge", + # Types + "JSONScalar", + "JSONValue", + "ResumeResult", "StreamBridge", "StreamEvent", - "make_stream_bridge", + "StreamStatus", + "TERMINAL_STATES", + # Exceptions + "BridgeClosedError", + "StreamBridgeError", + "StreamCapacityExceededError", + "StreamNotFoundError", + "StreamTerminatedError", ] diff --git a/backend/packages/harness/deerflow/runtime/stream_bridge/async_provider.py b/backend/packages/harness/deerflow/runtime/stream_bridge/async_provider.py deleted file mode 100644 index f35b7d639..000000000 --- a/backend/packages/harness/deerflow/runtime/stream_bridge/async_provider.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Async stream bridge factory. - -Provides an **async context manager** aligned with -:func:`deerflow.runtime.checkpointer.async_provider.make_checkpointer`. - -Usage (e.g. FastAPI lifespan):: - - from deerflow.agents.stream_bridge import make_stream_bridge - - async with make_stream_bridge() as bridge: - app.state.stream_bridge = bridge -""" - -from __future__ import annotations - -import contextlib -import logging -from collections.abc import AsyncIterator - -from deerflow.config.stream_bridge_config import get_stream_bridge_config - -from .base import StreamBridge - -logger = logging.getLogger(__name__) - - -@contextlib.asynccontextmanager -async def make_stream_bridge(config=None) -> AsyncIterator[StreamBridge]: - """Async context manager that yields a :class:`StreamBridge`. - - Falls back to :class:`MemoryStreamBridge` when no configuration is - provided and nothing is set globally. - """ - if config is None: - config = get_stream_bridge_config() - - if config is None or config.type == "memory": - from deerflow.runtime.stream_bridge.memory import MemoryStreamBridge - - maxsize = config.queue_maxsize if config is not None else 256 - bridge = MemoryStreamBridge(queue_maxsize=maxsize) - logger.info("Stream bridge initialised: memory (queue_maxsize=%d)", maxsize) - try: - yield bridge - finally: - await bridge.close() - return - - if config.type == "redis": - raise NotImplementedError("Redis stream bridge planned for Phase 2") - - raise ValueError(f"Unknown stream bridge type: {config.type!r}") diff --git a/backend/packages/harness/deerflow/runtime/stream_bridge/base.py b/backend/packages/harness/deerflow/runtime/stream_bridge/base.py deleted file mode 100644 index c34353a08..000000000 --- a/backend/packages/harness/deerflow/runtime/stream_bridge/base.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Abstract stream bridge protocol. - -StreamBridge decouples agent workers (producers) from SSE endpoints -(consumers), aligning with LangGraph Platform's Queue + StreamManager -architecture. -""" - -from __future__ import annotations - -import abc -from collections.abc import AsyncIterator -from dataclasses import dataclass -from typing import Any - - -@dataclass(frozen=True) -class StreamEvent: - """Single stream event. - - Attributes: - id: Monotonically increasing event ID (used as SSE ``id:`` field, - supports ``Last-Event-ID`` reconnection). - event: SSE event name, e.g. ``"metadata"``, ``"updates"``, - ``"events"``, ``"error"``, ``"end"``. - data: JSON-serialisable payload. - """ - - id: str - event: str - data: Any - - -HEARTBEAT_SENTINEL = StreamEvent(id="", event="__heartbeat__", data=None) -END_SENTINEL = StreamEvent(id="", event="__end__", data=None) - - -class StreamBridge(abc.ABC): - """Abstract base for stream bridges.""" - - @abc.abstractmethod - async def publish(self, run_id: str, event: str, data: Any) -> None: - """Enqueue a single event for *run_id* (producer side).""" - - @abc.abstractmethod - async def publish_end(self, run_id: str) -> None: - """Signal that no more events will be produced for *run_id*.""" - - @abc.abstractmethod - def subscribe( - self, - run_id: str, - *, - last_event_id: str | None = None, - heartbeat_interval: float = 15.0, - ) -> AsyncIterator[StreamEvent]: - """Async iterator that yields events for *run_id* (consumer side). - - Yields :data:`HEARTBEAT_SENTINEL` when no event arrives within - *heartbeat_interval* seconds. Yields :data:`END_SENTINEL` once - the producer calls :meth:`publish_end`. - """ - - @abc.abstractmethod - async def cleanup(self, run_id: str, *, delay: float = 0) -> None: - """Release resources associated with *run_id*. - - If *delay* > 0 the implementation should wait before releasing, - giving late subscribers a chance to drain remaining events. - """ - - async def close(self) -> None: - """Release backend resources. Default is a no-op.""" diff --git a/backend/packages/harness/deerflow/runtime/stream_bridge/contract.py b/backend/packages/harness/deerflow/runtime/stream_bridge/contract.py new file mode 100644 index 000000000..a0c14ccf6 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/stream_bridge/contract.py @@ -0,0 +1,112 @@ +"""Stream bridge contract and public types.""" + +from __future__ import annotations + +import abc +from collections.abc import AsyncIterator +from dataclasses import dataclass +from enum import Enum +from typing import Literal + +type JSONScalar = None | bool | int | float | str +type JSONValue = JSONScalar | list["JSONValue"] | dict[str, "JSONValue"] + + +class StreamStatus(str, Enum): + """Stream lifecycle states.""" + + ACTIVE = "active" + ENDED = "ended" + CANCELLED = "cancelled" + ERRORED = "errored" + CLOSED = "closed" + + +TERMINAL_STATES = frozenset({ + StreamStatus.ENDED, + StreamStatus.CANCELLED, + StreamStatus.ERRORED, +}) + + +@dataclass(frozen=True, slots=True) +class StreamEvent: + """Single stream event.""" + + id: str + event: str + data: JSONValue + + +@dataclass(frozen=True, slots=True) +class ResumeResult: + """Result of resolving Last-Event-ID.""" + + next_offset: int + status: Literal["fresh", "resumed", "evicted", "invalid", "unknown"] + gap_count: int = 0 + + +HEARTBEAT_SENTINEL = StreamEvent(id="", event="__heartbeat__", data=None) +END_SENTINEL = StreamEvent(id="", event="__end__", data=None) +CANCELLED_SENTINEL = StreamEvent(id="", event="__cancelled__", data=None) + + +class StreamBridge(abc.ABC): + """Abstract base for stream bridges. + + ``StreamBridge`` defines runtime stream semantics, not storage semantics. + Concrete backends may live outside the harness package and be injected by + the application composition root. + + Important boundary rules: + - Terminal run events (``end``/``cancel``/``error``) are real replayable + events and belong to run-level semantics. + - ``close()`` is bridge-level shutdown and must not be treated as a run + cancellation signal. + """ + + @abc.abstractmethod + async def publish(self, run_id: str, event: str, data: JSONValue) -> str: + """Enqueue a single event for *run_id* and return its event ID.""" + + @abc.abstractmethod + async def publish_end(self, run_id: str) -> str: + """Signal that no more events will be produced for *run_id*.""" + + async def publish_terminal( + self, + run_id: str, + kind: StreamStatus, + data: JSONValue = None, + ) -> str: + """Publish a terminal event (end/cancel/error).""" + await self.publish_end(run_id) + return "" + + @abc.abstractmethod + def subscribe( + self, + run_id: str, + *, + last_event_id: str | None = None, + heartbeat_interval: float = 15.0, + ) -> AsyncIterator[StreamEvent]: + """Yield replayable stream events for *run_id*.""" + + @abc.abstractmethod + async def cleanup(self, run_id: str, *, delay: float = 0) -> None: + """Release resources associated with *run_id*.""" + + async def cancel(self, run_id: str) -> None: + """Cancel a run and notify all subscribers.""" + await self.publish_terminal(run_id, StreamStatus.CANCELLED) + + async def mark_awaiting_input(self, run_id: str) -> None: + """Mark stream as awaiting human input.""" + + async def start(self) -> None: + """Start background tasks, if needed.""" + + async def close(self) -> None: + """Release bridge-level backend resources.""" diff --git a/backend/packages/harness/deerflow/runtime/stream_bridge/exceptions.py b/backend/packages/harness/deerflow/runtime/stream_bridge/exceptions.py new file mode 100644 index 000000000..fbdfdc1f4 --- /dev/null +++ b/backend/packages/harness/deerflow/runtime/stream_bridge/exceptions.py @@ -0,0 +1,23 @@ +"""Stream bridge exceptions.""" + +from __future__ import annotations + + +class StreamBridgeError(Exception): + """Base exception for stream bridge errors.""" + + +class BridgeClosedError(StreamBridgeError): + """Raised when operating on a closed bridge.""" + + +class StreamCapacityExceededError(StreamBridgeError): + """Raised when max_active_streams is reached and eviction is not possible.""" + + +class StreamTerminatedError(StreamBridgeError): + """Raised when publishing to a terminal stream.""" + + +class StreamNotFoundError(StreamBridgeError): + """Raised when referencing a non-existent stream.""" diff --git a/backend/packages/harness/deerflow/runtime/stream_bridge/memory.py b/backend/packages/harness/deerflow/runtime/stream_bridge/memory.py deleted file mode 100644 index cb5b8d1f9..000000000 --- a/backend/packages/harness/deerflow/runtime/stream_bridge/memory.py +++ /dev/null @@ -1,133 +0,0 @@ -"""In-memory stream bridge backed by an in-process event log.""" - -from __future__ import annotations - -import asyncio -import logging -import time -from collections.abc import AsyncIterator -from dataclasses import dataclass, field -from typing import Any - -from .base import END_SENTINEL, HEARTBEAT_SENTINEL, StreamBridge, StreamEvent - -logger = logging.getLogger(__name__) - - -@dataclass -class _RunStream: - events: list[StreamEvent] = field(default_factory=list) - condition: asyncio.Condition = field(default_factory=asyncio.Condition) - ended: bool = False - start_offset: int = 0 - - -class MemoryStreamBridge(StreamBridge): - """Per-run in-memory event log implementation. - - Events are retained for a bounded time window per run so late subscribers - and reconnecting clients can replay buffered events from ``Last-Event-ID``. - """ - - def __init__(self, *, queue_maxsize: int = 256) -> None: - self._maxsize = queue_maxsize - self._streams: dict[str, _RunStream] = {} - self._counters: dict[str, int] = {} - - # -- helpers --------------------------------------------------------------- - - def _get_or_create_stream(self, run_id: str) -> _RunStream: - if run_id not in self._streams: - self._streams[run_id] = _RunStream() - self._counters[run_id] = 0 - return self._streams[run_id] - - def _next_id(self, run_id: str) -> str: - self._counters[run_id] = self._counters.get(run_id, 0) + 1 - ts = int(time.time() * 1000) - seq = self._counters[run_id] - 1 - return f"{ts}-{seq}" - - def _resolve_start_offset(self, stream: _RunStream, last_event_id: str | None) -> int: - if last_event_id is None: - return stream.start_offset - - for index, entry in enumerate(stream.events): - if entry.id == last_event_id: - return stream.start_offset + index + 1 - - if stream.events: - logger.warning( - "last_event_id=%s not found in retained buffer; replaying from earliest retained event", - last_event_id, - ) - return stream.start_offset - - # -- StreamBridge API ------------------------------------------------------ - - async def publish(self, run_id: str, event: str, data: Any) -> None: - stream = self._get_or_create_stream(run_id) - entry = StreamEvent(id=self._next_id(run_id), event=event, data=data) - async with stream.condition: - stream.events.append(entry) - if len(stream.events) > self._maxsize: - overflow = len(stream.events) - self._maxsize - del stream.events[:overflow] - stream.start_offset += overflow - stream.condition.notify_all() - - async def publish_end(self, run_id: str) -> None: - stream = self._get_or_create_stream(run_id) - async with stream.condition: - stream.ended = True - stream.condition.notify_all() - - async def subscribe( - self, - run_id: str, - *, - last_event_id: str | None = None, - heartbeat_interval: float = 15.0, - ) -> AsyncIterator[StreamEvent]: - stream = self._get_or_create_stream(run_id) - async with stream.condition: - next_offset = self._resolve_start_offset(stream, last_event_id) - - while True: - async with stream.condition: - if next_offset < stream.start_offset: - logger.warning( - "subscriber for run %s fell behind retained buffer; resuming from offset %s", - run_id, - stream.start_offset, - ) - next_offset = stream.start_offset - - local_index = next_offset - stream.start_offset - if 0 <= local_index < len(stream.events): - entry = stream.events[local_index] - next_offset += 1 - elif stream.ended: - entry = END_SENTINEL - else: - try: - await asyncio.wait_for(stream.condition.wait(), timeout=heartbeat_interval) - except TimeoutError: - entry = HEARTBEAT_SENTINEL - else: - continue - - if entry is END_SENTINEL: - yield END_SENTINEL - return - yield entry - - async def cleanup(self, run_id: str, *, delay: float = 0) -> None: - if delay > 0: - await asyncio.sleep(delay) - self._streams.pop(run_id, None) - self._counters.pop(run_id, None) - - async def close(self) -> None: - self._streams.clear() - self._counters.clear() diff --git a/backend/packages/harness/deerflow/runtime/user_context.py b/backend/packages/harness/deerflow/runtime/user_context.py deleted file mode 100644 index ffe4be690..000000000 --- a/backend/packages/harness/deerflow/runtime/user_context.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Request-scoped user context for user-based authorization. - -This module holds a :class:`~contextvars.ContextVar` that the gateway's -auth middleware sets after a successful authentication. Repository -methods read the contextvar via a sentinel default parameter, letting -routers stay free of ``user_id`` boilerplate. - -Three-state semantics for the repository ``user_id`` parameter (the -consumer side of this module lives in ``deerflow.persistence.*``): - -- ``_AUTO`` (module-private sentinel, default): read from contextvar; - raise :class:`RuntimeError` if unset. -- Explicit ``str``: use the provided value, overriding contextvar. -- Explicit ``None``: no WHERE clause — used only by migration scripts - and admin CLIs that intentionally bypass isolation. - -Dependency direction --------------------- -``persistence`` (lower layer) reads from this module; ``gateway.auth`` -(higher layer) writes to it. ``CurrentUser`` is defined here as a -:class:`typing.Protocol` so that ``persistence`` never needs to import -the concrete ``User`` class from ``gateway.auth.models``. Any object -with an ``.id: str`` attribute structurally satisfies the protocol. - -Asyncio semantics ------------------ -``ContextVar`` is task-local under asyncio, not thread-local. Each -FastAPI request runs in its own task, so the context is naturally -isolated. ``asyncio.create_task`` and ``asyncio.to_thread`` inherit the -parent task's context, which is typically the intended behaviour; if -a background task must *not* see the foreground user, wrap it with -``contextvars.copy_context()`` to get a clean copy. -""" - -from __future__ import annotations - -from contextvars import ContextVar, Token -from typing import Final, Protocol, runtime_checkable - - -@runtime_checkable -class CurrentUser(Protocol): - """Structural type for the current authenticated user. - - Any object with an ``.id: str`` attribute satisfies this protocol. - Concrete implementations live in ``app.gateway.auth.models.User``. - """ - - id: str - - -_current_user: Final[ContextVar[CurrentUser | None]] = ContextVar("deerflow_current_user", default=None) - - -def set_current_user(user: CurrentUser) -> Token[CurrentUser | None]: - """Set the current user for this async task. - - Returns a reset token that should be passed to - :func:`reset_current_user` in a ``finally`` block to restore the - previous context. - """ - return _current_user.set(user) - - -def reset_current_user(token: Token[CurrentUser | None]) -> None: - """Restore the context to the state captured by ``token``.""" - _current_user.reset(token) - - -def get_current_user() -> CurrentUser | None: - """Return the current user, or ``None`` if unset. - - Safe to call in any context. Used by code paths that can proceed - without a user (e.g. migration scripts, public endpoints). - """ - return _current_user.get() - - -def require_current_user() -> CurrentUser: - """Return the current user, or raise :class:`RuntimeError`. - - Used by repository code that must not be called outside a - request-authenticated context. The error message is phrased so - that a caller debugging a stack trace can locate the offending - code path. - """ - user = _current_user.get() - if user is None: - raise RuntimeError("repository accessed without user context") - return user - - -# --------------------------------------------------------------------------- -# Effective user_id helpers (filesystem isolation) -# --------------------------------------------------------------------------- - -DEFAULT_USER_ID: Final[str] = "default" - - -def get_effective_user_id() -> str: - """Return the current user's id as a string, or DEFAULT_USER_ID if unset. - - Unlike :func:`require_current_user` this never raises — it is designed - for filesystem-path resolution where a valid user bucket is always needed. - """ - user = _current_user.get() - if user is None: - return DEFAULT_USER_ID - return str(user.id) - - -# --------------------------------------------------------------------------- -# Sentinel-based user_id resolution -# --------------------------------------------------------------------------- -# -# Repository methods accept a ``user_id`` keyword-only argument that -# defaults to ``AUTO``. The three possible values drive distinct -# behaviours; see the docstring on :func:`resolve_user_id`. - - -class _AutoSentinel: - """Singleton marker meaning 'resolve user_id from contextvar'.""" - - _instance: _AutoSentinel | None = None - - def __new__(cls) -> _AutoSentinel: - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def __repr__(self) -> str: - return "" - - -AUTO: Final[_AutoSentinel] = _AutoSentinel() - - -def resolve_user_id( - value: str | None | _AutoSentinel, - *, - method_name: str = "repository method", -) -> str | None: - """Resolve the user_id parameter passed to a repository method. - - Three-state semantics: - - - :data:`AUTO` (default): read from contextvar; raise - :class:`RuntimeError` if no user is in context. This is the - common case for request-scoped calls. - - Explicit ``str``: use the provided id verbatim, overriding any - contextvar value. Useful for tests and admin-override flows. - - Explicit ``None``: no filter — the repository should skip the - user_id WHERE clause entirely. Reserved for migration scripts - and CLI tools that intentionally bypass isolation. - """ - if isinstance(value, _AutoSentinel): - user = _current_user.get() - if user is None: - raise RuntimeError(f"{method_name} called with user_id=AUTO but no user context is set; pass an explicit user_id, set the contextvar via auth middleware, or opt out with user_id=None for migration/CLI paths.") - # Coerce to ``str`` at the boundary: ``User.id`` is typed as - # ``UUID`` for the API surface, but the persistence layer - # stores ``user_id`` as ``String(64)`` and aiosqlite cannot - # bind a raw UUID object to a VARCHAR column ("type 'UUID' is - # not supported"). Honour the documented return type here - # rather than ripple a type change through every caller. - return str(user.id) - return value