"""Subagent execution engine.""" import asyncio import atexit import logging import threading import uuid from collections.abc import Callable, Coroutine from concurrent.futures import Future, ThreadPoolExecutor from concurrent.futures import TimeoutError as FuturesTimeoutError from contextvars import Context, copy_context from dataclasses import dataclass, field from datetime import datetime from enum import Enum from typing import Any from langchain.agents import create_agent from langchain.tools import BaseTool from langchain_core.messages import AIMessage, HumanMessage, SystemMessage from langchain_core.runnables import RunnableConfig from deerflow.agents.thread_state import SandboxState, ThreadDataState, ThreadState from deerflow.config import get_app_config from deerflow.config.app_config import AppConfig from deerflow.models import create_chat_model from deerflow.skills.tool_policy import filter_tools_by_skill_allowed_tools from deerflow.skills.types import Skill from deerflow.subagents.config import SubagentConfig, resolve_subagent_model_name logger = logging.getLogger(__name__) _previous_shutdown_isolated_subagent_loop = globals().get("_shutdown_isolated_subagent_loop") if callable(_previous_shutdown_isolated_subagent_loop): atexit.unregister(_previous_shutdown_isolated_subagent_loop) _previous_shutdown_isolated_subagent_loop() class SubagentStatus(Enum): """Status of a subagent execution.""" PENDING = "pending" RUNNING = "running" COMPLETED = "completed" FAILED = "failed" CANCELLED = "cancelled" TIMED_OUT = "timed_out" @dataclass class SubagentResult: """Result of a subagent execution. Attributes: task_id: Unique identifier for this execution. trace_id: Trace ID for distributed tracing (links parent and subagent logs). status: Current status of the execution. result: The final result message (if completed). error: Error message (if failed). started_at: When execution started. completed_at: When execution completed. ai_messages: List of complete AI messages (as dicts) generated during execution. """ task_id: str trace_id: str status: SubagentStatus result: str | None = None error: str | None = None started_at: datetime | None = None completed_at: datetime | None = None ai_messages: list[dict[str, Any]] | None = None cancel_event: threading.Event = field(default_factory=threading.Event, repr=False) def __post_init__(self): """Initialize mutable defaults.""" if self.ai_messages is None: self.ai_messages = [] # Global storage for background task results _background_tasks: dict[str, SubagentResult] = {} _background_tasks_lock = threading.Lock() # Thread pool for background task scheduling and orchestration _scheduler_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-scheduler-") # Persistent event loop for isolated subagent executions triggered from an # already-running parent loop. Reusing one long-lived loop avoids creating a # fresh loop per execution and then closing async resources bound to it. _isolated_subagent_loop: asyncio.AbstractEventLoop | None = None _isolated_subagent_loop_thread: threading.Thread | None = None _isolated_subagent_loop_started: threading.Event | None = None _isolated_subagent_loop_lock = threading.Lock() def _run_isolated_subagent_loop( loop: asyncio.AbstractEventLoop, started_event: threading.Event, ) -> None: """Run the persistent isolated subagent loop in a dedicated daemon thread.""" asyncio.set_event_loop(loop) loop.call_soon(started_event.set) try: loop.run_forever() finally: started_event.clear() def _shutdown_isolated_subagent_loop() -> None: """Stop and close the persistent isolated subagent loop.""" global _isolated_subagent_loop, _isolated_subagent_loop_thread, _isolated_subagent_loop_started with _isolated_subagent_loop_lock: loop = _isolated_subagent_loop thread = _isolated_subagent_loop_thread _isolated_subagent_loop = None _isolated_subagent_loop_thread = None _isolated_subagent_loop_started = None if loop is None: return if loop.is_running(): loop.call_soon_threadsafe(loop.stop) if thread is not None and thread.is_alive() and thread is not threading.current_thread(): thread.join(timeout=1) thread_stopped = thread is None or not thread.is_alive() loop_stopped = not loop.is_running() if not loop.is_closed(): if thread_stopped and loop_stopped: loop.close() else: logger.warning( "Skipping close of isolated subagent loop because shutdown did not complete within timeout (thread_alive=%s, loop_running=%s)", thread is not None and thread.is_alive(), loop.is_running(), ) atexit.register(_shutdown_isolated_subagent_loop) def _get_isolated_subagent_loop() -> asyncio.AbstractEventLoop: """Return the persistent event loop used by isolated subagent executions.""" global _isolated_subagent_loop, _isolated_subagent_loop_thread, _isolated_subagent_loop_started with _isolated_subagent_loop_lock: thread_is_alive = _isolated_subagent_loop_thread is not None and _isolated_subagent_loop_thread.is_alive() loop_is_usable = _isolated_subagent_loop is not None and not _isolated_subagent_loop.is_closed() and _isolated_subagent_loop.is_running() and thread_is_alive if not loop_is_usable: loop = asyncio.new_event_loop() started_event = threading.Event() thread = threading.Thread( target=_run_isolated_subagent_loop, args=(loop, started_event), name="subagent-persistent-loop", daemon=True, ) thread.start() if not started_event.wait(timeout=5): loop.call_soon_threadsafe(loop.stop) thread.join(timeout=1) loop.close() raise RuntimeError("Timed out starting isolated subagent event loop") _isolated_subagent_loop = loop _isolated_subagent_loop_thread = thread _isolated_subagent_loop_started = started_event if _isolated_subagent_loop is None: raise RuntimeError("Isolated subagent event loop is not initialized") return _isolated_subagent_loop def _submit_to_isolated_loop_in_context( context: Context, coro_factory: Callable[[], Coroutine[Any, Any, SubagentResult]], ) -> Future[SubagentResult]: """Submit a coroutine to the isolated loop while preserving ContextVar state.""" return context.run( lambda: asyncio.run_coroutine_threadsafe( coro_factory(), _get_isolated_subagent_loop(), ) ) def _filter_tools( all_tools: list[BaseTool], allowed: list[str] | None, disallowed: list[str] | None, ) -> list[BaseTool]: """Filter tools based on subagent configuration. Args: all_tools: List of all available tools. allowed: Optional allowlist of tool names. If provided, only these tools are included. disallowed: Optional denylist of tool names. These tools are always excluded. Returns: Filtered list of tools. """ filtered = all_tools # Apply allowlist if specified if allowed is not None: allowed_set = set(allowed) filtered = [t for t in filtered if t.name in allowed_set] # Apply denylist if disallowed is not None: disallowed_set = set(disallowed) filtered = [t for t in filtered if t.name not in disallowed_set] return filtered class SubagentExecutor: """Executor for running subagents.""" def __init__( self, config: SubagentConfig, tools: list[BaseTool], app_config: AppConfig | None = None, parent_model: str | None = None, sandbox_state: SandboxState | None = None, thread_data: ThreadDataState | None = None, thread_id: str | None = None, trace_id: str | None = None, ): """Initialize the executor. Args: config: Subagent configuration. tools: List of all available tools (will be filtered). app_config: Resolved AppConfig. When None, ``_create_agent`` falls back to ``get_app_config()`` (matches the lead-agent factory's pattern). parent_model: The parent agent's model name for inheritance. sandbox_state: Sandbox state from parent agent. thread_data: Thread data from parent agent. thread_id: Thread ID for sandbox operations. trace_id: Trace ID from parent for distributed tracing. """ self.config = config self.app_config = app_config self.parent_model = parent_model # Resolve eagerly only when it does not require loading config.yaml; otherwise defer # to _create_agent (which already loads app_config) so unit tests can construct # executors without a config file present. if config.model != "inherit" or parent_model is not None or app_config is not None: self.model_name: str | None = resolve_subagent_model_name(config, parent_model, app_config=app_config) else: self.model_name = None self.sandbox_state = sandbox_state self.thread_data = thread_data self.thread_id = thread_id # Generate trace_id if not provided (for top-level calls) self.trace_id = trace_id or str(uuid.uuid4())[:8] self._base_tools = _filter_tools( tools, config.tools, config.disallowed_tools, ) self.tools = self._base_tools logger.info(f"[trace={self.trace_id}] SubagentExecutor initialized: {config.name} with {len(self.tools)} tools") def _create_agent(self, tools: list[BaseTool] | None = None): """Create the agent instance.""" app_config = self.app_config or get_app_config() if self.model_name is None: self.model_name = resolve_subagent_model_name(self.config, self.parent_model, app_config=app_config) model = create_chat_model(name=self.model_name, thinking_enabled=False, app_config=app_config) from deerflow.agents.middlewares.tool_error_handling_middleware import build_subagent_runtime_middlewares # Reuse shared middleware composition with lead agent. middlewares = build_subagent_runtime_middlewares(app_config=app_config, model_name=self.model_name, lazy_init=True) return create_agent( model=model, tools=tools if tools is not None else self.tools, middleware=middlewares, system_prompt=self.config.system_prompt, state_schema=ThreadState, ) async def _load_skills(self) -> list[Skill]: """Load enabled skill metadata based on config.skills.""" if self.config.skills is not None and len(self.config.skills) == 0: logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} skills=[] — skipping skill loading") return [] try: from deerflow.skills.storage import get_or_new_skill_storage storage_kwargs = {"app_config": self.app_config} if self.app_config is not None else {} storage = await asyncio.to_thread(get_or_new_skill_storage, **storage_kwargs) # Use asyncio.to_thread to avoid blocking the event loop (LangGraph ASGI requirement) all_skills = await asyncio.to_thread(storage.load_skills, enabled_only=True) logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} loaded {len(all_skills)} enabled skills from disk") except Exception: logger.exception(f"[trace={self.trace_id}] Failed to load skills for subagent {self.config.name}") raise if not all_skills: logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} no enabled skills found") return [] # Filter by config.skills whitelist if self.config.skills is not None: allowed = set(self.config.skills) return [s for s in all_skills if s.name in allowed] return all_skills def _apply_skill_allowed_tools(self, skills: list[Skill]) -> list[BaseTool]: return filter_tools_by_skill_allowed_tools(self._base_tools, skills) async def _load_skill_messages(self, skills: list[Skill]) -> list[SystemMessage]: """Load skill content as conversation items based on config.skills. Aligned with Codex's pattern: each subagent loads its own skills per-session and injects them as conversation items (developer messages), not as system prompt text. The config.skills whitelist controls which skills are loaded: - None: load all enabled skills - []: no skills - ["skill-a", "skill-b"]: only these skills Returns: List of SystemMessages containing skill content. """ if not skills: return [] # Read each skill's SKILL.md content and create conversation items messages = [] for skill in skills: try: content = await asyncio.to_thread(skill.skill_file.read_text, encoding="utf-8") content = content.strip() if content: messages.append(SystemMessage(content=f'\n{content}\n')) logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} loaded skill: {skill.name}") except Exception: logger.debug(f"[trace={self.trace_id}] Failed to read skill {skill.name}", exc_info=True) return messages async def _build_initial_state(self, task: str) -> tuple[dict[str, Any], list[BaseTool]]: """Build the initial state for agent execution. Args: task: The task description. Returns: Initial state dictionary and tools filtered by loaded skill metadata. """ # Load skills as conversation items (Codex pattern) skills = await self._load_skills() filtered_tools = self._apply_skill_allowed_tools(skills) skill_messages = await self._load_skill_messages(skills) messages: list[Any] = [] # Skill content injected as developer/system messages before the task messages.extend(skill_messages) # Then the actual task messages.append(HumanMessage(content=task)) state: dict[str, Any] = { "messages": messages, } # Pass through sandbox and thread data from parent if self.sandbox_state is not None: state["sandbox"] = self.sandbox_state if self.thread_data is not None: state["thread_data"] = self.thread_data return state, filtered_tools async def _aexecute(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult: """Execute a task asynchronously. Args: task: The task description for the subagent. result_holder: Optional pre-created result object to update during execution. Returns: SubagentResult with the execution result. """ if result_holder is not None: # Use the provided result holder (for async execution with real-time updates) result = result_holder else: # Create a new result for synchronous execution task_id = str(uuid.uuid4())[:8] result = SubagentResult( task_id=task_id, trace_id=self.trace_id, status=SubagentStatus.RUNNING, started_at=datetime.now(), ) ai_messages = result.ai_messages if ai_messages is None: ai_messages = [] result.ai_messages = ai_messages try: state, filtered_tools = await self._build_initial_state(task) agent = self._create_agent(filtered_tools) # Build config with thread_id for sandbox access and recursion limit run_config: RunnableConfig = { "recursion_limit": self.config.max_turns, } context: dict[str, Any] = {} if self.thread_id: run_config["configurable"] = {"thread_id": self.thread_id} context["thread_id"] = self.thread_id if self.app_config is not None: context["app_config"] = self.app_config logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution with max_turns={self.config.max_turns}") # Use stream instead of invoke to get real-time updates # This allows us to collect AI messages as they are generated final_state = None # Pre-check: bail out immediately if already cancelled before streaming starts if result.cancel_event.is_set(): logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} cancelled before streaming") with _background_tasks_lock: if result.status == SubagentStatus.RUNNING: result.status = SubagentStatus.CANCELLED result.error = "Cancelled by user" result.completed_at = datetime.now() return result async for chunk in agent.astream(state, config=run_config, context=context, stream_mode="values"): # type: ignore[arg-type] # Cooperative cancellation: check if parent requested stop. # Note: cancellation is only detected at astream iteration boundaries, # so long-running tool calls within a single iteration will not be # interrupted until the next chunk is yielded. if result.cancel_event.is_set(): logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} cancelled by parent") with _background_tasks_lock: if result.status == SubagentStatus.RUNNING: result.status = SubagentStatus.CANCELLED result.error = "Cancelled by user" result.completed_at = datetime.now() return result final_state = chunk # Extract AI messages from the current state messages = chunk.get("messages", []) if messages: last_message = messages[-1] # Check if this is a new AI message if isinstance(last_message, AIMessage): # Convert message to dict for serialization message_dict = last_message.model_dump() # Only add if it's not already in the list (avoid duplicates) # Check by comparing message IDs if available, otherwise compare full dict message_id = message_dict.get("id") is_duplicate = False if message_id: is_duplicate = any(msg.get("id") == message_id for msg in ai_messages) else: is_duplicate = message_dict in ai_messages if not is_duplicate: ai_messages.append(message_dict) logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} captured AI message #{len(ai_messages)}") logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} completed async execution") if final_state is None: logger.warning(f"[trace={self.trace_id}] Subagent {self.config.name} no final state") result.result = "No response generated" else: # Extract the final message - find the last AIMessage messages = final_state.get("messages", []) logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} final messages count: {len(messages)}") # Find the last AIMessage in the conversation last_ai_message = None for msg in reversed(messages): if isinstance(msg, AIMessage): last_ai_message = msg break if last_ai_message is not None: content = last_ai_message.content # Handle both str and list content types for the final result if isinstance(content, str): result.result = content elif isinstance(content, list): # Extract text from list of content blocks for final result only. # Concatenate raw string chunks directly, but preserve separation # between full text blocks for readability. text_parts = [] pending_str_parts = [] for block in content: if isinstance(block, str): pending_str_parts.append(block) elif isinstance(block, dict): if pending_str_parts: text_parts.append("".join(pending_str_parts)) pending_str_parts.clear() text_val = block.get("text") if isinstance(text_val, str): text_parts.append(text_val) if pending_str_parts: text_parts.append("".join(pending_str_parts)) result.result = "\n".join(text_parts) if text_parts else "No text content in response" else: result.result = str(content) elif messages: # Fallback: use the last message if no AIMessage found last_message = messages[-1] logger.warning(f"[trace={self.trace_id}] Subagent {self.config.name} no AIMessage found, using last message: {type(last_message)}") raw_content = last_message.content if hasattr(last_message, "content") else str(last_message) if isinstance(raw_content, str): result.result = raw_content elif isinstance(raw_content, list): parts = [] pending_str_parts = [] for block in raw_content: if isinstance(block, str): pending_str_parts.append(block) elif isinstance(block, dict): if pending_str_parts: parts.append("".join(pending_str_parts)) pending_str_parts.clear() text_val = block.get("text") if isinstance(text_val, str): parts.append(text_val) if pending_str_parts: parts.append("".join(pending_str_parts)) result.result = "\n".join(parts) if parts else "No text content in response" else: result.result = str(raw_content) else: logger.warning(f"[trace={self.trace_id}] Subagent {self.config.name} no messages in final state") result.result = "No response generated" result.status = SubagentStatus.COMPLETED result.completed_at = datetime.now() except Exception as e: logger.exception(f"[trace={self.trace_id}] Subagent {self.config.name} async execution failed") result.status = SubagentStatus.FAILED result.error = str(e) result.completed_at = datetime.now() return result def _execute_in_isolated_loop(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult: """Execute the subagent on the persistent isolated event loop. This method is used by the sync ``execute()`` path when the caller is already running inside an event loop. Because ``execute()`` is a sync API, this path blocks the caller while the actual coroutine runs on the long-lived isolated loop. Reusing that loop keeps shared async clients from being tied to a short-lived loop that gets closed per execution. """ future: Future[SubagentResult] | None = None parent_context = copy_context() try: future = _submit_to_isolated_loop_in_context( parent_context, lambda: self._aexecute(task, result_holder), ) return future.result(timeout=self.config.timeout_seconds) except FuturesTimeoutError: if result_holder is not None: result_holder.cancel_event.set() if future is not None: future.cancel() raise except Exception: if future is None: logger.debug( f"[trace={self.trace_id}] Failed to submit subagent {self.config.name} to the isolated event loop", exc_info=True, ) else: logger.debug( f"[trace={self.trace_id}] Subagent {self.config.name} failed while executing on the isolated event loop", exc_info=True, ) raise def execute(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult: """Execute a task synchronously (wrapper around async execution). This method runs the async execution in a new event loop, allowing asynchronous tools (like MCP tools) to be used within the thread pool. When called from within an already-running event loop (e.g., when the parent agent is async), this method synchronously waits on the persistent isolated loop to avoid event loop conflicts with shared async primitives like httpx clients. Args: task: The task description for the subagent. result_holder: Optional pre-created result object to update during execution. Returns: SubagentResult with the execution result. """ try: try: loop = asyncio.get_running_loop() except RuntimeError: loop = None if loop is not None and loop.is_running(): logger.debug(f"[trace={self.trace_id}] Subagent {self.config.name} detected running event loop, using isolated loop") return self._execute_in_isolated_loop(task, result_holder) # Standard path: no running event loop, use asyncio.run return asyncio.run(self._aexecute(task, result_holder)) except Exception as e: logger.exception(f"[trace={self.trace_id}] Subagent {self.config.name} execution failed") # Create a result with error if we don't have one if result_holder is not None: result = result_holder else: result = SubagentResult( task_id=str(uuid.uuid4())[:8], trace_id=self.trace_id, status=SubagentStatus.FAILED, ) result.status = SubagentStatus.FAILED result.error = str(e) result.completed_at = datetime.now() return result def execute_async(self, task: str, task_id: str | None = None) -> str: """Start a task execution in the background. Args: task: The task description for the subagent. task_id: Optional task ID to use. If not provided, a random UUID will be generated. Returns: Task ID that can be used to check status later. """ # Use provided task_id or generate a new one if task_id is None: task_id = str(uuid.uuid4())[:8] # Create initial pending result result = SubagentResult( task_id=task_id, trace_id=self.trace_id, status=SubagentStatus.PENDING, ) logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution, task_id={task_id}, timeout={self.config.timeout_seconds}s") with _background_tasks_lock: _background_tasks[task_id] = result parent_context = copy_context() # Submit to scheduler pool def run_task(): with _background_tasks_lock: _background_tasks[task_id].status = SubagentStatus.RUNNING _background_tasks[task_id].started_at = datetime.now() result_holder = _background_tasks[task_id] try: # Submit execution directly to the persistent isolated loop so the # background path does not create a temporary loop via execute(). execution_future = _submit_to_isolated_loop_in_context( parent_context, lambda: self._aexecute(task, result_holder), ) try: # Wait for execution with timeout exec_result = execution_future.result(timeout=self.config.timeout_seconds) with _background_tasks_lock: _background_tasks[task_id].status = exec_result.status _background_tasks[task_id].result = exec_result.result _background_tasks[task_id].error = exec_result.error _background_tasks[task_id].completed_at = datetime.now() _background_tasks[task_id].ai_messages = exec_result.ai_messages except FuturesTimeoutError: logger.error(f"[trace={self.trace_id}] Subagent {self.config.name} execution timed out after {self.config.timeout_seconds}s") with _background_tasks_lock: if _background_tasks[task_id].status == SubagentStatus.RUNNING: _background_tasks[task_id].status = SubagentStatus.TIMED_OUT _background_tasks[task_id].error = f"Execution timed out after {self.config.timeout_seconds} seconds" _background_tasks[task_id].completed_at = datetime.now() # Signal cooperative cancellation and cancel the future result_holder.cancel_event.set() execution_future.cancel() except Exception as e: logger.exception(f"[trace={self.trace_id}] Subagent {self.config.name} async execution failed") with _background_tasks_lock: _background_tasks[task_id].status = SubagentStatus.FAILED _background_tasks[task_id].error = str(e) _background_tasks[task_id].completed_at = datetime.now() _scheduler_pool.submit(run_task) return task_id MAX_CONCURRENT_SUBAGENTS = 3 def request_cancel_background_task(task_id: str) -> None: """Signal a running background task to stop. Sets the cancel_event on the task, which is checked cooperatively by ``_aexecute`` during ``agent.astream()`` iteration. This allows subagent threads — which cannot be force-killed via ``Future.cancel()`` — to stop at the next iteration boundary. Args: task_id: The task ID to cancel. """ with _background_tasks_lock: result = _background_tasks.get(task_id) if result is not None: result.cancel_event.set() logger.info("Requested cancellation for background task %s", task_id) def get_background_task_result(task_id: str) -> SubagentResult | None: """Get the result of a background task. Args: task_id: The task ID returned by execute_async. Returns: SubagentResult if found, None otherwise. """ with _background_tasks_lock: return _background_tasks.get(task_id) def list_background_tasks() -> list[SubagentResult]: """List all background tasks. Returns: List of all SubagentResult instances. """ with _background_tasks_lock: return list(_background_tasks.values()) def cleanup_background_task(task_id: str) -> None: """Remove a completed task from background tasks. Should be called by task_tool after it finishes polling and returns the result. This prevents memory leaks from accumulated completed tasks. Only removes tasks that are in a terminal state (COMPLETED/FAILED/TIMED_OUT) to avoid race conditions with the background executor still updating the task entry. Args: task_id: The task ID to remove. """ with _background_tasks_lock: result = _background_tasks.get(task_id) if result is None: # Nothing to clean up; may have been removed already. logger.debug("Requested cleanup for unknown background task %s", task_id) return # Only clean up tasks that are in a terminal state to avoid races with # the background executor still updating the task entry. is_terminal_status = result.status in { SubagentStatus.COMPLETED, SubagentStatus.FAILED, SubagentStatus.CANCELLED, SubagentStatus.TIMED_OUT, } if is_terminal_status or result.completed_at is not None: del _background_tasks[task_id] logger.debug("Cleaned up background task: %s", task_id) else: logger.debug( "Skipping cleanup for non-terminal background task %s (status=%s)", task_id, result.status.value if hasattr(result.status, "value") else result.status, )