fix(actor): harden lifecycle, supervision, Redis mailbox, and add comprehensive tests

- Fix spawn() zombie cell: clean up registry on start() failure - Fix shutdown(): cancel + await tasks that exceed graceful timeout - Fix _shutdown(): await mailbox.close() to release backend resources - Fix escalate directive: stop failing child before propagating to grandparent - Fix RedisMailbox.put(): wrap Redis errors in try/except, return False on failure - Fix retry.py: replace assert with proper raise for last_exc - Add put_batch() to Mailbox abstraction for single-roundtrip bulk enqueue - Add RedisMailbox.put_batch() with atomic Lua script for bounded queues - Add MailboxFullError exception type for semantic backpressure handling - Add redis>=7.4.0 dependency with public PyPI sources in uv.lock Tests added (31 total, up from 27): - test_middleware_on_restart_hook: verifies middleware.on_restart() on supervision restart - test_ask_propagates_actor_exception: ask() re-raises original exception type - test_ask_propagates_exception_while_supervised: exception propagates; root actor survives - test_ask_timeout_late_reply_no_exception: late reply after timeout is silent no-op - test_actor_backpressure.py: MailboxFullError + dead letter on full mailbox - test_actor_retry.py: ask_with_retry with exponential backoff - test_mailbox_redis.py: RedisMailbox put/get/batch/close - bench_actor_redis.py: RedisMailbox throughput benchmarks
2026-04-25 11:18:22 +00:00 · 2026-03-31 10:09:05 +08:00 · 2026-03-31 10:09:05 +08:00 · 228a2a66e3
commit 228a2a66e3
parent 3e17417122
14 changed files with 3156 additions and 2289 deletions
--- a/backend/packages/harness/deerflow/actor/init.py
+++ b/backend/packages/harness/deerflow/actor/init.py
@ -19,7 +19,8 @@ Usage::
 from .actor import Actor, ActorContext
 from .mailbox import Mailbox, MemoryMailbox
 from .middleware import Middleware
-from .ref import ActorRef, ReplyChannel
+from .ref import ActorRef, MailboxFullError, ReplyChannel
+from .retry import IdempotentActorMixin, IdempotencyStore, RetryEnvelope, ask_with_retry
 from .supervision import AllForOneStrategy, Directive, OneForOneStrategy, SupervisorStrategy
 from .system import ActorSystem, DeadLetter

@ -32,9 +33,14 @@ __all__ = [
    "DeadLetter",
    "Directive",
    "Mailbox",
+    "MailboxFullError",
    "MemoryMailbox",
    "Middleware",
    "OneForOneStrategy",
    "ReplyChannel",
+    "RetryEnvelope",
    "SupervisorStrategy",
+    "IdempotentActorMixin",
+    "IdempotencyStore",
+    "ask_with_retry",
 ]
--- a/backend/packages/harness/deerflow/actor/mailbox.py
+++ b/backend/packages/harness/deerflow/actor/mailbox.py
@ -12,6 +12,12 @@ import asyncio
 from typing import Any


+BACKPRESSURE_BLOCK = "block"
+BACKPRESSURE_DROP_NEW = "drop_new"
+BACKPRESSURE_FAIL = "fail"
+BACKPRESSURE_POLICIES = {BACKPRESSURE_BLOCK, BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL}
+
+
 class Mailbox(abc.ABC):
    """Abstract mailbox — the message queue for an actor.

@ -44,6 +50,18 @@ class Mailbox(abc.ABC):
    def full(self) -> bool:
        """Return True if mailbox is at capacity."""

+    async def put_batch(self, msgs: list[Any]) -> int:
+        """Enqueue multiple messages. Returns count accepted.
+
+        Default implementation falls back to sequential ``put`` calls.
+        Backends like Redis should override this for efficient bulk push.
+        """
+        count = 0
+        for msg in msgs:
+            if await self.put(msg):
+                count += 1
+        return count
+
    async def close(self) -> None:
        """Release resources. Default is no-op."""

@ -55,23 +73,32 @@ class Empty(Exception):
 class MemoryMailbox(Mailbox):
    """In-process mailbox backed by ``asyncio.Queue``."""

-    def __init__(self, maxsize: int = 256) -> None:
+    def __init__(self, maxsize: int = 256, *, backpressure_policy: str = BACKPRESSURE_BLOCK) -> None:
+        if backpressure_policy not in BACKPRESSURE_POLICIES:
+            raise ValueError(
+                f"Invalid backpressure_policy={backpressure_policy!r}, "
+                f"expected one of {sorted(BACKPRESSURE_POLICIES)}"
+            )
        self._queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=maxsize)
        self._maxsize = maxsize
+        self._backpressure_policy = backpressure_policy

    async def put(self, msg: Any) -> bool:
-        try:
+        if self._backpressure_policy == BACKPRESSURE_BLOCK:
            await self._queue.put(msg)
            return True
-        except asyncio.QueueFull:
-            return False
-
-    def put_nowait(self, msg: Any) -> bool:
-        try:
+        if self._backpressure_policy in (BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL):
+            if self._queue.full():
+                return False
            self._queue.put_nowait(msg)
            return True
-        except asyncio.QueueFull:
+        return False
+
+    def put_nowait(self, msg: Any) -> bool:
+        if self._queue.full():
            return False
+        self._queue.put_nowait(msg)
+        return True

    async def get(self) -> Any:
        return await self._queue.get()
--- a/backend/packages/harness/deerflow/actor/mailbox_redis.py
+++ b/backend/packages/harness/deerflow/actor/mailbox_redis.py
@ -107,12 +107,16 @@ class RedisMailbox(Mailbox):
        if self._closed:
            return False
        data = _serialize(msg)
-        if self._maxlen > 0:
-            # Atomic check+push via Lua script to avoid TOCTOU race
-            result = await self._redis.evalsha_or_eval(self._LUA_BOUNDED_PUSH, 1, self._queue_name, data, self._maxlen)
-            return bool(result)
-        await self._redis.lpush(self._queue_name, data)
-        return True
+        try:
+            if self._maxlen > 0:
+                # Atomic check+push via Lua script to avoid TOCTOU race
+                result = await self._redis.eval(self._LUA_BOUNDED_PUSH, 1, self._queue_name, data, self._maxlen)
+                return bool(result)
+            await self._redis.lpush(self._queue_name, data)
+            return True
+        except Exception as e:
+            logger.warning("RedisMailbox.put failed for %s: %s", self._queue_name, e)
+            return False

    def put_nowait(self, msg: Any) -> bool:
        """Redis cannot do synchronous non-blocking enqueue reliably.
@ -122,6 +126,36 @@ class RedisMailbox(Mailbox):
        """
        return False

+    async def put_batch(self, msgs: list[Any]) -> int:
+        """Push multiple messages in a single LPUSH command (one round-trip).
+
+        Unbounded queues: all messages sent atomically in one LPUSH.
+        Bounded queues: sequential puts to respect maxlen (no batch Lua script needed).
+        """
+        if self._closed or not msgs:
+            return 0
+        data_list = []
+        for msg in msgs:
+            try:
+                data_list.append(_serialize(msg))
+            except TypeError as e:
+                logger.warning("Skipping non-serializable message in put_batch: %s", e)
+        if not data_list:
+            return 0
+        if self._maxlen > 0:
+            count = 0
+            for data in data_list:
+                # Reuse the Lua script for TOCTOU-safe bounded check (same as put())
+                result = await self._redis.eval(self._LUA_BOUNDED_PUSH, 1, self._queue_name, data, self._maxlen)
+                if result:
+                    count += 1
+                else:
+                    break  # queue full — stop early
+            return count
+        # Unbounded: single LPUSH with all values — one network round-trip
+        await self._redis.lpush(self._queue_name, *data_list)
+        return len(data_list)
+
    async def get(self) -> Any:
        """Blocking dequeue via BRPOP. Retries until a message arrives."""
        while not self._closed:
--- a/backend/packages/harness/deerflow/actor/ref.py
+++ b/backend/packages/harness/deerflow/actor/ref.py
@ -83,6 +83,10 @@ class ActorStoppedError(Exception):
    """Raised when sending to a stopped actor via ask."""


+class MailboxFullError(RuntimeError):
+    """Raised when a message is rejected because the mailbox is at capacity."""
+
+
 # ---------------------------------------------------------------------------
 # Internal message wrappers (serializable — no Future objects)
 # ---------------------------------------------------------------------------
--- a/backend/packages/harness/deerflow/actor/retry.py
+++ b/backend/packages/harness/deerflow/actor/retry.py
@ -0,0 +1,142 @@
+"""Retry + idempotency helpers for Actor ask/tell patterns.
+
+This module provides:
+- Message envelope carrying retry/idempotency metadata
+- In-memory idempotency store (process-local)
+- ask_with_retry helper (bounded retries + exponential backoff + jitter)
+
+Design notes:
+- Keep transport-agnostic; works with current in-memory mailbox.
+- Business handlers must opt in by using ``IdempotentActorMixin`` and
+  wrapping logic with ``handle_idempotent``.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import random
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass(slots=True)
+class RetryEnvelope:
+    """Metadata wrapper for idempotent/retriable messages."""
+
+    payload: Any
+    message_id: str = field(default_factory=lambda: uuid.uuid4().hex)
+    idempotency_key: str | None = None
+    attempt: int = 1
+    max_attempts: int = 1
+    created_at_ms: int = field(default_factory=lambda: int(time.time() * 1000))
+
+    @classmethod
+    def wrap(
+        cls,
+        payload: Any,
+        *,
+        idempotency_key: str | None = None,
+        attempt: int = 1,
+        max_attempts: int = 1,
+    ) -> "RetryEnvelope":
+        return cls(
+            payload=payload,
+            idempotency_key=idempotency_key,
+            attempt=attempt,
+            max_attempts=max_attempts,
+        )
+
+
+class IdempotencyStore:
+    """Process-local idempotency result store."""
+
+    def __init__(self) -> None:
+        self._results: dict[str, Any] = {}
+
+    def has(self, key: str) -> bool:
+        return key in self._results
+
+    def get(self, key: str) -> Any:
+        return self._results[key]
+
+    def set(self, key: str, value: Any) -> None:
+        self._results[key] = value
+
+
+class IdempotentActorMixin:
+    """Mixin adding idempotent handling utility for actors.
+
+    Usage in actor::
+
+        class MyActor(IdempotentActorMixin, Actor):
+            async def on_receive(self, message):
+                return await self.handle_idempotent(message, self._handle)
+
+            async def _handle(self, payload):
+                ...
+    """
+
+    def _idempotency_store(self) -> IdempotencyStore:
+        store = getattr(self, "_idem_store", None)
+        if store is None:
+            store = IdempotencyStore()
+            setattr(self, "_idem_store", store)
+        return store
+
+    async def handle_idempotent(self, message: Any, handler):
+        if not isinstance(message, RetryEnvelope):
+            return await handler(message)
+
+        key = message.idempotency_key
+        if not key:
+            return await handler(message.payload)
+
+        store = self._idempotency_store()
+        if store.has(key):
+            return store.get(key)
+
+        result = await handler(message.payload)
+        store.set(key, result)
+        return result
+
+
+async def ask_with_retry(
+    ref,
+    payload: Any,
+    *,
+    timeout: float = 5.0,
+    max_attempts: int = 3,
+    base_backoff_s: float = 0.1,
+    max_backoff_s: float = 5.0,
+    jitter_ratio: float = 0.3,
+    retry_exceptions: tuple[type[BaseException], ...] = (asyncio.TimeoutError,),
+    idempotency_key: str | None = None,
+) -> Any:
+    """Ask actor with bounded retries and envelope metadata."""
+    if max_attempts < 1:
+        raise ValueError("max_attempts must be >= 1")
+
+    key = idempotency_key or uuid.uuid4().hex
+    last_exc: BaseException | None = None
+
+    for attempt in range(1, max_attempts + 1):
+        msg = RetryEnvelope.wrap(
+            payload,
+            idempotency_key=key,
+            attempt=attempt,
+            max_attempts=max_attempts,
+        )
+        try:
+            return await ref.ask(msg, timeout=timeout)
+        except retry_exceptions as exc:
+            last_exc = exc
+            if attempt >= max_attempts:
+                break
+
+            backoff = min(max_backoff_s, base_backoff_s * (2 ** (attempt - 1)))
+            jitter = backoff * jitter_ratio * random.random()
+            await asyncio.sleep(backoff + jitter)
+
+    raise last_exc  # type: ignore[misc]  # always set: loop runs ≥1 time and sets on last iteration
--- a/backend/packages/harness/deerflow/actor/system.py
+++ b/backend/packages/harness/deerflow/actor/system.py
@ -11,7 +11,7 @@ from typing import Any
 from .actor import Actor, ActorContext
 from .mailbox import Empty, Mailbox, MemoryMailbox
 from .middleware import ActorMailboxContext, Middleware, NextFn, build_middleware_chain
-from .ref import ActorRef, ActorStoppedError, ReplyChannel, _Envelope, _ReplyMessage, _ReplyRegistry, _Stop
+from .ref import ActorRef, ActorStoppedError, MailboxFullError, ReplyChannel, _Envelope, _ReplyMessage, _ReplyRegistry, _Stop
 from .supervision import Directive, SupervisorStrategy

 logger = logging.getLogger(__name__)
@ -87,7 +87,11 @@ class ActorSystem:
            middlewares=middlewares or [],
        )
        self._root_cells[name] = cell
-        await cell.start()
+        try:
+            await cell.start()
+        except Exception:
+            del self._root_cells[name]
+            raise
        return cell.ref

    async def shutdown(self, *, timeout: float = 10.0) -> None:
@ -99,7 +103,12 @@ class ActorSystem:
            if cell.task is not None:
                tasks.append(cell.task)
        if tasks:
-            await asyncio.wait(tasks, timeout=timeout)
+            _, pending = await asyncio.wait(tasks, timeout=timeout)
+            # Cancel tasks that didn't finish within the timeout to prevent zombie tasks
+            for t in pending:
+                t.cancel()
+            if pending:
+                await asyncio.wait(pending, timeout=2.0)
        self._root_cells.clear()
        self._replies.reject_all(ActorStoppedError("ActorSystem shutting down"))
        await self._reply_channel.stop_listener()
@ -188,16 +197,25 @@ class _ActorCell:
        self.task = asyncio.create_task(self._run(), name=f"actor:{self.path}")

    async def enqueue(self, msg: _Envelope | _Stop) -> None:
-        if not self.mailbox.put_nowait(msg):
+        # Try non-blocking first (fast path for MemoryMailbox)
+        if self.mailbox.put_nowait(msg):
+            return
+        # Fallback to async put (required for Redis and other async backends)
+        if not await self.mailbox.put(msg):
            if isinstance(msg, _Envelope) and msg.correlation_id is not None:
-                self.system._replies.reject(msg.correlation_id, RuntimeError(f"Mailbox full: {self.path}"))
+                self.system._replies.reject(msg.correlation_id, MailboxFullError(f"Mailbox full: {self.path}"))
            elif isinstance(msg, _Envelope):
                self.system._dead_letter(self.ref, msg.payload, msg.sender)

    def request_stop(self) -> None:
-        """Request graceful shutdown. Falls back to task.cancel() if mailbox full."""
+        """Request graceful shutdown.
+
+        Tries put_nowait first. If that fails (full or unsupported backend),
+        cancels the task directly so _run exits via CancelledError → finally → _shutdown.
+        """
        if not self.stopped:
            if not self.mailbox.put_nowait(_Stop()):
+                # Redis/async backends can't put_nowait — cancel the task
                if self.task is not None and not self.task.done():
                    self.task.cancel()
                else:
@ -223,7 +241,11 @@ class _ActorCell:
            middlewares=middlewares or [],
        )
        self.children[name] = child
-        await child.start()
+        try:
+            await child.start()
+        except Exception:
+            del self.children[name]
+            raise
        return child.ref

    # -- Processing loop -------------------------------------------------------
@ -310,6 +332,11 @@ class _ActorCell:
        # Remove from parent
        if self.parent is not None:
            self.parent.children.pop(self.name, None)
+        # Close mailbox to release backend resources (e.g. Redis connections)
+        try:
+            await self.mailbox.close()
+        except Exception:
+            logger.exception("Error closing mailbox for %s", self.path)

    # -- Supervision -----------------------------------------------------------

@ -337,8 +364,16 @@ class _ActorCell:
            return

        if directive == Directive.escalate:
-            logger.info("Supervisor %s: escalate %s", self.path, type(error).__name__)
-            raise error
+            # Stop the failing child, then propagate failure up the supervision chain.
+            # We cannot use `raise error` here — that would crash the child's _run
+            # loop instead of notifying the grandparent's supervisor.
+            child.request_stop()
+            if self.parent is not None:
+                logger.info("Supervisor %s: escalate %s to grandparent %s", self.path, type(error).__name__, self.parent.path)
+                await self.parent._handle_child_failure(self, error)
+            else:
+                logger.error("Uncaught escalation at root actor %s: %s", self.path, error)
+            return

        if directive == Directive.restart:
            for name in affected:
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@ -19,7 +19,11 @@ dependencies = [
 ]

 [dependency-groups]
-dev = ["pytest>=8.0.0", "ruff>=0.14.11"]
+dev = [
+    "pytest>=8.0.0",
+    "redis>=7.4.0",
+    "ruff>=0.14.11",
+]

 [tool.uv.workspace]
 members = ["packages/harness"]
--- a/backend/tests/bench_actor.py
+++ b/backend/tests/bench_actor.py
@ -2,7 +2,6 @@

 import asyncio
 import time
-import statistics

 from deerflow.actor import Actor, ActorSystem, Middleware

@ -17,7 +16,11 @@ class CounterActor(Actor):
        self.count = 0

    async def on_receive(self, message):
-        self.count += 1
+        if message == "inc":
+            self.count += 1
+            return self.count
+        if message == "get":
+            return self.count
        return self.count


@ -69,6 +72,8 @@ async def bench_tell_throughput(n=100_000):
        await ref.tell("inc")
    # Wait for all messages to be processed
    count = await ref.ask("get", timeout=30.0)
+    if count != n:
+        print(f"  warning: expected {n} processed, got {count}")
    elapsed = time.perf_counter() - start

    await system.shutdown()
--- a/backend/tests/bench_actor_redis.py
+++ b/backend/tests/bench_actor_redis.py
@ -0,0 +1,273 @@
+"""RedisMailbox benchmark: throughput, latency, concurrency, backpressure."""
+
+import asyncio
+import time
+
+import redis.asyncio as redis
+
+from deerflow.actor import Actor, ActorSystem
+from deerflow.actor.mailbox_redis import RedisMailbox
+
+
+class EchoActor(Actor):
+    async def on_receive(self, message):
+        return message
+
+
+class CounterActor(Actor):
+    async def on_started(self):
+        self.count = 0
+
+    async def on_receive(self, message):
+        if message == "inc":
+            self.count += 1
+            return self.count
+        if message == "get":
+            return self.count
+        return self.count
+
+
+def fmt(n):
+    if n >= 1_000_000:
+        return f"{n/1_000_000:.1f}M"
+    if n >= 1_000:
+        return f"{n/1_000:.0f}K"
+    return str(n)
+
+
+async def _redis_client():
+    client = redis.Redis(host="127.0.0.1", port=6379, decode_responses=False)
+    await client.ping()
+    return client
+
+
+async def bench_redis_ask_throughput(n=20_000):
+    client = await _redis_client()
+
+    queue = "deerflow:bench:redis:ask"
+    await client.delete(queue)
+
+    mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
+    system = ActorSystem("bench-redis")
+    ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
+
+    start = time.perf_counter()
+    for _ in range(n):
+        await ref.ask("ping", timeout=5.0)
+    elapsed = time.perf_counter() - start
+
+    await system.shutdown()
+
+    rate = n / elapsed
+    print(f"  redis ask throughput:  {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s")
+
+
+async def bench_redis_tell_throughput(n=50_000):
+    client = await _redis_client()
+
+    queue = "deerflow:bench:redis:tell"
+    await client.delete(queue)
+
+    mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
+    system = ActorSystem("bench-redis")
+    ref = await system.spawn(CounterActor, "counter", mailbox=mailbox)
+
+    start = time.perf_counter()
+    for _ in range(n):
+        await ref.tell("inc")
+    count = await ref.ask("get", timeout=30.0)
+    elapsed = time.perf_counter() - start
+
+    await system.shutdown()
+
+    rate = n / elapsed
+    loss = n - count
+    print(f"  redis tell throughput: {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s (loss: {loss})")
+
+
+async def bench_redis_ask_latency(n=5_000):
+    client = await _redis_client()
+
+    queue = "deerflow:bench:redis:latency"
+    await client.delete(queue)
+
+    mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
+    system = ActorSystem("bench-redis")
+    ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
+
+    for _ in range(100):
+        await ref.ask("warmup", timeout=5.0)
+
+    latencies = []
+    for _ in range(n):
+        t0 = time.perf_counter()
+        await ref.ask("ping", timeout=5.0)
+        latencies.append((time.perf_counter() - t0) * 1_000_000)
+
+    await system.shutdown()
+
+    latencies.sort()
+    p50 = latencies[len(latencies) // 2]
+    p99 = latencies[int(len(latencies) * 0.99)]
+    p999 = latencies[int(len(latencies) * 0.999)]
+    print(f"  redis ask latency:     p50={p50:.0f}µs  p99={p99:.0f}µs  p99.9={p999:.0f}µs")
+
+
+async def bench_redis_concurrent_actors(num_actors=200, msgs_per_actor=100):
+    client = await _redis_client()
+    system = ActorSystem("bench-redis")
+    refs = []
+
+    for i in range(num_actors):
+        q = f"deerflow:bench:redis:conc:{i}"
+        await client.delete(q)
+        mailbox = RedisMailbox(client.connection_pool, q, brpop_timeout=0.05)
+        refs.append(await system.spawn(CounterActor, f"a{i}", mailbox=mailbox))
+
+    start = time.perf_counter()
+
+    async def send_batch(ref, n):
+        for i in range(n):
+            await ref.tell("inc")
+            if i % 50 == 49:
+                await asyncio.sleep(0)
+        return await ref.ask("get", timeout=30.0)
+
+    results = await asyncio.gather(*[send_batch(r, msgs_per_actor) for r in refs])
+    elapsed = time.perf_counter() - start
+
+    total = num_actors * msgs_per_actor
+    delivered = sum(results)
+    rate = total / elapsed
+    loss = total - delivered
+    print(
+        f"  redis concurrency:     {num_actors} actors × {msgs_per_actor} msgs = {fmt(total)} in {elapsed:.2f}s = {fmt(int(rate))}/s (loss: {loss})"
+    )
+
+    await system.shutdown()
+
+
+async def bench_redis_maxlen_backpressure(total_messages=20_000, maxlen=100, ask_timeout=0.01, ask_concurrency=200):
+    client = await _redis_client()
+
+    queue_tell = "deerflow:bench:redis:bp:tell"
+    await client.delete(queue_tell)
+    mailbox_tell = RedisMailbox(client.connection_pool, queue_tell, maxlen=maxlen, brpop_timeout=0.05)
+
+    system_tell = ActorSystem("bench-redis-bp-tell")
+    ref_tell = await system_tell.spawn(CounterActor, "counter", mailbox=mailbox_tell)
+
+    # Saturate with tell: dropped messages become dead letters
+    for _ in range(total_messages):
+        await ref_tell.tell("inc")
+
+    await asyncio.sleep(0.2)
+    processed = await ref_tell.ask("get", timeout=10.0)
+    dropped = len(system_tell.dead_letters)
+    drop_rate = dropped / total_messages if total_messages else 0.0
+
+    print(
+        f"  redis maxlen tell:     maxlen={maxlen}, sent={fmt(total_messages)}, processed={fmt(processed)}, dropped={fmt(dropped)} ({drop_rate:.1%})"
+    )
+
+    await system_tell.shutdown()
+
+    # Ask timeout rate under pressure
+    queue_ask = "deerflow:bench:redis:bp:ask"
+    await client.delete(queue_ask)
+    mailbox_ask = RedisMailbox(client.connection_pool, queue_ask, maxlen=maxlen, brpop_timeout=0.05)
+
+    system_ask = ActorSystem("bench-redis-bp-ask")
+    ref_ask = await system_ask.spawn(EchoActor, "echo", mailbox=mailbox_ask)
+
+    async def one_ask(i):
+        try:
+            await ref_ask.ask(i, timeout=ask_timeout)
+            return True, None
+        except asyncio.TimeoutError:
+            return False, "timeout"
+        except Exception:  # MailboxFullError or other rejection
+            return False, "rejected"
+
+    sem = asyncio.Semaphore(ask_concurrency)
+
+    async def one_ask_limited(i):
+        async with sem:
+            return await one_ask(i)
+
+    results = await asyncio.gather(*[one_ask_limited(i) for i in range(total_messages)])
+    ok = sum(1 for r, _ in results if r)
+    timeout_count = sum(1 for _, reason in results if reason == "timeout")
+    rejected_count = sum(1 for _, reason in results if reason == "rejected")
+    fail_rate = (total_messages - ok) / total_messages if total_messages else 0.0
+
+    print(
+        f"  redis maxlen ask:      maxlen={maxlen}, total={fmt(total_messages)}, ok={fmt(ok)}, "
+        f"timeout={fmt(timeout_count)}, rejected={fmt(rejected_count)} (fail: {fail_rate:.1%}), "
+        f"ask_timeout={ask_timeout}s, concurrency={ask_concurrency}"
+    )
+
+    await system_ask.shutdown()
+
+
+async def bench_redis_put_batch(n=50_000, batch_size=100):
+    """put_batch: N messages in N/batch_size round-trips instead of N."""
+    client = await _redis_client()
+
+    queue = "deerflow:bench:redis:batch"
+    await client.delete(queue)
+
+    mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
+    system = ActorSystem("bench-redis-batch")
+    ref = await system.spawn(CounterActor, "counter", mailbox=mailbox)
+
+    from deerflow.actor.ref import _Envelope
+
+    batches = [
+        [_Envelope(payload="inc") for _ in range(batch_size)]
+        for _ in range(n // batch_size)
+    ]
+
+    t0 = time.perf_counter()
+    for batch in batches:
+        await mailbox.put_batch(batch)
+    enqueue_elapsed = time.perf_counter() - t0
+
+    count = await ref.ask("get", timeout=60.0)
+    total_elapsed = time.perf_counter() - t0
+
+    loss = n - count
+    enqueue_rate = n / enqueue_elapsed
+    print(
+        f"  redis put_batch push:  {fmt(n)} msgs in {enqueue_elapsed:.3f}s = {fmt(int(enqueue_rate))}/s "
+        f"(batch={batch_size}, round-trips={n // batch_size})"
+    )
+    print(
+        f"  redis put_batch total: end-to-end {total_elapsed:.2f}s = {fmt(int(n / total_elapsed))}/s "
+        f"(consume bottleneck, loss={loss})"
+    )
+
+    await system.shutdown()
+
+
+async def main():
+    print("=" * 72)
+    print("  RedisMailbox Benchmarks")
+    print("=" * 72)
+    print()
+
+    await bench_redis_tell_throughput()
+    await bench_redis_ask_throughput()
+    await bench_redis_ask_latency()
+    await bench_redis_concurrent_actors()
+    await bench_redis_put_batch()
+    await bench_redis_maxlen_backpressure()
+
+    print()
+    print("=" * 72)
+    print("  Done")
+    print("=" * 72)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/backend/tests/test_actor.py
+++ b/backend/tests/test_actor.py
@ -440,3 +440,95 @@ class TestMiddleware:
        # tell goes through middleware too
        assert any("before:" in entry for entry in mw.log) is False
        await system.shutdown()
+
+    @pytest.mark.anyio
+    async def test_middleware_on_restart_hook(self):
+        """on_restart is called on the middleware when a child actor is restarted."""
+
+        class RestartTrackingMiddleware(Middleware):
+            def __init__(self):
+                self.restart_errors: list[Exception] = []
+
+            async def on_restart(self, actor_ref, error):
+                self.restart_errors.append(error)
+
+        mw = RestartTrackingMiddleware()
+
+        class ChildSpawningParent(Actor):
+            async def on_receive(self, message):
+                if message == "spawn":
+                    ref = await self.context.spawn(CrashActor, "child", middlewares=[mw])
+                    return ref
+
+        system = ActorSystem("test")
+        parent = await system.spawn(ChildSpawningParent, "parent")
+        child = await parent.ask("spawn")
+
+        # Crash the child — parent supervisor will restart it
+        try:
+            await child.ask("crash")
+        except ValueError:
+            pass
+        await asyncio.sleep(0.1)
+
+        assert len(mw.restart_errors) == 1
+        assert isinstance(mw.restart_errors[0], ValueError)
+        await system.shutdown()
+
+
+class TestAskErrorPropagation:
+    @pytest.mark.anyio
+    async def test_ask_propagates_actor_exception(self):
+        """ask() re-raises the original exception type when on_receive crashes."""
+
+        class BoomActor(Actor):
+            async def on_receive(self, message):
+                raise ValueError("intentional crash")
+
+        system = ActorSystem("test")
+        ref = await system.spawn(BoomActor, "boom")
+        with pytest.raises(ValueError, match="intentional crash"):
+            await ref.ask("trigger")
+        await system.shutdown()
+
+    @pytest.mark.anyio
+    async def test_ask_propagates_exception_while_supervised(self):
+        """ask() gets the exception even when the actor is supervised (not stopped)."""
+
+        class SometimesCrashActor(Actor):
+            async def on_receive(self, message):
+                if message == "crash":
+                    raise RuntimeError("supervised crash")
+                return "ok"
+
+        system = ActorSystem("test")
+        ref = await system.spawn(SometimesCrashActor, "sca")
+        with pytest.raises(RuntimeError, match="supervised crash"):
+            await ref.ask("crash")
+        # Root actor keeps running after a crash (consecutive_failures, not restart)
+        result = await ref.ask("hello", timeout=2.0)
+        assert result == "ok"
+        await system.shutdown()
+
+    @pytest.mark.anyio
+    async def test_ask_timeout_late_reply_no_exception(self):
+        """Late reply arriving after ask() timeout is silently dropped — no exception, no orphaned future."""
+
+        class SlowActor(Actor):
+            async def on_receive(self, message):
+                await asyncio.sleep(0.3)
+                return "late"
+
+        system = ActorSystem("test")
+        ref = await system.spawn(SlowActor, "slow")
+
+        with pytest.raises(asyncio.TimeoutError):
+            await ref.ask("go", timeout=0.05)
+
+        # Wait for actor to finish processing — late reply arrives, should be a no-op
+        await asyncio.sleep(0.4)
+        # System still functional: no orphaned futures, no leaked state
+        assert ref.is_alive
+        result = await ref.ask("go", timeout=2.0)
+        assert result == "late"
+        await system.shutdown()
--- a/backend/tests/test_actor_backpressure.py
+++ b/backend/tests/test_actor_backpressure.py
@ -0,0 +1,89 @@
+import asyncio
+
+import pytest
+
+from deerflow.actor import Actor, ActorSystem, MailboxFullError
+from deerflow.actor.mailbox import BACKPRESSURE_BLOCK, BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL, MemoryMailbox
+
+
+class SlowActor(Actor):
+    async def on_started(self):
+        self.count = 0
+
+    async def on_receive(self, message):
+        if message == 'inc':
+            await asyncio.sleep(0.01)
+            self.count += 1
+            return None
+        if message == 'get':
+            return self.count
+        return None
+
+
+@pytest.mark.anyio
+async def test_memory_mailbox_drop_new_policy_drops_tell_to_dead_letters():
+    system = ActorSystem('bp')
+    ref = await system.spawn(
+        SlowActor,
+        'slow',
+        mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_DROP_NEW),
+    )
+
+    # Overfill quickly
+    for _ in range(20):
+        await ref.tell('inc')
+
+    await asyncio.sleep(0.4)
+    count = await ref.ask('get', timeout=2.0)
+    await system.shutdown()
+
+    # Some messages should be dropped under drop_new
+    assert count < 20
+    assert len(system.dead_letters) > 0
+
+
+@pytest.mark.anyio
+async def test_memory_mailbox_fail_policy_rejects_ask_when_full():
+    system = ActorSystem('bp')
+    ref = await system.spawn(
+        SlowActor,
+        'slow',
+        mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_FAIL),
+    )
+
+    # Fill queue with tell first
+    await ref.tell('inc')
+
+    # Then ask may be rejected when queue still full
+    got_reject = False
+    for _ in range(30):
+        try:
+            await ref.ask('inc', timeout=0.02)
+        except MailboxFullError:
+            got_reject = True
+            break
+        except asyncio.TimeoutError:
+            pass
+
+    await system.shutdown()
+    assert got_reject
+
+
+@pytest.mark.anyio
+async def test_memory_mailbox_block_policy_eventually_accepts():
+    system = ActorSystem('bp')
+    ref = await system.spawn(
+        SlowActor,
+        'slow',
+        mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_BLOCK),
+    )
+
+    for _ in range(10):
+        await ref.tell('inc')
+
+    await asyncio.sleep(0.25)
+    count = await ref.ask('get', timeout=2.0)
+    await system.shutdown()
+
+    # Block policy should avoid dropping on tell path
+    assert count == 10
--- a/backend/tests/test_actor_retry.py
+++ b/backend/tests/test_actor_retry.py
@ -0,0 +1,62 @@
+import asyncio
+
+import pytest
+
+from deerflow.actor import Actor, ActorSystem, IdempotentActorMixin, RetryEnvelope, ask_with_retry
+
+
+class FlakyIdempotentActor(IdempotentActorMixin, Actor):
+    async def on_started(self):
+        self.calls = 0
+
+    async def on_receive(self, message):
+        return await self.handle_idempotent(message, self._handle)
+
+    async def _handle(self, payload):
+        self.calls += 1
+        if payload == 'flaky' and self.calls == 1:
+            await asyncio.sleep(0.02)
+            return 'late'
+        return f"ok:{payload}"
+
+
+@pytest.mark.anyio
+async def test_ask_with_retry_timeout_raises():
+    system = ActorSystem('retry')
+    ref = await system.spawn(FlakyIdempotentActor, 'a')
+
+    with pytest.raises(asyncio.TimeoutError):
+        await ask_with_retry(
+            ref,
+            'flaky',
+            timeout=0.005,
+            max_attempts=3,
+            base_backoff_s=0.001,
+            max_backoff_s=0.005,
+            jitter_ratio=0.0,
+            idempotency_key='k1',
+        )
+
+    # This helper retries timeout, but if each attempt times out it should raise.
+    assert ref.is_alive
+    await system.shutdown()
+
+
+@pytest.mark.anyio
+async def test_idempotent_envelope_returns_cached_result():
+    system = ActorSystem('retry')
+    ref = await system.spawn(FlakyIdempotentActor, 'a')
+
+    m1 = RetryEnvelope.wrap('x', idempotency_key='same-key')
+    m2 = RetryEnvelope.wrap('x', idempotency_key='same-key', attempt=2, max_attempts=3)
+
+    r1 = await ref.ask(m1, timeout=1.0)
+    r2 = await ref.ask(m2, timeout=1.0)
+
+    assert r1 == 'ok:x'
+    assert r2 == 'ok:x'
+    # handler should run once due to idempotency cache
+    actor = ref._cell.actor
+    assert actor.calls == 1
+
+    await system.shutdown()
--- a/backend/tests/test_mailbox_redis.py
+++ b/backend/tests/test_mailbox_redis.py
@ -0,0 +1,83 @@
+import asyncio
+
+import pytest
+
+redis = pytest.importorskip("redis.asyncio")
+
+from deerflow.actor.mailbox_redis import RedisMailbox
+from deerflow.actor.ref import _Envelope, _Stop
+
+
+pytestmark = pytest.mark.anyio
+
+
+async def _make_mailbox(queue_name: str, *, maxlen: int = 0) -> RedisMailbox:
+    client = redis.Redis(host="127.0.0.1", port=6379, decode_responses=False)
+    await client.ping()
+    await client.delete(queue_name)
+    mailbox = RedisMailbox(client.connection_pool, queue_name, maxlen=maxlen, brpop_timeout=0.2)
+    return mailbox
+
+
+async def test_roundtrip_envelope_and_stop():
+    queue = "deerflow:test:redis-mailbox:roundtrip"
+    mailbox = await _make_mailbox(queue)
+    try:
+        msg = _Envelope(payload={"k": "v"}, correlation_id="c1", reply_to="sysA")
+        ok = await mailbox.put(msg)
+        assert ok is True
+
+        got = await mailbox.get()
+        assert isinstance(got, _Envelope)
+        assert got.payload == {"k": "v"}
+        assert got.correlation_id == "c1"
+        assert got.reply_to == "sysA"
+
+        ok = await mailbox.put(_Stop())
+        assert ok is True
+        stop = await mailbox.get()
+        assert isinstance(stop, _Stop)
+    finally:
+        await mailbox.close()
+
+
+async def test_bounded_queue_rejects_when_full():
+    queue = "deerflow:test:redis-mailbox:bounded"
+    mailbox = await _make_mailbox(queue, maxlen=1)
+    try:
+        assert await mailbox.put(_Envelope("m1")) is True
+        assert await mailbox.put(_Envelope("m2")) is False
+    finally:
+        await mailbox.close()
+
+
+async def test_put_nowait_and_get_nowait_contract():
+    queue = "deerflow:test:redis-mailbox:nowait"
+    mailbox = await _make_mailbox(queue)
+    try:
+        assert mailbox.put_nowait(_Envelope("x")) is False
+        with pytest.raises(Exception, match="does not support synchronous get_nowait"):
+            mailbox.get_nowait()
+    finally:
+        await mailbox.close()
+
+
+async def test_system_enqueue_fallback_with_async_mailbox():
+    from deerflow.actor import Actor, ActorSystem
+
+    class EchoActor(Actor):
+        async def on_receive(self, message):
+            return message
+
+    queue = "deerflow:test:redis-mailbox:system-fallback"
+    mailbox = await _make_mailbox(queue)
+
+    system = ActorSystem("redis-test")
+    ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
+    try:
+        # This exercises _ActorCell.enqueue fallback path:
+        # put_nowait() -> False, then await put() -> True
+        result = await ref.ask("hello", timeout=3.0)
+        assert result == "hello"
+    finally:
+        await system.shutdown()
--- a/backend/uv.lock
+++ b/backend/uv.lock