mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-04-25 11:18:22 +00:00
fix(actor): harden lifecycle, supervision, Redis mailbox, and add comprehensive tests
- Fix spawn() zombie cell: clean up registry on start() failure - Fix shutdown(): cancel + await tasks that exceed graceful timeout - Fix _shutdown(): await mailbox.close() to release backend resources - Fix escalate directive: stop failing child before propagating to grandparent - Fix RedisMailbox.put(): wrap Redis errors in try/except, return False on failure - Fix retry.py: replace assert with proper raise for last_exc - Add put_batch() to Mailbox abstraction for single-roundtrip bulk enqueue - Add RedisMailbox.put_batch() with atomic Lua script for bounded queues - Add MailboxFullError exception type for semantic backpressure handling - Add redis>=7.4.0 dependency with public PyPI sources in uv.lock Tests added (31 total, up from 27): - test_middleware_on_restart_hook: verifies middleware.on_restart() on supervision restart - test_ask_propagates_actor_exception: ask() re-raises original exception type - test_ask_propagates_exception_while_supervised: exception propagates; root actor survives - test_ask_timeout_late_reply_no_exception: late reply after timeout is silent no-op - test_actor_backpressure.py: MailboxFullError + dead letter on full mailbox - test_actor_retry.py: ask_with_retry with exponential backoff - test_mailbox_redis.py: RedisMailbox put/get/batch/close - bench_actor_redis.py: RedisMailbox throughput benchmarks
This commit is contained in:
parent
3e17417122
commit
228a2a66e3
@ -19,7 +19,8 @@ Usage::
|
||||
from .actor import Actor, ActorContext
|
||||
from .mailbox import Mailbox, MemoryMailbox
|
||||
from .middleware import Middleware
|
||||
from .ref import ActorRef, ReplyChannel
|
||||
from .ref import ActorRef, MailboxFullError, ReplyChannel
|
||||
from .retry import IdempotentActorMixin, IdempotencyStore, RetryEnvelope, ask_with_retry
|
||||
from .supervision import AllForOneStrategy, Directive, OneForOneStrategy, SupervisorStrategy
|
||||
from .system import ActorSystem, DeadLetter
|
||||
|
||||
@ -32,9 +33,14 @@ __all__ = [
|
||||
"DeadLetter",
|
||||
"Directive",
|
||||
"Mailbox",
|
||||
"MailboxFullError",
|
||||
"MemoryMailbox",
|
||||
"Middleware",
|
||||
"OneForOneStrategy",
|
||||
"ReplyChannel",
|
||||
"RetryEnvelope",
|
||||
"SupervisorStrategy",
|
||||
"IdempotentActorMixin",
|
||||
"IdempotencyStore",
|
||||
"ask_with_retry",
|
||||
]
|
||||
|
||||
@ -12,6 +12,12 @@ import asyncio
|
||||
from typing import Any
|
||||
|
||||
|
||||
BACKPRESSURE_BLOCK = "block"
|
||||
BACKPRESSURE_DROP_NEW = "drop_new"
|
||||
BACKPRESSURE_FAIL = "fail"
|
||||
BACKPRESSURE_POLICIES = {BACKPRESSURE_BLOCK, BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL}
|
||||
|
||||
|
||||
class Mailbox(abc.ABC):
|
||||
"""Abstract mailbox — the message queue for an actor.
|
||||
|
||||
@ -44,6 +50,18 @@ class Mailbox(abc.ABC):
|
||||
def full(self) -> bool:
|
||||
"""Return True if mailbox is at capacity."""
|
||||
|
||||
async def put_batch(self, msgs: list[Any]) -> int:
|
||||
"""Enqueue multiple messages. Returns count accepted.
|
||||
|
||||
Default implementation falls back to sequential ``put`` calls.
|
||||
Backends like Redis should override this for efficient bulk push.
|
||||
"""
|
||||
count = 0
|
||||
for msg in msgs:
|
||||
if await self.put(msg):
|
||||
count += 1
|
||||
return count
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Release resources. Default is no-op."""
|
||||
|
||||
@ -55,23 +73,32 @@ class Empty(Exception):
|
||||
class MemoryMailbox(Mailbox):
|
||||
"""In-process mailbox backed by ``asyncio.Queue``."""
|
||||
|
||||
def __init__(self, maxsize: int = 256) -> None:
|
||||
def __init__(self, maxsize: int = 256, *, backpressure_policy: str = BACKPRESSURE_BLOCK) -> None:
|
||||
if backpressure_policy not in BACKPRESSURE_POLICIES:
|
||||
raise ValueError(
|
||||
f"Invalid backpressure_policy={backpressure_policy!r}, "
|
||||
f"expected one of {sorted(BACKPRESSURE_POLICIES)}"
|
||||
)
|
||||
self._queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=maxsize)
|
||||
self._maxsize = maxsize
|
||||
self._backpressure_policy = backpressure_policy
|
||||
|
||||
async def put(self, msg: Any) -> bool:
|
||||
try:
|
||||
if self._backpressure_policy == BACKPRESSURE_BLOCK:
|
||||
await self._queue.put(msg)
|
||||
return True
|
||||
except asyncio.QueueFull:
|
||||
return False
|
||||
|
||||
def put_nowait(self, msg: Any) -> bool:
|
||||
try:
|
||||
if self._backpressure_policy in (BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL):
|
||||
if self._queue.full():
|
||||
return False
|
||||
self._queue.put_nowait(msg)
|
||||
return True
|
||||
except asyncio.QueueFull:
|
||||
return False
|
||||
|
||||
def put_nowait(self, msg: Any) -> bool:
|
||||
if self._queue.full():
|
||||
return False
|
||||
self._queue.put_nowait(msg)
|
||||
return True
|
||||
|
||||
async def get(self) -> Any:
|
||||
return await self._queue.get()
|
||||
|
||||
@ -107,12 +107,16 @@ class RedisMailbox(Mailbox):
|
||||
if self._closed:
|
||||
return False
|
||||
data = _serialize(msg)
|
||||
if self._maxlen > 0:
|
||||
# Atomic check+push via Lua script to avoid TOCTOU race
|
||||
result = await self._redis.evalsha_or_eval(self._LUA_BOUNDED_PUSH, 1, self._queue_name, data, self._maxlen)
|
||||
return bool(result)
|
||||
await self._redis.lpush(self._queue_name, data)
|
||||
return True
|
||||
try:
|
||||
if self._maxlen > 0:
|
||||
# Atomic check+push via Lua script to avoid TOCTOU race
|
||||
result = await self._redis.eval(self._LUA_BOUNDED_PUSH, 1, self._queue_name, data, self._maxlen)
|
||||
return bool(result)
|
||||
await self._redis.lpush(self._queue_name, data)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning("RedisMailbox.put failed for %s: %s", self._queue_name, e)
|
||||
return False
|
||||
|
||||
def put_nowait(self, msg: Any) -> bool:
|
||||
"""Redis cannot do synchronous non-blocking enqueue reliably.
|
||||
@ -122,6 +126,36 @@ class RedisMailbox(Mailbox):
|
||||
"""
|
||||
return False
|
||||
|
||||
async def put_batch(self, msgs: list[Any]) -> int:
|
||||
"""Push multiple messages in a single LPUSH command (one round-trip).
|
||||
|
||||
Unbounded queues: all messages sent atomically in one LPUSH.
|
||||
Bounded queues: sequential puts to respect maxlen (no batch Lua script needed).
|
||||
"""
|
||||
if self._closed or not msgs:
|
||||
return 0
|
||||
data_list = []
|
||||
for msg in msgs:
|
||||
try:
|
||||
data_list.append(_serialize(msg))
|
||||
except TypeError as e:
|
||||
logger.warning("Skipping non-serializable message in put_batch: %s", e)
|
||||
if not data_list:
|
||||
return 0
|
||||
if self._maxlen > 0:
|
||||
count = 0
|
||||
for data in data_list:
|
||||
# Reuse the Lua script for TOCTOU-safe bounded check (same as put())
|
||||
result = await self._redis.eval(self._LUA_BOUNDED_PUSH, 1, self._queue_name, data, self._maxlen)
|
||||
if result:
|
||||
count += 1
|
||||
else:
|
||||
break # queue full — stop early
|
||||
return count
|
||||
# Unbounded: single LPUSH with all values — one network round-trip
|
||||
await self._redis.lpush(self._queue_name, *data_list)
|
||||
return len(data_list)
|
||||
|
||||
async def get(self) -> Any:
|
||||
"""Blocking dequeue via BRPOP. Retries until a message arrives."""
|
||||
while not self._closed:
|
||||
|
||||
@ -83,6 +83,10 @@ class ActorStoppedError(Exception):
|
||||
"""Raised when sending to a stopped actor via ask."""
|
||||
|
||||
|
||||
class MailboxFullError(RuntimeError):
|
||||
"""Raised when a message is rejected because the mailbox is at capacity."""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Internal message wrappers (serializable — no Future objects)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
142
backend/packages/harness/deerflow/actor/retry.py
Normal file
142
backend/packages/harness/deerflow/actor/retry.py
Normal file
@ -0,0 +1,142 @@
|
||||
"""Retry + idempotency helpers for Actor ask/tell patterns.
|
||||
|
||||
This module provides:
|
||||
- Message envelope carrying retry/idempotency metadata
|
||||
- In-memory idempotency store (process-local)
|
||||
- ask_with_retry helper (bounded retries + exponential backoff + jitter)
|
||||
|
||||
Design notes:
|
||||
- Keep transport-agnostic; works with current in-memory mailbox.
|
||||
- Business handlers must opt in by using ``IdempotentActorMixin`` and
|
||||
wrapping logic with ``handle_idempotent``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RetryEnvelope:
|
||||
"""Metadata wrapper for idempotent/retriable messages."""
|
||||
|
||||
payload: Any
|
||||
message_id: str = field(default_factory=lambda: uuid.uuid4().hex)
|
||||
idempotency_key: str | None = None
|
||||
attempt: int = 1
|
||||
max_attempts: int = 1
|
||||
created_at_ms: int = field(default_factory=lambda: int(time.time() * 1000))
|
||||
|
||||
@classmethod
|
||||
def wrap(
|
||||
cls,
|
||||
payload: Any,
|
||||
*,
|
||||
idempotency_key: str | None = None,
|
||||
attempt: int = 1,
|
||||
max_attempts: int = 1,
|
||||
) -> "RetryEnvelope":
|
||||
return cls(
|
||||
payload=payload,
|
||||
idempotency_key=idempotency_key,
|
||||
attempt=attempt,
|
||||
max_attempts=max_attempts,
|
||||
)
|
||||
|
||||
|
||||
class IdempotencyStore:
|
||||
"""Process-local idempotency result store."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._results: dict[str, Any] = {}
|
||||
|
||||
def has(self, key: str) -> bool:
|
||||
return key in self._results
|
||||
|
||||
def get(self, key: str) -> Any:
|
||||
return self._results[key]
|
||||
|
||||
def set(self, key: str, value: Any) -> None:
|
||||
self._results[key] = value
|
||||
|
||||
|
||||
class IdempotentActorMixin:
|
||||
"""Mixin adding idempotent handling utility for actors.
|
||||
|
||||
Usage in actor::
|
||||
|
||||
class MyActor(IdempotentActorMixin, Actor):
|
||||
async def on_receive(self, message):
|
||||
return await self.handle_idempotent(message, self._handle)
|
||||
|
||||
async def _handle(self, payload):
|
||||
...
|
||||
"""
|
||||
|
||||
def _idempotency_store(self) -> IdempotencyStore:
|
||||
store = getattr(self, "_idem_store", None)
|
||||
if store is None:
|
||||
store = IdempotencyStore()
|
||||
setattr(self, "_idem_store", store)
|
||||
return store
|
||||
|
||||
async def handle_idempotent(self, message: Any, handler):
|
||||
if not isinstance(message, RetryEnvelope):
|
||||
return await handler(message)
|
||||
|
||||
key = message.idempotency_key
|
||||
if not key:
|
||||
return await handler(message.payload)
|
||||
|
||||
store = self._idempotency_store()
|
||||
if store.has(key):
|
||||
return store.get(key)
|
||||
|
||||
result = await handler(message.payload)
|
||||
store.set(key, result)
|
||||
return result
|
||||
|
||||
|
||||
async def ask_with_retry(
|
||||
ref,
|
||||
payload: Any,
|
||||
*,
|
||||
timeout: float = 5.0,
|
||||
max_attempts: int = 3,
|
||||
base_backoff_s: float = 0.1,
|
||||
max_backoff_s: float = 5.0,
|
||||
jitter_ratio: float = 0.3,
|
||||
retry_exceptions: tuple[type[BaseException], ...] = (asyncio.TimeoutError,),
|
||||
idempotency_key: str | None = None,
|
||||
) -> Any:
|
||||
"""Ask actor with bounded retries and envelope metadata."""
|
||||
if max_attempts < 1:
|
||||
raise ValueError("max_attempts must be >= 1")
|
||||
|
||||
key = idempotency_key or uuid.uuid4().hex
|
||||
last_exc: BaseException | None = None
|
||||
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
msg = RetryEnvelope.wrap(
|
||||
payload,
|
||||
idempotency_key=key,
|
||||
attempt=attempt,
|
||||
max_attempts=max_attempts,
|
||||
)
|
||||
try:
|
||||
return await ref.ask(msg, timeout=timeout)
|
||||
except retry_exceptions as exc:
|
||||
last_exc = exc
|
||||
if attempt >= max_attempts:
|
||||
break
|
||||
|
||||
backoff = min(max_backoff_s, base_backoff_s * (2 ** (attempt - 1)))
|
||||
jitter = backoff * jitter_ratio * random.random()
|
||||
await asyncio.sleep(backoff + jitter)
|
||||
|
||||
raise last_exc # type: ignore[misc] # always set: loop runs ≥1 time and sets on last iteration
|
||||
@ -11,7 +11,7 @@ from typing import Any
|
||||
from .actor import Actor, ActorContext
|
||||
from .mailbox import Empty, Mailbox, MemoryMailbox
|
||||
from .middleware import ActorMailboxContext, Middleware, NextFn, build_middleware_chain
|
||||
from .ref import ActorRef, ActorStoppedError, ReplyChannel, _Envelope, _ReplyMessage, _ReplyRegistry, _Stop
|
||||
from .ref import ActorRef, ActorStoppedError, MailboxFullError, ReplyChannel, _Envelope, _ReplyMessage, _ReplyRegistry, _Stop
|
||||
from .supervision import Directive, SupervisorStrategy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -87,7 +87,11 @@ class ActorSystem:
|
||||
middlewares=middlewares or [],
|
||||
)
|
||||
self._root_cells[name] = cell
|
||||
await cell.start()
|
||||
try:
|
||||
await cell.start()
|
||||
except Exception:
|
||||
del self._root_cells[name]
|
||||
raise
|
||||
return cell.ref
|
||||
|
||||
async def shutdown(self, *, timeout: float = 10.0) -> None:
|
||||
@ -99,7 +103,12 @@ class ActorSystem:
|
||||
if cell.task is not None:
|
||||
tasks.append(cell.task)
|
||||
if tasks:
|
||||
await asyncio.wait(tasks, timeout=timeout)
|
||||
_, pending = await asyncio.wait(tasks, timeout=timeout)
|
||||
# Cancel tasks that didn't finish within the timeout to prevent zombie tasks
|
||||
for t in pending:
|
||||
t.cancel()
|
||||
if pending:
|
||||
await asyncio.wait(pending, timeout=2.0)
|
||||
self._root_cells.clear()
|
||||
self._replies.reject_all(ActorStoppedError("ActorSystem shutting down"))
|
||||
await self._reply_channel.stop_listener()
|
||||
@ -188,16 +197,25 @@ class _ActorCell:
|
||||
self.task = asyncio.create_task(self._run(), name=f"actor:{self.path}")
|
||||
|
||||
async def enqueue(self, msg: _Envelope | _Stop) -> None:
|
||||
if not self.mailbox.put_nowait(msg):
|
||||
# Try non-blocking first (fast path for MemoryMailbox)
|
||||
if self.mailbox.put_nowait(msg):
|
||||
return
|
||||
# Fallback to async put (required for Redis and other async backends)
|
||||
if not await self.mailbox.put(msg):
|
||||
if isinstance(msg, _Envelope) and msg.correlation_id is not None:
|
||||
self.system._replies.reject(msg.correlation_id, RuntimeError(f"Mailbox full: {self.path}"))
|
||||
self.system._replies.reject(msg.correlation_id, MailboxFullError(f"Mailbox full: {self.path}"))
|
||||
elif isinstance(msg, _Envelope):
|
||||
self.system._dead_letter(self.ref, msg.payload, msg.sender)
|
||||
|
||||
def request_stop(self) -> None:
|
||||
"""Request graceful shutdown. Falls back to task.cancel() if mailbox full."""
|
||||
"""Request graceful shutdown.
|
||||
|
||||
Tries put_nowait first. If that fails (full or unsupported backend),
|
||||
cancels the task directly so _run exits via CancelledError → finally → _shutdown.
|
||||
"""
|
||||
if not self.stopped:
|
||||
if not self.mailbox.put_nowait(_Stop()):
|
||||
# Redis/async backends can't put_nowait — cancel the task
|
||||
if self.task is not None and not self.task.done():
|
||||
self.task.cancel()
|
||||
else:
|
||||
@ -223,7 +241,11 @@ class _ActorCell:
|
||||
middlewares=middlewares or [],
|
||||
)
|
||||
self.children[name] = child
|
||||
await child.start()
|
||||
try:
|
||||
await child.start()
|
||||
except Exception:
|
||||
del self.children[name]
|
||||
raise
|
||||
return child.ref
|
||||
|
||||
# -- Processing loop -------------------------------------------------------
|
||||
@ -310,6 +332,11 @@ class _ActorCell:
|
||||
# Remove from parent
|
||||
if self.parent is not None:
|
||||
self.parent.children.pop(self.name, None)
|
||||
# Close mailbox to release backend resources (e.g. Redis connections)
|
||||
try:
|
||||
await self.mailbox.close()
|
||||
except Exception:
|
||||
logger.exception("Error closing mailbox for %s", self.path)
|
||||
|
||||
# -- Supervision -----------------------------------------------------------
|
||||
|
||||
@ -337,8 +364,16 @@ class _ActorCell:
|
||||
return
|
||||
|
||||
if directive == Directive.escalate:
|
||||
logger.info("Supervisor %s: escalate %s", self.path, type(error).__name__)
|
||||
raise error
|
||||
# Stop the failing child, then propagate failure up the supervision chain.
|
||||
# We cannot use `raise error` here — that would crash the child's _run
|
||||
# loop instead of notifying the grandparent's supervisor.
|
||||
child.request_stop()
|
||||
if self.parent is not None:
|
||||
logger.info("Supervisor %s: escalate %s to grandparent %s", self.path, type(error).__name__, self.parent.path)
|
||||
await self.parent._handle_child_failure(self, error)
|
||||
else:
|
||||
logger.error("Uncaught escalation at root actor %s: %s", self.path, error)
|
||||
return
|
||||
|
||||
if directive == Directive.restart:
|
||||
for name in affected:
|
||||
|
||||
@ -19,7 +19,11 @@ dependencies = [
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
dev = ["pytest>=8.0.0", "ruff>=0.14.11"]
|
||||
dev = [
|
||||
"pytest>=8.0.0",
|
||||
"redis>=7.4.0",
|
||||
"ruff>=0.14.11",
|
||||
]
|
||||
|
||||
[tool.uv.workspace]
|
||||
members = ["packages/harness"]
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
import statistics
|
||||
|
||||
from deerflow.actor import Actor, ActorSystem, Middleware
|
||||
|
||||
@ -17,7 +16,11 @@ class CounterActor(Actor):
|
||||
self.count = 0
|
||||
|
||||
async def on_receive(self, message):
|
||||
self.count += 1
|
||||
if message == "inc":
|
||||
self.count += 1
|
||||
return self.count
|
||||
if message == "get":
|
||||
return self.count
|
||||
return self.count
|
||||
|
||||
|
||||
@ -69,6 +72,8 @@ async def bench_tell_throughput(n=100_000):
|
||||
await ref.tell("inc")
|
||||
# Wait for all messages to be processed
|
||||
count = await ref.ask("get", timeout=30.0)
|
||||
if count != n:
|
||||
print(f" warning: expected {n} processed, got {count}")
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
await system.shutdown()
|
||||
|
||||
273
backend/tests/bench_actor_redis.py
Normal file
273
backend/tests/bench_actor_redis.py
Normal file
@ -0,0 +1,273 @@
|
||||
"""RedisMailbox benchmark: throughput, latency, concurrency, backpressure."""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
import redis.asyncio as redis
|
||||
|
||||
from deerflow.actor import Actor, ActorSystem
|
||||
from deerflow.actor.mailbox_redis import RedisMailbox
|
||||
|
||||
|
||||
class EchoActor(Actor):
|
||||
async def on_receive(self, message):
|
||||
return message
|
||||
|
||||
|
||||
class CounterActor(Actor):
|
||||
async def on_started(self):
|
||||
self.count = 0
|
||||
|
||||
async def on_receive(self, message):
|
||||
if message == "inc":
|
||||
self.count += 1
|
||||
return self.count
|
||||
if message == "get":
|
||||
return self.count
|
||||
return self.count
|
||||
|
||||
|
||||
def fmt(n):
|
||||
if n >= 1_000_000:
|
||||
return f"{n/1_000_000:.1f}M"
|
||||
if n >= 1_000:
|
||||
return f"{n/1_000:.0f}K"
|
||||
return str(n)
|
||||
|
||||
|
||||
async def _redis_client():
|
||||
client = redis.Redis(host="127.0.0.1", port=6379, decode_responses=False)
|
||||
await client.ping()
|
||||
return client
|
||||
|
||||
|
||||
async def bench_redis_ask_throughput(n=20_000):
|
||||
client = await _redis_client()
|
||||
|
||||
queue = "deerflow:bench:redis:ask"
|
||||
await client.delete(queue)
|
||||
|
||||
mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
|
||||
system = ActorSystem("bench-redis")
|
||||
ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
|
||||
|
||||
start = time.perf_counter()
|
||||
for _ in range(n):
|
||||
await ref.ask("ping", timeout=5.0)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
await system.shutdown()
|
||||
|
||||
rate = n / elapsed
|
||||
print(f" redis ask throughput: {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s")
|
||||
|
||||
|
||||
async def bench_redis_tell_throughput(n=50_000):
|
||||
client = await _redis_client()
|
||||
|
||||
queue = "deerflow:bench:redis:tell"
|
||||
await client.delete(queue)
|
||||
|
||||
mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
|
||||
system = ActorSystem("bench-redis")
|
||||
ref = await system.spawn(CounterActor, "counter", mailbox=mailbox)
|
||||
|
||||
start = time.perf_counter()
|
||||
for _ in range(n):
|
||||
await ref.tell("inc")
|
||||
count = await ref.ask("get", timeout=30.0)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
await system.shutdown()
|
||||
|
||||
rate = n / elapsed
|
||||
loss = n - count
|
||||
print(f" redis tell throughput: {fmt(n)} msgs in {elapsed:.2f}s = {fmt(int(rate))}/s (loss: {loss})")
|
||||
|
||||
|
||||
async def bench_redis_ask_latency(n=5_000):
|
||||
client = await _redis_client()
|
||||
|
||||
queue = "deerflow:bench:redis:latency"
|
||||
await client.delete(queue)
|
||||
|
||||
mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
|
||||
system = ActorSystem("bench-redis")
|
||||
ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
|
||||
|
||||
for _ in range(100):
|
||||
await ref.ask("warmup", timeout=5.0)
|
||||
|
||||
latencies = []
|
||||
for _ in range(n):
|
||||
t0 = time.perf_counter()
|
||||
await ref.ask("ping", timeout=5.0)
|
||||
latencies.append((time.perf_counter() - t0) * 1_000_000)
|
||||
|
||||
await system.shutdown()
|
||||
|
||||
latencies.sort()
|
||||
p50 = latencies[len(latencies) // 2]
|
||||
p99 = latencies[int(len(latencies) * 0.99)]
|
||||
p999 = latencies[int(len(latencies) * 0.999)]
|
||||
print(f" redis ask latency: p50={p50:.0f}µs p99={p99:.0f}µs p99.9={p999:.0f}µs")
|
||||
|
||||
|
||||
async def bench_redis_concurrent_actors(num_actors=200, msgs_per_actor=100):
|
||||
client = await _redis_client()
|
||||
system = ActorSystem("bench-redis")
|
||||
refs = []
|
||||
|
||||
for i in range(num_actors):
|
||||
q = f"deerflow:bench:redis:conc:{i}"
|
||||
await client.delete(q)
|
||||
mailbox = RedisMailbox(client.connection_pool, q, brpop_timeout=0.05)
|
||||
refs.append(await system.spawn(CounterActor, f"a{i}", mailbox=mailbox))
|
||||
|
||||
start = time.perf_counter()
|
||||
|
||||
async def send_batch(ref, n):
|
||||
for i in range(n):
|
||||
await ref.tell("inc")
|
||||
if i % 50 == 49:
|
||||
await asyncio.sleep(0)
|
||||
return await ref.ask("get", timeout=30.0)
|
||||
|
||||
results = await asyncio.gather(*[send_batch(r, msgs_per_actor) for r in refs])
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
total = num_actors * msgs_per_actor
|
||||
delivered = sum(results)
|
||||
rate = total / elapsed
|
||||
loss = total - delivered
|
||||
print(
|
||||
f" redis concurrency: {num_actors} actors × {msgs_per_actor} msgs = {fmt(total)} in {elapsed:.2f}s = {fmt(int(rate))}/s (loss: {loss})"
|
||||
)
|
||||
|
||||
await system.shutdown()
|
||||
|
||||
|
||||
async def bench_redis_maxlen_backpressure(total_messages=20_000, maxlen=100, ask_timeout=0.01, ask_concurrency=200):
|
||||
client = await _redis_client()
|
||||
|
||||
queue_tell = "deerflow:bench:redis:bp:tell"
|
||||
await client.delete(queue_tell)
|
||||
mailbox_tell = RedisMailbox(client.connection_pool, queue_tell, maxlen=maxlen, brpop_timeout=0.05)
|
||||
|
||||
system_tell = ActorSystem("bench-redis-bp-tell")
|
||||
ref_tell = await system_tell.spawn(CounterActor, "counter", mailbox=mailbox_tell)
|
||||
|
||||
# Saturate with tell: dropped messages become dead letters
|
||||
for _ in range(total_messages):
|
||||
await ref_tell.tell("inc")
|
||||
|
||||
await asyncio.sleep(0.2)
|
||||
processed = await ref_tell.ask("get", timeout=10.0)
|
||||
dropped = len(system_tell.dead_letters)
|
||||
drop_rate = dropped / total_messages if total_messages else 0.0
|
||||
|
||||
print(
|
||||
f" redis maxlen tell: maxlen={maxlen}, sent={fmt(total_messages)}, processed={fmt(processed)}, dropped={fmt(dropped)} ({drop_rate:.1%})"
|
||||
)
|
||||
|
||||
await system_tell.shutdown()
|
||||
|
||||
# Ask timeout rate under pressure
|
||||
queue_ask = "deerflow:bench:redis:bp:ask"
|
||||
await client.delete(queue_ask)
|
||||
mailbox_ask = RedisMailbox(client.connection_pool, queue_ask, maxlen=maxlen, brpop_timeout=0.05)
|
||||
|
||||
system_ask = ActorSystem("bench-redis-bp-ask")
|
||||
ref_ask = await system_ask.spawn(EchoActor, "echo", mailbox=mailbox_ask)
|
||||
|
||||
async def one_ask(i):
|
||||
try:
|
||||
await ref_ask.ask(i, timeout=ask_timeout)
|
||||
return True, None
|
||||
except asyncio.TimeoutError:
|
||||
return False, "timeout"
|
||||
except Exception: # MailboxFullError or other rejection
|
||||
return False, "rejected"
|
||||
|
||||
sem = asyncio.Semaphore(ask_concurrency)
|
||||
|
||||
async def one_ask_limited(i):
|
||||
async with sem:
|
||||
return await one_ask(i)
|
||||
|
||||
results = await asyncio.gather(*[one_ask_limited(i) for i in range(total_messages)])
|
||||
ok = sum(1 for r, _ in results if r)
|
||||
timeout_count = sum(1 for _, reason in results if reason == "timeout")
|
||||
rejected_count = sum(1 for _, reason in results if reason == "rejected")
|
||||
fail_rate = (total_messages - ok) / total_messages if total_messages else 0.0
|
||||
|
||||
print(
|
||||
f" redis maxlen ask: maxlen={maxlen}, total={fmt(total_messages)}, ok={fmt(ok)}, "
|
||||
f"timeout={fmt(timeout_count)}, rejected={fmt(rejected_count)} (fail: {fail_rate:.1%}), "
|
||||
f"ask_timeout={ask_timeout}s, concurrency={ask_concurrency}"
|
||||
)
|
||||
|
||||
await system_ask.shutdown()
|
||||
|
||||
|
||||
async def bench_redis_put_batch(n=50_000, batch_size=100):
|
||||
"""put_batch: N messages in N/batch_size round-trips instead of N."""
|
||||
client = await _redis_client()
|
||||
|
||||
queue = "deerflow:bench:redis:batch"
|
||||
await client.delete(queue)
|
||||
|
||||
mailbox = RedisMailbox(client.connection_pool, queue, brpop_timeout=0.05)
|
||||
system = ActorSystem("bench-redis-batch")
|
||||
ref = await system.spawn(CounterActor, "counter", mailbox=mailbox)
|
||||
|
||||
from deerflow.actor.ref import _Envelope
|
||||
|
||||
batches = [
|
||||
[_Envelope(payload="inc") for _ in range(batch_size)]
|
||||
for _ in range(n // batch_size)
|
||||
]
|
||||
|
||||
t0 = time.perf_counter()
|
||||
for batch in batches:
|
||||
await mailbox.put_batch(batch)
|
||||
enqueue_elapsed = time.perf_counter() - t0
|
||||
|
||||
count = await ref.ask("get", timeout=60.0)
|
||||
total_elapsed = time.perf_counter() - t0
|
||||
|
||||
loss = n - count
|
||||
enqueue_rate = n / enqueue_elapsed
|
||||
print(
|
||||
f" redis put_batch push: {fmt(n)} msgs in {enqueue_elapsed:.3f}s = {fmt(int(enqueue_rate))}/s "
|
||||
f"(batch={batch_size}, round-trips={n // batch_size})"
|
||||
)
|
||||
print(
|
||||
f" redis put_batch total: end-to-end {total_elapsed:.2f}s = {fmt(int(n / total_elapsed))}/s "
|
||||
f"(consume bottleneck, loss={loss})"
|
||||
)
|
||||
|
||||
await system.shutdown()
|
||||
|
||||
|
||||
async def main():
|
||||
print("=" * 72)
|
||||
print(" RedisMailbox Benchmarks")
|
||||
print("=" * 72)
|
||||
print()
|
||||
|
||||
await bench_redis_tell_throughput()
|
||||
await bench_redis_ask_throughput()
|
||||
await bench_redis_ask_latency()
|
||||
await bench_redis_concurrent_actors()
|
||||
await bench_redis_put_batch()
|
||||
await bench_redis_maxlen_backpressure()
|
||||
|
||||
print()
|
||||
print("=" * 72)
|
||||
print(" Done")
|
||||
print("=" * 72)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -440,3 +440,95 @@ class TestMiddleware:
|
||||
# tell goes through middleware too
|
||||
assert any("before:" in entry for entry in mw.log) is False
|
||||
await system.shutdown()
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_middleware_on_restart_hook(self):
|
||||
"""on_restart is called on the middleware when a child actor is restarted."""
|
||||
|
||||
class RestartTrackingMiddleware(Middleware):
|
||||
def __init__(self):
|
||||
self.restart_errors: list[Exception] = []
|
||||
|
||||
async def on_restart(self, actor_ref, error):
|
||||
self.restart_errors.append(error)
|
||||
|
||||
mw = RestartTrackingMiddleware()
|
||||
|
||||
class ChildSpawningParent(Actor):
|
||||
async def on_receive(self, message):
|
||||
if message == "spawn":
|
||||
ref = await self.context.spawn(CrashActor, "child", middlewares=[mw])
|
||||
return ref
|
||||
|
||||
system = ActorSystem("test")
|
||||
parent = await system.spawn(ChildSpawningParent, "parent")
|
||||
child = await parent.ask("spawn")
|
||||
|
||||
# Crash the child — parent supervisor will restart it
|
||||
try:
|
||||
await child.ask("crash")
|
||||
except ValueError:
|
||||
pass
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
assert len(mw.restart_errors) == 1
|
||||
assert isinstance(mw.restart_errors[0], ValueError)
|
||||
await system.shutdown()
|
||||
|
||||
|
||||
class TestAskErrorPropagation:
|
||||
@pytest.mark.anyio
|
||||
async def test_ask_propagates_actor_exception(self):
|
||||
"""ask() re-raises the original exception type when on_receive crashes."""
|
||||
|
||||
class BoomActor(Actor):
|
||||
async def on_receive(self, message):
|
||||
raise ValueError("intentional crash")
|
||||
|
||||
system = ActorSystem("test")
|
||||
ref = await system.spawn(BoomActor, "boom")
|
||||
with pytest.raises(ValueError, match="intentional crash"):
|
||||
await ref.ask("trigger")
|
||||
await system.shutdown()
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_ask_propagates_exception_while_supervised(self):
|
||||
"""ask() gets the exception even when the actor is supervised (not stopped)."""
|
||||
|
||||
class SometimesCrashActor(Actor):
|
||||
async def on_receive(self, message):
|
||||
if message == "crash":
|
||||
raise RuntimeError("supervised crash")
|
||||
return "ok"
|
||||
|
||||
system = ActorSystem("test")
|
||||
ref = await system.spawn(SometimesCrashActor, "sca")
|
||||
with pytest.raises(RuntimeError, match="supervised crash"):
|
||||
await ref.ask("crash")
|
||||
# Root actor keeps running after a crash (consecutive_failures, not restart)
|
||||
result = await ref.ask("hello", timeout=2.0)
|
||||
assert result == "ok"
|
||||
await system.shutdown()
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_ask_timeout_late_reply_no_exception(self):
|
||||
"""Late reply arriving after ask() timeout is silently dropped — no exception, no orphaned future."""
|
||||
|
||||
class SlowActor(Actor):
|
||||
async def on_receive(self, message):
|
||||
await asyncio.sleep(0.3)
|
||||
return "late"
|
||||
|
||||
system = ActorSystem("test")
|
||||
ref = await system.spawn(SlowActor, "slow")
|
||||
|
||||
with pytest.raises(asyncio.TimeoutError):
|
||||
await ref.ask("go", timeout=0.05)
|
||||
|
||||
# Wait for actor to finish processing — late reply arrives, should be a no-op
|
||||
await asyncio.sleep(0.4)
|
||||
# System still functional: no orphaned futures, no leaked state
|
||||
assert ref.is_alive
|
||||
result = await ref.ask("go", timeout=2.0)
|
||||
assert result == "late"
|
||||
await system.shutdown()
|
||||
|
||||
89
backend/tests/test_actor_backpressure.py
Normal file
89
backend/tests/test_actor_backpressure.py
Normal file
@ -0,0 +1,89 @@
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
|
||||
from deerflow.actor import Actor, ActorSystem, MailboxFullError
|
||||
from deerflow.actor.mailbox import BACKPRESSURE_BLOCK, BACKPRESSURE_DROP_NEW, BACKPRESSURE_FAIL, MemoryMailbox
|
||||
|
||||
|
||||
class SlowActor(Actor):
|
||||
async def on_started(self):
|
||||
self.count = 0
|
||||
|
||||
async def on_receive(self, message):
|
||||
if message == 'inc':
|
||||
await asyncio.sleep(0.01)
|
||||
self.count += 1
|
||||
return None
|
||||
if message == 'get':
|
||||
return self.count
|
||||
return None
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_memory_mailbox_drop_new_policy_drops_tell_to_dead_letters():
|
||||
system = ActorSystem('bp')
|
||||
ref = await system.spawn(
|
||||
SlowActor,
|
||||
'slow',
|
||||
mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_DROP_NEW),
|
||||
)
|
||||
|
||||
# Overfill quickly
|
||||
for _ in range(20):
|
||||
await ref.tell('inc')
|
||||
|
||||
await asyncio.sleep(0.4)
|
||||
count = await ref.ask('get', timeout=2.0)
|
||||
await system.shutdown()
|
||||
|
||||
# Some messages should be dropped under drop_new
|
||||
assert count < 20
|
||||
assert len(system.dead_letters) > 0
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_memory_mailbox_fail_policy_rejects_ask_when_full():
|
||||
system = ActorSystem('bp')
|
||||
ref = await system.spawn(
|
||||
SlowActor,
|
||||
'slow',
|
||||
mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_FAIL),
|
||||
)
|
||||
|
||||
# Fill queue with tell first
|
||||
await ref.tell('inc')
|
||||
|
||||
# Then ask may be rejected when queue still full
|
||||
got_reject = False
|
||||
for _ in range(30):
|
||||
try:
|
||||
await ref.ask('inc', timeout=0.02)
|
||||
except MailboxFullError:
|
||||
got_reject = True
|
||||
break
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
|
||||
await system.shutdown()
|
||||
assert got_reject
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_memory_mailbox_block_policy_eventually_accepts():
|
||||
system = ActorSystem('bp')
|
||||
ref = await system.spawn(
|
||||
SlowActor,
|
||||
'slow',
|
||||
mailbox=MemoryMailbox(1, backpressure_policy=BACKPRESSURE_BLOCK),
|
||||
)
|
||||
|
||||
for _ in range(10):
|
||||
await ref.tell('inc')
|
||||
|
||||
await asyncio.sleep(0.25)
|
||||
count = await ref.ask('get', timeout=2.0)
|
||||
await system.shutdown()
|
||||
|
||||
# Block policy should avoid dropping on tell path
|
||||
assert count == 10
|
||||
62
backend/tests/test_actor_retry.py
Normal file
62
backend/tests/test_actor_retry.py
Normal file
@ -0,0 +1,62 @@
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
|
||||
from deerflow.actor import Actor, ActorSystem, IdempotentActorMixin, RetryEnvelope, ask_with_retry
|
||||
|
||||
|
||||
class FlakyIdempotentActor(IdempotentActorMixin, Actor):
|
||||
async def on_started(self):
|
||||
self.calls = 0
|
||||
|
||||
async def on_receive(self, message):
|
||||
return await self.handle_idempotent(message, self._handle)
|
||||
|
||||
async def _handle(self, payload):
|
||||
self.calls += 1
|
||||
if payload == 'flaky' and self.calls == 1:
|
||||
await asyncio.sleep(0.02)
|
||||
return 'late'
|
||||
return f"ok:{payload}"
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_ask_with_retry_timeout_raises():
|
||||
system = ActorSystem('retry')
|
||||
ref = await system.spawn(FlakyIdempotentActor, 'a')
|
||||
|
||||
with pytest.raises(asyncio.TimeoutError):
|
||||
await ask_with_retry(
|
||||
ref,
|
||||
'flaky',
|
||||
timeout=0.005,
|
||||
max_attempts=3,
|
||||
base_backoff_s=0.001,
|
||||
max_backoff_s=0.005,
|
||||
jitter_ratio=0.0,
|
||||
idempotency_key='k1',
|
||||
)
|
||||
|
||||
# This helper retries timeout, but if each attempt times out it should raise.
|
||||
assert ref.is_alive
|
||||
await system.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_idempotent_envelope_returns_cached_result():
|
||||
system = ActorSystem('retry')
|
||||
ref = await system.spawn(FlakyIdempotentActor, 'a')
|
||||
|
||||
m1 = RetryEnvelope.wrap('x', idempotency_key='same-key')
|
||||
m2 = RetryEnvelope.wrap('x', idempotency_key='same-key', attempt=2, max_attempts=3)
|
||||
|
||||
r1 = await ref.ask(m1, timeout=1.0)
|
||||
r2 = await ref.ask(m2, timeout=1.0)
|
||||
|
||||
assert r1 == 'ok:x'
|
||||
assert r2 == 'ok:x'
|
||||
# handler should run once due to idempotency cache
|
||||
actor = ref._cell.actor
|
||||
assert actor.calls == 1
|
||||
|
||||
await system.shutdown()
|
||||
83
backend/tests/test_mailbox_redis.py
Normal file
83
backend/tests/test_mailbox_redis.py
Normal file
@ -0,0 +1,83 @@
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
|
||||
redis = pytest.importorskip("redis.asyncio")
|
||||
|
||||
from deerflow.actor.mailbox_redis import RedisMailbox
|
||||
from deerflow.actor.ref import _Envelope, _Stop
|
||||
|
||||
|
||||
pytestmark = pytest.mark.anyio
|
||||
|
||||
|
||||
async def _make_mailbox(queue_name: str, *, maxlen: int = 0) -> RedisMailbox:
|
||||
client = redis.Redis(host="127.0.0.1", port=6379, decode_responses=False)
|
||||
await client.ping()
|
||||
await client.delete(queue_name)
|
||||
mailbox = RedisMailbox(client.connection_pool, queue_name, maxlen=maxlen, brpop_timeout=0.2)
|
||||
return mailbox
|
||||
|
||||
|
||||
async def test_roundtrip_envelope_and_stop():
|
||||
queue = "deerflow:test:redis-mailbox:roundtrip"
|
||||
mailbox = await _make_mailbox(queue)
|
||||
try:
|
||||
msg = _Envelope(payload={"k": "v"}, correlation_id="c1", reply_to="sysA")
|
||||
ok = await mailbox.put(msg)
|
||||
assert ok is True
|
||||
|
||||
got = await mailbox.get()
|
||||
assert isinstance(got, _Envelope)
|
||||
assert got.payload == {"k": "v"}
|
||||
assert got.correlation_id == "c1"
|
||||
assert got.reply_to == "sysA"
|
||||
|
||||
ok = await mailbox.put(_Stop())
|
||||
assert ok is True
|
||||
stop = await mailbox.get()
|
||||
assert isinstance(stop, _Stop)
|
||||
finally:
|
||||
await mailbox.close()
|
||||
|
||||
|
||||
async def test_bounded_queue_rejects_when_full():
|
||||
queue = "deerflow:test:redis-mailbox:bounded"
|
||||
mailbox = await _make_mailbox(queue, maxlen=1)
|
||||
try:
|
||||
assert await mailbox.put(_Envelope("m1")) is True
|
||||
assert await mailbox.put(_Envelope("m2")) is False
|
||||
finally:
|
||||
await mailbox.close()
|
||||
|
||||
|
||||
async def test_put_nowait_and_get_nowait_contract():
|
||||
queue = "deerflow:test:redis-mailbox:nowait"
|
||||
mailbox = await _make_mailbox(queue)
|
||||
try:
|
||||
assert mailbox.put_nowait(_Envelope("x")) is False
|
||||
with pytest.raises(Exception, match="does not support synchronous get_nowait"):
|
||||
mailbox.get_nowait()
|
||||
finally:
|
||||
await mailbox.close()
|
||||
|
||||
|
||||
async def test_system_enqueue_fallback_with_async_mailbox():
|
||||
from deerflow.actor import Actor, ActorSystem
|
||||
|
||||
class EchoActor(Actor):
|
||||
async def on_receive(self, message):
|
||||
return message
|
||||
|
||||
queue = "deerflow:test:redis-mailbox:system-fallback"
|
||||
mailbox = await _make_mailbox(queue)
|
||||
|
||||
system = ActorSystem("redis-test")
|
||||
ref = await system.spawn(EchoActor, "echo", mailbox=mailbox)
|
||||
try:
|
||||
# This exercises _ActorCell.enqueue fallback path:
|
||||
# put_nowait() -> False, then await put() -> True
|
||||
result = await ref.ask("hello", timeout=3.0)
|
||||
assert result == "hello"
|
||||
finally:
|
||||
await system.shutdown()
|
||||
4535
backend/uv.lock
generated
4535
backend/uv.lock
generated
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user