deer-flow/backend/tests/test_run_worker_rollback.py
luo jiyin 35f141fc48
feat: implement full checkpoint rollback on user cancellation (#1867)
* feat: implement full checkpoint rollback on user cancellation

- Capture pre-run checkpoint snapshot including checkpoint state, metadata, and pending_writes
- Add _rollback_to_pre_run_checkpoint() function to restore thread state
- Implement _call_checkpointer_method() helper to support both async and sync checkpointer methods
- Rollback now properly restores checkpoint, metadata, channel_versions, and pending_writes
- Remove obsolete TODO comment (Phase 2) as rollback is now complete

This resolves the TODO(Phase 2) comment and enables full thread state
restoration when a run is cancelled by the user.

* fix: address rollback review feedback

* fix: strengthen checkpoint rollback validation and error handling

- Validate restored_config structure and checkpoint_id before use
- Raise RuntimeError on malformed pending_writes instead of silent skip
- Normalize None checkpoint_ns to empty string instead of "None"
- Move delete_thread to only execute when pre_run_snapshot is None
- Add docstring noting non-atomic rollback as known limitation

This addresses review feedback on PR #1867 regarding data integrity
in the checkpoint rollback implementation.

* test: add comprehensive coverage for checkpoint rollback edge cases

- test_rollback_restores_snapshot_without_deleting_thread
- test_rollback_deletes_thread_when_no_snapshot_exists
- test_rollback_raises_when_restore_config_has_no_checkpoint_id
- test_rollback_normalizes_none_checkpoint_ns_to_root_namespace
- test_rollback_raises_on_malformed_pending_write_not_a_tuple
- test_rollback_raises_on_malformed_pending_write_non_string_channel
- test_rollback_propagates_aput_writes_failure

Covers all scenarios from PR #1867 review feedback.

* test: format rollback worker tests
2026-04-09 17:56:36 +08:00

215 lines
7.8 KiB
Python

from unittest.mock import AsyncMock, call
import pytest
from deerflow.runtime.runs.worker import _rollback_to_pre_run_checkpoint
class FakeCheckpointer:
def __init__(self, *, put_result):
self.adelete_thread = AsyncMock()
self.aput = AsyncMock(return_value=put_result)
self.aput_writes = AsyncMock()
@pytest.mark.anyio
async def test_rollback_restores_snapshot_without_deleting_thread():
checkpointer = FakeCheckpointer(put_result={"configurable": {"thread_id": "thread-1", "checkpoint_ns": "", "checkpoint_id": "restored-1"}})
await _rollback_to_pre_run_checkpoint(
checkpointer=checkpointer,
thread_id="thread-1",
run_id="run-1",
pre_run_checkpoint_id="ckpt-1",
pre_run_snapshot={
"checkpoint_ns": "",
"checkpoint": {
"id": "ckpt-1",
"channel_versions": {"messages": 3},
"channel_values": {"messages": ["before"]},
},
"metadata": {"source": "input"},
"pending_writes": [
("task-a", "messages", {"content": "first"}),
("task-a", "status", "done"),
("task-b", "events", {"type": "tool"}),
],
},
snapshot_capture_failed=False,
)
checkpointer.adelete_thread.assert_not_awaited()
checkpointer.aput.assert_awaited_once_with(
{"configurable": {"thread_id": "thread-1", "checkpoint_ns": ""}},
{
"id": "ckpt-1",
"channel_versions": {"messages": 3},
"channel_values": {"messages": ["before"]},
},
{"source": "input"},
{"messages": 3},
)
assert checkpointer.aput_writes.await_args_list == [
call(
{"configurable": {"thread_id": "thread-1", "checkpoint_ns": "", "checkpoint_id": "restored-1"}},
[("messages", {"content": "first"}), ("status", "done")],
task_id="task-a",
),
call(
{"configurable": {"thread_id": "thread-1", "checkpoint_ns": "", "checkpoint_id": "restored-1"}},
[("events", {"type": "tool"})],
task_id="task-b",
),
]
@pytest.mark.anyio
async def test_rollback_deletes_thread_when_no_snapshot_exists():
checkpointer = FakeCheckpointer(put_result=None)
await _rollback_to_pre_run_checkpoint(
checkpointer=checkpointer,
thread_id="thread-1",
run_id="run-1",
pre_run_checkpoint_id=None,
pre_run_snapshot=None,
snapshot_capture_failed=False,
)
checkpointer.adelete_thread.assert_awaited_once_with("thread-1")
checkpointer.aput.assert_not_awaited()
checkpointer.aput_writes.assert_not_awaited()
@pytest.mark.anyio
async def test_rollback_raises_when_restore_config_has_no_checkpoint_id():
checkpointer = FakeCheckpointer(put_result={"configurable": {"thread_id": "thread-1", "checkpoint_ns": ""}})
with pytest.raises(RuntimeError, match="did not return checkpoint_id"):
await _rollback_to_pre_run_checkpoint(
checkpointer=checkpointer,
thread_id="thread-1",
run_id="run-1",
pre_run_checkpoint_id="ckpt-1",
pre_run_snapshot={
"checkpoint_ns": "",
"checkpoint": {"id": "ckpt-1", "channel_versions": {}},
"metadata": {},
"pending_writes": [("task-a", "messages", "value")],
},
snapshot_capture_failed=False,
)
checkpointer.adelete_thread.assert_not_awaited()
checkpointer.aput.assert_awaited_once()
checkpointer.aput_writes.assert_not_awaited()
@pytest.mark.anyio
async def test_rollback_normalizes_none_checkpoint_ns_to_root_namespace():
checkpointer = FakeCheckpointer(put_result={"configurable": {"thread_id": "thread-1", "checkpoint_ns": "", "checkpoint_id": "restored-1"}})
await _rollback_to_pre_run_checkpoint(
checkpointer=checkpointer,
thread_id="thread-1",
run_id="run-1",
pre_run_checkpoint_id="ckpt-1",
pre_run_snapshot={
"checkpoint_ns": None,
"checkpoint": {"id": "ckpt-1", "channel_versions": {}},
"metadata": {},
"pending_writes": [],
},
snapshot_capture_failed=False,
)
checkpointer.aput.assert_awaited_once_with(
{"configurable": {"thread_id": "thread-1", "checkpoint_ns": ""}},
{"id": "ckpt-1", "channel_versions": {}},
{},
{},
)
@pytest.mark.anyio
async def test_rollback_raises_on_malformed_pending_write_not_a_tuple():
"""pending_writes containing a non-3-tuple item should raise RuntimeError."""
checkpointer = FakeCheckpointer(put_result={"configurable": {"thread_id": "thread-1", "checkpoint_ns": "", "checkpoint_id": "restored-1"}})
with pytest.raises(RuntimeError, match="rollback failed: pending_write is not a 3-tuple"):
await _rollback_to_pre_run_checkpoint(
checkpointer=checkpointer,
thread_id="thread-1",
run_id="run-1",
pre_run_checkpoint_id="ckpt-1",
pre_run_snapshot={
"checkpoint_ns": "",
"checkpoint": {"id": "ckpt-1", "channel_versions": {}},
"metadata": {},
"pending_writes": [
("task-a", "messages", "valid"), # valid
["only", "two"], # malformed: only 2 elements
],
},
snapshot_capture_failed=False,
)
# aput succeeded but aput_writes should not be called due to malformed data
checkpointer.aput.assert_awaited_once()
checkpointer.aput_writes.assert_not_awaited()
@pytest.mark.anyio
async def test_rollback_raises_on_malformed_pending_write_non_string_channel():
"""pending_writes containing a non-string channel should raise RuntimeError."""
checkpointer = FakeCheckpointer(put_result={"configurable": {"thread_id": "thread-1", "checkpoint_ns": "", "checkpoint_id": "restored-1"}})
with pytest.raises(RuntimeError, match="rollback failed: pending_write has non-string channel"):
await _rollback_to_pre_run_checkpoint(
checkpointer=checkpointer,
thread_id="thread-1",
run_id="run-1",
pre_run_checkpoint_id="ckpt-1",
pre_run_snapshot={
"checkpoint_ns": "",
"checkpoint": {"id": "ckpt-1", "channel_versions": {}},
"metadata": {},
"pending_writes": [
("task-a", 123, "value"), # malformed: channel is not a string
],
},
snapshot_capture_failed=False,
)
checkpointer.aput.assert_awaited_once()
checkpointer.aput_writes.assert_not_awaited()
@pytest.mark.anyio
async def test_rollback_propagates_aput_writes_failure():
"""If aput_writes fails, the exception should propagate (not be swallowed)."""
checkpointer = FakeCheckpointer(put_result={"configurable": {"thread_id": "thread-1", "checkpoint_ns": "", "checkpoint_id": "restored-1"}})
# Simulate aput_writes failure
checkpointer.aput_writes.side_effect = RuntimeError("Database connection lost")
with pytest.raises(RuntimeError, match="Database connection lost"):
await _rollback_to_pre_run_checkpoint(
checkpointer=checkpointer,
thread_id="thread-1",
run_id="run-1",
pre_run_checkpoint_id="ckpt-1",
pre_run_snapshot={
"checkpoint_ns": "",
"checkpoint": {"id": "ckpt-1", "channel_versions": {}},
"metadata": {},
"pending_writes": [
("task-a", "messages", "value"),
],
},
snapshot_capture_failed=False,
)
# aput succeeded, aput_writes was called but failed
checkpointer.aput.assert_awaited_once()
checkpointer.aput_writes.assert_awaited_once()