From 90299e2710bf82079e6db5c88e227f75e75913fb Mon Sep 17 00:00:00 2001 From: Willem Jiang Date: Fri, 10 Apr 2026 20:40:30 +0800 Subject: [PATCH] feat(provisioner): add optional PVC support for sandbox volumes (#2020) * feat(provisioner): add optional PVC support for sandbox volumes (#1978) Add SKILLS_PVC_NAME and USERDATA_PVC_NAME env vars to allow sandbox Pods to use PersistentVolumeClaims instead of hostPath volumes. This prevents data loss in production when pods are rescheduled across nodes. When USERDATA_PVC_NAME is set, a subPath of threads/{thread_id}/user-data is used so a single PVC can serve multiple threads. Falls back to hostPath when the new env vars are not set, preserving backward compatibility. * add unit test for provisioner pvc volumes * refactor: extract shared provisioner_module fixture to conftest.py Agent-Logs-Url: https://github.com/bytedance/deer-flow/sessions/e7ccf708-c6ba-40e4-844a-b526bdb249dd Co-authored-by: WillemJiang <219644+WillemJiang@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: JeffJiang --- backend/tests/conftest.py | 21 +++ backend/tests/test_provisioner_kubeconfig.py | 30 +--- backend/tests/test_provisioner_pvc_volumes.py | 158 ++++++++++++++++++ docker/docker-compose-dev.yaml | 5 + docker/provisioner/README.md | 6 +- docker/provisioner/app.py | 90 ++++++---- 6 files changed, 255 insertions(+), 55 deletions(-) create mode 100644 backend/tests/test_provisioner_pvc_volumes.py diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index eb9703d45..997f42577 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -4,10 +4,13 @@ Sets up sys.path and pre-mocks modules that would cause circular import issues when unit-testing lightweight config/registry code in isolation. """ +import importlib.util import sys from pathlib import Path from unittest.mock import MagicMock +import pytest + # Make 'app' and 'deerflow' importable from any working directory sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts")) @@ -32,3 +35,21 @@ _executor_mock.MAX_CONCURRENT_SUBAGENTS = 3 _executor_mock.get_background_task_result = MagicMock() sys.modules["deerflow.subagents.executor"] = _executor_mock + + +@pytest.fixture() +def provisioner_module(): + """Load docker/provisioner/app.py as an importable test module. + + Shared by test_provisioner_kubeconfig and test_provisioner_pvc_volumes so + that any change to the provisioner entry-point path or module name only + needs to be updated in one place. + """ + repo_root = Path(__file__).resolve().parents[2] + module_path = repo_root / "docker" / "provisioner" / "app.py" + spec = importlib.util.spec_from_file_location("provisioner_app_test", module_path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module diff --git a/backend/tests/test_provisioner_kubeconfig.py b/backend/tests/test_provisioner_kubeconfig.py index ebc944eac..cbfa50dcb 100644 --- a/backend/tests/test_provisioner_kubeconfig.py +++ b/backend/tests/test_provisioner_kubeconfig.py @@ -2,25 +2,9 @@ from __future__ import annotations -import importlib.util -from pathlib import Path - -def _load_provisioner_module(): - """Load docker/provisioner/app.py as an importable test module.""" - repo_root = Path(__file__).resolve().parents[2] - module_path = repo_root / "docker" / "provisioner" / "app.py" - spec = importlib.util.spec_from_file_location("provisioner_app_test", module_path) - assert spec is not None - assert spec.loader is not None - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def test_wait_for_kubeconfig_rejects_directory(tmp_path): +def test_wait_for_kubeconfig_rejects_directory(tmp_path, provisioner_module): """Directory mount at kubeconfig path should fail fast with clear error.""" - provisioner_module = _load_provisioner_module() kubeconfig_dir = tmp_path / "config_dir" kubeconfig_dir.mkdir() @@ -33,9 +17,8 @@ def test_wait_for_kubeconfig_rejects_directory(tmp_path): assert "directory" in str(exc) -def test_wait_for_kubeconfig_accepts_file(tmp_path): +def test_wait_for_kubeconfig_accepts_file(tmp_path, provisioner_module): """Regular file mount should pass readiness wait.""" - provisioner_module = _load_provisioner_module() kubeconfig_file = tmp_path / "config" kubeconfig_file.write_text("apiVersion: v1\n") @@ -45,9 +28,8 @@ def test_wait_for_kubeconfig_accepts_file(tmp_path): provisioner_module._wait_for_kubeconfig(timeout=1) -def test_init_k8s_client_rejects_directory_path(tmp_path): +def test_init_k8s_client_rejects_directory_path(tmp_path, provisioner_module): """KUBECONFIG_PATH that resolves to a directory should be rejected.""" - provisioner_module = _load_provisioner_module() kubeconfig_dir = tmp_path / "config_dir" kubeconfig_dir.mkdir() @@ -60,9 +42,8 @@ def test_init_k8s_client_rejects_directory_path(tmp_path): assert "expected a file" in str(exc) -def test_init_k8s_client_uses_file_kubeconfig(tmp_path, monkeypatch): +def test_init_k8s_client_uses_file_kubeconfig(tmp_path, monkeypatch, provisioner_module): """When file exists, provisioner should load kubeconfig file path.""" - provisioner_module = _load_provisioner_module() kubeconfig_file = tmp_path / "config" kubeconfig_file.write_text("apiVersion: v1\n") @@ -90,9 +71,8 @@ def test_init_k8s_client_uses_file_kubeconfig(tmp_path, monkeypatch): assert result == "core-v1" -def test_init_k8s_client_falls_back_to_incluster_when_missing(tmp_path, monkeypatch): +def test_init_k8s_client_falls_back_to_incluster_when_missing(tmp_path, monkeypatch, provisioner_module): """When kubeconfig file is missing, in-cluster config should be attempted.""" - provisioner_module = _load_provisioner_module() missing_path = tmp_path / "missing-config" calls: dict[str, int] = {"incluster": 0} diff --git a/backend/tests/test_provisioner_pvc_volumes.py b/backend/tests/test_provisioner_pvc_volumes.py new file mode 100644 index 000000000..5566f63bd --- /dev/null +++ b/backend/tests/test_provisioner_pvc_volumes.py @@ -0,0 +1,158 @@ +"""Regression tests for provisioner PVC volume support.""" + + +# ── _build_volumes ───────────────────────────────────────────────────── + + +class TestBuildVolumes: + """Tests for _build_volumes: PVC vs hostPath selection.""" + + def test_default_uses_hostpath_for_skills(self, provisioner_module): + """When SKILLS_PVC_NAME is empty, skills volume should use hostPath.""" + provisioner_module.SKILLS_PVC_NAME = "" + volumes = provisioner_module._build_volumes("thread-1") + skills_vol = volumes[0] + assert skills_vol.host_path is not None + assert skills_vol.host_path.path == provisioner_module.SKILLS_HOST_PATH + assert skills_vol.host_path.type == "Directory" + assert skills_vol.persistent_volume_claim is None + + def test_default_uses_hostpath_for_userdata(self, provisioner_module): + """When USERDATA_PVC_NAME is empty, user-data volume should use hostPath.""" + provisioner_module.USERDATA_PVC_NAME = "" + volumes = provisioner_module._build_volumes("thread-1") + userdata_vol = volumes[1] + assert userdata_vol.host_path is not None + assert userdata_vol.persistent_volume_claim is None + + def test_hostpath_userdata_includes_thread_id(self, provisioner_module): + """hostPath user-data path should include thread_id.""" + provisioner_module.USERDATA_PVC_NAME = "" + volumes = provisioner_module._build_volumes("my-thread-42") + userdata_vol = volumes[1] + path = userdata_vol.host_path.path + assert "my-thread-42" in path + assert path.endswith("user-data") + assert userdata_vol.host_path.type == "DirectoryOrCreate" + + def test_skills_pvc_overrides_hostpath(self, provisioner_module): + """When SKILLS_PVC_NAME is set, skills volume should use PVC.""" + provisioner_module.SKILLS_PVC_NAME = "my-skills-pvc" + volumes = provisioner_module._build_volumes("thread-1") + skills_vol = volumes[0] + assert skills_vol.persistent_volume_claim is not None + assert skills_vol.persistent_volume_claim.claim_name == "my-skills-pvc" + assert skills_vol.persistent_volume_claim.read_only is True + assert skills_vol.host_path is None + + def test_userdata_pvc_overrides_hostpath(self, provisioner_module): + """When USERDATA_PVC_NAME is set, user-data volume should use PVC.""" + provisioner_module.USERDATA_PVC_NAME = "my-userdata-pvc" + volumes = provisioner_module._build_volumes("thread-1") + userdata_vol = volumes[1] + assert userdata_vol.persistent_volume_claim is not None + assert userdata_vol.persistent_volume_claim.claim_name == "my-userdata-pvc" + assert userdata_vol.host_path is None + + def test_both_pvc_set(self, provisioner_module): + """When both PVC names are set, both volumes use PVC.""" + provisioner_module.SKILLS_PVC_NAME = "skills-pvc" + provisioner_module.USERDATA_PVC_NAME = "userdata-pvc" + volumes = provisioner_module._build_volumes("thread-1") + assert volumes[0].persistent_volume_claim is not None + assert volumes[1].persistent_volume_claim is not None + + def test_returns_two_volumes(self, provisioner_module): + """Should always return exactly two volumes.""" + provisioner_module.SKILLS_PVC_NAME = "" + provisioner_module.USERDATA_PVC_NAME = "" + assert len(provisioner_module._build_volumes("t")) == 2 + + provisioner_module.SKILLS_PVC_NAME = "a" + provisioner_module.USERDATA_PVC_NAME = "b" + assert len(provisioner_module._build_volumes("t")) == 2 + + def test_volume_names_are_stable(self, provisioner_module): + """Volume names must stay 'skills' and 'user-data'.""" + volumes = provisioner_module._build_volumes("thread-1") + assert volumes[0].name == "skills" + assert volumes[1].name == "user-data" + + +# ── _build_volume_mounts ─────────────────────────────────────────────── + + +class TestBuildVolumeMounts: + """Tests for _build_volume_mounts: mount paths and subPath behavior.""" + + def test_default_no_subpath(self, provisioner_module): + """hostPath mode should not set sub_path on user-data mount.""" + provisioner_module.USERDATA_PVC_NAME = "" + mounts = provisioner_module._build_volume_mounts("thread-1") + userdata_mount = mounts[1] + assert userdata_mount.sub_path is None + + def test_pvc_sets_subpath(self, provisioner_module): + """PVC mode should set sub_path to threads/{thread_id}/user-data.""" + provisioner_module.USERDATA_PVC_NAME = "my-pvc" + mounts = provisioner_module._build_volume_mounts("thread-42") + userdata_mount = mounts[1] + assert userdata_mount.sub_path == "threads/thread-42/user-data" + + def test_skills_mount_read_only(self, provisioner_module): + """Skills mount should always be read-only.""" + mounts = provisioner_module._build_volume_mounts("thread-1") + assert mounts[0].read_only is True + + def test_userdata_mount_read_write(self, provisioner_module): + """User-data mount should always be read-write.""" + mounts = provisioner_module._build_volume_mounts("thread-1") + assert mounts[1].read_only is False + + def test_mount_paths_are_stable(self, provisioner_module): + """Mount paths must stay /mnt/skills and /mnt/user-data.""" + mounts = provisioner_module._build_volume_mounts("thread-1") + assert mounts[0].mount_path == "/mnt/skills" + assert mounts[1].mount_path == "/mnt/user-data" + + def test_mount_names_match_volumes(self, provisioner_module): + """Mount names should match the volume names.""" + mounts = provisioner_module._build_volume_mounts("thread-1") + assert mounts[0].name == "skills" + assert mounts[1].name == "user-data" + + def test_returns_two_mounts(self, provisioner_module): + """Should always return exactly two mounts.""" + assert len(provisioner_module._build_volume_mounts("t")) == 2 + + +# ── _build_pod integration ───────────────────────────────────────────── + + +class TestBuildPodVolumes: + """Integration: _build_pod should wire volumes and mounts correctly.""" + + def test_pod_spec_has_volumes(self, provisioner_module): + """Pod spec should contain exactly 2 volumes.""" + provisioner_module.SKILLS_PVC_NAME = "" + provisioner_module.USERDATA_PVC_NAME = "" + pod = provisioner_module._build_pod("sandbox-1", "thread-1") + assert len(pod.spec.volumes) == 2 + + def test_pod_spec_has_volume_mounts(self, provisioner_module): + """Container should have exactly 2 volume mounts.""" + provisioner_module.SKILLS_PVC_NAME = "" + provisioner_module.USERDATA_PVC_NAME = "" + pod = provisioner_module._build_pod("sandbox-1", "thread-1") + assert len(pod.spec.containers[0].volume_mounts) == 2 + + def test_pod_pvc_mode(self, provisioner_module): + """Pod should use PVC volumes when PVC names are configured.""" + provisioner_module.SKILLS_PVC_NAME = "skills-pvc" + provisioner_module.USERDATA_PVC_NAME = "userdata-pvc" + pod = provisioner_module._build_pod("sandbox-1", "thread-1") + assert pod.spec.volumes[0].persistent_volume_claim is not None + assert pod.spec.volumes[1].persistent_volume_claim is not None + # subPath should be set on user-data mount + userdata_mount = pod.spec.containers[0].volume_mounts[1] + assert userdata_mount.sub_path == "threads/thread-1/user-data" diff --git a/docker/docker-compose-dev.yaml b/docker/docker-compose-dev.yaml index 285f89779..87d19abbe 100644 --- a/docker/docker-compose-dev.yaml +++ b/docker/docker-compose-dev.yaml @@ -36,6 +36,11 @@ services: # export DEER_FLOW_ROOT=/absolute/path/to/deer-flow - SKILLS_HOST_PATH=${DEER_FLOW_ROOT}/skills - THREADS_HOST_PATH=${DEER_FLOW_ROOT}/backend/.deer-flow/threads + # Production: use PVC instead of hostPath to avoid data loss on node failure. + # When set, hostPath vars above are ignored for the corresponding volume. + # USERDATA_PVC_NAME uses subPath (threads/{thread_id}/user-data) automatically. + # - SKILLS_PVC_NAME=deer-flow-skills-pvc + # - USERDATA_PVC_NAME=deer-flow-userdata-pvc - KUBECONFIG_PATH=/root/.kube/config - NODE_HOST=host.docker.internal # Override K8S API server URL since kubeconfig uses 127.0.0.1 diff --git a/docker/provisioner/README.md b/docker/provisioner/README.md index f7b824358..557ad6cfd 100644 --- a/docker/provisioner/README.md +++ b/docker/provisioner/README.md @@ -137,6 +137,8 @@ The provisioner is configured via environment variables (set in [docker-compose- | `SANDBOX_IMAGE` | `enterprise-public-cn-beijing.cr.volces.com/vefaas-public/all-in-one-sandbox:latest` | Container image for sandbox Pods | | `SKILLS_HOST_PATH` | - | **Host machine** path to skills directory (must be absolute) | | `THREADS_HOST_PATH` | - | **Host machine** path to threads data directory (must be absolute) | +| `SKILLS_PVC_NAME` | empty (use hostPath) | PVC name for skills volume; when set, sandbox Pods use PVC instead of hostPath | +| `USERDATA_PVC_NAME` | empty (use hostPath) | PVC name for user-data volume; when set, uses PVC with `subPath: threads/{thread_id}/user-data` | | `KUBECONFIG_PATH` | `/root/.kube/config` | Path to kubeconfig **inside** the provisioner container | | `NODE_HOST` | `host.docker.internal` | Hostname that backend containers use to reach host NodePorts | | `K8S_API_SERVER` | (from kubeconfig) | Override K8s API server URL (e.g., `https://host.docker.internal:26443`) | @@ -309,7 +311,7 @@ docker exec deer-flow-gateway curl -s $SANDBOX_URL/v1/sandbox ## Security Considerations -1. **HostPath Volumes**: The provisioner mounts host directories into sandbox Pods. Ensure these paths contain only trusted data. +1. **HostPath Volumes**: The provisioner mounts host directories into sandbox Pods by default. Ensure these paths contain only trusted data. For production, prefer PVC-based volumes (set `SKILLS_PVC_NAME` and `USERDATA_PVC_NAME`) to avoid node-specific data loss risks. 2. **Resource Limits**: Each sandbox Pod has CPU, memory, and storage limits to prevent resource exhaustion. @@ -322,7 +324,7 @@ docker exec deer-flow-gateway curl -s $SANDBOX_URL/v1/sandbox ## Future Enhancements - [ ] Support for custom resource requests/limits per sandbox -- [ ] PersistentVolume support for larger data requirements +- [x] PersistentVolume support for larger data requirements - [ ] Automatic cleanup of stale sandboxes (timeout-based) - [ ] Metrics and monitoring (Prometheus integration) - [ ] Multi-cluster support (route to different K8s clusters) diff --git a/docker/provisioner/app.py b/docker/provisioner/app.py index ba2dbb37b..11e1e424f 100644 --- a/docker/provisioner/app.py +++ b/docker/provisioner/app.py @@ -60,6 +60,8 @@ SANDBOX_IMAGE = os.environ.get( ) SKILLS_HOST_PATH = os.environ.get("SKILLS_HOST_PATH", "/skills") THREADS_HOST_PATH = os.environ.get("THREADS_HOST_PATH", "/.deer-flow/threads") +SKILLS_PVC_NAME = os.environ.get("SKILLS_PVC_NAME", "") +USERDATA_PVC_NAME = os.environ.get("USERDATA_PVC_NAME", "") SAFE_THREAD_ID_PATTERN = r"^[A-Za-z0-9_\-]+$" # Path to the kubeconfig *inside* the provisioner container. @@ -243,6 +245,64 @@ def _sandbox_url(node_port: int) -> str: return f"http://{NODE_HOST}:{node_port}" +def _build_volumes(thread_id: str) -> list[k8s_client.V1Volume]: + """Build volume list: PVC when configured, otherwise hostPath.""" + if SKILLS_PVC_NAME: + skills_vol = k8s_client.V1Volume( + name="skills", + persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource( + claim_name=SKILLS_PVC_NAME, + read_only=True, + ), + ) + else: + skills_vol = k8s_client.V1Volume( + name="skills", + host_path=k8s_client.V1HostPathVolumeSource( + path=SKILLS_HOST_PATH, + type="Directory", + ), + ) + + if USERDATA_PVC_NAME: + userdata_vol = k8s_client.V1Volume( + name="user-data", + persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource( + claim_name=USERDATA_PVC_NAME, + ), + ) + else: + userdata_vol = k8s_client.V1Volume( + name="user-data", + host_path=k8s_client.V1HostPathVolumeSource( + path=join_host_path(THREADS_HOST_PATH, thread_id, "user-data"), + type="DirectoryOrCreate", + ), + ) + + return [skills_vol, userdata_vol] + + +def _build_volume_mounts(thread_id: str) -> list[k8s_client.V1VolumeMount]: + """Build volume mount list, using subPath for PVC user-data.""" + userdata_mount = k8s_client.V1VolumeMount( + name="user-data", + mount_path="/mnt/user-data", + read_only=False, + ) + if USERDATA_PVC_NAME: + userdata_mount.sub_path = f"threads/{thread_id}/user-data" + + return [ + k8s_client.V1VolumeMount( + name="skills", + mount_path="/mnt/skills", + read_only=True, + ), + userdata_mount, + ] + + def _build_pod(sandbox_id: str, thread_id: str) -> k8s_client.V1Pod: """Construct a Pod manifest for a single sandbox.""" thread_id = _validate_thread_id(thread_id) @@ -302,40 +362,14 @@ def _build_pod(sandbox_id: str, thread_id: str) -> k8s_client.V1Pod: "ephemeral-storage": "500Mi", }, ), - volume_mounts=[ - k8s_client.V1VolumeMount( - name="skills", - mount_path="/mnt/skills", - read_only=True, - ), - k8s_client.V1VolumeMount( - name="user-data", - mount_path="/mnt/user-data", - read_only=False, - ), - ], + volume_mounts=_build_volume_mounts(thread_id), security_context=k8s_client.V1SecurityContext( privileged=False, allow_privilege_escalation=True, ), ) ], - volumes=[ - k8s_client.V1Volume( - name="skills", - host_path=k8s_client.V1HostPathVolumeSource( - path=SKILLS_HOST_PATH, - type="Directory", - ), - ), - k8s_client.V1Volume( - name="user-data", - host_path=k8s_client.V1HostPathVolumeSource( - path=join_host_path(THREADS_HOST_PATH, thread_id, "user-data"), - type="DirectoryOrCreate", - ), - ), - ], + volumes=_build_volumes(thread_id), restart_policy="Always", ), )