diff --git a/backend/packages/harness/deerflow/community/jina_ai/jina_client.py b/backend/packages/harness/deerflow/community/jina_ai/jina_client.py index c4fc1ac81..8c79e0569 100644 --- a/backend/packages/harness/deerflow/community/jina_ai/jina_client.py +++ b/backend/packages/harness/deerflow/community/jina_ai/jina_client.py @@ -9,7 +9,7 @@ _api_key_warned = False class JinaClient: - async def crawl(self, url: str, return_format: str = "html", timeout: int = 10) -> str: + async def crawl(self, url: str, return_format: str = "html", timeout: int = 10, proxy: str | None = None, trust_env: bool = True) -> str: global _api_key_warned headers = { "Content-Type": "application/json", @@ -23,7 +23,10 @@ class JinaClient: logger.warning("Jina API key is not set. Provide your own key to access a higher rate limit. See https://jina.ai/reader for more information.") data = {"url": url} try: - async with httpx.AsyncClient() as client: + client_kwargs: dict[str, object] = {"trust_env": trust_env} + if proxy: + client_kwargs["proxy"] = proxy + async with httpx.AsyncClient(**client_kwargs) as client: response = await client.post("https://r.jina.ai/", headers=headers, json=data, timeout=timeout) if response.status_code != 200: diff --git a/backend/packages/harness/deerflow/community/jina_ai/tools.py b/backend/packages/harness/deerflow/community/jina_ai/tools.py index 760e6a3b6..81c8370d6 100644 --- a/backend/packages/harness/deerflow/community/jina_ai/tools.py +++ b/backend/packages/harness/deerflow/community/jina_ai/tools.py @@ -9,6 +9,38 @@ from deerflow.utils.readability import ReadabilityExtractor readability_extractor = ReadabilityExtractor() +def _coerce_bool(value: object, default: bool) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"1", "true", "yes", "on"}: + return True + if normalized in {"0", "false", "no", "off"}: + return False + return default + + +def _coerce_timeout(value: object, default: int) -> int: + if isinstance(value, bool): + return default + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return default + return default + + +def _coerce_proxy(value: object) -> str | None: + if not isinstance(value, str): + return None + proxy = value.strip() + return proxy or None + + @tool("web_fetch", parse_docstring=True) async def web_fetch_tool(url: str) -> str: """Fetch the contents of a web page at a given URL. @@ -22,10 +54,14 @@ async def web_fetch_tool(url: str) -> str: """ jina_client = JinaClient() timeout = 10 + proxy = None + trust_env = True config = get_app_config().get_tool_config("web_fetch") - if config is not None and "timeout" in config.model_extra: - timeout = config.model_extra.get("timeout") - html_content = await jina_client.crawl(url, return_format="html", timeout=timeout) + if config is not None: + timeout = _coerce_timeout(config.model_extra.get("timeout"), timeout) + proxy = _coerce_proxy(config.model_extra.get("proxy")) + trust_env = _coerce_bool(config.model_extra.get("trust_env"), trust_env) + html_content = await jina_client.crawl(url, return_format="html", timeout=timeout, proxy=proxy, trust_env=trust_env) if isinstance(html_content, str) and html_content.startswith("Error:"): return html_content article = await asyncio.to_thread(readability_extractor.extract_article, html_content) diff --git a/backend/tests/test_jina_client.py b/backend/tests/test_jina_client.py index b1856e4ae..b9845260c 100644 --- a/backend/tests/test_jina_client.py +++ b/backend/tests/test_jina_client.py @@ -8,7 +8,12 @@ import pytest import deerflow.community.jina_ai.jina_client as jina_client_module from deerflow.community.jina_ai.jina_client import JinaClient -from deerflow.community.jina_ai.tools import web_fetch_tool +from deerflow.community.jina_ai.tools import ( + _coerce_bool, + _coerce_proxy, + _coerce_timeout, + web_fetch_tool, +) @pytest.fixture @@ -117,6 +122,59 @@ async def test_crawl_passes_headers(jina_client, monkeypatch): assert captured_headers["X-Timeout"] == "30" +@pytest.mark.anyio +async def test_crawl_passes_proxy_to_httpx_client(jina_client, monkeypatch): + """Explicit proxy config should be passed to httpx.AsyncClient.""" + captured_client_kwargs = {} + + class MockAsyncClient: + def __init__(self, **kwargs): + captured_client_kwargs.update(kwargs) + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return None + + async def post(self, url, **kwargs): + return httpx.Response(200, text="ok", request=httpx.Request("POST", url)) + + monkeypatch.setattr(httpx, "AsyncClient", MockAsyncClient) + + result = await jina_client.crawl("https://example.com", proxy="http://127.0.0.1:7890") + + assert result == "ok" + assert captured_client_kwargs["proxy"] == "http://127.0.0.1:7890" + assert captured_client_kwargs["trust_env"] is True + + +@pytest.mark.anyio +async def test_crawl_can_disable_trust_env(jina_client, monkeypatch): + """Callers can disable environment proxy lookup for deterministic networking.""" + captured_client_kwargs = {} + + class MockAsyncClient: + def __init__(self, **kwargs): + captured_client_kwargs.update(kwargs) + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return None + + async def post(self, url, **kwargs): + return httpx.Response(200, text="ok", request=httpx.Request("POST", url)) + + monkeypatch.setattr(httpx, "AsyncClient", MockAsyncClient) + + result = await jina_client.crawl("https://example.com", trust_env=False) + + assert result == "ok" + assert captured_client_kwargs == {"trust_env": False} + + @pytest.mark.anyio async def test_crawl_includes_api_key_when_set(jina_client, monkeypatch): """Test that Authorization header is set when JINA_API_KEY is available.""" @@ -199,6 +257,60 @@ async def test_web_fetch_tool_returns_markdown_on_success(monkeypatch): assert not result.startswith("Error:") +@pytest.mark.anyio +async def test_web_fetch_tool_forwards_proxy_and_trust_env(monkeypatch): + """web_fetch tool config should be forwarded to JinaClient.crawl.""" + captured_crawl_kwargs = {} + + async def mock_crawl(self, url, **kwargs): + captured_crawl_kwargs.update(kwargs) + return "

Hello world

" + + mock_config = MagicMock() + mock_tool_config = MagicMock() + mock_tool_config.model_extra = { + "timeout": "20", + "proxy": "http://host.docker.internal:7890", + "trust_env": "false", + } + mock_config.get_tool_config.return_value = mock_tool_config + monkeypatch.setattr("deerflow.community.jina_ai.tools.get_app_config", lambda: mock_config) + monkeypatch.setattr(JinaClient, "crawl", mock_crawl) + + result = await web_fetch_tool.ainvoke("https://example.com") + + assert "Hello world" in result + assert captured_crawl_kwargs == { + "return_format": "html", + "timeout": 20, + "proxy": "http://host.docker.internal:7890", + "trust_env": False, + } + + +@pytest.mark.anyio +async def test_web_fetch_tool_ignores_empty_proxy(monkeypatch): + """Empty proxy values from unresolved env vars should not be passed to httpx.""" + captured_crawl_kwargs = {} + + async def mock_crawl(self, url, **kwargs): + captured_crawl_kwargs.update(kwargs) + return "

Hello world

" + + mock_config = MagicMock() + mock_tool_config = MagicMock() + mock_tool_config.model_extra = {"proxy": " ", "trust_env": True} + mock_config.get_tool_config.return_value = mock_tool_config + monkeypatch.setattr("deerflow.community.jina_ai.tools.get_app_config", lambda: mock_config) + monkeypatch.setattr(JinaClient, "crawl", mock_crawl) + + result = await web_fetch_tool.ainvoke("https://example.com") + + assert "Hello world" in result + assert captured_crawl_kwargs["proxy"] is None + assert captured_crawl_kwargs["trust_env"] is True + + @pytest.mark.anyio async def test_web_fetch_tool_offloads_extraction_to_thread(monkeypatch): """Test that readability extraction is offloaded via asyncio.to_thread to avoid blocking the event loop.""" @@ -224,3 +336,60 @@ async def test_web_fetch_tool_offloads_extraction_to_thread(monkeypatch): result = await web_fetch_tool.ainvoke("https://example.com") assert to_thread_called, "extract_article must be called via asyncio.to_thread to avoid blocking the event loop" assert "threaded" in result + + +@pytest.mark.parametrize( + ("value", "default", "expected"), + [ + (True, False, True), + (False, True, False), + ("true", False, True), + ("YES", False, True), + (" on ", False, True), + ("1", False, True), + ("false", True, False), + ("No", True, False), + ("off", True, False), + ("0", True, False), + ("maybe", True, True), + ("maybe", False, False), + (None, True, True), + (123, False, False), + ], +) +def test_coerce_bool(value, default, expected): + """_coerce_bool normalizes booleans, known strings, and falls back to the default.""" + assert _coerce_bool(value, default) is expected + + +@pytest.mark.parametrize( + ("value", "default", "expected"), + [ + (30, 10, 30), + ("45", 10, 45), + ("not-a-number", 10, 10), + (True, 10, 10), + (False, 10, 10), + (None, 10, 10), + (1.5, 10, 10), + ], +) +def test_coerce_timeout(value, default, expected): + """_coerce_timeout accepts ints and numeric strings, rejecting bools and junk.""" + assert _coerce_timeout(value, default) == expected + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + ("http://127.0.0.1:7890", "http://127.0.0.1:7890"), + (" http://proxy:8080 ", "http://proxy:8080"), + ("", None), + (" ", None), + (None, None), + (123, None), + ], +) +def test_coerce_proxy(value, expected): + """_coerce_proxy trims strings and treats empty/non-string values as None.""" + assert _coerce_proxy(value) == expected diff --git a/config.example.yaml b/config.example.yaml index 1c7d2a115..5de11e226 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -537,6 +537,10 @@ tools: group: web use: deerflow.community.jina_ai.tools:web_fetch_tool timeout: 10 + # Optional proxy for restricted networks / Docker / WSL. + # Use host.docker.internal instead of 127.0.0.1 when the proxy runs on the host. + # proxy: $HTTPS_PROXY + # trust_env: true # Web fetch tool (uses InfoQuest) # - name: web_fetch diff --git a/docker/docker-compose-dev.yaml b/docker/docker-compose-dev.yaml index 233d22c55..d36981a27 100644 --- a/docker/docker-compose-dev.yaml +++ b/docker/docker-compose-dev.yaml @@ -172,6 +172,10 @@ services: - DEER_FLOW_HOST_BASE_DIR=${DEER_FLOW_ROOT}/backend/.deer-flow - DEER_FLOW_HOST_SKILLS_PATH=${DEER_FLOW_ROOT}/skills - DEER_FLOW_SANDBOX_HOST=host.docker.internal + # Proxy values (HTTP_PROXY/HTTPS_PROXY/ALL_PROXY) are inherited from ../.env via env_file. + # Only NO_PROXY is declared here so internal service hostnames are always exempt from the proxy. + - NO_PROXY=${NO_PROXY:-}${NO_PROXY:+,}localhost,127.0.0.1,::1,gateway,frontend,nginx,provisioner,host.docker.internal + - no_proxy=${no_proxy:-}${no_proxy:+,}localhost,127.0.0.1,::1,gateway,frontend,nginx,provisioner,host.docker.internal env_file: - ../.env extra_hosts: diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 169e8f3d9..7455902e9 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -107,6 +107,10 @@ services: - DEER_FLOW_HOST_BASE_DIR=${DEER_FLOW_HOME} - DEER_FLOW_HOST_SKILLS_PATH=${DEER_FLOW_REPO_ROOT}/skills - DEER_FLOW_SANDBOX_HOST=host.docker.internal + # Proxy values (HTTP_PROXY/HTTPS_PROXY/ALL_PROXY) are inherited from ../.env via env_file. + # Only NO_PROXY is declared here so internal service hostnames are always exempt from the proxy. + - NO_PROXY=${NO_PROXY:-}${NO_PROXY:+,}localhost,127.0.0.1,::1,gateway,frontend,nginx,provisioner,host.docker.internal + - no_proxy=${no_proxy:-}${no_proxy:+,}localhost,127.0.0.1,::1,gateway,frontend,nginx,provisioner,host.docker.internal env_file: - ../.env extra_hosts: diff --git a/scripts/docker.sh b/scripts/docker.sh index 6b37b6b51..9db74a4f5 100755 --- a/scripts/docker.sh +++ b/scripts/docker.sh @@ -15,6 +15,32 @@ DOCKER_DIR="$PROJECT_ROOT/docker" # Docker Compose command with project name COMPOSE_CMD="docker compose -p deer-flow-dev -f docker-compose-dev.yaml" +load_proxy_env_from_dotenv() { + local env_file="$PROJECT_ROOT/.env" + local var + local line + local value + + if [ ! -f "$env_file" ]; then + return + fi + + for var in HTTP_PROXY HTTPS_PROXY ALL_PROXY NO_PROXY http_proxy https_proxy all_proxy no_proxy; do + if [ -z "${!var+x}" ]; then + line="$(grep -E "^[[:space:]]*${var}=" "$env_file" | tail -n 1 || true)" + if [ -n "$line" ]; then + value="${line#*=}" + value="${value%\"}" + value="${value#\"}" + value="${value%\'}" + value="${value#\'}" + value="${value%$'\r'}" + export "${var}=${value}" + fi + fi + done +} + detect_sandbox_mode() { local config_file="$PROJECT_ROOT/config.yaml" local sandbox_use="" @@ -220,6 +246,8 @@ start() { fi fi + load_proxy_env_from_dotenv + echo "Building and starting containers..." cd "$DOCKER_DIR" && $COMPOSE_CMD up --build -d --remove-orphans $services echo ""