From 1df389b9d04d41caa56a73a0ad748f439d8b7a80 Mon Sep 17 00:00:00 2001 From: lesliewangwyc-dev Date: Mon, 13 Apr 2026 21:15:24 +0800 Subject: [PATCH] fix: wrap blocking readability call with asyncio.to_thread in web_fetch (#2157) * fix: wrap blocking readability call with asyncio.to_thread in web_fetch The readability extractor internally spawns a Node.js subprocess via readabilipy, which blocks the async event loop and causes a BlockingError when web_fetch is invoked inside LangGraph's async runtime. Wrap the synchronous extract_article call with asyncio.to_thread to offload it to a thread pool, unblocking the event loop. Note: community/infoquest/tools.py has the same latent issue and should be addressed in a follow-up PR. Closes #2152 Co-Authored-By: Claude Opus 4.6 * test: verify web_fetch offloads extraction via asyncio.to_thread Add a regression test that monkeypatches asyncio.to_thread to confirm readability extraction is offloaded to a worker thread, preventing future refactors from reintroducing the blocking call. Addresses Copilot review feedback on #2157. Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 Co-authored-by: Willem Jiang --- .../deerflow/community/jina_ai/tools.py | 4 ++- backend/tests/test_jina_client.py | 27 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/backend/packages/harness/deerflow/community/jina_ai/tools.py b/backend/packages/harness/deerflow/community/jina_ai/tools.py index 9f243ecd9..760e6a3b6 100644 --- a/backend/packages/harness/deerflow/community/jina_ai/tools.py +++ b/backend/packages/harness/deerflow/community/jina_ai/tools.py @@ -1,3 +1,5 @@ +import asyncio + from langchain.tools import tool from deerflow.community.jina_ai.jina_client import JinaClient @@ -26,5 +28,5 @@ async def web_fetch_tool(url: str) -> str: html_content = await jina_client.crawl(url, return_format="html", timeout=timeout) if isinstance(html_content, str) and html_content.startswith("Error:"): return html_content - article = readability_extractor.extract_article(html_content) + article = await asyncio.to_thread(readability_extractor.extract_article, html_content) return article.to_markdown()[:4096] diff --git a/backend/tests/test_jina_client.py b/backend/tests/test_jina_client.py index 037436f73..5a1d6f6fa 100644 --- a/backend/tests/test_jina_client.py +++ b/backend/tests/test_jina_client.py @@ -175,3 +175,30 @@ async def test_web_fetch_tool_returns_markdown_on_success(monkeypatch): result = await web_fetch_tool.ainvoke("https://example.com") assert "Hello world" in result assert not result.startswith("Error:") + + +@pytest.mark.anyio +async def test_web_fetch_tool_offloads_extraction_to_thread(monkeypatch): + """Test that readability extraction is offloaded via asyncio.to_thread to avoid blocking the event loop.""" + import asyncio + + async def mock_crawl(self, url, **kwargs): + return "

threaded

" + + mock_config = MagicMock() + mock_config.get_tool_config.return_value = None + monkeypatch.setattr("deerflow.community.jina_ai.tools.get_app_config", lambda: mock_config) + monkeypatch.setattr(JinaClient, "crawl", mock_crawl) + + to_thread_called = False + original_to_thread = asyncio.to_thread + + async def tracking_to_thread(func, *args, **kwargs): + nonlocal to_thread_called + to_thread_called = True + return await original_to_thread(func, *args, **kwargs) + + monkeypatch.setattr("deerflow.community.jina_ai.tools.asyncio.to_thread", tracking_to_thread) + result = await web_fetch_tool.ainvoke("https://example.com") + assert to_thread_called, "extract_article must be called via asyncio.to_thread to avoid blocking the event loop" + assert "threaded" in result