From 10c1d9f41754401f05b5cee3704fe6b37c763404 Mon Sep 17 00:00:00 2001 From: Nan Gao Date: Mon, 8 Jun 2026 01:59:50 +0200 Subject: [PATCH] fix(search): fix DDGS Wikipedia region handling (#3423) --- .../deerflow/community/ddg_search/tools.py | 97 ++++++++++++++++++- backend/tests/test_ddg_search_tools.py | 75 ++++++++++++++ config.example.yaml | 3 + 3 files changed, 170 insertions(+), 5 deletions(-) create mode 100644 backend/tests/test_ddg_search_tools.py diff --git a/backend/packages/harness/deerflow/community/ddg_search/tools.py b/backend/packages/harness/deerflow/community/ddg_search/tools.py index 7639fe8ec..2d03b5ad0 100644 --- a/backend/packages/harness/deerflow/community/ddg_search/tools.py +++ b/backend/packages/harness/deerflow/community/ddg_search/tools.py @@ -11,12 +11,85 @@ from deerflow.config import get_app_config logger = logging.getLogger(__name__) +DEFAULT_BACKEND = "auto" +DEFAULT_REGION = "wt-wt" +DEFAULT_SAFESEARCH = "moderate" +DEFAULT_WIKIPEDIA_REGION = "us-en" + +WIKIPEDIA_BACKENDS = {"auto", "all", "wikipedia"} +WIKIPEDIA_LANGUAGE_ALIASES = { + "jp": "ja", + "kr": "ko", + "tzh": "zh", + "wt": "en", +} + + +def _normalize_backend(backend: str | list[str] | tuple[str, ...] | None) -> str: + if backend is None: + return DEFAULT_BACKEND + if isinstance(backend, (list, tuple)): + return ",".join(str(part).strip() for part in backend if str(part).strip()) or DEFAULT_BACKEND + return str(backend).strip() or DEFAULT_BACKEND + + +def _normalize_setting(value: str | None, default: str) -> str: + return str(value).strip() if value else default + + +def _backend_includes_wikipedia(backend: str | list[str] | tuple[str, ...] | None) -> bool: + backend = _normalize_backend(backend) + return any(part.strip().lower() in WIKIPEDIA_BACKENDS for part in backend.split(",")) + + +def _contains_codepoint(query: str, ranges: tuple[tuple[int, int], ...]) -> bool: + return any(start <= ord(char) <= end for char in query for start, end in ranges) + + +def _infer_wikipedia_region(query: str) -> str: + """Pick a valid Wikipedia language region when DDGS' worldwide region is used.""" + if _contains_codepoint(query, ((0x3040, 0x30FF), (0x31F0, 0x31FF))): + return "jp-ja" + if _contains_codepoint(query, ((0xAC00, 0xD7AF), (0x1100, 0x11FF), (0x3130, 0x318F))): + return "kr-ko" + if _contains_codepoint(query, ((0x3400, 0x9FFF),)): + return "cn-zh" + if _contains_codepoint(query, ((0x0400, 0x04FF),)): + return "ru-ru" + if _contains_codepoint(query, ((0x0370, 0x03FF),)): + return "gr-el" + if _contains_codepoint(query, ((0x0590, 0x05FF),)): + return "il-he" + if _contains_codepoint(query, ((0x0600, 0x06FF),)): + return "xa-ar" + return DEFAULT_WIKIPEDIA_REGION + + +def _resolve_ddgs_region(query: str, region: str | None, backend: str | list[str] | tuple[str, ...] | None) -> str: + """ + DDGS' wikipedia engine treats the second part of region as a Wikipedia + subdomain. Its default worldwide region, wt-wt, becomes wt.wikipedia.org. + """ + normalized_region = _normalize_setting(region, DEFAULT_REGION).lower() + if not _backend_includes_wikipedia(backend): + return normalized_region + + if normalized_region == DEFAULT_REGION: + return _infer_wikipedia_region(query) + + if "-" not in normalized_region: + return DEFAULT_WIKIPEDIA_REGION + + country, language = normalized_region.split("-", 1) + return f"{country}-{WIKIPEDIA_LANGUAGE_ALIASES.get(language, language)}" + def _search_text( query: str, max_results: int = 5, - region: str = "wt-wt", - safesearch: str = "moderate", + region: str | None = DEFAULT_REGION, + safesearch: str | None = DEFAULT_SAFESEARCH, + backend: str | list[str] | tuple[str, ...] | None = DEFAULT_BACKEND, ) -> list[dict]: """ Execute text search using DuckDuckGo. @@ -26,6 +99,7 @@ def _search_text( max_results: Maximum number of results region: Search region safesearch: Safe search level + backend: DDGS backend(s), e.g. "auto", "duckduckgo", or "duckduckgo,brave" Returns: List of search results @@ -39,11 +113,15 @@ def _search_text( ddgs = DDGS(timeout=30) try: + backend = _normalize_backend(backend) + safesearch = _normalize_setting(safesearch, DEFAULT_SAFESEARCH) + effective_region = _resolve_ddgs_region(query, region, backend) results = ddgs.text( query, - region=region, + region=effective_region, safesearch=safesearch, max_results=max_results, + backend=backend, ) return list(results) if results else [] @@ -64,14 +142,23 @@ def web_search_tool( max_results: Maximum number of results to return. Default is 5. """ config = get_app_config().get_tool_config("web_search") + region = DEFAULT_REGION + safesearch = DEFAULT_SAFESEARCH + backend = DEFAULT_BACKEND - # Override max_results from config if set - if config is not None and "max_results" in config.model_extra: + if config is not None: + # Override tool call defaults from config if set. max_results = config.model_extra.get("max_results", max_results) + region = config.model_extra.get("region", region) + safesearch = config.model_extra.get("safesearch", safesearch) + backend = config.model_extra.get("backend", backend) results = _search_text( query=query, max_results=max_results, + region=region, + safesearch=safesearch, + backend=backend, ) if not results: diff --git a/backend/tests/test_ddg_search_tools.py b/backend/tests/test_ddg_search_tools.py new file mode 100644 index 000000000..734ea29b3 --- /dev/null +++ b/backend/tests/test_ddg_search_tools.py @@ -0,0 +1,75 @@ +"""Unit tests for the DDGS community web search tool.""" + +import json +import sys +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +from deerflow.community.ddg_search import tools + + +def test_resolve_ddgs_region_maps_worldwide_chinese_query_for_wikipedia() -> None: + assert tools._resolve_ddgs_region("\u4e16\u754c\u676f\u65b0\u95fb 2026", "wt-wt", "auto") == "cn-zh" + + +def test_resolve_ddgs_region_uses_english_fallback_for_worldwide_query() -> None: + assert tools._resolve_ddgs_region("latest world cup news", "wt-wt", "auto") == "us-en" + + +def test_resolve_ddgs_region_preserves_worldwide_for_non_wikipedia_backend() -> None: + assert tools._resolve_ddgs_region("latest world cup news", "wt-wt", "duckduckgo") == "wt-wt" + + +def test_resolve_ddgs_region_maps_common_ddg_locale_aliases() -> None: + assert tools._resolve_ddgs_region("\u65e5\u672c \u30cb\u30e5\u30fc\u30b9", "jp-jp", "auto") == "jp-ja" + assert tools._resolve_ddgs_region("\ud55c\uad6d \ub274\uc2a4", "kr-kr", "auto") == "kr-ko" + assert tools._resolve_ddgs_region("\u53f0\u7063\u65b0\u805e", "tw-tzh", "auto") == "tw-zh" + + +def test_search_text_passes_wikipedia_safe_region_to_ddgs(monkeypatch) -> None: + calls = {} + + class FakeDDGS: + def __init__(self, timeout: int) -> None: + calls["timeout"] = timeout + + def text(self, query: str, **kwargs): + calls["query"] = query + calls.update(kwargs) + return [{"title": "Result", "href": "https://example.com", "body": "Snippet"}] + + monkeypatch.setitem(sys.modules, "ddgs", SimpleNamespace(DDGS=FakeDDGS)) + + results = tools._search_text("\u4e16\u754c\u676f\u65b0\u95fb 2026", backend="auto") + + assert results == [{"title": "Result", "href": "https://example.com", "body": "Snippet"}] + assert calls["timeout"] == 30 + assert calls["region"] == "cn-zh" + assert calls["backend"] == "auto" + + +def test_web_search_tool_reads_ddgs_options_from_config() -> None: + with patch("deerflow.community.ddg_search.tools.get_app_config") as mock_config: + tool_config = MagicMock() + tool_config.model_extra = { + "max_results": 3, + "region": "us-en", + "safesearch": "off", + "backend": "auto", + } + mock_config.return_value.get_tool_config.return_value = tool_config + + with patch("deerflow.community.ddg_search.tools._search_text") as mock_search: + mock_search.return_value = [{"title": "Result", "href": "https://example.com", "body": "Snippet"}] + + result = tools.web_search_tool.invoke({"query": "latest news", "max_results": 8}) + parsed = json.loads(result) + + assert parsed["total_results"] == 1 + mock_search.assert_called_once_with( + query="latest news", + max_results=3, + region="us-en", + safesearch="off", + backend="auto", + ) diff --git a/config.example.yaml b/config.example.yaml index b3e54892a..0a0026d87 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -436,6 +436,9 @@ tools: group: web use: deerflow.community.ddg_search.tools:web_search_tool max_results: 5 + # backend: auto # DDGS backend(s): auto, duckduckgo, brave, wikipedia, etc. + # region: wt-wt # wt-wt is normalized for Wikipedia when backend includes auto/all/wikipedia. + # safesearch: moderate # on, moderate, off # Web search tool (uses Serper - Google Search API, requires SERPER_API_KEY) # Serper provides real-time Google Search results. Sign up at https://serper.dev