mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-09 17:12:01 +00:00
fix(search): fix DDGS Wikipedia region handling (#3423)
This commit is contained in:
parent
7679f21edf
commit
10c1d9f417
@ -11,12 +11,85 @@ from deerflow.config import get_app_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_BACKEND = "auto"
|
||||
DEFAULT_REGION = "wt-wt"
|
||||
DEFAULT_SAFESEARCH = "moderate"
|
||||
DEFAULT_WIKIPEDIA_REGION = "us-en"
|
||||
|
||||
WIKIPEDIA_BACKENDS = {"auto", "all", "wikipedia"}
|
||||
WIKIPEDIA_LANGUAGE_ALIASES = {
|
||||
"jp": "ja",
|
||||
"kr": "ko",
|
||||
"tzh": "zh",
|
||||
"wt": "en",
|
||||
}
|
||||
|
||||
|
||||
def _normalize_backend(backend: str | list[str] | tuple[str, ...] | None) -> str:
|
||||
if backend is None:
|
||||
return DEFAULT_BACKEND
|
||||
if isinstance(backend, (list, tuple)):
|
||||
return ",".join(str(part).strip() for part in backend if str(part).strip()) or DEFAULT_BACKEND
|
||||
return str(backend).strip() or DEFAULT_BACKEND
|
||||
|
||||
|
||||
def _normalize_setting(value: str | None, default: str) -> str:
|
||||
return str(value).strip() if value else default
|
||||
|
||||
|
||||
def _backend_includes_wikipedia(backend: str | list[str] | tuple[str, ...] | None) -> bool:
|
||||
backend = _normalize_backend(backend)
|
||||
return any(part.strip().lower() in WIKIPEDIA_BACKENDS for part in backend.split(","))
|
||||
|
||||
|
||||
def _contains_codepoint(query: str, ranges: tuple[tuple[int, int], ...]) -> bool:
|
||||
return any(start <= ord(char) <= end for char in query for start, end in ranges)
|
||||
|
||||
|
||||
def _infer_wikipedia_region(query: str) -> str:
|
||||
"""Pick a valid Wikipedia language region when DDGS' worldwide region is used."""
|
||||
if _contains_codepoint(query, ((0x3040, 0x30FF), (0x31F0, 0x31FF))):
|
||||
return "jp-ja"
|
||||
if _contains_codepoint(query, ((0xAC00, 0xD7AF), (0x1100, 0x11FF), (0x3130, 0x318F))):
|
||||
return "kr-ko"
|
||||
if _contains_codepoint(query, ((0x3400, 0x9FFF),)):
|
||||
return "cn-zh"
|
||||
if _contains_codepoint(query, ((0x0400, 0x04FF),)):
|
||||
return "ru-ru"
|
||||
if _contains_codepoint(query, ((0x0370, 0x03FF),)):
|
||||
return "gr-el"
|
||||
if _contains_codepoint(query, ((0x0590, 0x05FF),)):
|
||||
return "il-he"
|
||||
if _contains_codepoint(query, ((0x0600, 0x06FF),)):
|
||||
return "xa-ar"
|
||||
return DEFAULT_WIKIPEDIA_REGION
|
||||
|
||||
|
||||
def _resolve_ddgs_region(query: str, region: str | None, backend: str | list[str] | tuple[str, ...] | None) -> str:
|
||||
"""
|
||||
DDGS' wikipedia engine treats the second part of region as a Wikipedia
|
||||
subdomain. Its default worldwide region, wt-wt, becomes wt.wikipedia.org.
|
||||
"""
|
||||
normalized_region = _normalize_setting(region, DEFAULT_REGION).lower()
|
||||
if not _backend_includes_wikipedia(backend):
|
||||
return normalized_region
|
||||
|
||||
if normalized_region == DEFAULT_REGION:
|
||||
return _infer_wikipedia_region(query)
|
||||
|
||||
if "-" not in normalized_region:
|
||||
return DEFAULT_WIKIPEDIA_REGION
|
||||
|
||||
country, language = normalized_region.split("-", 1)
|
||||
return f"{country}-{WIKIPEDIA_LANGUAGE_ALIASES.get(language, language)}"
|
||||
|
||||
|
||||
def _search_text(
|
||||
query: str,
|
||||
max_results: int = 5,
|
||||
region: str = "wt-wt",
|
||||
safesearch: str = "moderate",
|
||||
region: str | None = DEFAULT_REGION,
|
||||
safesearch: str | None = DEFAULT_SAFESEARCH,
|
||||
backend: str | list[str] | tuple[str, ...] | None = DEFAULT_BACKEND,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Execute text search using DuckDuckGo.
|
||||
@ -26,6 +99,7 @@ def _search_text(
|
||||
max_results: Maximum number of results
|
||||
region: Search region
|
||||
safesearch: Safe search level
|
||||
backend: DDGS backend(s), e.g. "auto", "duckduckgo", or "duckduckgo,brave"
|
||||
|
||||
Returns:
|
||||
List of search results
|
||||
@ -39,11 +113,15 @@ def _search_text(
|
||||
ddgs = DDGS(timeout=30)
|
||||
|
||||
try:
|
||||
backend = _normalize_backend(backend)
|
||||
safesearch = _normalize_setting(safesearch, DEFAULT_SAFESEARCH)
|
||||
effective_region = _resolve_ddgs_region(query, region, backend)
|
||||
results = ddgs.text(
|
||||
query,
|
||||
region=region,
|
||||
region=effective_region,
|
||||
safesearch=safesearch,
|
||||
max_results=max_results,
|
||||
backend=backend,
|
||||
)
|
||||
return list(results) if results else []
|
||||
|
||||
@ -64,14 +142,23 @@ def web_search_tool(
|
||||
max_results: Maximum number of results to return. Default is 5.
|
||||
"""
|
||||
config = get_app_config().get_tool_config("web_search")
|
||||
region = DEFAULT_REGION
|
||||
safesearch = DEFAULT_SAFESEARCH
|
||||
backend = DEFAULT_BACKEND
|
||||
|
||||
# Override max_results from config if set
|
||||
if config is not None and "max_results" in config.model_extra:
|
||||
if config is not None:
|
||||
# Override tool call defaults from config if set.
|
||||
max_results = config.model_extra.get("max_results", max_results)
|
||||
region = config.model_extra.get("region", region)
|
||||
safesearch = config.model_extra.get("safesearch", safesearch)
|
||||
backend = config.model_extra.get("backend", backend)
|
||||
|
||||
results = _search_text(
|
||||
query=query,
|
||||
max_results=max_results,
|
||||
region=region,
|
||||
safesearch=safesearch,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
if not results:
|
||||
|
||||
75
backend/tests/test_ddg_search_tools.py
Normal file
75
backend/tests/test_ddg_search_tools.py
Normal file
@ -0,0 +1,75 @@
|
||||
"""Unit tests for the DDGS community web search tool."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from deerflow.community.ddg_search import tools
|
||||
|
||||
|
||||
def test_resolve_ddgs_region_maps_worldwide_chinese_query_for_wikipedia() -> None:
|
||||
assert tools._resolve_ddgs_region("\u4e16\u754c\u676f\u65b0\u95fb 2026", "wt-wt", "auto") == "cn-zh"
|
||||
|
||||
|
||||
def test_resolve_ddgs_region_uses_english_fallback_for_worldwide_query() -> None:
|
||||
assert tools._resolve_ddgs_region("latest world cup news", "wt-wt", "auto") == "us-en"
|
||||
|
||||
|
||||
def test_resolve_ddgs_region_preserves_worldwide_for_non_wikipedia_backend() -> None:
|
||||
assert tools._resolve_ddgs_region("latest world cup news", "wt-wt", "duckduckgo") == "wt-wt"
|
||||
|
||||
|
||||
def test_resolve_ddgs_region_maps_common_ddg_locale_aliases() -> None:
|
||||
assert tools._resolve_ddgs_region("\u65e5\u672c \u30cb\u30e5\u30fc\u30b9", "jp-jp", "auto") == "jp-ja"
|
||||
assert tools._resolve_ddgs_region("\ud55c\uad6d \ub274\uc2a4", "kr-kr", "auto") == "kr-ko"
|
||||
assert tools._resolve_ddgs_region("\u53f0\u7063\u65b0\u805e", "tw-tzh", "auto") == "tw-zh"
|
||||
|
||||
|
||||
def test_search_text_passes_wikipedia_safe_region_to_ddgs(monkeypatch) -> None:
|
||||
calls = {}
|
||||
|
||||
class FakeDDGS:
|
||||
def __init__(self, timeout: int) -> None:
|
||||
calls["timeout"] = timeout
|
||||
|
||||
def text(self, query: str, **kwargs):
|
||||
calls["query"] = query
|
||||
calls.update(kwargs)
|
||||
return [{"title": "Result", "href": "https://example.com", "body": "Snippet"}]
|
||||
|
||||
monkeypatch.setitem(sys.modules, "ddgs", SimpleNamespace(DDGS=FakeDDGS))
|
||||
|
||||
results = tools._search_text("\u4e16\u754c\u676f\u65b0\u95fb 2026", backend="auto")
|
||||
|
||||
assert results == [{"title": "Result", "href": "https://example.com", "body": "Snippet"}]
|
||||
assert calls["timeout"] == 30
|
||||
assert calls["region"] == "cn-zh"
|
||||
assert calls["backend"] == "auto"
|
||||
|
||||
|
||||
def test_web_search_tool_reads_ddgs_options_from_config() -> None:
|
||||
with patch("deerflow.community.ddg_search.tools.get_app_config") as mock_config:
|
||||
tool_config = MagicMock()
|
||||
tool_config.model_extra = {
|
||||
"max_results": 3,
|
||||
"region": "us-en",
|
||||
"safesearch": "off",
|
||||
"backend": "auto",
|
||||
}
|
||||
mock_config.return_value.get_tool_config.return_value = tool_config
|
||||
|
||||
with patch("deerflow.community.ddg_search.tools._search_text") as mock_search:
|
||||
mock_search.return_value = [{"title": "Result", "href": "https://example.com", "body": "Snippet"}]
|
||||
|
||||
result = tools.web_search_tool.invoke({"query": "latest news", "max_results": 8})
|
||||
parsed = json.loads(result)
|
||||
|
||||
assert parsed["total_results"] == 1
|
||||
mock_search.assert_called_once_with(
|
||||
query="latest news",
|
||||
max_results=3,
|
||||
region="us-en",
|
||||
safesearch="off",
|
||||
backend="auto",
|
||||
)
|
||||
@ -436,6 +436,9 @@ tools:
|
||||
group: web
|
||||
use: deerflow.community.ddg_search.tools:web_search_tool
|
||||
max_results: 5
|
||||
# backend: auto # DDGS backend(s): auto, duckduckgo, brave, wikipedia, etc.
|
||||
# region: wt-wt # wt-wt is normalized for Wikipedia when backend includes auto/all/wikipedia.
|
||||
# safesearch: moderate # on, moderate, off
|
||||
|
||||
# Web search tool (uses Serper - Google Search API, requires SERPER_API_KEY)
|
||||
# Serper provides real-time Google Search results. Sign up at https://serper.dev
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user