From 5350b2fb24b3bdb98729cc20b4544658fb8dfaa9 Mon Sep 17 00:00:00 2001 From: hung_ng__ <51025722+hung-ngm@users.noreply.github.com> Date: Wed, 8 Apr 2026 19:13:39 +1000 Subject: [PATCH] feat(community): add Exa search as community tool provider (#1357) * feat(community): add Exa search as community tool provider Add Exa (exa.ai) as a new community search provider alongside Tavily, Firecrawl, InfoQuest, and Jina AI. Exa is an AI-native search engine with neural, keyword, and auto search types. New files: - community/exa/tools.py: web_search_tool and web_fetch_tool - tests/test_exa_tools.py: 10 unit tests with mocked Exa client Changes: - pyproject.toml: add exa-py dependency - config.example.yaml: add commented-out Exa configuration examples Usage: set `use: deerflow.community.exa.tools:web_search_tool` in config.yaml and provide EXA_API_KEY. Co-Authored-By: Claude Opus 4.6 (1M context) * fix(community): address PR review comments for Exa tools - Make _get_exa_client() accept tool_name param so web_fetch reads its own config - Remove __init__.py to match namespace package pattern of other providers - Add duplicate tool name warning in config.example.yaml - Add regression tests for web_fetch config resolution Co-Authored-By: Claude Opus 4.6 (1M context) * Update revision in uv.lock to 3 --------- Co-authored-by: Claude Opus 4.6 (1M context) Co-authored-by: Willem Jiang --- .../harness/deerflow/community/exa/tools.py | 79 ++++++ backend/packages/harness/pyproject.toml | 1 + backend/tests/test_exa_tools.py | 260 ++++++++++++++++++ backend/uv.lock | 20 ++ config.example.yaml | 17 ++ 5 files changed, 377 insertions(+) create mode 100644 backend/packages/harness/deerflow/community/exa/tools.py create mode 100644 backend/tests/test_exa_tools.py diff --git a/backend/packages/harness/deerflow/community/exa/tools.py b/backend/packages/harness/deerflow/community/exa/tools.py new file mode 100644 index 000000000..974280402 --- /dev/null +++ b/backend/packages/harness/deerflow/community/exa/tools.py @@ -0,0 +1,79 @@ +import json + +from exa_py import Exa +from langchain.tools import tool + +from deerflow.config import get_app_config + + +def _get_exa_client(tool_name: str = "web_search") -> Exa: + config = get_app_config().get_tool_config(tool_name) + api_key = None + if config is not None and "api_key" in config.model_extra: + api_key = config.model_extra.get("api_key") + return Exa(api_key=api_key) + + +@tool("web_search", parse_docstring=True) +def web_search_tool(query: str) -> str: + """Search the web. + + Args: + query: The query to search for. + """ + try: + config = get_app_config().get_tool_config("web_search") + max_results = 5 + search_type = "auto" + contents_max_characters = 1000 + if config is not None: + max_results = config.model_extra.get("max_results", max_results) + search_type = config.model_extra.get("search_type", search_type) + contents_max_characters = config.model_extra.get("contents_max_characters", contents_max_characters) + + client = _get_exa_client() + res = client.search( + query, + type=search_type, + num_results=max_results, + contents={"highlights": {"max_characters": contents_max_characters}}, + ) + + normalized_results = [ + { + "title": result.title or "", + "url": result.url or "", + "snippet": "\n".join(result.highlights) if result.highlights else "", + } + for result in res.results + ] + json_results = json.dumps(normalized_results, indent=2, ensure_ascii=False) + return json_results + except Exception as e: + return f"Error: {str(e)}" + + +@tool("web_fetch", parse_docstring=True) +def web_fetch_tool(url: str) -> str: + """Fetch the contents of a web page at a given URL. + Only fetch EXACT URLs that have been provided directly by the user or have been returned in results from the web_search and web_fetch tools. + This tool can NOT access content that requires authentication, such as private Google Docs or pages behind login walls. + Do NOT add www. to URLs that do NOT have them. + URLs must include the schema: https://example.com is a valid URL while example.com is an invalid URL. + + Args: + url: The URL to fetch the contents of. + """ + try: + client = _get_exa_client("web_fetch") + res = client.get_contents([url], text={"max_characters": 4096}) + + if res.results: + result = res.results[0] + title = result.title or "Untitled" + text = result.text or "" + return f"# {title}\n\n{text[:4096]}" + else: + return "Error: No results found" + except Exception as e: + return f"Error: {str(e)}" diff --git a/backend/packages/harness/pyproject.toml b/backend/packages/harness/pyproject.toml index cf8b15839..6d48caeae 100644 --- a/backend/packages/harness/pyproject.toml +++ b/backend/packages/harness/pyproject.toml @@ -7,6 +7,7 @@ dependencies = [ "agent-client-protocol>=0.4.0", "agent-sandbox>=0.0.19", "dotenv>=0.9.9", + "exa-py>=1.0.0", "httpx>=0.28.0", "kubernetes>=30.0.0", "langchain>=1.2.3", diff --git a/backend/tests/test_exa_tools.py b/backend/tests/test_exa_tools.py new file mode 100644 index 000000000..b7196918e --- /dev/null +++ b/backend/tests/test_exa_tools.py @@ -0,0 +1,260 @@ +"""Unit tests for the Exa community tools.""" + +import json +from unittest.mock import MagicMock, patch + +import pytest + + +@pytest.fixture +def mock_app_config(): + """Mock the app config to return tool configurations.""" + with patch("deerflow.community.exa.tools.get_app_config") as mock_config: + tool_config = MagicMock() + tool_config.model_extra = { + "max_results": 5, + "search_type": "auto", + "contents_max_characters": 1000, + "api_key": "test-api-key", + } + mock_config.return_value.get_tool_config.return_value = tool_config + yield mock_config + + +@pytest.fixture +def mock_exa_client(): + """Mock the Exa client.""" + with patch("deerflow.community.exa.tools.Exa") as mock_exa_cls: + mock_client = MagicMock() + mock_exa_cls.return_value = mock_client + yield mock_client + + +class TestWebSearchTool: + def test_basic_search(self, mock_app_config, mock_exa_client): + """Test basic web search returns normalized results.""" + mock_result_1 = MagicMock() + mock_result_1.title = "Test Title 1" + mock_result_1.url = "https://example.com/1" + mock_result_1.highlights = ["This is a highlight about the topic."] + + mock_result_2 = MagicMock() + mock_result_2.title = "Test Title 2" + mock_result_2.url = "https://example.com/2" + mock_result_2.highlights = ["First highlight.", "Second highlight."] + + mock_response = MagicMock() + mock_response.results = [mock_result_1, mock_result_2] + mock_exa_client.search.return_value = mock_response + + from deerflow.community.exa.tools import web_search_tool + + result = web_search_tool.invoke({"query": "test query"}) + parsed = json.loads(result) + + assert len(parsed) == 2 + assert parsed[0]["title"] == "Test Title 1" + assert parsed[0]["url"] == "https://example.com/1" + assert parsed[0]["snippet"] == "This is a highlight about the topic." + assert parsed[1]["snippet"] == "First highlight.\nSecond highlight." + + mock_exa_client.search.assert_called_once_with( + "test query", + type="auto", + num_results=5, + contents={"highlights": {"max_characters": 1000}}, + ) + + def test_search_with_custom_config(self, mock_exa_client): + """Test search respects custom configuration values.""" + with patch("deerflow.community.exa.tools.get_app_config") as mock_config: + tool_config = MagicMock() + tool_config.model_extra = { + "max_results": 10, + "search_type": "neural", + "contents_max_characters": 2000, + "api_key": "test-key", + } + mock_config.return_value.get_tool_config.return_value = tool_config + + mock_response = MagicMock() + mock_response.results = [] + mock_exa_client.search.return_value = mock_response + + from deerflow.community.exa.tools import web_search_tool + + web_search_tool.invoke({"query": "neural search"}) + + mock_exa_client.search.assert_called_once_with( + "neural search", + type="neural", + num_results=10, + contents={"highlights": {"max_characters": 2000}}, + ) + + def test_search_with_no_highlights(self, mock_app_config, mock_exa_client): + """Test search handles results with no highlights.""" + mock_result = MagicMock() + mock_result.title = "No Highlights" + mock_result.url = "https://example.com/empty" + mock_result.highlights = None + + mock_response = MagicMock() + mock_response.results = [mock_result] + mock_exa_client.search.return_value = mock_response + + from deerflow.community.exa.tools import web_search_tool + + result = web_search_tool.invoke({"query": "test"}) + parsed = json.loads(result) + + assert parsed[0]["snippet"] == "" + + def test_search_empty_results(self, mock_app_config, mock_exa_client): + """Test search with no results returns empty list.""" + mock_response = MagicMock() + mock_response.results = [] + mock_exa_client.search.return_value = mock_response + + from deerflow.community.exa.tools import web_search_tool + + result = web_search_tool.invoke({"query": "nothing"}) + parsed = json.loads(result) + + assert parsed == [] + + def test_search_error_handling(self, mock_app_config, mock_exa_client): + """Test search returns error string on exception.""" + mock_exa_client.search.side_effect = Exception("API rate limit exceeded") + + from deerflow.community.exa.tools import web_search_tool + + result = web_search_tool.invoke({"query": "error"}) + + assert result == "Error: API rate limit exceeded" + + +class TestWebFetchTool: + def test_basic_fetch(self, mock_app_config, mock_exa_client): + """Test basic web fetch returns formatted content.""" + mock_result = MagicMock() + mock_result.title = "Fetched Page" + mock_result.text = "This is the page content." + + mock_response = MagicMock() + mock_response.results = [mock_result] + mock_exa_client.get_contents.return_value = mock_response + + from deerflow.community.exa.tools import web_fetch_tool + + result = web_fetch_tool.invoke({"url": "https://example.com"}) + + assert result == "# Fetched Page\n\nThis is the page content." + mock_exa_client.get_contents.assert_called_once_with( + ["https://example.com"], + text={"max_characters": 4096}, + ) + + def test_fetch_no_title(self, mock_app_config, mock_exa_client): + """Test fetch with missing title uses 'Untitled'.""" + mock_result = MagicMock() + mock_result.title = None + mock_result.text = "Content without title." + + mock_response = MagicMock() + mock_response.results = [mock_result] + mock_exa_client.get_contents.return_value = mock_response + + from deerflow.community.exa.tools import web_fetch_tool + + result = web_fetch_tool.invoke({"url": "https://example.com"}) + + assert result.startswith("# Untitled\n\n") + + def test_fetch_no_results(self, mock_app_config, mock_exa_client): + """Test fetch with no results returns error.""" + mock_response = MagicMock() + mock_response.results = [] + mock_exa_client.get_contents.return_value = mock_response + + from deerflow.community.exa.tools import web_fetch_tool + + result = web_fetch_tool.invoke({"url": "https://example.com/404"}) + + assert result == "Error: No results found" + + def test_fetch_error_handling(self, mock_app_config, mock_exa_client): + """Test fetch returns error string on exception.""" + mock_exa_client.get_contents.side_effect = Exception("Connection timeout") + + from deerflow.community.exa.tools import web_fetch_tool + + result = web_fetch_tool.invoke({"url": "https://example.com"}) + + assert result == "Error: Connection timeout" + + def test_fetch_reads_web_fetch_config(self, mock_exa_client): + """Test that web_fetch_tool reads 'web_fetch' config, not 'web_search'.""" + with patch("deerflow.community.exa.tools.get_app_config") as mock_config: + tool_config = MagicMock() + tool_config.model_extra = {"api_key": "exa-fetch-key"} + mock_config.return_value.get_tool_config.return_value = tool_config + + mock_result = MagicMock() + mock_result.title = "Page" + mock_result.text = "Content." + mock_response = MagicMock() + mock_response.results = [mock_result] + mock_exa_client.get_contents.return_value = mock_response + + from deerflow.community.exa.tools import web_fetch_tool + + web_fetch_tool.invoke({"url": "https://example.com"}) + + mock_config.return_value.get_tool_config.assert_any_call("web_fetch") + + def test_fetch_uses_independent_api_key(self, mock_exa_client): + """Test mixed-provider config: web_fetch uses its own api_key, not web_search's.""" + with patch("deerflow.community.exa.tools.get_app_config") as mock_config: + with patch("deerflow.community.exa.tools.Exa") as mock_exa_cls: + mock_exa_cls.return_value = mock_exa_client + fetch_config = MagicMock() + fetch_config.model_extra = {"api_key": "exa-fetch-key"} + + def get_tool_config(name): + if name == "web_fetch": + return fetch_config + return None + + mock_config.return_value.get_tool_config.side_effect = get_tool_config + + mock_result = MagicMock() + mock_result.title = "Page" + mock_result.text = "Content." + mock_response = MagicMock() + mock_response.results = [mock_result] + mock_exa_client.get_contents.return_value = mock_response + + from deerflow.community.exa.tools import web_fetch_tool + + web_fetch_tool.invoke({"url": "https://example.com"}) + + mock_exa_cls.assert_called_once_with(api_key="exa-fetch-key") + + def test_fetch_truncates_long_content(self, mock_app_config, mock_exa_client): + """Test fetch truncates content to 4096 characters.""" + mock_result = MagicMock() + mock_result.title = "Long Page" + mock_result.text = "x" * 5000 + + mock_response = MagicMock() + mock_response.results = [mock_result] + mock_exa_client.get_contents.return_value = mock_response + + from deerflow.community.exa.tools import web_fetch_tool + + result = web_fetch_tool.invoke({"url": "https://example.com"}) + + # "# Long Page\n\n" is 14 chars, content truncated to 4096 + content_after_header = result.split("\n\n", 1)[1] + assert len(content_after_header) == 4096 diff --git a/backend/uv.lock b/backend/uv.lock index 45731fb04..92a20393e 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -722,6 +722,7 @@ dependencies = [ { name = "ddgs" }, { name = "dotenv" }, { name = "duckdb" }, + { name = "exa-py" }, { name = "firecrawl-py" }, { name = "httpx" }, { name = "kubernetes" }, @@ -759,6 +760,7 @@ requires-dist = [ { name = "ddgs", specifier = ">=9.10.0" }, { name = "dotenv", specifier = ">=0.9.9" }, { name = "duckdb", specifier = ">=1.4.4" }, + { name = "exa-py", specifier = ">=1.0.0" }, { name = "firecrawl-py", specifier = ">=1.15.0" }, { name = "httpx", specifier = ">=0.28.0" }, { name = "kubernetes", specifier = ">=30.0.0" }, @@ -871,6 +873,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, ] +[[package]] +name = "exa-py" +version = "2.10.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpcore" }, + { name = "httpx" }, + { name = "openai" }, + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fb/bb/23c9f78edbf0e0d656839be7346a2f77b9caaae8cc3cb301012c46fd7dc5/exa_py-2.10.1.tar.gz", hash = "sha256:731958c2befc5fc82f031c93cfe7b3d55dc3b0e1bf32f83ec34d32a65ee31ba1", size = 53826, upload-time = "2026-03-25T00:50:49.286Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/8d/0665263aa8d51ef8e2a3955e2b56496add4879730451961b09610bbc7036/exa_py-2.10.1-py3-none-any.whl", hash = "sha256:e2174c932764fff747e84e9e6d0637eaa4a6503556014df73a3427f42cc9d6a7", size = 72270, upload-time = "2026-03-25T00:50:47.721Z" }, +] + [[package]] name = "fake-useragent" version = "2.2.0" diff --git a/config.example.yaml b/config.example.yaml index 96dc7b4a0..7edfe60ae 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -304,6 +304,23 @@ tools: # # Used to limit the scope of search results, only returns content within the specified time range. Set to -1 to disable time filtering # search_time_range: 10 + # Web search tool (uses Exa, requires EXA_API_KEY) + # - name: web_search + # group: web + # use: deerflow.community.exa.tools:web_search_tool + # max_results: 5 + # search_type: auto # Options: auto, neural, keyword + # contents_max_characters: 1000 + # # api_key: $EXA_API_KEY + + # Web fetch tool (uses Exa) + # NOTE: Only one web_fetch provider can be active at a time. + # Comment out the Jina AI web_fetch entry below before enabling this one. + # - name: web_fetch + # group: web + # use: deerflow.community.exa.tools:web_fetch_tool + # # api_key: $EXA_API_KEY + # Web fetch tool (uses Jina AI reader) - name: web_fetch group: web