diff --git a/conf.yaml.example b/conf.yaml.example index eb9319c72..92499617a 100644 --- a/conf.yaml.example +++ b/conf.yaml.example @@ -20,3 +20,18 @@ BASIC_MODEL: # base_url: https://ark-cn-beijing.bytedance.net/api/v3 # model: "doubao-1-5-thinking-pro-m-250428" # api_key: xxxx + +# OTHER SETTINGS: +# Search engine configuration (Only supports Tavily currently) +# SEARCH_ENGINE: +# engine: tavily +# # Only include results from these domains +# include_domains: +# - example.com +# - trusted-news.com +# - reliable-source.org +# - gov.cn +# - edu.cn +# # Exclude results from these domains +# exclude_domains: +# - example.com diff --git a/docs/configuration_guide.md b/docs/configuration_guide.md index 1b16ed6ae..610e44261 100644 --- a/docs/configuration_guide.md +++ b/docs/configuration_guide.md @@ -115,3 +115,25 @@ BASIC_MODEL: api_version: $AZURE_API_VERSION api_key: $AZURE_API_KEY ``` +## About Search Engine + +### How to control search domains for Tavily? + +DeerFlow allows you to control which domains are included or excluded in Tavily search results through the configuration file. This helps improve search result quality and reduce hallucinations by focusing on trusted sources. + +`Tips`: it only supports Tavily currently. + +You can configure domain filtering in your `conf.yaml` file as follows: + +```yaml +SEARCH_ENGINE: + engine: tavily + # Only include results from these domains (whitelist) + include_domains: + - trusted-news.com + - gov.org + - reliable-source.edu + # Exclude results from these domains (blacklist) + exclude_domains: + - unreliable-site.com + - spam-domain.net \ No newline at end of file diff --git a/src/tools/search.py b/src/tools/search.py index bbe4fa8bd..3f9f2bff0 100644 --- a/src/tools/search.py +++ b/src/tools/search.py @@ -4,12 +4,14 @@ import json import logging import os +from typing import List, Optional from langchain_community.tools import BraveSearch, DuckDuckGoSearchResults from langchain_community.tools.arxiv import ArxivQueryRun from langchain_community.utilities import ArxivAPIWrapper, BraveSearchWrapper from src.config import SearchEngine, SELECTED_SEARCH_ENGINE +from src.config import load_yaml_config from src.tools.tavily_search.tavily_search_results_with_images import ( TavilySearchResultsWithImages, ) @@ -25,15 +27,33 @@ LoggedBraveSearch = create_logged_tool(BraveSearch) LoggedArxivSearch = create_logged_tool(ArxivQueryRun) +def get_search_config(): + config = load_yaml_config("conf.yaml") + search_config = config.get("SEARCH_ENGINE", {}) + return search_config + + # Get the selected search tool def get_web_search_tool(max_search_results: int): + search_config = get_search_config() + if SELECTED_SEARCH_ENGINE == SearchEngine.TAVILY.value: + # Only get and apply include/exclude domains for Tavily + include_domains: Optional[List[str]] = search_config.get("include_domains", []) + exclude_domains: Optional[List[str]] = search_config.get("exclude_domains", []) + + logger.info( + f"Tavily search configuration loaded: include_domains={include_domains}, exclude_domains={exclude_domains}" + ) + return LoggedTavilySearch( name="web_search", max_results=max_search_results, include_raw_content=True, include_images=True, include_image_descriptions=True, + include_domains=include_domains, + exclude_domains=exclude_domains, ) elif SELECTED_SEARCH_ENGINE == SearchEngine.DUCKDUCKGO.value: return LoggedDuckDuckGoSearch(