diff --git a/.env.example b/.env.example index c4eb63c91..5b11c82db 100644 --- a/.env.example +++ b/.env.example @@ -17,6 +17,7 @@ INFOQUEST_API_KEY=your-infoquest-api-key # DEEPSEEK_API_KEY=your-deepseek-api-key # NOVITA_API_KEY=your-novita-api-key # OpenAI-compatible, see https://novita.ai # MINIMAX_API_KEY=your-minimax-api-key # OpenAI-compatible, see https://platform.minimax.io +# VLLM_API_KEY=your-vllm-api-key # OpenAI-compatible # FEISHU_APP_ID=your-feishu-app-id # FEISHU_APP_SECRET=your-feishu-app-secret diff --git a/README.md b/README.md index 14aec9fc6..0d84a0861 100644 --- a/README.md +++ b/README.md @@ -141,12 +141,26 @@ That prompt is intended for coding agents. It tells the agent to clone the repo api_key: $OPENAI_API_KEY use_responses_api: true output_version: responses/v1 + + - name: qwen3-32b-vllm + display_name: Qwen3 32B (vLLM) + use: deerflow.models.vllm_provider:VllmChatModel + model: Qwen/Qwen3-32B + api_key: $VLLM_API_KEY + base_url: http://localhost:8000/v1 + supports_thinking: true + when_thinking_enabled: + extra_body: + chat_template_kwargs: + enable_thinking: true ``` OpenRouter and similar OpenAI-compatible gateways should be configured with `langchain_openai:ChatOpenAI` plus `base_url`. If you prefer a provider-specific environment variable name, point `api_key` at that variable explicitly (for example `api_key: $OPENROUTER_API_KEY`). To route OpenAI models through `/v1/responses`, keep using `langchain_openai:ChatOpenAI` and set `use_responses_api: true` with `output_version: responses/v1`. + For vLLM 0.19.0, use `deerflow.models.vllm_provider:VllmChatModel`. For Qwen-style reasoning models, DeerFlow toggles reasoning with `extra_body.chat_template_kwargs.enable_thinking` and preserves vLLM's non-standard `reasoning` field across multi-turn tool-call conversations. Legacy `thinking` configs are normalized automatically for backward compatibility. Reasoning models may also require the server to be started with `--reasoning-parser ...`. If your local vLLM deployment accepts any non-empty API key, you can still set `VLLM_API_KEY` to a placeholder value. + CLI-backed provider examples: ```yaml diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 846429e40..c8f62c21b 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -293,10 +293,17 @@ Proxied through nginx: `/api/langgraph/*` → LangGraph, all other `/api/*` → - `create_chat_model(name, thinking_enabled)` instantiates LLM from config via reflection - Supports `thinking_enabled` flag with per-model `when_thinking_enabled` overrides +- Supports vLLM-style thinking toggles via `when_thinking_enabled.extra_body.chat_template_kwargs.enable_thinking` for Qwen reasoning models, while normalizing legacy `thinking` configs for backward compatibility - Supports `supports_vision` flag for image understanding models - Config values starting with `$` resolved as environment variables - Missing provider modules surface actionable install hints from reflection resolvers (for example `uv add langchain-google-genai`) +### vLLM Provider (`packages/harness/deerflow/models/vllm_provider.py`) + +- `VllmChatModel` subclasses `langchain_openai:ChatOpenAI` for vLLM 0.19.0 OpenAI-compatible endpoints +- Preserves vLLM's non-standard assistant `reasoning` field on full responses, streaming deltas, and follow-up tool-call turns +- Designed for configs that enable thinking through `extra_body.chat_template_kwargs.enable_thinking` on vLLM 0.19.0 Qwen reasoning models, while accepting the older `thinking` alias + ### IM Channels System (`app/channels/`) Bridges external messaging platforms (Feishu, Slack, Telegram) to the DeerFlow agent via the LangGraph Server. @@ -365,6 +372,7 @@ Focused regression coverage for the updater lives in `backend/tests/test_memory_ **`config.yaml`** key sections: - `models[]` - LLM configs with `use` class path, `supports_thinking`, `supports_vision`, provider-specific fields +- vLLM reasoning models should use `deerflow.models.vllm_provider:VllmChatModel`; for Qwen-style parsers prefer `when_thinking_enabled.extra_body.chat_template_kwargs.enable_thinking`, and DeerFlow will also normalize the older `thinking` alias - `tools[]` - Tool configs with `use` variable path and `group` - `tool_groups[]` - Logical groupings for tools - `sandbox.use` - Sandbox provider class path diff --git a/backend/packages/harness/deerflow/models/factory.py b/backend/packages/harness/deerflow/models/factory.py index 51332c5e5..b05b8625f 100644 --- a/backend/packages/harness/deerflow/models/factory.py +++ b/backend/packages/harness/deerflow/models/factory.py @@ -9,6 +9,27 @@ from deerflow.tracing import build_tracing_callbacks logger = logging.getLogger(__name__) +def _deep_merge_dicts(base: dict | None, override: dict) -> dict: + """Recursively merge two dictionaries without mutating the inputs.""" + merged = dict(base or {}) + for key, value in override.items(): + if isinstance(value, dict) and isinstance(merged.get(key), dict): + merged[key] = _deep_merge_dicts(merged[key], value) + else: + merged[key] = value + return merged + + +def _vllm_disable_chat_template_kwargs(chat_template_kwargs: dict) -> dict: + """Build the disable payload for vLLM/Qwen chat template kwargs.""" + disable_kwargs: dict[str, bool] = {} + if "thinking" in chat_template_kwargs: + disable_kwargs["thinking"] = False + if "enable_thinking" in chat_template_kwargs: + disable_kwargs["enable_thinking"] = False + return disable_kwargs + + def create_chat_model(name: str | None = None, thinking_enabled: bool = False, **kwargs) -> BaseChatModel: """Create a chat model instance from the config. @@ -54,13 +75,23 @@ def create_chat_model(name: str | None = None, thinking_enabled: bool = False, * if not thinking_enabled and has_thinking_settings: if effective_wte.get("extra_body", {}).get("thinking", {}).get("type"): # OpenAI-compatible gateway: thinking is nested under extra_body - kwargs.update({"extra_body": {"thinking": {"type": "disabled"}}}) - kwargs.update({"reasoning_effort": "minimal"}) + model_settings_from_config["extra_body"] = _deep_merge_dicts( + model_settings_from_config.get("extra_body"), + {"thinking": {"type": "disabled"}}, + ) + model_settings_from_config["reasoning_effort"] = "minimal" + elif disable_chat_template_kwargs := _vllm_disable_chat_template_kwargs(effective_wte.get("extra_body", {}).get("chat_template_kwargs") or {}): + # vLLM uses chat template kwargs to switch thinking on/off. + model_settings_from_config["extra_body"] = _deep_merge_dicts( + model_settings_from_config.get("extra_body"), + {"chat_template_kwargs": disable_chat_template_kwargs}, + ) elif effective_wte.get("thinking", {}).get("type"): # Native langchain_anthropic: thinking is a direct constructor parameter - kwargs.update({"thinking": {"type": "disabled"}}) - if not model_config.supports_reasoning_effort and "reasoning_effort" in kwargs: - del kwargs["reasoning_effort"] + model_settings_from_config["thinking"] = {"type": "disabled"} + if not model_config.supports_reasoning_effort: + kwargs.pop("reasoning_effort", None) + model_settings_from_config.pop("reasoning_effort", None) # For Codex Responses API models: map thinking mode to reasoning_effort from deerflow.models.openai_codex_provider import CodexChatModel diff --git a/backend/packages/harness/deerflow/models/vllm_provider.py b/backend/packages/harness/deerflow/models/vllm_provider.py new file mode 100644 index 000000000..d947e1c26 --- /dev/null +++ b/backend/packages/harness/deerflow/models/vllm_provider.py @@ -0,0 +1,258 @@ +"""Custom vLLM provider built on top of LangChain ChatOpenAI. + +vLLM 0.19.0 exposes reasoning models through an OpenAI-compatible API, but +LangChain's default OpenAI adapter drops the non-standard ``reasoning`` field +from assistant messages and streaming deltas. That breaks interleaved +thinking/tool-call flows because vLLM expects the assistant's prior reasoning to +be echoed back on subsequent turns. + +This provider preserves ``reasoning`` on: +- non-streaming responses +- streaming deltas +- multi-turn request payloads +""" + +from __future__ import annotations + +import json +from collections.abc import Mapping +from typing import Any, cast + +import openai +from langchain_core.language_models import LanguageModelInput +from langchain_core.messages import ( + AIMessage, + AIMessageChunk, + BaseMessageChunk, + ChatMessageChunk, + FunctionMessageChunk, + HumanMessageChunk, + SystemMessageChunk, + ToolMessageChunk, +) +from langchain_core.messages.tool import tool_call_chunk +from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult +from langchain_openai import ChatOpenAI +from langchain_openai.chat_models.base import _create_usage_metadata + + +def _normalize_vllm_chat_template_kwargs(payload: dict[str, Any]) -> None: + """Map DeerFlow's legacy ``thinking`` toggle to vLLM/Qwen's ``enable_thinking``. + + DeerFlow originally documented ``extra_body.chat_template_kwargs.thinking`` + for vLLM, but vLLM 0.19.0's Qwen reasoning parser reads + ``chat_template_kwargs.enable_thinking``. Normalize the payload just before + it is sent so existing configs keep working and flash mode can truly + disable reasoning. + """ + extra_body = payload.get("extra_body") + if not isinstance(extra_body, dict): + return + + chat_template_kwargs = extra_body.get("chat_template_kwargs") + if not isinstance(chat_template_kwargs, dict): + return + + if "thinking" not in chat_template_kwargs: + return + + normalized_chat_template_kwargs = dict(chat_template_kwargs) + normalized_chat_template_kwargs.setdefault("enable_thinking", normalized_chat_template_kwargs["thinking"]) + normalized_chat_template_kwargs.pop("thinking", None) + extra_body["chat_template_kwargs"] = normalized_chat_template_kwargs + + +def _reasoning_to_text(reasoning: Any) -> str: + """Best-effort extraction of readable reasoning text from vLLM payloads.""" + if isinstance(reasoning, str): + return reasoning + + if isinstance(reasoning, list): + parts = [_reasoning_to_text(item) for item in reasoning] + return "".join(part for part in parts if part) + + if isinstance(reasoning, dict): + for key in ("text", "content", "reasoning"): + value = reasoning.get(key) + if isinstance(value, str): + return value + if value is not None: + text = _reasoning_to_text(value) + if text: + return text + try: + return json.dumps(reasoning, ensure_ascii=False) + except TypeError: + return str(reasoning) + + try: + return json.dumps(reasoning, ensure_ascii=False) + except TypeError: + return str(reasoning) + + +def _convert_delta_to_message_chunk_with_reasoning(_dict: Mapping[str, Any], default_class: type[BaseMessageChunk]) -> BaseMessageChunk: + """Convert a streaming delta to a LangChain message chunk while preserving reasoning.""" + id_ = _dict.get("id") + role = cast(str, _dict.get("role")) + content = cast(str, _dict.get("content") or "") + additional_kwargs: dict[str, Any] = {} + + if _dict.get("function_call"): + function_call = dict(_dict["function_call"]) + if "name" in function_call and function_call["name"] is None: + function_call["name"] = "" + additional_kwargs["function_call"] = function_call + + reasoning = _dict.get("reasoning") + if reasoning is not None: + additional_kwargs["reasoning"] = reasoning + reasoning_text = _reasoning_to_text(reasoning) + if reasoning_text: + additional_kwargs["reasoning_content"] = reasoning_text + + tool_call_chunks = [] + if raw_tool_calls := _dict.get("tool_calls"): + try: + tool_call_chunks = [ + tool_call_chunk( + name=rtc["function"].get("name"), + args=rtc["function"].get("arguments"), + id=rtc.get("id"), + index=rtc["index"], + ) + for rtc in raw_tool_calls + ] + except KeyError: + pass + + if role == "user" or default_class == HumanMessageChunk: + return HumanMessageChunk(content=content, id=id_) + if role == "assistant" or default_class == AIMessageChunk: + return AIMessageChunk( + content=content, + additional_kwargs=additional_kwargs, + id=id_, + tool_call_chunks=tool_call_chunks, # type: ignore[arg-type] + ) + if role in ("system", "developer") or default_class == SystemMessageChunk: + role_kwargs = {"__openai_role__": "developer"} if role == "developer" else {} + return SystemMessageChunk(content=content, id=id_, additional_kwargs=role_kwargs) + if role == "function" or default_class == FunctionMessageChunk: + return FunctionMessageChunk(content=content, name=_dict["name"], id=id_) + if role == "tool" or default_class == ToolMessageChunk: + return ToolMessageChunk(content=content, tool_call_id=_dict["tool_call_id"], id=id_) + if role or default_class == ChatMessageChunk: + return ChatMessageChunk(content=content, role=role, id=id_) # type: ignore[arg-type] + return default_class(content=content, id=id_) # type: ignore[call-arg] + + +def _restore_reasoning_field(payload_msg: dict[str, Any], orig_msg: AIMessage) -> None: + """Re-inject vLLM reasoning onto outgoing assistant messages.""" + reasoning = orig_msg.additional_kwargs.get("reasoning") + if reasoning is None: + reasoning = orig_msg.additional_kwargs.get("reasoning_content") + if reasoning is not None: + payload_msg["reasoning"] = reasoning + + +class VllmChatModel(ChatOpenAI): + """ChatOpenAI variant that preserves vLLM reasoning fields across turns.""" + + model_config = {"arbitrary_types_allowed": True} + + @property + def _llm_type(self) -> str: + return "vllm-openai-compatible" + + def _get_request_payload( + self, + input_: LanguageModelInput, + *, + stop: list[str] | None = None, + **kwargs: Any, + ) -> dict[str, Any]: + """Restore assistant reasoning in request payloads for interleaved thinking.""" + original_messages = self._convert_input(input_).to_messages() + payload = super()._get_request_payload(input_, stop=stop, **kwargs) + _normalize_vllm_chat_template_kwargs(payload) + payload_messages = payload.get("messages", []) + + if len(payload_messages) == len(original_messages): + for payload_msg, orig_msg in zip(payload_messages, original_messages): + if payload_msg.get("role") == "assistant" and isinstance(orig_msg, AIMessage): + _restore_reasoning_field(payload_msg, orig_msg) + else: + ai_messages = [message for message in original_messages if isinstance(message, AIMessage)] + assistant_payloads = [message for message in payload_messages if message.get("role") == "assistant"] + for payload_msg, ai_msg in zip(assistant_payloads, ai_messages): + _restore_reasoning_field(payload_msg, ai_msg) + + return payload + + def _create_chat_result(self, response: dict | openai.BaseModel, generation_info: dict | None = None) -> ChatResult: + """Preserve vLLM reasoning on non-streaming responses.""" + result = super()._create_chat_result(response, generation_info=generation_info) + response_dict = response if isinstance(response, dict) else response.model_dump() + + for generation, choice in zip(result.generations, response_dict.get("choices", [])): + if not isinstance(generation, ChatGeneration): + continue + message = generation.message + if not isinstance(message, AIMessage): + continue + reasoning = choice.get("message", {}).get("reasoning") + if reasoning is None: + continue + message.additional_kwargs["reasoning"] = reasoning + reasoning_text = _reasoning_to_text(reasoning) + if reasoning_text: + message.additional_kwargs["reasoning_content"] = reasoning_text + + return result + + def _convert_chunk_to_generation_chunk( + self, + chunk: dict, + default_chunk_class: type, + base_generation_info: dict | None, + ) -> ChatGenerationChunk | None: + """Preserve vLLM reasoning on streaming deltas.""" + if chunk.get("type") == "content.delta": + return None + + token_usage = chunk.get("usage") + choices = chunk.get("choices", []) or chunk.get("chunk", {}).get("choices", []) + usage_metadata = _create_usage_metadata(token_usage, chunk.get("service_tier")) if token_usage else None + + if len(choices) == 0: + generation_chunk = ChatGenerationChunk(message=default_chunk_class(content="", usage_metadata=usage_metadata), generation_info=base_generation_info) + if self.output_version == "v1": + generation_chunk.message.content = [] + generation_chunk.message.response_metadata["output_version"] = "v1" + return generation_chunk + + choice = choices[0] + if choice["delta"] is None: + return None + + message_chunk = _convert_delta_to_message_chunk_with_reasoning(choice["delta"], default_chunk_class) + generation_info = {**base_generation_info} if base_generation_info else {} + + if finish_reason := choice.get("finish_reason"): + generation_info["finish_reason"] = finish_reason + if model_name := chunk.get("model"): + generation_info["model_name"] = model_name + if system_fingerprint := chunk.get("system_fingerprint"): + generation_info["system_fingerprint"] = system_fingerprint + if service_tier := chunk.get("service_tier"): + generation_info["service_tier"] = service_tier + + if logprobs := choice.get("logprobs"): + generation_info["logprobs"] = logprobs + + if usage_metadata and isinstance(message_chunk, AIMessageChunk): + message_chunk.usage_metadata = usage_metadata + + message_chunk.response_metadata["model_provider"] = "openai" + return ChatGenerationChunk(message=message_chunk, generation_info=generation_info or None) diff --git a/backend/tests/test_model_factory.py b/backend/tests/test_model_factory.py index 9ae2c726a..5e980bd1b 100644 --- a/backend/tests/test_model_factory.py +++ b/backend/tests/test_model_factory.py @@ -604,6 +604,63 @@ def test_codex_provider_strips_unsupported_max_tokens(monkeypatch): assert "max_tokens" not in FakeChatModel.captured_kwargs +def test_thinking_disabled_vllm_chat_template_format(monkeypatch): + wte = {"extra_body": {"chat_template_kwargs": {"thinking": True}}} + model = _make_model( + "vllm-qwen", + use="deerflow.models.vllm_provider:VllmChatModel", + supports_thinking=True, + when_thinking_enabled=wte, + ) + model.extra_body = {"top_k": 20} + cfg = _make_app_config([model]) + _patch_factory(monkeypatch, cfg) + + captured: dict = {} + + class CapturingModel(FakeChatModel): + def __init__(self, **kwargs): + captured.update(kwargs) + BaseChatModel.__init__(self, **kwargs) + + monkeypatch.setattr(factory_module, "resolve_class", lambda path, base: CapturingModel) + + factory_module.create_chat_model(name="vllm-qwen", thinking_enabled=False) + + assert captured.get("extra_body") == {"top_k": 20, "chat_template_kwargs": {"thinking": False}} + assert captured.get("reasoning_effort") is None + + +def test_thinking_disabled_vllm_enable_thinking_format(monkeypatch): + wte = {"extra_body": {"chat_template_kwargs": {"enable_thinking": True}}} + model = _make_model( + "vllm-qwen-enable", + use="deerflow.models.vllm_provider:VllmChatModel", + supports_thinking=True, + when_thinking_enabled=wte, + ) + model.extra_body = {"top_k": 20} + cfg = _make_app_config([model]) + _patch_factory(monkeypatch, cfg) + + captured: dict = {} + + class CapturingModel(FakeChatModel): + def __init__(self, **kwargs): + captured.update(kwargs) + BaseChatModel.__init__(self, **kwargs) + + monkeypatch.setattr(factory_module, "resolve_class", lambda path, base: CapturingModel) + + factory_module.create_chat_model(name="vllm-qwen-enable", thinking_enabled=False) + + assert captured.get("extra_body") == { + "top_k": 20, + "chat_template_kwargs": {"enable_thinking": False}, + } + assert captured.get("reasoning_effort") is None + + def test_openai_responses_api_settings_are_passed_to_chatopenai(monkeypatch): model = ModelConfig( name="gpt-5-responses", diff --git a/backend/tests/test_vllm_provider.py b/backend/tests/test_vllm_provider.py new file mode 100644 index 000000000..9e60d446f --- /dev/null +++ b/backend/tests/test_vllm_provider.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +from langchain_core.messages import AIMessage, AIMessageChunk, HumanMessage + +from deerflow.models.vllm_provider import VllmChatModel + + +def _make_model() -> VllmChatModel: + return VllmChatModel( + model="Qwen/QwQ-32B", + api_key="dummy", + base_url="http://localhost:8000/v1", + ) + + +def test_vllm_provider_restores_reasoning_in_request_payload(): + model = _make_model() + payload = model._get_request_payload( + [ + AIMessage( + content="", + tool_calls=[{"name": "bash", "args": {"cmd": "pwd"}, "id": "tool-1", "type": "tool_call"}], + additional_kwargs={"reasoning": "Need to inspect the workspace first."}, + ), + HumanMessage(content="Continue"), + ] + ) + + assistant_message = payload["messages"][0] + assert assistant_message["role"] == "assistant" + assert assistant_message["reasoning"] == "Need to inspect the workspace first." + assert assistant_message["tool_calls"][0]["function"]["name"] == "bash" + + +def test_vllm_provider_normalizes_legacy_thinking_kwarg_to_enable_thinking(): + model = VllmChatModel( + model="qwen3", + api_key="dummy", + base_url="http://localhost:8000/v1", + extra_body={"chat_template_kwargs": {"thinking": True}}, + ) + + payload = model._get_request_payload([HumanMessage(content="Hello")]) + + assert payload["extra_body"]["chat_template_kwargs"] == {"enable_thinking": True} + + +def test_vllm_provider_preserves_explicit_enable_thinking_kwarg(): + model = VllmChatModel( + model="qwen3", + api_key="dummy", + base_url="http://localhost:8000/v1", + extra_body={"chat_template_kwargs": {"enable_thinking": False, "foo": "bar"}}, + ) + + payload = model._get_request_payload([HumanMessage(content="Hello")]) + + assert payload["extra_body"]["chat_template_kwargs"] == { + "enable_thinking": False, + "foo": "bar", + } + + +def test_vllm_provider_preserves_reasoning_in_chat_result(): + model = _make_model() + result = model._create_chat_result( + { + "model": "Qwen/QwQ-32B", + "choices": [ + { + "message": { + "role": "assistant", + "content": "42", + "reasoning": "I compared the two numbers directly.", + }, + "finish_reason": "stop", + } + ], + "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2}, + } + ) + + message = result.generations[0].message + assert message.additional_kwargs["reasoning"] == "I compared the two numbers directly." + assert message.additional_kwargs["reasoning_content"] == "I compared the two numbers directly." + + +def test_vllm_provider_preserves_reasoning_in_streaming_chunks(): + model = _make_model() + chunk = model._convert_chunk_to_generation_chunk( + { + "model": "Qwen/QwQ-32B", + "choices": [ + { + "delta": { + "role": "assistant", + "reasoning": "First, call the weather tool.", + "content": "Calling tool...", + }, + "finish_reason": None, + } + ], + }, + AIMessageChunk, + {}, + ) + + assert chunk is not None + assert chunk.message.additional_kwargs["reasoning"] == "First, call the weather tool." + assert chunk.message.additional_kwargs["reasoning_content"] == "First, call the weather tool." + assert chunk.message.content == "Calling tool..." + + +def test_vllm_provider_preserves_empty_reasoning_values_in_streaming_chunks(): + model = _make_model() + chunk = model._convert_chunk_to_generation_chunk( + { + "model": "Qwen/QwQ-32B", + "choices": [ + { + "delta": { + "role": "assistant", + "reasoning": "", + "content": "Still replying...", + }, + "finish_reason": None, + } + ], + }, + AIMessageChunk, + {}, + ) + + assert chunk is not None + assert "reasoning" in chunk.message.additional_kwargs + assert chunk.message.additional_kwargs["reasoning"] == "" + assert "reasoning_content" not in chunk.message.additional_kwargs + assert chunk.message.content == "Still replying..." diff --git a/config.example.yaml b/config.example.yaml index 380527f42..7406649eb 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -245,6 +245,28 @@ models: # max_tokens: 8192 # temperature: 0.7 + # Example: vLLM 0.19.0 (OpenAI-compatible, with reasoning toggle) + # DeerFlow's vLLM provider preserves vLLM reasoning across tool-call turns and + # toggles Qwen-style reasoning by writing + # extra_body.chat_template_kwargs.enable_thinking=true/false. + # Some reasoning models also require the server to be started with + # `vllm serve ... --reasoning-parser `. + # - name: qwen3-32b-vllm + # display_name: Qwen3 32B (vLLM) + # use: deerflow.models.vllm_provider:VllmChatModel + # model: Qwen/Qwen3-32B + # api_key: $VLLM_API_KEY + # base_url: http://localhost:8000/v1 + # request_timeout: 600.0 + # max_retries: 2 + # max_tokens: 8192 + # supports_thinking: true + # supports_vision: false + # when_thinking_enabled: + # extra_body: + # chat_template_kwargs: + # enable_thinking: true + # ============================================================================ # Tool Groups Configuration # ============================================================================