mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-04-25 03:08:09 +00:00
feat(models): add vLLM provider support (#1860)
support for vLLM 0.19.0 OpenAI-compatible chat endpoints and fixes the Qwen reasoning toggle so flash mode can actually disable thinking. Co-authored-by: NmanQAQ <normangyao@qq.com> Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
parent
5fd2c581f6
commit
dd30e609f7
@ -17,6 +17,7 @@ INFOQUEST_API_KEY=your-infoquest-api-key
|
||||
# DEEPSEEK_API_KEY=your-deepseek-api-key
|
||||
# NOVITA_API_KEY=your-novita-api-key # OpenAI-compatible, see https://novita.ai
|
||||
# MINIMAX_API_KEY=your-minimax-api-key # OpenAI-compatible, see https://platform.minimax.io
|
||||
# VLLM_API_KEY=your-vllm-api-key # OpenAI-compatible
|
||||
# FEISHU_APP_ID=your-feishu-app-id
|
||||
# FEISHU_APP_SECRET=your-feishu-app-secret
|
||||
|
||||
|
||||
14
README.md
14
README.md
@ -141,12 +141,26 @@ That prompt is intended for coding agents. It tells the agent to clone the repo
|
||||
api_key: $OPENAI_API_KEY
|
||||
use_responses_api: true
|
||||
output_version: responses/v1
|
||||
|
||||
- name: qwen3-32b-vllm
|
||||
display_name: Qwen3 32B (vLLM)
|
||||
use: deerflow.models.vllm_provider:VllmChatModel
|
||||
model: Qwen/Qwen3-32B
|
||||
api_key: $VLLM_API_KEY
|
||||
base_url: http://localhost:8000/v1
|
||||
supports_thinking: true
|
||||
when_thinking_enabled:
|
||||
extra_body:
|
||||
chat_template_kwargs:
|
||||
enable_thinking: true
|
||||
```
|
||||
|
||||
OpenRouter and similar OpenAI-compatible gateways should be configured with `langchain_openai:ChatOpenAI` plus `base_url`. If you prefer a provider-specific environment variable name, point `api_key` at that variable explicitly (for example `api_key: $OPENROUTER_API_KEY`).
|
||||
|
||||
To route OpenAI models through `/v1/responses`, keep using `langchain_openai:ChatOpenAI` and set `use_responses_api: true` with `output_version: responses/v1`.
|
||||
|
||||
For vLLM 0.19.0, use `deerflow.models.vllm_provider:VllmChatModel`. For Qwen-style reasoning models, DeerFlow toggles reasoning with `extra_body.chat_template_kwargs.enable_thinking` and preserves vLLM's non-standard `reasoning` field across multi-turn tool-call conversations. Legacy `thinking` configs are normalized automatically for backward compatibility. Reasoning models may also require the server to be started with `--reasoning-parser ...`. If your local vLLM deployment accepts any non-empty API key, you can still set `VLLM_API_KEY` to a placeholder value.
|
||||
|
||||
CLI-backed provider examples:
|
||||
|
||||
```yaml
|
||||
|
||||
@ -293,10 +293,17 @@ Proxied through nginx: `/api/langgraph/*` → LangGraph, all other `/api/*` →
|
||||
|
||||
- `create_chat_model(name, thinking_enabled)` instantiates LLM from config via reflection
|
||||
- Supports `thinking_enabled` flag with per-model `when_thinking_enabled` overrides
|
||||
- Supports vLLM-style thinking toggles via `when_thinking_enabled.extra_body.chat_template_kwargs.enable_thinking` for Qwen reasoning models, while normalizing legacy `thinking` configs for backward compatibility
|
||||
- Supports `supports_vision` flag for image understanding models
|
||||
- Config values starting with `$` resolved as environment variables
|
||||
- Missing provider modules surface actionable install hints from reflection resolvers (for example `uv add langchain-google-genai`)
|
||||
|
||||
### vLLM Provider (`packages/harness/deerflow/models/vllm_provider.py`)
|
||||
|
||||
- `VllmChatModel` subclasses `langchain_openai:ChatOpenAI` for vLLM 0.19.0 OpenAI-compatible endpoints
|
||||
- Preserves vLLM's non-standard assistant `reasoning` field on full responses, streaming deltas, and follow-up tool-call turns
|
||||
- Designed for configs that enable thinking through `extra_body.chat_template_kwargs.enable_thinking` on vLLM 0.19.0 Qwen reasoning models, while accepting the older `thinking` alias
|
||||
|
||||
### IM Channels System (`app/channels/`)
|
||||
|
||||
Bridges external messaging platforms (Feishu, Slack, Telegram) to the DeerFlow agent via the LangGraph Server.
|
||||
@ -365,6 +372,7 @@ Focused regression coverage for the updater lives in `backend/tests/test_memory_
|
||||
|
||||
**`config.yaml`** key sections:
|
||||
- `models[]` - LLM configs with `use` class path, `supports_thinking`, `supports_vision`, provider-specific fields
|
||||
- vLLM reasoning models should use `deerflow.models.vllm_provider:VllmChatModel`; for Qwen-style parsers prefer `when_thinking_enabled.extra_body.chat_template_kwargs.enable_thinking`, and DeerFlow will also normalize the older `thinking` alias
|
||||
- `tools[]` - Tool configs with `use` variable path and `group`
|
||||
- `tool_groups[]` - Logical groupings for tools
|
||||
- `sandbox.use` - Sandbox provider class path
|
||||
|
||||
@ -9,6 +9,27 @@ from deerflow.tracing import build_tracing_callbacks
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _deep_merge_dicts(base: dict | None, override: dict) -> dict:
|
||||
"""Recursively merge two dictionaries without mutating the inputs."""
|
||||
merged = dict(base or {})
|
||||
for key, value in override.items():
|
||||
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
||||
merged[key] = _deep_merge_dicts(merged[key], value)
|
||||
else:
|
||||
merged[key] = value
|
||||
return merged
|
||||
|
||||
|
||||
def _vllm_disable_chat_template_kwargs(chat_template_kwargs: dict) -> dict:
|
||||
"""Build the disable payload for vLLM/Qwen chat template kwargs."""
|
||||
disable_kwargs: dict[str, bool] = {}
|
||||
if "thinking" in chat_template_kwargs:
|
||||
disable_kwargs["thinking"] = False
|
||||
if "enable_thinking" in chat_template_kwargs:
|
||||
disable_kwargs["enable_thinking"] = False
|
||||
return disable_kwargs
|
||||
|
||||
|
||||
def create_chat_model(name: str | None = None, thinking_enabled: bool = False, **kwargs) -> BaseChatModel:
|
||||
"""Create a chat model instance from the config.
|
||||
|
||||
@ -54,13 +75,23 @@ def create_chat_model(name: str | None = None, thinking_enabled: bool = False, *
|
||||
if not thinking_enabled and has_thinking_settings:
|
||||
if effective_wte.get("extra_body", {}).get("thinking", {}).get("type"):
|
||||
# OpenAI-compatible gateway: thinking is nested under extra_body
|
||||
kwargs.update({"extra_body": {"thinking": {"type": "disabled"}}})
|
||||
kwargs.update({"reasoning_effort": "minimal"})
|
||||
model_settings_from_config["extra_body"] = _deep_merge_dicts(
|
||||
model_settings_from_config.get("extra_body"),
|
||||
{"thinking": {"type": "disabled"}},
|
||||
)
|
||||
model_settings_from_config["reasoning_effort"] = "minimal"
|
||||
elif disable_chat_template_kwargs := _vllm_disable_chat_template_kwargs(effective_wte.get("extra_body", {}).get("chat_template_kwargs") or {}):
|
||||
# vLLM uses chat template kwargs to switch thinking on/off.
|
||||
model_settings_from_config["extra_body"] = _deep_merge_dicts(
|
||||
model_settings_from_config.get("extra_body"),
|
||||
{"chat_template_kwargs": disable_chat_template_kwargs},
|
||||
)
|
||||
elif effective_wte.get("thinking", {}).get("type"):
|
||||
# Native langchain_anthropic: thinking is a direct constructor parameter
|
||||
kwargs.update({"thinking": {"type": "disabled"}})
|
||||
if not model_config.supports_reasoning_effort and "reasoning_effort" in kwargs:
|
||||
del kwargs["reasoning_effort"]
|
||||
model_settings_from_config["thinking"] = {"type": "disabled"}
|
||||
if not model_config.supports_reasoning_effort:
|
||||
kwargs.pop("reasoning_effort", None)
|
||||
model_settings_from_config.pop("reasoning_effort", None)
|
||||
|
||||
# For Codex Responses API models: map thinking mode to reasoning_effort
|
||||
from deerflow.models.openai_codex_provider import CodexChatModel
|
||||
|
||||
258
backend/packages/harness/deerflow/models/vllm_provider.py
Normal file
258
backend/packages/harness/deerflow/models/vllm_provider.py
Normal file
@ -0,0 +1,258 @@
|
||||
"""Custom vLLM provider built on top of LangChain ChatOpenAI.
|
||||
|
||||
vLLM 0.19.0 exposes reasoning models through an OpenAI-compatible API, but
|
||||
LangChain's default OpenAI adapter drops the non-standard ``reasoning`` field
|
||||
from assistant messages and streaming deltas. That breaks interleaved
|
||||
thinking/tool-call flows because vLLM expects the assistant's prior reasoning to
|
||||
be echoed back on subsequent turns.
|
||||
|
||||
This provider preserves ``reasoning`` on:
|
||||
- non-streaming responses
|
||||
- streaming deltas
|
||||
- multi-turn request payloads
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, cast
|
||||
|
||||
import openai
|
||||
from langchain_core.language_models import LanguageModelInput
|
||||
from langchain_core.messages import (
|
||||
AIMessage,
|
||||
AIMessageChunk,
|
||||
BaseMessageChunk,
|
||||
ChatMessageChunk,
|
||||
FunctionMessageChunk,
|
||||
HumanMessageChunk,
|
||||
SystemMessageChunk,
|
||||
ToolMessageChunk,
|
||||
)
|
||||
from langchain_core.messages.tool import tool_call_chunk
|
||||
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain_openai.chat_models.base import _create_usage_metadata
|
||||
|
||||
|
||||
def _normalize_vllm_chat_template_kwargs(payload: dict[str, Any]) -> None:
|
||||
"""Map DeerFlow's legacy ``thinking`` toggle to vLLM/Qwen's ``enable_thinking``.
|
||||
|
||||
DeerFlow originally documented ``extra_body.chat_template_kwargs.thinking``
|
||||
for vLLM, but vLLM 0.19.0's Qwen reasoning parser reads
|
||||
``chat_template_kwargs.enable_thinking``. Normalize the payload just before
|
||||
it is sent so existing configs keep working and flash mode can truly
|
||||
disable reasoning.
|
||||
"""
|
||||
extra_body = payload.get("extra_body")
|
||||
if not isinstance(extra_body, dict):
|
||||
return
|
||||
|
||||
chat_template_kwargs = extra_body.get("chat_template_kwargs")
|
||||
if not isinstance(chat_template_kwargs, dict):
|
||||
return
|
||||
|
||||
if "thinking" not in chat_template_kwargs:
|
||||
return
|
||||
|
||||
normalized_chat_template_kwargs = dict(chat_template_kwargs)
|
||||
normalized_chat_template_kwargs.setdefault("enable_thinking", normalized_chat_template_kwargs["thinking"])
|
||||
normalized_chat_template_kwargs.pop("thinking", None)
|
||||
extra_body["chat_template_kwargs"] = normalized_chat_template_kwargs
|
||||
|
||||
|
||||
def _reasoning_to_text(reasoning: Any) -> str:
|
||||
"""Best-effort extraction of readable reasoning text from vLLM payloads."""
|
||||
if isinstance(reasoning, str):
|
||||
return reasoning
|
||||
|
||||
if isinstance(reasoning, list):
|
||||
parts = [_reasoning_to_text(item) for item in reasoning]
|
||||
return "".join(part for part in parts if part)
|
||||
|
||||
if isinstance(reasoning, dict):
|
||||
for key in ("text", "content", "reasoning"):
|
||||
value = reasoning.get(key)
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
if value is not None:
|
||||
text = _reasoning_to_text(value)
|
||||
if text:
|
||||
return text
|
||||
try:
|
||||
return json.dumps(reasoning, ensure_ascii=False)
|
||||
except TypeError:
|
||||
return str(reasoning)
|
||||
|
||||
try:
|
||||
return json.dumps(reasoning, ensure_ascii=False)
|
||||
except TypeError:
|
||||
return str(reasoning)
|
||||
|
||||
|
||||
def _convert_delta_to_message_chunk_with_reasoning(_dict: Mapping[str, Any], default_class: type[BaseMessageChunk]) -> BaseMessageChunk:
|
||||
"""Convert a streaming delta to a LangChain message chunk while preserving reasoning."""
|
||||
id_ = _dict.get("id")
|
||||
role = cast(str, _dict.get("role"))
|
||||
content = cast(str, _dict.get("content") or "")
|
||||
additional_kwargs: dict[str, Any] = {}
|
||||
|
||||
if _dict.get("function_call"):
|
||||
function_call = dict(_dict["function_call"])
|
||||
if "name" in function_call and function_call["name"] is None:
|
||||
function_call["name"] = ""
|
||||
additional_kwargs["function_call"] = function_call
|
||||
|
||||
reasoning = _dict.get("reasoning")
|
||||
if reasoning is not None:
|
||||
additional_kwargs["reasoning"] = reasoning
|
||||
reasoning_text = _reasoning_to_text(reasoning)
|
||||
if reasoning_text:
|
||||
additional_kwargs["reasoning_content"] = reasoning_text
|
||||
|
||||
tool_call_chunks = []
|
||||
if raw_tool_calls := _dict.get("tool_calls"):
|
||||
try:
|
||||
tool_call_chunks = [
|
||||
tool_call_chunk(
|
||||
name=rtc["function"].get("name"),
|
||||
args=rtc["function"].get("arguments"),
|
||||
id=rtc.get("id"),
|
||||
index=rtc["index"],
|
||||
)
|
||||
for rtc in raw_tool_calls
|
||||
]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
if role == "user" or default_class == HumanMessageChunk:
|
||||
return HumanMessageChunk(content=content, id=id_)
|
||||
if role == "assistant" or default_class == AIMessageChunk:
|
||||
return AIMessageChunk(
|
||||
content=content,
|
||||
additional_kwargs=additional_kwargs,
|
||||
id=id_,
|
||||
tool_call_chunks=tool_call_chunks, # type: ignore[arg-type]
|
||||
)
|
||||
if role in ("system", "developer") or default_class == SystemMessageChunk:
|
||||
role_kwargs = {"__openai_role__": "developer"} if role == "developer" else {}
|
||||
return SystemMessageChunk(content=content, id=id_, additional_kwargs=role_kwargs)
|
||||
if role == "function" or default_class == FunctionMessageChunk:
|
||||
return FunctionMessageChunk(content=content, name=_dict["name"], id=id_)
|
||||
if role == "tool" or default_class == ToolMessageChunk:
|
||||
return ToolMessageChunk(content=content, tool_call_id=_dict["tool_call_id"], id=id_)
|
||||
if role or default_class == ChatMessageChunk:
|
||||
return ChatMessageChunk(content=content, role=role, id=id_) # type: ignore[arg-type]
|
||||
return default_class(content=content, id=id_) # type: ignore[call-arg]
|
||||
|
||||
|
||||
def _restore_reasoning_field(payload_msg: dict[str, Any], orig_msg: AIMessage) -> None:
|
||||
"""Re-inject vLLM reasoning onto outgoing assistant messages."""
|
||||
reasoning = orig_msg.additional_kwargs.get("reasoning")
|
||||
if reasoning is None:
|
||||
reasoning = orig_msg.additional_kwargs.get("reasoning_content")
|
||||
if reasoning is not None:
|
||||
payload_msg["reasoning"] = reasoning
|
||||
|
||||
|
||||
class VllmChatModel(ChatOpenAI):
|
||||
"""ChatOpenAI variant that preserves vLLM reasoning fields across turns."""
|
||||
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
return "vllm-openai-compatible"
|
||||
|
||||
def _get_request_payload(
|
||||
self,
|
||||
input_: LanguageModelInput,
|
||||
*,
|
||||
stop: list[str] | None = None,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Restore assistant reasoning in request payloads for interleaved thinking."""
|
||||
original_messages = self._convert_input(input_).to_messages()
|
||||
payload = super()._get_request_payload(input_, stop=stop, **kwargs)
|
||||
_normalize_vllm_chat_template_kwargs(payload)
|
||||
payload_messages = payload.get("messages", [])
|
||||
|
||||
if len(payload_messages) == len(original_messages):
|
||||
for payload_msg, orig_msg in zip(payload_messages, original_messages):
|
||||
if payload_msg.get("role") == "assistant" and isinstance(orig_msg, AIMessage):
|
||||
_restore_reasoning_field(payload_msg, orig_msg)
|
||||
else:
|
||||
ai_messages = [message for message in original_messages if isinstance(message, AIMessage)]
|
||||
assistant_payloads = [message for message in payload_messages if message.get("role") == "assistant"]
|
||||
for payload_msg, ai_msg in zip(assistant_payloads, ai_messages):
|
||||
_restore_reasoning_field(payload_msg, ai_msg)
|
||||
|
||||
return payload
|
||||
|
||||
def _create_chat_result(self, response: dict | openai.BaseModel, generation_info: dict | None = None) -> ChatResult:
|
||||
"""Preserve vLLM reasoning on non-streaming responses."""
|
||||
result = super()._create_chat_result(response, generation_info=generation_info)
|
||||
response_dict = response if isinstance(response, dict) else response.model_dump()
|
||||
|
||||
for generation, choice in zip(result.generations, response_dict.get("choices", [])):
|
||||
if not isinstance(generation, ChatGeneration):
|
||||
continue
|
||||
message = generation.message
|
||||
if not isinstance(message, AIMessage):
|
||||
continue
|
||||
reasoning = choice.get("message", {}).get("reasoning")
|
||||
if reasoning is None:
|
||||
continue
|
||||
message.additional_kwargs["reasoning"] = reasoning
|
||||
reasoning_text = _reasoning_to_text(reasoning)
|
||||
if reasoning_text:
|
||||
message.additional_kwargs["reasoning_content"] = reasoning_text
|
||||
|
||||
return result
|
||||
|
||||
def _convert_chunk_to_generation_chunk(
|
||||
self,
|
||||
chunk: dict,
|
||||
default_chunk_class: type,
|
||||
base_generation_info: dict | None,
|
||||
) -> ChatGenerationChunk | None:
|
||||
"""Preserve vLLM reasoning on streaming deltas."""
|
||||
if chunk.get("type") == "content.delta":
|
||||
return None
|
||||
|
||||
token_usage = chunk.get("usage")
|
||||
choices = chunk.get("choices", []) or chunk.get("chunk", {}).get("choices", [])
|
||||
usage_metadata = _create_usage_metadata(token_usage, chunk.get("service_tier")) if token_usage else None
|
||||
|
||||
if len(choices) == 0:
|
||||
generation_chunk = ChatGenerationChunk(message=default_chunk_class(content="", usage_metadata=usage_metadata), generation_info=base_generation_info)
|
||||
if self.output_version == "v1":
|
||||
generation_chunk.message.content = []
|
||||
generation_chunk.message.response_metadata["output_version"] = "v1"
|
||||
return generation_chunk
|
||||
|
||||
choice = choices[0]
|
||||
if choice["delta"] is None:
|
||||
return None
|
||||
|
||||
message_chunk = _convert_delta_to_message_chunk_with_reasoning(choice["delta"], default_chunk_class)
|
||||
generation_info = {**base_generation_info} if base_generation_info else {}
|
||||
|
||||
if finish_reason := choice.get("finish_reason"):
|
||||
generation_info["finish_reason"] = finish_reason
|
||||
if model_name := chunk.get("model"):
|
||||
generation_info["model_name"] = model_name
|
||||
if system_fingerprint := chunk.get("system_fingerprint"):
|
||||
generation_info["system_fingerprint"] = system_fingerprint
|
||||
if service_tier := chunk.get("service_tier"):
|
||||
generation_info["service_tier"] = service_tier
|
||||
|
||||
if logprobs := choice.get("logprobs"):
|
||||
generation_info["logprobs"] = logprobs
|
||||
|
||||
if usage_metadata and isinstance(message_chunk, AIMessageChunk):
|
||||
message_chunk.usage_metadata = usage_metadata
|
||||
|
||||
message_chunk.response_metadata["model_provider"] = "openai"
|
||||
return ChatGenerationChunk(message=message_chunk, generation_info=generation_info or None)
|
||||
@ -604,6 +604,63 @@ def test_codex_provider_strips_unsupported_max_tokens(monkeypatch):
|
||||
assert "max_tokens" not in FakeChatModel.captured_kwargs
|
||||
|
||||
|
||||
def test_thinking_disabled_vllm_chat_template_format(monkeypatch):
|
||||
wte = {"extra_body": {"chat_template_kwargs": {"thinking": True}}}
|
||||
model = _make_model(
|
||||
"vllm-qwen",
|
||||
use="deerflow.models.vllm_provider:VllmChatModel",
|
||||
supports_thinking=True,
|
||||
when_thinking_enabled=wte,
|
||||
)
|
||||
model.extra_body = {"top_k": 20}
|
||||
cfg = _make_app_config([model])
|
||||
_patch_factory(monkeypatch, cfg)
|
||||
|
||||
captured: dict = {}
|
||||
|
||||
class CapturingModel(FakeChatModel):
|
||||
def __init__(self, **kwargs):
|
||||
captured.update(kwargs)
|
||||
BaseChatModel.__init__(self, **kwargs)
|
||||
|
||||
monkeypatch.setattr(factory_module, "resolve_class", lambda path, base: CapturingModel)
|
||||
|
||||
factory_module.create_chat_model(name="vllm-qwen", thinking_enabled=False)
|
||||
|
||||
assert captured.get("extra_body") == {"top_k": 20, "chat_template_kwargs": {"thinking": False}}
|
||||
assert captured.get("reasoning_effort") is None
|
||||
|
||||
|
||||
def test_thinking_disabled_vllm_enable_thinking_format(monkeypatch):
|
||||
wte = {"extra_body": {"chat_template_kwargs": {"enable_thinking": True}}}
|
||||
model = _make_model(
|
||||
"vllm-qwen-enable",
|
||||
use="deerflow.models.vllm_provider:VllmChatModel",
|
||||
supports_thinking=True,
|
||||
when_thinking_enabled=wte,
|
||||
)
|
||||
model.extra_body = {"top_k": 20}
|
||||
cfg = _make_app_config([model])
|
||||
_patch_factory(monkeypatch, cfg)
|
||||
|
||||
captured: dict = {}
|
||||
|
||||
class CapturingModel(FakeChatModel):
|
||||
def __init__(self, **kwargs):
|
||||
captured.update(kwargs)
|
||||
BaseChatModel.__init__(self, **kwargs)
|
||||
|
||||
monkeypatch.setattr(factory_module, "resolve_class", lambda path, base: CapturingModel)
|
||||
|
||||
factory_module.create_chat_model(name="vllm-qwen-enable", thinking_enabled=False)
|
||||
|
||||
assert captured.get("extra_body") == {
|
||||
"top_k": 20,
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
}
|
||||
assert captured.get("reasoning_effort") is None
|
||||
|
||||
|
||||
def test_openai_responses_api_settings_are_passed_to_chatopenai(monkeypatch):
|
||||
model = ModelConfig(
|
||||
name="gpt-5-responses",
|
||||
|
||||
138
backend/tests/test_vllm_provider.py
Normal file
138
backend/tests/test_vllm_provider.py
Normal file
@ -0,0 +1,138 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from langchain_core.messages import AIMessage, AIMessageChunk, HumanMessage
|
||||
|
||||
from deerflow.models.vllm_provider import VllmChatModel
|
||||
|
||||
|
||||
def _make_model() -> VllmChatModel:
|
||||
return VllmChatModel(
|
||||
model="Qwen/QwQ-32B",
|
||||
api_key="dummy",
|
||||
base_url="http://localhost:8000/v1",
|
||||
)
|
||||
|
||||
|
||||
def test_vllm_provider_restores_reasoning_in_request_payload():
|
||||
model = _make_model()
|
||||
payload = model._get_request_payload(
|
||||
[
|
||||
AIMessage(
|
||||
content="",
|
||||
tool_calls=[{"name": "bash", "args": {"cmd": "pwd"}, "id": "tool-1", "type": "tool_call"}],
|
||||
additional_kwargs={"reasoning": "Need to inspect the workspace first."},
|
||||
),
|
||||
HumanMessage(content="Continue"),
|
||||
]
|
||||
)
|
||||
|
||||
assistant_message = payload["messages"][0]
|
||||
assert assistant_message["role"] == "assistant"
|
||||
assert assistant_message["reasoning"] == "Need to inspect the workspace first."
|
||||
assert assistant_message["tool_calls"][0]["function"]["name"] == "bash"
|
||||
|
||||
|
||||
def test_vllm_provider_normalizes_legacy_thinking_kwarg_to_enable_thinking():
|
||||
model = VllmChatModel(
|
||||
model="qwen3",
|
||||
api_key="dummy",
|
||||
base_url="http://localhost:8000/v1",
|
||||
extra_body={"chat_template_kwargs": {"thinking": True}},
|
||||
)
|
||||
|
||||
payload = model._get_request_payload([HumanMessage(content="Hello")])
|
||||
|
||||
assert payload["extra_body"]["chat_template_kwargs"] == {"enable_thinking": True}
|
||||
|
||||
|
||||
def test_vllm_provider_preserves_explicit_enable_thinking_kwarg():
|
||||
model = VllmChatModel(
|
||||
model="qwen3",
|
||||
api_key="dummy",
|
||||
base_url="http://localhost:8000/v1",
|
||||
extra_body={"chat_template_kwargs": {"enable_thinking": False, "foo": "bar"}},
|
||||
)
|
||||
|
||||
payload = model._get_request_payload([HumanMessage(content="Hello")])
|
||||
|
||||
assert payload["extra_body"]["chat_template_kwargs"] == {
|
||||
"enable_thinking": False,
|
||||
"foo": "bar",
|
||||
}
|
||||
|
||||
|
||||
def test_vllm_provider_preserves_reasoning_in_chat_result():
|
||||
model = _make_model()
|
||||
result = model._create_chat_result(
|
||||
{
|
||||
"model": "Qwen/QwQ-32B",
|
||||
"choices": [
|
||||
{
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "42",
|
||||
"reasoning": "I compared the two numbers directly.",
|
||||
},
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
],
|
||||
"usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
|
||||
}
|
||||
)
|
||||
|
||||
message = result.generations[0].message
|
||||
assert message.additional_kwargs["reasoning"] == "I compared the two numbers directly."
|
||||
assert message.additional_kwargs["reasoning_content"] == "I compared the two numbers directly."
|
||||
|
||||
|
||||
def test_vllm_provider_preserves_reasoning_in_streaming_chunks():
|
||||
model = _make_model()
|
||||
chunk = model._convert_chunk_to_generation_chunk(
|
||||
{
|
||||
"model": "Qwen/QwQ-32B",
|
||||
"choices": [
|
||||
{
|
||||
"delta": {
|
||||
"role": "assistant",
|
||||
"reasoning": "First, call the weather tool.",
|
||||
"content": "Calling tool...",
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
AIMessageChunk,
|
||||
{},
|
||||
)
|
||||
|
||||
assert chunk is not None
|
||||
assert chunk.message.additional_kwargs["reasoning"] == "First, call the weather tool."
|
||||
assert chunk.message.additional_kwargs["reasoning_content"] == "First, call the weather tool."
|
||||
assert chunk.message.content == "Calling tool..."
|
||||
|
||||
|
||||
def test_vllm_provider_preserves_empty_reasoning_values_in_streaming_chunks():
|
||||
model = _make_model()
|
||||
chunk = model._convert_chunk_to_generation_chunk(
|
||||
{
|
||||
"model": "Qwen/QwQ-32B",
|
||||
"choices": [
|
||||
{
|
||||
"delta": {
|
||||
"role": "assistant",
|
||||
"reasoning": "",
|
||||
"content": "Still replying...",
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
},
|
||||
AIMessageChunk,
|
||||
{},
|
||||
)
|
||||
|
||||
assert chunk is not None
|
||||
assert "reasoning" in chunk.message.additional_kwargs
|
||||
assert chunk.message.additional_kwargs["reasoning"] == ""
|
||||
assert "reasoning_content" not in chunk.message.additional_kwargs
|
||||
assert chunk.message.content == "Still replying..."
|
||||
@ -245,6 +245,28 @@ models:
|
||||
# max_tokens: 8192
|
||||
# temperature: 0.7
|
||||
|
||||
# Example: vLLM 0.19.0 (OpenAI-compatible, with reasoning toggle)
|
||||
# DeerFlow's vLLM provider preserves vLLM reasoning across tool-call turns and
|
||||
# toggles Qwen-style reasoning by writing
|
||||
# extra_body.chat_template_kwargs.enable_thinking=true/false.
|
||||
# Some reasoning models also require the server to be started with
|
||||
# `vllm serve ... --reasoning-parser <parser>`.
|
||||
# - name: qwen3-32b-vllm
|
||||
# display_name: Qwen3 32B (vLLM)
|
||||
# use: deerflow.models.vllm_provider:VllmChatModel
|
||||
# model: Qwen/Qwen3-32B
|
||||
# api_key: $VLLM_API_KEY
|
||||
# base_url: http://localhost:8000/v1
|
||||
# request_timeout: 600.0
|
||||
# max_retries: 2
|
||||
# max_tokens: 8192
|
||||
# supports_thinking: true
|
||||
# supports_vision: false
|
||||
# when_thinking_enabled:
|
||||
# extra_body:
|
||||
# chat_template_kwargs:
|
||||
# enable_thinking: true
|
||||
|
||||
# ============================================================================
|
||||
# Tool Groups Configuration
|
||||
# ============================================================================
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user