mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-09 09:02:02 +00:00
fix(suggestions): strip inline <think> reasoning before parsing follow-up questions (#3435)
Reasoning models such as MiniMax-M3 inline their chain-of-thought into the
message content as <think>...</think> (reasoning_split defaults to false)
instead of a separate reasoning_content field. The follow-up-suggestions
endpoint extracted the JSON array via find('[') / rfind(']'), which silently
broke whenever the reasoning text contained '[' or ']' — or when long thinking
hit max_tokens and truncated before the array was emitted — returning empty
suggestions.
- Add _strip_think_blocks() and apply it before JSON extraction; it removes
complete <think>...</think> blocks (case-insensitive) and drops an unclosed
<think> left by max_tokens truncation.
- Document the MiniMax thinking toggle in config.example.yaml
(when_thinking_enabled: adaptive / when_thinking_disabled: disabled) so
thinking_enabled=False actually disables reasoning on M3; note that M2.x
models always think and rely on the defensive strip above.
- Tests cover complete/unclosed think blocks, brackets-inside-think, think +
code-fence, and an end-to-end suggestions case reproducing the empty-result
bug.
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
88759015e4
commit
3b105d1e5f
@ -263,7 +263,7 @@ CORS is same-origin by default when requests enter through nginx on port 2026. S
|
||||
| **Uploads** (`/api/threads/{id}/uploads`) | `POST /` - upload files (auto-converts PDF/PPT/Excel/Word); `GET /list` - list; `DELETE /{filename}` - delete |
|
||||
| **Threads** (`/api/threads/{id}`) | `DELETE /` - remove DeerFlow-managed local thread data after LangGraph thread deletion; unexpected failures are logged server-side and return a generic 500 detail |
|
||||
| **Artifacts** (`/api/threads/{id}/artifacts`) | `GET /{path}` - serve artifacts; active content types (`text/html`, `application/xhtml+xml`, `image/svg+xml`) are always forced as download attachments to reduce XSS risk; `?download=true` still forces download for other file types |
|
||||
| **Suggestions** (`/api/threads/{id}/suggestions`) | `POST /` - generate follow-up questions; rich list/block model content is normalized before JSON parsing |
|
||||
| **Suggestions** (`/api/threads/{id}/suggestions`) | `POST /` - generate follow-up questions; rich list/block model content is normalized and inline reasoning (`<think>...</think>`, including unclosed/truncated blocks from reasoning models like MiniMax-M3) is stripped before JSON parsing |
|
||||
| **Thread Runs** (`/api/threads/{id}/runs`) | `POST /` - create background run; `POST /stream` - create + SSE stream; `POST /wait` - create + block; `GET /` - list runs; `GET /{rid}` - run details; `POST /{rid}/cancel` - cancel; `GET /{rid}/join` - join SSE; `GET /{rid}/messages` - paginated messages `{data, has_more}`; `GET /{rid}/events` - full event stream; `GET /../messages` - thread messages with feedback; `GET /../token-usage` - aggregate tokens |
|
||||
| **Feedback** (`/api/threads/{id}/runs/{rid}/feedback`) | `PUT /` - upsert feedback; `DELETE /` - delete user feedback; `POST /` - create feedback; `GET /` - list feedback; `GET /stats` - aggregate stats; `DELETE /{fid}` - delete specific |
|
||||
| **Runs** (`/api/runs`) | `POST /stream` - stateless run + SSE; `POST /wait` - stateless run + block; `GET /{rid}/messages` - paginated messages by run_id `{data, has_more}` (cursor: `after_seq`/`before_seq`); `GET /{rid}/feedback` - list feedback by run_id |
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
@ -30,6 +31,31 @@ class SuggestionsResponse(BaseModel):
|
||||
suggestions: list[str] = Field(default_factory=list, description="Suggested follow-up questions")
|
||||
|
||||
|
||||
# Matches a complete <think>...</think> block (case-insensitive, spans newlines).
|
||||
_THINK_BLOCK_RE = re.compile(r"<think\b[^>]*>.*?</think\s*>", re.IGNORECASE | re.DOTALL)
|
||||
# Matches a dangling, unclosed <think> (model truncated at max_tokens mid-thought).
|
||||
_OPEN_THINK_RE = re.compile(r"<think\b[^>]*>", re.IGNORECASE)
|
||||
|
||||
|
||||
def _strip_think_blocks(text: str) -> str:
|
||||
"""Remove reasoning-model ``<think>...</think>`` blocks from the response.
|
||||
|
||||
Reasoning models such as MiniMax-M3 inline their chain-of-thought into the
|
||||
message ``content`` wrapped in ``<think>...</think>`` (``reasoning_split``
|
||||
defaults to false), rather than exposing a separate ``reasoning_content``
|
||||
field. The thinking text frequently contains ``[`` / ``]`` characters, which
|
||||
corrupted the downstream ``find('[')`` / ``rfind(']')`` JSON extraction and
|
||||
produced empty suggestions. We strip the reasoning before parsing so only
|
||||
the actual answer remains.
|
||||
"""
|
||||
text = _THINK_BLOCK_RE.sub("", text)
|
||||
# Drop any unclosed <think> (and everything after it) left by truncation.
|
||||
open_match = _OPEN_THINK_RE.search(text)
|
||||
if open_match:
|
||||
text = text[: open_match.start()]
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _strip_markdown_code_fence(text: str) -> str:
|
||||
stripped = text.strip()
|
||||
if not stripped.startswith("```"):
|
||||
@ -41,7 +67,8 @@ def _strip_markdown_code_fence(text: str) -> str:
|
||||
|
||||
|
||||
def _parse_json_string_list(text: str) -> list[str] | None:
|
||||
candidate = _strip_markdown_code_fence(text)
|
||||
candidate = _strip_think_blocks(text)
|
||||
candidate = _strip_markdown_code_fence(candidate)
|
||||
start = candidate.find("[")
|
||||
end = candidate.rfind("]")
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
|
||||
@ -25,6 +25,60 @@ def test_parse_json_string_list_rejects_non_list():
|
||||
assert suggestions._parse_json_string_list(text) is None
|
||||
|
||||
|
||||
def test_strip_think_blocks_removes_complete_block():
|
||||
text = "<think>\nreasoning here\n</think>\nanswer"
|
||||
assert suggestions._strip_think_blocks(text) == "answer"
|
||||
|
||||
|
||||
def test_strip_think_blocks_is_case_insensitive():
|
||||
text = "<Think>reasoning</THINK>\nanswer"
|
||||
assert suggestions._strip_think_blocks(text) == "answer"
|
||||
|
||||
|
||||
def test_strip_think_blocks_drops_unclosed_block():
|
||||
# Reasoning models truncated at max_tokens emit an unclosed <think>.
|
||||
text = "<think>\nreasoning that never finished because tokens ran out"
|
||||
assert suggestions._strip_think_blocks(text) == ""
|
||||
|
||||
|
||||
def test_strip_think_blocks_keeps_text_without_think():
|
||||
text = '["a", "b"]'
|
||||
assert suggestions._strip_think_blocks(text) == '["a", "b"]'
|
||||
|
||||
|
||||
def test_parse_json_string_list_ignores_brackets_inside_think_block():
|
||||
# MiniMax-M3 inlines its chain-of-thought as <think>...</think> in content
|
||||
# (reasoning_split=false). When that reasoning contains '[' / ']', the old
|
||||
# find('[')/rfind(']') logic grabbed the wrong span and parsing failed.
|
||||
text = '<think>\nMaybe a list like ["x", "y"] could work. Let me craft 3.\n</think>\n["Q1", "Q2", "Q3"]'
|
||||
assert suggestions._parse_json_string_list(text) == ["Q1", "Q2", "Q3"]
|
||||
|
||||
|
||||
def test_parse_json_string_list_strips_think_then_code_fence():
|
||||
text = '<think>reasoning</think>\n```json\n["Q1", "Q2"]\n```'
|
||||
assert suggestions._parse_json_string_list(text) == ["Q1", "Q2"]
|
||||
|
||||
|
||||
def test_generate_suggestions_strips_inline_think_block(monkeypatch):
|
||||
# End-to-end: model returns thinking inline followed by the JSON array.
|
||||
req = suggestions.SuggestionsRequest(
|
||||
messages=[
|
||||
suggestions.SuggestionMessage(role="user", content="介绍深度学习"),
|
||||
suggestions.SuggestionMessage(role="assistant", content="深度学习是机器学习的分支。"),
|
||||
],
|
||||
n=3,
|
||||
model_name=None,
|
||||
)
|
||||
content = '<think>\nThe user asked about deep learning. Options: maybe [1] frameworks, [2] math basics.\n</think>\n["深度学习和机器学习的区别?", "常用框架有哪些?", "需要什么数学基础?"]'
|
||||
fake_model = MagicMock()
|
||||
fake_model.ainvoke = AsyncMock(return_value=MagicMock(content=content))
|
||||
monkeypatch.setattr(suggestions, "create_chat_model", lambda **kwargs: fake_model)
|
||||
|
||||
result = asyncio.run(suggestions.generate_suggestions.__wrapped__("t1", req, request=None, config=SimpleNamespace()))
|
||||
|
||||
assert result.suggestions == ["深度学习和机器学习的区别?", "常用框架有哪些?", "需要什么数学基础?"]
|
||||
|
||||
|
||||
def test_format_conversation_formats_roles():
|
||||
messages = [
|
||||
suggestions.SuggestionMessage(role="User", content="Hi"),
|
||||
|
||||
@ -289,7 +289,23 @@ models:
|
||||
# temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0]
|
||||
# supports_vision: true
|
||||
# supports_thinking: true
|
||||
# # MiniMax inlines its chain-of-thought into `content` as <think>...</think>
|
||||
# # (reasoning_split defaults to false), not in a separate reasoning_content
|
||||
# # field. Declare the thinking toggle so non-thinking paths (flash mode,
|
||||
# # follow-up suggestions, title/memory generation) truly disable reasoning
|
||||
# # instead of wasting tokens on — and parsing around — inline <think> blocks.
|
||||
# when_thinking_enabled:
|
||||
# extra_body:
|
||||
# thinking:
|
||||
# type: adaptive
|
||||
# when_thinking_disabled:
|
||||
# extra_body:
|
||||
# thinking:
|
||||
# type: disabled
|
||||
|
||||
# NOTE: M2.x models always think — passing thinking:{type:disabled} has no
|
||||
# effect (per MiniMax docs), so the toggle above is omitted for M2.7. The
|
||||
# follow-up-suggestions endpoint strips inline <think> defensively regardless.
|
||||
# - name: minimax-m2.7
|
||||
# display_name: MiniMax M2.7
|
||||
# use: langchain_openai:ChatOpenAI
|
||||
@ -331,7 +347,23 @@ models:
|
||||
# temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0]
|
||||
# supports_vision: true
|
||||
# supports_thinking: true
|
||||
# # MiniMax inlines its chain-of-thought into `content` as <think>...</think>
|
||||
# # (reasoning_split defaults to false), not in a separate reasoning_content
|
||||
# # field. Declare the thinking toggle so non-thinking paths (flash mode,
|
||||
# # follow-up suggestions, title/memory generation) truly disable reasoning
|
||||
# # instead of wasting tokens on — and parsing around — inline <think> blocks.
|
||||
# when_thinking_enabled:
|
||||
# extra_body:
|
||||
# thinking:
|
||||
# type: adaptive
|
||||
# when_thinking_disabled:
|
||||
# extra_body:
|
||||
# thinking:
|
||||
# type: disabled
|
||||
|
||||
# NOTE: M2.x models always think — passing thinking:{type:disabled} has no
|
||||
# effect (per MiniMax docs), so the toggle above is omitted for M2.7. The
|
||||
# follow-up-suggestions endpoint strips inline <think> defensively regardless.
|
||||
# - name: minimax-m2.7
|
||||
# display_name: MiniMax M2.7
|
||||
# use: langchain_openai:ChatOpenAI
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user