From 3b105d1e5f493932c6541d5afb7279e8c6517006 Mon Sep 17 00:00:00 2001 From: DanielWalnut <45447813+hetaoBackend@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:48:00 +0800 Subject: [PATCH] fix(suggestions): strip inline reasoning before parsing follow-up questions (#3435) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reasoning models such as MiniMax-M3 inline their chain-of-thought into the message content as ... (reasoning_split defaults to false) instead of a separate reasoning_content field. The follow-up-suggestions endpoint extracted the JSON array via find('[') / rfind(']'), which silently broke whenever the reasoning text contained '[' or ']' — or when long thinking hit max_tokens and truncated before the array was emitted — returning empty suggestions. - Add _strip_think_blocks() and apply it before JSON extraction; it removes complete ... blocks (case-insensitive) and drops an unclosed left by max_tokens truncation. - Document the MiniMax thinking toggle in config.example.yaml (when_thinking_enabled: adaptive / when_thinking_disabled: disabled) so thinking_enabled=False actually disables reasoning on M3; note that M2.x models always think and rely on the defensive strip above. - Tests cover complete/unclosed think blocks, brackets-inside-think, think + code-fence, and an end-to-end suggestions case reproducing the empty-result bug. Co-authored-by: Claude Opus 4.8 (1M context) --- backend/CLAUDE.md | 2 +- backend/app/gateway/routers/suggestions.py | 29 +++++++++++- backend/tests/test_suggestions_router.py | 54 ++++++++++++++++++++++ config.example.yaml | 32 +++++++++++++ 4 files changed, 115 insertions(+), 2 deletions(-) diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index caa36f579..8490d8644 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -263,7 +263,7 @@ CORS is same-origin by default when requests enter through nginx on port 2026. S | **Uploads** (`/api/threads/{id}/uploads`) | `POST /` - upload files (auto-converts PDF/PPT/Excel/Word); `GET /list` - list; `DELETE /{filename}` - delete | | **Threads** (`/api/threads/{id}`) | `DELETE /` - remove DeerFlow-managed local thread data after LangGraph thread deletion; unexpected failures are logged server-side and return a generic 500 detail | | **Artifacts** (`/api/threads/{id}/artifacts`) | `GET /{path}` - serve artifacts; active content types (`text/html`, `application/xhtml+xml`, `image/svg+xml`) are always forced as download attachments to reduce XSS risk; `?download=true` still forces download for other file types | -| **Suggestions** (`/api/threads/{id}/suggestions`) | `POST /` - generate follow-up questions; rich list/block model content is normalized before JSON parsing | +| **Suggestions** (`/api/threads/{id}/suggestions`) | `POST /` - generate follow-up questions; rich list/block model content is normalized and inline reasoning (`...`, including unclosed/truncated blocks from reasoning models like MiniMax-M3) is stripped before JSON parsing | | **Thread Runs** (`/api/threads/{id}/runs`) | `POST /` - create background run; `POST /stream` - create + SSE stream; `POST /wait` - create + block; `GET /` - list runs; `GET /{rid}` - run details; `POST /{rid}/cancel` - cancel; `GET /{rid}/join` - join SSE; `GET /{rid}/messages` - paginated messages `{data, has_more}`; `GET /{rid}/events` - full event stream; `GET /../messages` - thread messages with feedback; `GET /../token-usage` - aggregate tokens | | **Feedback** (`/api/threads/{id}/runs/{rid}/feedback`) | `PUT /` - upsert feedback; `DELETE /` - delete user feedback; `POST /` - create feedback; `GET /` - list feedback; `GET /stats` - aggregate stats; `DELETE /{fid}` - delete specific | | **Runs** (`/api/runs`) | `POST /stream` - stateless run + SSE; `POST /wait` - stateless run + block; `GET /{rid}/messages` - paginated messages by run_id `{data, has_more}` (cursor: `after_seq`/`before_seq`); `GET /{rid}/feedback` - list feedback by run_id | diff --git a/backend/app/gateway/routers/suggestions.py b/backend/app/gateway/routers/suggestions.py index 56d99b9d3..39f7d250a 100644 --- a/backend/app/gateway/routers/suggestions.py +++ b/backend/app/gateway/routers/suggestions.py @@ -1,5 +1,6 @@ import json import logging +import re from fastapi import APIRouter, Depends, Request from langchain_core.messages import HumanMessage, SystemMessage @@ -30,6 +31,31 @@ class SuggestionsResponse(BaseModel): suggestions: list[str] = Field(default_factory=list, description="Suggested follow-up questions") +# Matches a complete ... block (case-insensitive, spans newlines). +_THINK_BLOCK_RE = re.compile(r"]*>.*?", re.IGNORECASE | re.DOTALL) +# Matches a dangling, unclosed (model truncated at max_tokens mid-thought). +_OPEN_THINK_RE = re.compile(r"]*>", re.IGNORECASE) + + +def _strip_think_blocks(text: str) -> str: + """Remove reasoning-model ``...`` blocks from the response. + + Reasoning models such as MiniMax-M3 inline their chain-of-thought into the + message ``content`` wrapped in ``...`` (``reasoning_split`` + defaults to false), rather than exposing a separate ``reasoning_content`` + field. The thinking text frequently contains ``[`` / ``]`` characters, which + corrupted the downstream ``find('[')`` / ``rfind(']')`` JSON extraction and + produced empty suggestions. We strip the reasoning before parsing so only + the actual answer remains. + """ + text = _THINK_BLOCK_RE.sub("", text) + # Drop any unclosed (and everything after it) left by truncation. + open_match = _OPEN_THINK_RE.search(text) + if open_match: + text = text[: open_match.start()] + return text.strip() + + def _strip_markdown_code_fence(text: str) -> str: stripped = text.strip() if not stripped.startswith("```"): @@ -41,7 +67,8 @@ def _strip_markdown_code_fence(text: str) -> str: def _parse_json_string_list(text: str) -> list[str] | None: - candidate = _strip_markdown_code_fence(text) + candidate = _strip_think_blocks(text) + candidate = _strip_markdown_code_fence(candidate) start = candidate.find("[") end = candidate.rfind("]") if start == -1 or end == -1 or end <= start: diff --git a/backend/tests/test_suggestions_router.py b/backend/tests/test_suggestions_router.py index 0058e4588..bd0a998ff 100644 --- a/backend/tests/test_suggestions_router.py +++ b/backend/tests/test_suggestions_router.py @@ -25,6 +25,60 @@ def test_parse_json_string_list_rejects_non_list(): assert suggestions._parse_json_string_list(text) is None +def test_strip_think_blocks_removes_complete_block(): + text = "\nreasoning here\n\nanswer" + assert suggestions._strip_think_blocks(text) == "answer" + + +def test_strip_think_blocks_is_case_insensitive(): + text = "reasoning\nanswer" + assert suggestions._strip_think_blocks(text) == "answer" + + +def test_strip_think_blocks_drops_unclosed_block(): + # Reasoning models truncated at max_tokens emit an unclosed . + text = "\nreasoning that never finished because tokens ran out" + assert suggestions._strip_think_blocks(text) == "" + + +def test_strip_think_blocks_keeps_text_without_think(): + text = '["a", "b"]' + assert suggestions._strip_think_blocks(text) == '["a", "b"]' + + +def test_parse_json_string_list_ignores_brackets_inside_think_block(): + # MiniMax-M3 inlines its chain-of-thought as ... in content + # (reasoning_split=false). When that reasoning contains '[' / ']', the old + # find('[')/rfind(']') logic grabbed the wrong span and parsing failed. + text = '\nMaybe a list like ["x", "y"] could work. Let me craft 3.\n\n["Q1", "Q2", "Q3"]' + assert suggestions._parse_json_string_list(text) == ["Q1", "Q2", "Q3"] + + +def test_parse_json_string_list_strips_think_then_code_fence(): + text = 'reasoning\n```json\n["Q1", "Q2"]\n```' + assert suggestions._parse_json_string_list(text) == ["Q1", "Q2"] + + +def test_generate_suggestions_strips_inline_think_block(monkeypatch): + # End-to-end: model returns thinking inline followed by the JSON array. + req = suggestions.SuggestionsRequest( + messages=[ + suggestions.SuggestionMessage(role="user", content="介绍深度学习"), + suggestions.SuggestionMessage(role="assistant", content="深度学习是机器学习的分支。"), + ], + n=3, + model_name=None, + ) + content = '\nThe user asked about deep learning. Options: maybe [1] frameworks, [2] math basics.\n\n["深度学习和机器学习的区别?", "常用框架有哪些?", "需要什么数学基础?"]' + fake_model = MagicMock() + fake_model.ainvoke = AsyncMock(return_value=MagicMock(content=content)) + monkeypatch.setattr(suggestions, "create_chat_model", lambda **kwargs: fake_model) + + result = asyncio.run(suggestions.generate_suggestions.__wrapped__("t1", req, request=None, config=SimpleNamespace())) + + assert result.suggestions == ["深度学习和机器学习的区别?", "常用框架有哪些?", "需要什么数学基础?"] + + def test_format_conversation_formats_roles(): messages = [ suggestions.SuggestionMessage(role="User", content="Hi"), diff --git a/config.example.yaml b/config.example.yaml index 0a0026d87..99752cf5e 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -289,7 +289,23 @@ models: # temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] # supports_vision: true # supports_thinking: true + # # MiniMax inlines its chain-of-thought into `content` as ... + # # (reasoning_split defaults to false), not in a separate reasoning_content + # # field. Declare the thinking toggle so non-thinking paths (flash mode, + # # follow-up suggestions, title/memory generation) truly disable reasoning + # # instead of wasting tokens on — and parsing around — inline blocks. + # when_thinking_enabled: + # extra_body: + # thinking: + # type: adaptive + # when_thinking_disabled: + # extra_body: + # thinking: + # type: disabled + # NOTE: M2.x models always think — passing thinking:{type:disabled} has no + # effect (per MiniMax docs), so the toggle above is omitted for M2.7. The + # follow-up-suggestions endpoint strips inline defensively regardless. # - name: minimax-m2.7 # display_name: MiniMax M2.7 # use: langchain_openai:ChatOpenAI @@ -331,7 +347,23 @@ models: # temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0] # supports_vision: true # supports_thinking: true + # # MiniMax inlines its chain-of-thought into `content` as ... + # # (reasoning_split defaults to false), not in a separate reasoning_content + # # field. Declare the thinking toggle so non-thinking paths (flash mode, + # # follow-up suggestions, title/memory generation) truly disable reasoning + # # instead of wasting tokens on — and parsing around — inline blocks. + # when_thinking_enabled: + # extra_body: + # thinking: + # type: adaptive + # when_thinking_disabled: + # extra_body: + # thinking: + # type: disabled + # NOTE: M2.x models always think — passing thinking:{type:disabled} has no + # effect (per MiniMax docs), so the toggle above is omitted for M2.7. The + # follow-up-suggestions endpoint strips inline defensively regardless. # - name: minimax-m2.7 # display_name: MiniMax M2.7 # use: langchain_openai:ChatOpenAI