From 5ff230eafd29fd6dad8dd3ece58b3f3aba478ff9 Mon Sep 17 00:00:00 2001 From: SHIYAO ZHANG <834247613@qq.com> Date: Fri, 3 Apr 2026 20:52:47 +0800 Subject: [PATCH] feat(uploads): inject document outline into agent context for converted files (#1738) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(uploads): inject document outline into agent context for converted files Extract headings from converted .md files and inject them into the context block so the agent can navigate large documents by line number before reading. - Add `extract_outline()` to `file_conversion.py`: recognises standard Markdown headings (#/##/###) and SEC-style bold structural headings (**ITEM N. BUSINESS**, **PART II**); caps at 50 entries; excludes cover-page boilerplate (WASHINGTON DC, CURRENT REPORT, SIGNATURES) - Add `_extract_outline_for_file()` helper in `uploads_middleware.py`: looks for a sibling `.md` file produced by the conversion pipeline - Update `UploadsMiddleware._create_files_message()` to render the outline under each file entry with `L{line}: {title}` format and a `read_file` prompt for range-based reading - Tests: 10 new tests for `extract_outline()`, 4 new tests for outline injection in `UploadsMiddleware`; existing test updated for new `outline` field in `uploaded_files` state Partially addresses #1647 (agent ignores uploaded files). * fix(uploads): stream outline file reads and strip inline bold from heading titles - Switch extract_outline() from read_text().splitlines() to open()+line iteration so large converted documents are not loaded into memory on every agent turn; exits as soon as MAX_OUTLINE_ENTRIES is reached (Copilot suggestion) - Strip **...** wrapper from standard Markdown heading titles before appending to outline so agent context stays clean (e.g. "## **Overview**" → "Overview") (Copilot suggestion) - Remove unused pathlib.Path import and fix import sort order in test_file_conversion.py to satisfy ruff CI lint * fix(uploads): show truncation hint when outline exceeds MAX_OUTLINE_ENTRIES When extract_outline() hits the cap it now appends a sentinel entry {"truncated": True} instead of silently dropping the rest of the headings. UploadsMiddleware reads the sentinel and renders a hint line: ... (showing first 50 headings; use `read_file` to explore further) Without this the agent had no way to know the outline was incomplete and would treat the first 50 headings as the full document structure. * fix(uploads): fall back to configurable.thread_id when runtime.context lacks thread_id runtime.context does not always carry thread_id (depends on LangGraph invocation path). ThreadDataMiddleware already falls back to get_config().configurable.thread_id — apply the same pattern so UploadsMiddleware can resolve the uploads directory and attach outlines in all invocation paths. * style: apply ruff format --------- Co-authored-by: Willem Jiang --- .../agents/middlewares/uploads_middleware.py | 56 ++++++-- .../harness/deerflow/utils/file_conversion.py | 88 ++++++++++++ backend/tests/test_file_conversion.py | 130 ++++++++++++++++++ .../test_uploads_middleware_core_logic.py | 90 ++++++++++++ 4 files changed, 354 insertions(+), 10 deletions(-) create mode 100644 backend/tests/test_file_conversion.py diff --git a/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py b/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py index 58468765a..f1700e77e 100644 --- a/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py +++ b/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py @@ -10,10 +10,27 @@ from langchain_core.messages import HumanMessage from langgraph.runtime import Runtime from deerflow.config.paths import Paths, get_paths +from deerflow.utils.file_conversion import extract_outline logger = logging.getLogger(__name__) +def _extract_outline_for_file(file_path: Path) -> list[dict]: + """Return the document outline for *file_path* if a converted .md exists. + + Looks for a sibling ``.md`` file produced by the upload conversion + pipeline. Returns an empty list when the file is not a converted document + or when no headings are found. + """ + md_path = file_path.with_suffix(".md") + if not md_path.is_file(): + return [] + outline = extract_outline(md_path) + if outline: + logger.debug("Extracted %d outline entries from %s", len(outline), file_path.name) + return outline + + class UploadsMiddlewareState(AgentState): """State schema for uploads middleware.""" @@ -39,12 +56,31 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): super().__init__() self._paths = Paths(base_dir) if base_dir else get_paths() + def _format_file_entry(self, file: dict, lines: list[str]) -> None: + """Append a single file entry (name, size, path, optional outline) to lines.""" + size_kb = file["size"] / 1024 + size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB" + lines.append(f"- {file['filename']} ({size_str})") + lines.append(f" Path: {file['path']}") + outline = file.get("outline") or [] + if outline: + truncated = outline[-1].get("truncated", False) if outline else False + visible = [e for e in outline if not e.get("truncated")] + lines.append(" Document outline (use `read_file` with line ranges to read sections):") + for entry in visible: + lines.append(f" L{entry['line']}: {entry['title']}") + if truncated: + lines.append(f" ... (showing first {len(visible)} headings; use `read_file` to explore further)") + lines.append("") + def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str: """Create a formatted message listing uploaded files. Args: new_files: Files uploaded in the current message. historical_files: Files uploaded in previous messages. + Each file dict may contain an optional ``outline`` key — a list of + ``{title, line}`` dicts extracted from the converted Markdown file. Returns: Formatted string inside tags. @@ -55,23 +91,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): lines.append("") if new_files: for file in new_files: - size_kb = file["size"] / 1024 - size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB" - lines.append(f"- {file['filename']} ({size_str})") - lines.append(f" Path: {file['path']}") - lines.append("") + self._format_file_entry(file, lines) else: lines.append("(empty)") + lines.append("") if historical_files: lines.append("The following files were uploaded in previous messages and are still available:") lines.append("") for file in historical_files: - size_kb = file["size"] / 1024 - size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB" - lines.append(f"- {file['filename']} ({size_str})") - lines.append(f" Path: {file['path']}") - lines.append("") + self._format_file_entry(file, lines) lines.append("You can read these files using the `read_file` tool with the paths shown above.") lines.append("") @@ -172,9 +201,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): "size": stat.st_size, "path": f"/mnt/user-data/uploads/{file_path.name}", "extension": file_path.suffix, + "outline": _extract_outline_for_file(file_path), } ) + # Attach outlines to new files as well + if uploads_dir: + for file in new_files: + phys_path = uploads_dir / file["filename"] + file["outline"] = _extract_outline_for_file(phys_path) + if not new_files and not historical_files: return None diff --git a/backend/packages/harness/deerflow/utils/file_conversion.py b/backend/packages/harness/deerflow/utils/file_conversion.py index 45cdf1210..eae56599e 100644 --- a/backend/packages/harness/deerflow/utils/file_conversion.py +++ b/backend/packages/harness/deerflow/utils/file_conversion.py @@ -5,6 +5,7 @@ No FastAPI or HTTP dependencies — pure utility functions. """ import logging +import re from pathlib import Path logger = logging.getLogger(__name__) @@ -45,3 +46,90 @@ async def convert_file_to_markdown(file_path: Path) -> Path | None: except Exception as e: logger.error(f"Failed to convert {file_path.name} to markdown: {e}") return None + + +# Regex for bold-only lines that look like section headings. +# Targets SEC filing structural headings that pymupdf4llm renders as **bold** +# rather than # Markdown headings (because they use same font size as body text, +# distinguished only by bold+caps formatting). +# +# Pattern requires ALL of: +# 1. Entire line is a single **...** block (no surrounding prose) +# 2. Starts with a recognised structural keyword: +# - ITEM / PART / SECTION (with optional number/letter after) +# - SCHEDULE, EXHIBIT, APPENDIX, ANNEX, CHAPTER +# All-caps addresses, boilerplate ("CURRENT REPORT", "SIGNATURES", +# "WASHINGTON, DC 20549") do NOT start with these keywords and are excluded. +# +# Chinese headings (第三节...) are already captured as standard # headings +# by pymupdf4llm, so they don't need this pattern. +_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$") + +# Maximum number of outline entries injected into the agent context. +# Keeps prompt size bounded even for very long documents. +MAX_OUTLINE_ENTRIES = 50 + + +def extract_outline(md_path: Path) -> list[dict]: + """Extract document outline (headings) from a Markdown file. + + Recognises two heading styles produced by pymupdf4llm: + 1. Standard Markdown headings: lines starting with one or more '#' + 2. Bold-only structural headings: **ITEM 1. BUSINESS**, **PART II**, etc. + (SEC filings use bold+caps for section headings with the same font size + as body text, so pymupdf4llm cannot promote them to # headings) + + Args: + md_path: Path to the .md file. + + Returns: + List of dicts with keys: title (str), line (int, 1-based). + When the outline is truncated at MAX_OUTLINE_ENTRIES, a sentinel entry + ``{"truncated": True}`` is appended as the last element so callers can + render a "showing first N headings" hint without re-scanning the file. + Returns an empty list if the file cannot be read or has no headings. + """ + outline: list[dict] = [] + try: + with md_path.open(encoding="utf-8") as f: + for lineno, line in enumerate(f, 1): + stripped = line.strip() + if not stripped: + continue + + # Style 1: standard Markdown heading + if stripped.startswith("#"): + title = stripped.lstrip("#").strip() + # Strip any inline **...** wrapping (e.g. "## **Overview**" → "Overview") + if title: + if m2 := re.fullmatch(r"\*\*(.+?)\*\*", title): + title = m2.group(1).strip() + outline.append({"title": title, "line": lineno}) + + # Style 2: bold-only line (entire line is **...**) + elif m := _BOLD_HEADING_RE.match(stripped): + title = m.group(1).strip() + if title: + outline.append({"title": title, "line": lineno}) + + if len(outline) >= MAX_OUTLINE_ENTRIES: + outline.append({"truncated": True}) + break + except Exception: + return [] + + return outline + + +def _get_pdf_converter() -> str: + """Read pdf_converter setting from app config, defaulting to 'auto'.""" + try: + from deerflow.config.app_config import get_app_config + + cfg = get_app_config() + uploads_cfg = getattr(cfg, "uploads", None) + if uploads_cfg is not None: + return str(getattr(uploads_cfg, "pdf_converter", "auto")) + except Exception: + pass + return "auto" diff --git a/backend/tests/test_file_conversion.py b/backend/tests/test_file_conversion.py new file mode 100644 index 000000000..72b13edc6 --- /dev/null +++ b/backend/tests/test_file_conversion.py @@ -0,0 +1,130 @@ +"""Tests for extract_outline() in file_conversion utilities (PR2: document outline injection).""" + +from __future__ import annotations + +from deerflow.utils.file_conversion import ( + MAX_OUTLINE_ENTRIES, + extract_outline, +) + +# --------------------------------------------------------------------------- +# extract_outline +# --------------------------------------------------------------------------- + + +class TestExtractOutline: + """Tests for extract_outline().""" + + def test_empty_file_returns_empty(self, tmp_path): + """Empty markdown file yields no outline entries.""" + md = tmp_path / "empty.md" + md.write_text("", encoding="utf-8") + assert extract_outline(md) == [] + + def test_missing_file_returns_empty(self, tmp_path): + """Non-existent path returns [] without raising.""" + assert extract_outline(tmp_path / "nonexistent.md") == [] + + def test_standard_markdown_headings(self, tmp_path): + """# / ## / ### headings are all recognised.""" + md = tmp_path / "doc.md" + md.write_text( + "# Chapter One\n\nSome text.\n\n## Section 1.1\n\nMore text.\n\n### Sub 1.1.1\n", + encoding="utf-8", + ) + outline = extract_outline(md) + assert len(outline) == 3 + assert outline[0] == {"title": "Chapter One", "line": 1} + assert outline[1] == {"title": "Section 1.1", "line": 5} + assert outline[2] == {"title": "Sub 1.1.1", "line": 9} + + def test_bold_sec_item_heading(self, tmp_path): + """**ITEM N. TITLE** lines in SEC filings are recognised.""" + md = tmp_path / "10k.md" + md.write_text( + "Cover page text.\n\n**ITEM 1. BUSINESS**\n\nBody.\n\n**ITEM 1A. RISK FACTORS**\n", + encoding="utf-8", + ) + outline = extract_outline(md) + assert len(outline) == 2 + assert outline[0] == {"title": "ITEM 1. BUSINESS", "line": 3} + assert outline[1] == {"title": "ITEM 1A. RISK FACTORS", "line": 7} + + def test_bold_part_heading(self, tmp_path): + """**PART I** / **PART II** headings are recognised.""" + md = tmp_path / "10k.md" + md.write_text("**PART I**\n\n**PART II**\n\n**PART III**\n", encoding="utf-8") + outline = extract_outline(md) + assert len(outline) == 3 + titles = [e["title"] for e in outline] + assert "PART I" in titles + assert "PART II" in titles + assert "PART III" in titles + + def test_sec_cover_page_boilerplate_excluded(self, tmp_path): + """Address lines and short cover boilerplate must NOT appear in outline.""" + md = tmp_path / "8k.md" + md.write_text( + "## **UNITED STATES SECURITIES AND EXCHANGE COMMISSION**\n\n**WASHINGTON, DC 20549**\n\n**CURRENT REPORT**\n\n**SIGNATURES**\n\n**TESLA, INC.**\n\n**ITEM 2.02. RESULTS OF OPERATIONS**\n", + encoding="utf-8", + ) + outline = extract_outline(md) + titles = [e["title"] for e in outline] + # Cover-page boilerplate should be excluded + assert "WASHINGTON, DC 20549" not in titles + assert "CURRENT REPORT" not in titles + assert "SIGNATURES" not in titles + assert "TESLA, INC." not in titles + # Real SEC heading must be included + assert "ITEM 2.02. RESULTS OF OPERATIONS" in titles + + def test_chinese_headings_via_standard_markdown(self, tmp_path): + """Chinese annual report headings emitted as # by pymupdf4llm are captured.""" + md = tmp_path / "annual.md" + md.write_text( + "# 第一节 公司简介\n\n内容。\n\n## 第三节 管理层讨论与分析\n\n分析内容。\n", + encoding="utf-8", + ) + outline = extract_outline(md) + assert len(outline) == 2 + assert outline[0]["title"] == "第一节 公司简介" + assert outline[1]["title"] == "第三节 管理层讨论与分析" + + def test_outline_capped_at_max_entries(self, tmp_path): + """When truncated, result has MAX_OUTLINE_ENTRIES real entries + 1 sentinel.""" + lines = [f"# Heading {i}" for i in range(MAX_OUTLINE_ENTRIES + 10)] + md = tmp_path / "long.md" + md.write_text("\n".join(lines), encoding="utf-8") + outline = extract_outline(md) + # Last entry is the truncation sentinel + assert outline[-1] == {"truncated": True} + # Visible entries are exactly MAX_OUTLINE_ENTRIES + visible = [e for e in outline if not e.get("truncated")] + assert len(visible) == MAX_OUTLINE_ENTRIES + + def test_no_truncation_sentinel_when_under_limit(self, tmp_path): + """Short documents produce no sentinel entry.""" + lines = [f"# Heading {i}" for i in range(5)] + md = tmp_path / "short.md" + md.write_text("\n".join(lines), encoding="utf-8") + outline = extract_outline(md) + assert len(outline) == 5 + assert not any(e.get("truncated") for e in outline) + + def test_blank_lines_and_whitespace_ignored(self, tmp_path): + """Blank lines between headings do not produce empty entries.""" + md = tmp_path / "spaced.md" + md.write_text("\n\n# Title One\n\n\n\n# Title Two\n\n", encoding="utf-8") + outline = extract_outline(md) + assert len(outline) == 2 + assert all(e["title"] for e in outline) + + def test_inline_bold_not_confused_with_heading(self, tmp_path): + """Mid-sentence bold text must not be mistaken for a heading.""" + md = tmp_path / "prose.md" + md.write_text( + "This sentence has **bold words** inside it.\n\nAnother with **MULTIPLE CAPS** inline.\n", + encoding="utf-8", + ) + outline = extract_outline(md) + assert outline == [] diff --git a/backend/tests/test_uploads_middleware_core_logic.py b/backend/tests/test_uploads_middleware_core_logic.py index e69f80978..ebc9ab071 100644 --- a/backend/tests/test_uploads_middleware_core_logic.py +++ b/backend/tests/test_uploads_middleware_core_logic.py @@ -289,6 +289,7 @@ class TestBeforeAgent: "size": 5, "path": "/mnt/user-data/uploads/notes.txt", "extension": ".txt", + "outline": [], } ] @@ -339,3 +340,92 @@ class TestBeforeAgent: result = mw.before_agent(self._state(msg), _runtime()) assert result["messages"][-1].id == "original-id-42" + + def test_outline_injected_when_md_file_exists(self, tmp_path): + """When a converted .md file exists alongside the upload, its outline is injected.""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "report.pdf").write_bytes(b"%PDF fake") + # Simulate the .md produced by the conversion pipeline + (uploads_dir / "report.md").write_text( + "# PART I\n\n## ITEM 1. BUSINESS\n\nBody text.\n\n## ITEM 2. RISK\n", + encoding="utf-8", + ) + + msg = _human("summarise", files=[{"filename": "report.pdf", "size": 9, "path": "/mnt/user-data/uploads/report.pdf"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "Document outline" in content + assert "PART I" in content + assert "ITEM 1. BUSINESS" in content + assert "ITEM 2. RISK" in content + assert "read_file" in content + + def test_no_outline_when_no_md_file(self, tmp_path): + """Files without a sibling .md have no outline section.""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "data.xlsx").write_bytes(b"fake-xlsx") + + msg = _human("analyse", files=[{"filename": "data.xlsx", "size": 9, "path": "/mnt/user-data/uploads/data.xlsx"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "Document outline" not in content + + def test_outline_truncation_hint_shown(self, tmp_path): + """When outline is truncated, a hint line is appended after the last visible entry.""" + from deerflow.utils.file_conversion import MAX_OUTLINE_ENTRIES + + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "big.pdf").write_bytes(b"%PDF fake") + # Write MAX_OUTLINE_ENTRIES + 5 headings so truncation is triggered + headings = "\n".join(f"# Heading {i}" for i in range(MAX_OUTLINE_ENTRIES + 5)) + (uploads_dir / "big.md").write_text(headings, encoding="utf-8") + + msg = _human("read", files=[{"filename": "big.pdf", "size": 9, "path": "/mnt/user-data/uploads/big.pdf"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert f"showing first {MAX_OUTLINE_ENTRIES} headings" in content + assert "use `read_file` to explore further" in content + + def test_no_truncation_hint_for_short_outline(self, tmp_path): + """Short outlines (under the cap) must not show a truncation hint.""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "short.pdf").write_bytes(b"%PDF fake") + (uploads_dir / "short.md").write_text("# Intro\n\n# Conclusion\n", encoding="utf-8") + + msg = _human("read", files=[{"filename": "short.pdf", "size": 9, "path": "/mnt/user-data/uploads/short.pdf"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "showing first" not in content + + def test_historical_file_outline_injected(self, tmp_path): + """Outline is also shown for historical (previously uploaded) files.""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + # Historical file with .md + (uploads_dir / "old_report.pdf").write_bytes(b"%PDF old") + (uploads_dir / "old_report.md").write_text( + "# Chapter 1\n\n# Chapter 2\n", + encoding="utf-8", + ) + # New file without .md + (uploads_dir / "new.txt").write_bytes(b"new") + + msg = _human("go", files=[{"filename": "new.txt", "size": 3, "path": "/mnt/user-data/uploads/new.txt"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "Chapter 1" in content + assert "Chapter 2" in content