diff --git a/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py b/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py index 58468765a..f1700e77e 100644 --- a/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py +++ b/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py @@ -10,10 +10,27 @@ from langchain_core.messages import HumanMessage from langgraph.runtime import Runtime from deerflow.config.paths import Paths, get_paths +from deerflow.utils.file_conversion import extract_outline logger = logging.getLogger(__name__) +def _extract_outline_for_file(file_path: Path) -> list[dict]: + """Return the document outline for *file_path* if a converted .md exists. + + Looks for a sibling ``.md`` file produced by the upload conversion + pipeline. Returns an empty list when the file is not a converted document + or when no headings are found. + """ + md_path = file_path.with_suffix(".md") + if not md_path.is_file(): + return [] + outline = extract_outline(md_path) + if outline: + logger.debug("Extracted %d outline entries from %s", len(outline), file_path.name) + return outline + + class UploadsMiddlewareState(AgentState): """State schema for uploads middleware.""" @@ -39,12 +56,31 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): super().__init__() self._paths = Paths(base_dir) if base_dir else get_paths() + def _format_file_entry(self, file: dict, lines: list[str]) -> None: + """Append a single file entry (name, size, path, optional outline) to lines.""" + size_kb = file["size"] / 1024 + size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB" + lines.append(f"- {file['filename']} ({size_str})") + lines.append(f" Path: {file['path']}") + outline = file.get("outline") or [] + if outline: + truncated = outline[-1].get("truncated", False) if outline else False + visible = [e for e in outline if not e.get("truncated")] + lines.append(" Document outline (use `read_file` with line ranges to read sections):") + for entry in visible: + lines.append(f" L{entry['line']}: {entry['title']}") + if truncated: + lines.append(f" ... (showing first {len(visible)} headings; use `read_file` to explore further)") + lines.append("") + def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str: """Create a formatted message listing uploaded files. Args: new_files: Files uploaded in the current message. historical_files: Files uploaded in previous messages. + Each file dict may contain an optional ``outline`` key — a list of + ``{title, line}`` dicts extracted from the converted Markdown file. Returns: Formatted string inside tags. @@ -55,23 +91,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): lines.append("") if new_files: for file in new_files: - size_kb = file["size"] / 1024 - size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB" - lines.append(f"- {file['filename']} ({size_str})") - lines.append(f" Path: {file['path']}") - lines.append("") + self._format_file_entry(file, lines) else: lines.append("(empty)") + lines.append("") if historical_files: lines.append("The following files were uploaded in previous messages and are still available:") lines.append("") for file in historical_files: - size_kb = file["size"] / 1024 - size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB" - lines.append(f"- {file['filename']} ({size_str})") - lines.append(f" Path: {file['path']}") - lines.append("") + self._format_file_entry(file, lines) lines.append("You can read these files using the `read_file` tool with the paths shown above.") lines.append("") @@ -172,9 +201,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): "size": stat.st_size, "path": f"/mnt/user-data/uploads/{file_path.name}", "extension": file_path.suffix, + "outline": _extract_outline_for_file(file_path), } ) + # Attach outlines to new files as well + if uploads_dir: + for file in new_files: + phys_path = uploads_dir / file["filename"] + file["outline"] = _extract_outline_for_file(phys_path) + if not new_files and not historical_files: return None diff --git a/backend/packages/harness/deerflow/utils/file_conversion.py b/backend/packages/harness/deerflow/utils/file_conversion.py index 45cdf1210..eae56599e 100644 --- a/backend/packages/harness/deerflow/utils/file_conversion.py +++ b/backend/packages/harness/deerflow/utils/file_conversion.py @@ -5,6 +5,7 @@ No FastAPI or HTTP dependencies — pure utility functions. """ import logging +import re from pathlib import Path logger = logging.getLogger(__name__) @@ -45,3 +46,90 @@ async def convert_file_to_markdown(file_path: Path) -> Path | None: except Exception as e: logger.error(f"Failed to convert {file_path.name} to markdown: {e}") return None + + +# Regex for bold-only lines that look like section headings. +# Targets SEC filing structural headings that pymupdf4llm renders as **bold** +# rather than # Markdown headings (because they use same font size as body text, +# distinguished only by bold+caps formatting). +# +# Pattern requires ALL of: +# 1. Entire line is a single **...** block (no surrounding prose) +# 2. Starts with a recognised structural keyword: +# - ITEM / PART / SECTION (with optional number/letter after) +# - SCHEDULE, EXHIBIT, APPENDIX, ANNEX, CHAPTER +# All-caps addresses, boilerplate ("CURRENT REPORT", "SIGNATURES", +# "WASHINGTON, DC 20549") do NOT start with these keywords and are excluded. +# +# Chinese headings (第三节...) are already captured as standard # headings +# by pymupdf4llm, so they don't need this pattern. +_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$") + +# Maximum number of outline entries injected into the agent context. +# Keeps prompt size bounded even for very long documents. +MAX_OUTLINE_ENTRIES = 50 + + +def extract_outline(md_path: Path) -> list[dict]: + """Extract document outline (headings) from a Markdown file. + + Recognises two heading styles produced by pymupdf4llm: + 1. Standard Markdown headings: lines starting with one or more '#' + 2. Bold-only structural headings: **ITEM 1. BUSINESS**, **PART II**, etc. + (SEC filings use bold+caps for section headings with the same font size + as body text, so pymupdf4llm cannot promote them to # headings) + + Args: + md_path: Path to the .md file. + + Returns: + List of dicts with keys: title (str), line (int, 1-based). + When the outline is truncated at MAX_OUTLINE_ENTRIES, a sentinel entry + ``{"truncated": True}`` is appended as the last element so callers can + render a "showing first N headings" hint without re-scanning the file. + Returns an empty list if the file cannot be read or has no headings. + """ + outline: list[dict] = [] + try: + with md_path.open(encoding="utf-8") as f: + for lineno, line in enumerate(f, 1): + stripped = line.strip() + if not stripped: + continue + + # Style 1: standard Markdown heading + if stripped.startswith("#"): + title = stripped.lstrip("#").strip() + # Strip any inline **...** wrapping (e.g. "## **Overview**" → "Overview") + if title: + if m2 := re.fullmatch(r"\*\*(.+?)\*\*", title): + title = m2.group(1).strip() + outline.append({"title": title, "line": lineno}) + + # Style 2: bold-only line (entire line is **...**) + elif m := _BOLD_HEADING_RE.match(stripped): + title = m.group(1).strip() + if title: + outline.append({"title": title, "line": lineno}) + + if len(outline) >= MAX_OUTLINE_ENTRIES: + outline.append({"truncated": True}) + break + except Exception: + return [] + + return outline + + +def _get_pdf_converter() -> str: + """Read pdf_converter setting from app config, defaulting to 'auto'.""" + try: + from deerflow.config.app_config import get_app_config + + cfg = get_app_config() + uploads_cfg = getattr(cfg, "uploads", None) + if uploads_cfg is not None: + return str(getattr(uploads_cfg, "pdf_converter", "auto")) + except Exception: + pass + return "auto" diff --git a/backend/tests/test_file_conversion.py b/backend/tests/test_file_conversion.py new file mode 100644 index 000000000..72b13edc6 --- /dev/null +++ b/backend/tests/test_file_conversion.py @@ -0,0 +1,130 @@ +"""Tests for extract_outline() in file_conversion utilities (PR2: document outline injection).""" + +from __future__ import annotations + +from deerflow.utils.file_conversion import ( + MAX_OUTLINE_ENTRIES, + extract_outline, +) + +# --------------------------------------------------------------------------- +# extract_outline +# --------------------------------------------------------------------------- + + +class TestExtractOutline: + """Tests for extract_outline().""" + + def test_empty_file_returns_empty(self, tmp_path): + """Empty markdown file yields no outline entries.""" + md = tmp_path / "empty.md" + md.write_text("", encoding="utf-8") + assert extract_outline(md) == [] + + def test_missing_file_returns_empty(self, tmp_path): + """Non-existent path returns [] without raising.""" + assert extract_outline(tmp_path / "nonexistent.md") == [] + + def test_standard_markdown_headings(self, tmp_path): + """# / ## / ### headings are all recognised.""" + md = tmp_path / "doc.md" + md.write_text( + "# Chapter One\n\nSome text.\n\n## Section 1.1\n\nMore text.\n\n### Sub 1.1.1\n", + encoding="utf-8", + ) + outline = extract_outline(md) + assert len(outline) == 3 + assert outline[0] == {"title": "Chapter One", "line": 1} + assert outline[1] == {"title": "Section 1.1", "line": 5} + assert outline[2] == {"title": "Sub 1.1.1", "line": 9} + + def test_bold_sec_item_heading(self, tmp_path): + """**ITEM N. TITLE** lines in SEC filings are recognised.""" + md = tmp_path / "10k.md" + md.write_text( + "Cover page text.\n\n**ITEM 1. BUSINESS**\n\nBody.\n\n**ITEM 1A. RISK FACTORS**\n", + encoding="utf-8", + ) + outline = extract_outline(md) + assert len(outline) == 2 + assert outline[0] == {"title": "ITEM 1. BUSINESS", "line": 3} + assert outline[1] == {"title": "ITEM 1A. RISK FACTORS", "line": 7} + + def test_bold_part_heading(self, tmp_path): + """**PART I** / **PART II** headings are recognised.""" + md = tmp_path / "10k.md" + md.write_text("**PART I**\n\n**PART II**\n\n**PART III**\n", encoding="utf-8") + outline = extract_outline(md) + assert len(outline) == 3 + titles = [e["title"] for e in outline] + assert "PART I" in titles + assert "PART II" in titles + assert "PART III" in titles + + def test_sec_cover_page_boilerplate_excluded(self, tmp_path): + """Address lines and short cover boilerplate must NOT appear in outline.""" + md = tmp_path / "8k.md" + md.write_text( + "## **UNITED STATES SECURITIES AND EXCHANGE COMMISSION**\n\n**WASHINGTON, DC 20549**\n\n**CURRENT REPORT**\n\n**SIGNATURES**\n\n**TESLA, INC.**\n\n**ITEM 2.02. RESULTS OF OPERATIONS**\n", + encoding="utf-8", + ) + outline = extract_outline(md) + titles = [e["title"] for e in outline] + # Cover-page boilerplate should be excluded + assert "WASHINGTON, DC 20549" not in titles + assert "CURRENT REPORT" not in titles + assert "SIGNATURES" not in titles + assert "TESLA, INC." not in titles + # Real SEC heading must be included + assert "ITEM 2.02. RESULTS OF OPERATIONS" in titles + + def test_chinese_headings_via_standard_markdown(self, tmp_path): + """Chinese annual report headings emitted as # by pymupdf4llm are captured.""" + md = tmp_path / "annual.md" + md.write_text( + "# 第一节 公司简介\n\n内容。\n\n## 第三节 管理层讨论与分析\n\n分析内容。\n", + encoding="utf-8", + ) + outline = extract_outline(md) + assert len(outline) == 2 + assert outline[0]["title"] == "第一节 公司简介" + assert outline[1]["title"] == "第三节 管理层讨论与分析" + + def test_outline_capped_at_max_entries(self, tmp_path): + """When truncated, result has MAX_OUTLINE_ENTRIES real entries + 1 sentinel.""" + lines = [f"# Heading {i}" for i in range(MAX_OUTLINE_ENTRIES + 10)] + md = tmp_path / "long.md" + md.write_text("\n".join(lines), encoding="utf-8") + outline = extract_outline(md) + # Last entry is the truncation sentinel + assert outline[-1] == {"truncated": True} + # Visible entries are exactly MAX_OUTLINE_ENTRIES + visible = [e for e in outline if not e.get("truncated")] + assert len(visible) == MAX_OUTLINE_ENTRIES + + def test_no_truncation_sentinel_when_under_limit(self, tmp_path): + """Short documents produce no sentinel entry.""" + lines = [f"# Heading {i}" for i in range(5)] + md = tmp_path / "short.md" + md.write_text("\n".join(lines), encoding="utf-8") + outline = extract_outline(md) + assert len(outline) == 5 + assert not any(e.get("truncated") for e in outline) + + def test_blank_lines_and_whitespace_ignored(self, tmp_path): + """Blank lines between headings do not produce empty entries.""" + md = tmp_path / "spaced.md" + md.write_text("\n\n# Title One\n\n\n\n# Title Two\n\n", encoding="utf-8") + outline = extract_outline(md) + assert len(outline) == 2 + assert all(e["title"] for e in outline) + + def test_inline_bold_not_confused_with_heading(self, tmp_path): + """Mid-sentence bold text must not be mistaken for a heading.""" + md = tmp_path / "prose.md" + md.write_text( + "This sentence has **bold words** inside it.\n\nAnother with **MULTIPLE CAPS** inline.\n", + encoding="utf-8", + ) + outline = extract_outline(md) + assert outline == [] diff --git a/backend/tests/test_uploads_middleware_core_logic.py b/backend/tests/test_uploads_middleware_core_logic.py index e69f80978..ebc9ab071 100644 --- a/backend/tests/test_uploads_middleware_core_logic.py +++ b/backend/tests/test_uploads_middleware_core_logic.py @@ -289,6 +289,7 @@ class TestBeforeAgent: "size": 5, "path": "/mnt/user-data/uploads/notes.txt", "extension": ".txt", + "outline": [], } ] @@ -339,3 +340,92 @@ class TestBeforeAgent: result = mw.before_agent(self._state(msg), _runtime()) assert result["messages"][-1].id == "original-id-42" + + def test_outline_injected_when_md_file_exists(self, tmp_path): + """When a converted .md file exists alongside the upload, its outline is injected.""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "report.pdf").write_bytes(b"%PDF fake") + # Simulate the .md produced by the conversion pipeline + (uploads_dir / "report.md").write_text( + "# PART I\n\n## ITEM 1. BUSINESS\n\nBody text.\n\n## ITEM 2. RISK\n", + encoding="utf-8", + ) + + msg = _human("summarise", files=[{"filename": "report.pdf", "size": 9, "path": "/mnt/user-data/uploads/report.pdf"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "Document outline" in content + assert "PART I" in content + assert "ITEM 1. BUSINESS" in content + assert "ITEM 2. RISK" in content + assert "read_file" in content + + def test_no_outline_when_no_md_file(self, tmp_path): + """Files without a sibling .md have no outline section.""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "data.xlsx").write_bytes(b"fake-xlsx") + + msg = _human("analyse", files=[{"filename": "data.xlsx", "size": 9, "path": "/mnt/user-data/uploads/data.xlsx"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "Document outline" not in content + + def test_outline_truncation_hint_shown(self, tmp_path): + """When outline is truncated, a hint line is appended after the last visible entry.""" + from deerflow.utils.file_conversion import MAX_OUTLINE_ENTRIES + + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "big.pdf").write_bytes(b"%PDF fake") + # Write MAX_OUTLINE_ENTRIES + 5 headings so truncation is triggered + headings = "\n".join(f"# Heading {i}" for i in range(MAX_OUTLINE_ENTRIES + 5)) + (uploads_dir / "big.md").write_text(headings, encoding="utf-8") + + msg = _human("read", files=[{"filename": "big.pdf", "size": 9, "path": "/mnt/user-data/uploads/big.pdf"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert f"showing first {MAX_OUTLINE_ENTRIES} headings" in content + assert "use `read_file` to explore further" in content + + def test_no_truncation_hint_for_short_outline(self, tmp_path): + """Short outlines (under the cap) must not show a truncation hint.""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "short.pdf").write_bytes(b"%PDF fake") + (uploads_dir / "short.md").write_text("# Intro\n\n# Conclusion\n", encoding="utf-8") + + msg = _human("read", files=[{"filename": "short.pdf", "size": 9, "path": "/mnt/user-data/uploads/short.pdf"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "showing first" not in content + + def test_historical_file_outline_injected(self, tmp_path): + """Outline is also shown for historical (previously uploaded) files.""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + # Historical file with .md + (uploads_dir / "old_report.pdf").write_bytes(b"%PDF old") + (uploads_dir / "old_report.md").write_text( + "# Chapter 1\n\n# Chapter 2\n", + encoding="utf-8", + ) + # New file without .md + (uploads_dir / "new.txt").write_bytes(b"new") + + msg = _human("go", files=[{"filename": "new.txt", "size": 3, "path": "/mnt/user-data/uploads/new.txt"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "Chapter 1" in content + assert "Chapter 2" in content