diff --git a/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py b/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py index a0c48969c..78c9a7b7b 100644 --- a/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py +++ b/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py @@ -15,20 +15,45 @@ from deerflow.utils.file_conversion import extract_outline logger = logging.getLogger(__name__) -def _extract_outline_for_file(file_path: Path) -> list[dict]: - """Return the document outline for *file_path* if a converted .md exists. +_OUTLINE_PREVIEW_LINES = 5 + + +def _extract_outline_for_file(file_path: Path) -> tuple[list[dict], list[str]]: + """Return the document outline and fallback preview for *file_path*. Looks for a sibling ``.md`` file produced by the upload conversion - pipeline. Returns an empty list when the file is not a converted document - or when no headings are found. + pipeline. + + Returns: + (outline, preview) where: + - outline: list of ``{title, line}`` dicts (plus optional sentinel). + Empty when no headings are found or no .md exists. + - preview: first few non-empty lines of the .md, used as a content + anchor when outline is empty so the agent has some context. + Empty when outline is non-empty (no fallback needed). """ md_path = file_path.with_suffix(".md") if not md_path.is_file(): - return [] + return [], [] + outline = extract_outline(md_path) if outline: logger.debug("Extracted %d outline entries from %s", len(outline), file_path.name) - return outline + return outline, [] + + # outline is empty — read the first few non-empty lines as a content preview + preview: list[str] = [] + try: + with md_path.open(encoding="utf-8") as f: + for line in f: + stripped = line.strip() + if stripped: + preview.append(stripped) + if len(preview) >= _OUTLINE_PREVIEW_LINES: + break + except Exception: + logger.debug("Failed to read preview lines from %s", md_path, exc_info=True) + return [], preview class UploadsMiddlewareState(AgentState): @@ -64,13 +89,20 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): lines.append(f" Path: {file['path']}") outline = file.get("outline") or [] if outline: - truncated = outline[-1].get("truncated", False) if outline else False + truncated = outline[-1].get("truncated", False) visible = [e for e in outline if not e.get("truncated")] lines.append(" Document outline (use `read_file` with line ranges to read sections):") for entry in visible: lines.append(f" L{entry['line']}: {entry['title']}") if truncated: lines.append(f" ... (showing first {len(visible)} headings; use `read_file` to explore further)") + else: + preview = file.get("outline_preview") or [] + if preview: + lines.append(" No structural headings detected. Document begins with:") + for text in preview: + lines.append(f" > {text}") + lines.append(" Use `grep` to search for keywords (e.g. `grep(pattern='keyword', path='/mnt/user-data/uploads/')`).") lines.append("") def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str: @@ -201,13 +233,15 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): for file_path in sorted(uploads_dir.iterdir()): if file_path.is_file() and file_path.name not in new_filenames: stat = file_path.stat() + outline, preview = _extract_outline_for_file(file_path) historical_files.append( { "filename": file_path.name, "size": stat.st_size, "path": f"/mnt/user-data/uploads/{file_path.name}", "extension": file_path.suffix, - "outline": _extract_outline_for_file(file_path), + "outline": outline, + "outline_preview": preview, } ) @@ -215,7 +249,9 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): if uploads_dir: for file in new_files: phys_path = uploads_dir / file["filename"] - file["outline"] = _extract_outline_for_file(phys_path) + outline, preview = _extract_outline_for_file(phys_path) + file["outline"] = outline + file["outline_preview"] = preview if not new_files and not historical_files: return None diff --git a/backend/packages/harness/deerflow/utils/file_conversion.py b/backend/packages/harness/deerflow/utils/file_conversion.py index 9a180883d..68755b675 100644 --- a/backend/packages/harness/deerflow/utils/file_conversion.py +++ b/backend/packages/harness/deerflow/utils/file_conversion.py @@ -182,6 +182,19 @@ async def convert_file_to_markdown(file_path: Path) -> Path | None: # by pymupdf4llm, so they don't need this pattern. _BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$") +# Regex for split-bold headings produced by pymupdf4llm when a heading spans +# multiple text spans in the PDF (e.g. section number and title are separate spans). +# Matches lines like: **1** **Introduction** or **3.2** **Multi-Head Attention** +# Requirements: +# 1. Entire line consists only of **...** blocks separated by whitespace (no prose) +# 2. First block is a section number (digits and dots, e.g. "1", "3.2", "A.1") +# 3. Second block must not be purely numeric/punctuation — excludes financial table +# headers like **2023** **2022** **2021** while allowing non-ASCII titles such as +# **1** **概述** or accented words (negative lookahead instead of [A-Za-z]) +# 4. At most two additional blocks (four total) with [^*]+ (no * inside) to keep +# the regex linear and avoid ReDoS on attacker-controlled content +_SPLIT_BOLD_HEADING_RE = re.compile(r"^\*\*[\dA-Z][\d\.]*\*\*\s+\*\*(?!\d[\d\s.,\-–—/:()%]*\*\*)[^*]+\*\*(?:\s+\*\*[^*]+\*\*){0,2}\s*$") + # Maximum number of outline entries injected into the agent context. # Keeps prompt size bounded even for very long documents. MAX_OUTLINE_ENTRIES = 50 @@ -189,14 +202,43 @@ MAX_OUTLINE_ENTRIES = 50 _ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"} +def _clean_bold_title(raw: str) -> str: + """Normalise a title string that may contain pymupdf4llm bold artefacts. + + pymupdf4llm sometimes emits adjacent bold spans as ``**A** **B**`` instead + of a single ``**A B**`` block. This helper merges those fragments and then + strips the outermost ``**...**`` wrapper so the caller gets plain text. + + Examples:: + + "**Overview**" → "Overview" + "**UNITED STATES** **SECURITIES**" → "UNITED STATES SECURITIES" + "plain text" → "plain text" (unchanged) + """ + # Merge adjacent bold spans: "** **" → " " + merged = re.sub(r"\*\*\s*\*\*", " ", raw).strip() + # Strip outermost **...** if the whole string is wrapped + if m := re.fullmatch(r"\*\*(.+?)\*\*", merged, re.DOTALL): + return m.group(1).strip() + return merged + + def extract_outline(md_path: Path) -> list[dict]: """Extract document outline (headings) from a Markdown file. - Recognises two heading styles produced by pymupdf4llm: - 1. Standard Markdown headings: lines starting with one or more '#' - 2. Bold-only structural headings: **ITEM 1. BUSINESS**, **PART II**, etc. - (SEC filings use bold+caps for section headings with the same font size - as body text, so pymupdf4llm cannot promote them to # headings) + Recognises three heading styles produced by pymupdf4llm: + + 1. Standard Markdown headings: lines starting with one or more '#'. + Inline ``**...**`` wrappers and adjacent bold spans (``** **``) are + cleaned so the title is plain text. + + 2. Bold-only structural headings: ``**ITEM 1. BUSINESS**``, ``**PART II**``, + etc. SEC filings use bold+caps for section headings with the same font + size as body text, so pymupdf4llm cannot promote them to # headings. + + 3. Split-bold headings: ``**1** **Introduction**``, ``**3.2** **Attention**``. + pymupdf4llm emits these when the section number and title text are + separate spans in the underlying PDF (common in academic papers). Args: md_path: Path to the .md file. @@ -218,19 +260,23 @@ def extract_outline(md_path: Path) -> list[dict]: # Style 1: standard Markdown heading if stripped.startswith("#"): - title = stripped.lstrip("#").strip() - # Strip any inline **...** wrapping (e.g. "## **Overview**" → "Overview") + title = _clean_bold_title(stripped.lstrip("#").strip()) if title: - if m2 := re.fullmatch(r"\*\*(.+?)\*\*", title): - title = m2.group(1).strip() outline.append({"title": title, "line": lineno}) - # Style 2: bold-only line (entire line is **...**) + # Style 2: single bold block with SEC structural keyword elif m := _BOLD_HEADING_RE.match(stripped): title = m.group(1).strip() if title: outline.append({"title": title, "line": lineno}) + # Style 3: split-bold heading — **** **** + # Regex already enforces max 4 blocks and non-numeric second block. + elif _SPLIT_BOLD_HEADING_RE.match(stripped): + title = " ".join(re.findall(r"\*\*([^*]+)\*\*", stripped)) + if title: + outline.append({"title": title, "line": lineno}) + if len(outline) >= MAX_OUTLINE_ENTRIES: outline.append({"truncated": True}) break diff --git a/backend/tests/test_file_conversion.py b/backend/tests/test_file_conversion.py index b2ad2d035..42abd3b4a 100644 --- a/backend/tests/test_file_conversion.py +++ b/backend/tests/test_file_conversion.py @@ -420,3 +420,40 @@ class TestExtractOutline: ) outline = extract_outline(md) assert outline == [] + + def test_split_bold_heading_academic_paper(self, tmp_path): + """**<num>** **<title>** lines from academic papers are recognised (Style 3).""" + md = tmp_path / "paper.md" + md.write_text( + "## **Attention Is All You Need**\n\n**1** **Introduction**\n\nBody text.\n\n**2** **Background**\n\nMore text.\n\n**3.1** **Encoder and Decoder Stacks**\n", + encoding="utf-8", + ) + outline = extract_outline(md) + titles = [e["title"] for e in outline] + assert "1 Introduction" in titles + assert "2 Background" in titles + assert "3.1 Encoder and Decoder Stacks" in titles + + def test_split_bold_year_columns_excluded(self, tmp_path): + """Financial table headers like **2023** **2022** **2021** are NOT headings.""" + md = tmp_path / "annual.md" + md.write_text( + "# Financial Summary\n\n**2023** **2022** **2021**\n\nRevenue 100 90 80\n", + encoding="utf-8", + ) + outline = extract_outline(md) + titles = [e["title"] for e in outline] + # Only the # heading should appear, not the year-column row + assert titles == ["Financial Summary"] + + def test_adjacent_bold_spans_merged_in_markdown_heading(self, tmp_path): + """** ** artefacts inside a # heading are merged into clean plain text.""" + md = tmp_path / "sec.md" + md.write_text( + "## **UNITED STATES** **SECURITIES AND EXCHANGE COMMISSION**\n\nBody text.\n", + encoding="utf-8", + ) + outline = extract_outline(md) + assert len(outline) == 1 + # Title must be clean — no ** ** artefacts + assert outline[0]["title"] == "UNITED STATES SECURITIES AND EXCHANGE COMMISSION" diff --git a/backend/tests/test_uploads_middleware_core_logic.py b/backend/tests/test_uploads_middleware_core_logic.py index ebc9ab071..72639fb09 100644 --- a/backend/tests/test_uploads_middleware_core_logic.py +++ b/backend/tests/test_uploads_middleware_core_logic.py @@ -290,6 +290,7 @@ class TestBeforeAgent: "path": "/mnt/user-data/uploads/notes.txt", "extension": ".txt", "outline": [], + "outline_preview": [], } ] @@ -429,3 +430,41 @@ class TestBeforeAgent: content = result["messages"][-1].content assert "Chapter 1" in content assert "Chapter 2" in content + + def test_fallback_preview_shown_when_outline_empty(self, tmp_path): + """When .md exists but has no headings, first lines are shown as a preview.""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "report.pdf").write_bytes(b"%PDF fake") + # .md with no # headings — plain prose only + (uploads_dir / "report.md").write_text( + "Annual Financial Report 2024\n\nThis document summarises key findings.\n\nRevenue grew by 12%.\n", + encoding="utf-8", + ) + + msg = _human("analyse", files=[{"filename": "report.pdf", "size": 9, "path": "/mnt/user-data/uploads/report.pdf"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + # Outline section must NOT appear + assert "Document outline" not in content + # Preview lines must appear + assert "Annual Financial Report 2024" in content + assert "No structural headings detected" in content + # grep hint must appear + assert "grep" in content + + def test_fallback_grep_hint_shown_when_no_md_file(self, tmp_path): + """Files with no sibling .md still get the grep hint (outline is empty).""" + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "data.csv").write_bytes(b"a,b,c\n1,2,3\n") + + msg = _human("analyse", files=[{"filename": "data.csv", "size": 12, "path": "/mnt/user-data/uploads/data.csv"}]) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "Document outline" not in content + assert "grep" in content