mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-05-02 14:58:24 +00:00
* feat(uploads): inject document outline into agent context for converted files
Extract headings from converted .md files and inject them into the
<uploaded_files> context block so the agent can navigate large documents
by line number before reading.
- Add `extract_outline()` to `file_conversion.py`: recognises standard
Markdown headings (#/##/###) and SEC-style bold structural headings
(**ITEM N. BUSINESS**, **PART II**); caps at 50 entries; excludes
cover-page boilerplate (WASHINGTON DC, CURRENT REPORT, SIGNATURES)
- Add `_extract_outline_for_file()` helper in `uploads_middleware.py`:
looks for a sibling `.md` file produced by the conversion pipeline
- Update `UploadsMiddleware._create_files_message()` to render the outline
under each file entry with `L{line}: {title}` format and a `read_file`
prompt for range-based reading
- Tests: 10 new tests for `extract_outline()`, 4 new tests for outline
injection in `UploadsMiddleware`; existing test updated for new `outline`
field in `uploaded_files` state
Partially addresses #1647 (agent ignores uploaded files).
* fix(uploads): stream outline file reads and strip inline bold from heading titles
- Switch extract_outline() from read_text().splitlines() to open()+line iteration
so large converted documents are not loaded into memory on every agent turn;
exits as soon as MAX_OUTLINE_ENTRIES is reached (Copilot suggestion)
- Strip **...** wrapper from standard Markdown heading titles before appending
to outline so agent context stays clean (e.g. "## **Overview**" → "Overview")
(Copilot suggestion)
- Remove unused pathlib.Path import and fix import sort order in test_file_conversion.py
to satisfy ruff CI lint
* fix(uploads): show truncation hint when outline exceeds MAX_OUTLINE_ENTRIES
When extract_outline() hits the cap it now appends a sentinel entry
{"truncated": True} instead of silently dropping the rest of the headings.
UploadsMiddleware reads the sentinel and renders a hint line:
... (showing first 50 headings; use `read_file` to explore further)
Without this the agent had no way to know the outline was incomplete and
would treat the first 50 headings as the full document structure.
* fix(uploads): fall back to configurable.thread_id when runtime.context lacks thread_id
runtime.context does not always carry thread_id (depends on LangGraph
invocation path). ThreadDataMiddleware already falls back to
get_config().configurable.thread_id — apply the same pattern so
UploadsMiddleware can resolve the uploads directory and attach outlines
in all invocation paths.
* style: apply ruff format
---------
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
131 lines
5.6 KiB
Python
131 lines
5.6 KiB
Python
"""Tests for extract_outline() in file_conversion utilities (PR2: document outline injection)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from deerflow.utils.file_conversion import (
|
|
MAX_OUTLINE_ENTRIES,
|
|
extract_outline,
|
|
)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# extract_outline
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestExtractOutline:
|
|
"""Tests for extract_outline()."""
|
|
|
|
def test_empty_file_returns_empty(self, tmp_path):
|
|
"""Empty markdown file yields no outline entries."""
|
|
md = tmp_path / "empty.md"
|
|
md.write_text("", encoding="utf-8")
|
|
assert extract_outline(md) == []
|
|
|
|
def test_missing_file_returns_empty(self, tmp_path):
|
|
"""Non-existent path returns [] without raising."""
|
|
assert extract_outline(tmp_path / "nonexistent.md") == []
|
|
|
|
def test_standard_markdown_headings(self, tmp_path):
|
|
"""# / ## / ### headings are all recognised."""
|
|
md = tmp_path / "doc.md"
|
|
md.write_text(
|
|
"# Chapter One\n\nSome text.\n\n## Section 1.1\n\nMore text.\n\n### Sub 1.1.1\n",
|
|
encoding="utf-8",
|
|
)
|
|
outline = extract_outline(md)
|
|
assert len(outline) == 3
|
|
assert outline[0] == {"title": "Chapter One", "line": 1}
|
|
assert outline[1] == {"title": "Section 1.1", "line": 5}
|
|
assert outline[2] == {"title": "Sub 1.1.1", "line": 9}
|
|
|
|
def test_bold_sec_item_heading(self, tmp_path):
|
|
"""**ITEM N. TITLE** lines in SEC filings are recognised."""
|
|
md = tmp_path / "10k.md"
|
|
md.write_text(
|
|
"Cover page text.\n\n**ITEM 1. BUSINESS**\n\nBody.\n\n**ITEM 1A. RISK FACTORS**\n",
|
|
encoding="utf-8",
|
|
)
|
|
outline = extract_outline(md)
|
|
assert len(outline) == 2
|
|
assert outline[0] == {"title": "ITEM 1. BUSINESS", "line": 3}
|
|
assert outline[1] == {"title": "ITEM 1A. RISK FACTORS", "line": 7}
|
|
|
|
def test_bold_part_heading(self, tmp_path):
|
|
"""**PART I** / **PART II** headings are recognised."""
|
|
md = tmp_path / "10k.md"
|
|
md.write_text("**PART I**\n\n**PART II**\n\n**PART III**\n", encoding="utf-8")
|
|
outline = extract_outline(md)
|
|
assert len(outline) == 3
|
|
titles = [e["title"] for e in outline]
|
|
assert "PART I" in titles
|
|
assert "PART II" in titles
|
|
assert "PART III" in titles
|
|
|
|
def test_sec_cover_page_boilerplate_excluded(self, tmp_path):
|
|
"""Address lines and short cover boilerplate must NOT appear in outline."""
|
|
md = tmp_path / "8k.md"
|
|
md.write_text(
|
|
"## **UNITED STATES SECURITIES AND EXCHANGE COMMISSION**\n\n**WASHINGTON, DC 20549**\n\n**CURRENT REPORT**\n\n**SIGNATURES**\n\n**TESLA, INC.**\n\n**ITEM 2.02. RESULTS OF OPERATIONS**\n",
|
|
encoding="utf-8",
|
|
)
|
|
outline = extract_outline(md)
|
|
titles = [e["title"] for e in outline]
|
|
# Cover-page boilerplate should be excluded
|
|
assert "WASHINGTON, DC 20549" not in titles
|
|
assert "CURRENT REPORT" not in titles
|
|
assert "SIGNATURES" not in titles
|
|
assert "TESLA, INC." not in titles
|
|
# Real SEC heading must be included
|
|
assert "ITEM 2.02. RESULTS OF OPERATIONS" in titles
|
|
|
|
def test_chinese_headings_via_standard_markdown(self, tmp_path):
|
|
"""Chinese annual report headings emitted as # by pymupdf4llm are captured."""
|
|
md = tmp_path / "annual.md"
|
|
md.write_text(
|
|
"# 第一节 公司简介\n\n内容。\n\n## 第三节 管理层讨论与分析\n\n分析内容。\n",
|
|
encoding="utf-8",
|
|
)
|
|
outline = extract_outline(md)
|
|
assert len(outline) == 2
|
|
assert outline[0]["title"] == "第一节 公司简介"
|
|
assert outline[1]["title"] == "第三节 管理层讨论与分析"
|
|
|
|
def test_outline_capped_at_max_entries(self, tmp_path):
|
|
"""When truncated, result has MAX_OUTLINE_ENTRIES real entries + 1 sentinel."""
|
|
lines = [f"# Heading {i}" for i in range(MAX_OUTLINE_ENTRIES + 10)]
|
|
md = tmp_path / "long.md"
|
|
md.write_text("\n".join(lines), encoding="utf-8")
|
|
outline = extract_outline(md)
|
|
# Last entry is the truncation sentinel
|
|
assert outline[-1] == {"truncated": True}
|
|
# Visible entries are exactly MAX_OUTLINE_ENTRIES
|
|
visible = [e for e in outline if not e.get("truncated")]
|
|
assert len(visible) == MAX_OUTLINE_ENTRIES
|
|
|
|
def test_no_truncation_sentinel_when_under_limit(self, tmp_path):
|
|
"""Short documents produce no sentinel entry."""
|
|
lines = [f"# Heading {i}" for i in range(5)]
|
|
md = tmp_path / "short.md"
|
|
md.write_text("\n".join(lines), encoding="utf-8")
|
|
outline = extract_outline(md)
|
|
assert len(outline) == 5
|
|
assert not any(e.get("truncated") for e in outline)
|
|
|
|
def test_blank_lines_and_whitespace_ignored(self, tmp_path):
|
|
"""Blank lines between headings do not produce empty entries."""
|
|
md = tmp_path / "spaced.md"
|
|
md.write_text("\n\n# Title One\n\n\n\n# Title Two\n\n", encoding="utf-8")
|
|
outline = extract_outline(md)
|
|
assert len(outline) == 2
|
|
assert all(e["title"] for e in outline)
|
|
|
|
def test_inline_bold_not_confused_with_heading(self, tmp_path):
|
|
"""Mid-sentence bold text must not be mistaken for a heading."""
|
|
md = tmp_path / "prose.md"
|
|
md.write_text(
|
|
"This sentence has **bold words** inside it.\n\nAnother with **MULTIPLE CAPS** inline.\n",
|
|
encoding="utf-8",
|
|
)
|
|
outline = extract_outline(md)
|
|
assert outline == []
|