mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-04-25 11:18:22 +00:00
fix(uploads): handle split-bold headings and ** ** artefacts in extract_outline (#1838)
* feat(uploads): guide agent to use grep/glob/read_file for uploaded documents Add workflow guidance to the <uploaded_files> context block so the agent knows to use grep and glob (added in #1784) alongside read_file when working with uploaded documents, rather than falling back to web search. This is the final piece of the three-PR PDF agentic search pipeline: - PR1 (#1727): pymupdf4llm converter produces structured Markdown with headings - PR2 (#1738): document outline injected into agent context with line numbers - PR3 (this): agent guided to use outline + grep + read_file workflow * feat(uploads): add file-first priority and fallback guidance to uploaded_files context * fix(uploads): handle split-bold headings and ** ** artefacts in extract_outline - Add _clean_bold_title() to merge adjacent bold spans (** **) produced by pymupdf4llm when bold text crosses span boundaries - Add _SPLIT_BOLD_HEADING_RE (Style 3) to recognise **<num>** **<title>** headings common in academic papers; excludes pure-number table headers and rows with more than 4 bold blocks - When outline is empty, read first 5 non-empty lines of the .md as a content preview and surface a grep hint in the agent context - Update _format_file_entry to render the preview + grep hint instead of silently omitting the outline section - Add 3 new extract_outline tests and 2 new middleware tests (65 total) * fix(uploads): address Copilot review comments on extract_outline regex - Replace ASCII [A-Za-z] guard with negative lookahead to support non-ASCII titles (e.g. **1** **概述**); pure-numeric/punctuation blocks still excluded - Replace .+ with [^*]+ and cap repetition at {0,2} (four blocks total) to keep _SPLIT_BOLD_HEADING_RE linear and avoid ReDoS on malformed input - Remove now-redundant len(blocks) <= 4 code-level check (enforced by regex) - Log debug message with exc_info when preview extraction fails
This commit is contained in:
parent
19809800f1
commit
163121d327
@ -15,20 +15,45 @@ from deerflow.utils.file_conversion import extract_outline
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def _extract_outline_for_file(file_path: Path) -> list[dict]:
|
_OUTLINE_PREVIEW_LINES = 5
|
||||||
"""Return the document outline for *file_path* if a converted .md exists.
|
|
||||||
|
|
||||||
|
def _extract_outline_for_file(file_path: Path) -> tuple[list[dict], list[str]]:
|
||||||
|
"""Return the document outline and fallback preview for *file_path*.
|
||||||
|
|
||||||
Looks for a sibling ``<stem>.md`` file produced by the upload conversion
|
Looks for a sibling ``<stem>.md`` file produced by the upload conversion
|
||||||
pipeline. Returns an empty list when the file is not a converted document
|
pipeline.
|
||||||
or when no headings are found.
|
|
||||||
|
Returns:
|
||||||
|
(outline, preview) where:
|
||||||
|
- outline: list of ``{title, line}`` dicts (plus optional sentinel).
|
||||||
|
Empty when no headings are found or no .md exists.
|
||||||
|
- preview: first few non-empty lines of the .md, used as a content
|
||||||
|
anchor when outline is empty so the agent has some context.
|
||||||
|
Empty when outline is non-empty (no fallback needed).
|
||||||
"""
|
"""
|
||||||
md_path = file_path.with_suffix(".md")
|
md_path = file_path.with_suffix(".md")
|
||||||
if not md_path.is_file():
|
if not md_path.is_file():
|
||||||
return []
|
return [], []
|
||||||
|
|
||||||
outline = extract_outline(md_path)
|
outline = extract_outline(md_path)
|
||||||
if outline:
|
if outline:
|
||||||
logger.debug("Extracted %d outline entries from %s", len(outline), file_path.name)
|
logger.debug("Extracted %d outline entries from %s", len(outline), file_path.name)
|
||||||
return outline
|
return outline, []
|
||||||
|
|
||||||
|
# outline is empty — read the first few non-empty lines as a content preview
|
||||||
|
preview: list[str] = []
|
||||||
|
try:
|
||||||
|
with md_path.open(encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped:
|
||||||
|
preview.append(stripped)
|
||||||
|
if len(preview) >= _OUTLINE_PREVIEW_LINES:
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
logger.debug("Failed to read preview lines from %s", md_path, exc_info=True)
|
||||||
|
return [], preview
|
||||||
|
|
||||||
|
|
||||||
class UploadsMiddlewareState(AgentState):
|
class UploadsMiddlewareState(AgentState):
|
||||||
@ -64,13 +89,20 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
|||||||
lines.append(f" Path: {file['path']}")
|
lines.append(f" Path: {file['path']}")
|
||||||
outline = file.get("outline") or []
|
outline = file.get("outline") or []
|
||||||
if outline:
|
if outline:
|
||||||
truncated = outline[-1].get("truncated", False) if outline else False
|
truncated = outline[-1].get("truncated", False)
|
||||||
visible = [e for e in outline if not e.get("truncated")]
|
visible = [e for e in outline if not e.get("truncated")]
|
||||||
lines.append(" Document outline (use `read_file` with line ranges to read sections):")
|
lines.append(" Document outline (use `read_file` with line ranges to read sections):")
|
||||||
for entry in visible:
|
for entry in visible:
|
||||||
lines.append(f" L{entry['line']}: {entry['title']}")
|
lines.append(f" L{entry['line']}: {entry['title']}")
|
||||||
if truncated:
|
if truncated:
|
||||||
lines.append(f" ... (showing first {len(visible)} headings; use `read_file` to explore further)")
|
lines.append(f" ... (showing first {len(visible)} headings; use `read_file` to explore further)")
|
||||||
|
else:
|
||||||
|
preview = file.get("outline_preview") or []
|
||||||
|
if preview:
|
||||||
|
lines.append(" No structural headings detected. Document begins with:")
|
||||||
|
for text in preview:
|
||||||
|
lines.append(f" > {text}")
|
||||||
|
lines.append(" Use `grep` to search for keywords (e.g. `grep(pattern='keyword', path='/mnt/user-data/uploads/')`).")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str:
|
def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str:
|
||||||
@ -201,13 +233,15 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
|||||||
for file_path in sorted(uploads_dir.iterdir()):
|
for file_path in sorted(uploads_dir.iterdir()):
|
||||||
if file_path.is_file() and file_path.name not in new_filenames:
|
if file_path.is_file() and file_path.name not in new_filenames:
|
||||||
stat = file_path.stat()
|
stat = file_path.stat()
|
||||||
|
outline, preview = _extract_outline_for_file(file_path)
|
||||||
historical_files.append(
|
historical_files.append(
|
||||||
{
|
{
|
||||||
"filename": file_path.name,
|
"filename": file_path.name,
|
||||||
"size": stat.st_size,
|
"size": stat.st_size,
|
||||||
"path": f"/mnt/user-data/uploads/{file_path.name}",
|
"path": f"/mnt/user-data/uploads/{file_path.name}",
|
||||||
"extension": file_path.suffix,
|
"extension": file_path.suffix,
|
||||||
"outline": _extract_outline_for_file(file_path),
|
"outline": outline,
|
||||||
|
"outline_preview": preview,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -215,7 +249,9 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
|||||||
if uploads_dir:
|
if uploads_dir:
|
||||||
for file in new_files:
|
for file in new_files:
|
||||||
phys_path = uploads_dir / file["filename"]
|
phys_path = uploads_dir / file["filename"]
|
||||||
file["outline"] = _extract_outline_for_file(phys_path)
|
outline, preview = _extract_outline_for_file(phys_path)
|
||||||
|
file["outline"] = outline
|
||||||
|
file["outline_preview"] = preview
|
||||||
|
|
||||||
if not new_files and not historical_files:
|
if not new_files and not historical_files:
|
||||||
return None
|
return None
|
||||||
|
|||||||
@ -182,6 +182,19 @@ async def convert_file_to_markdown(file_path: Path) -> Path | None:
|
|||||||
# by pymupdf4llm, so they don't need this pattern.
|
# by pymupdf4llm, so they don't need this pattern.
|
||||||
_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")
|
_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")
|
||||||
|
|
||||||
|
# Regex for split-bold headings produced by pymupdf4llm when a heading spans
|
||||||
|
# multiple text spans in the PDF (e.g. section number and title are separate spans).
|
||||||
|
# Matches lines like: **1** **Introduction** or **3.2** **Multi-Head Attention**
|
||||||
|
# Requirements:
|
||||||
|
# 1. Entire line consists only of **...** blocks separated by whitespace (no prose)
|
||||||
|
# 2. First block is a section number (digits and dots, e.g. "1", "3.2", "A.1")
|
||||||
|
# 3. Second block must not be purely numeric/punctuation — excludes financial table
|
||||||
|
# headers like **2023** **2022** **2021** while allowing non-ASCII titles such as
|
||||||
|
# **1** **概述** or accented words (negative lookahead instead of [A-Za-z])
|
||||||
|
# 4. At most two additional blocks (four total) with [^*]+ (no * inside) to keep
|
||||||
|
# the regex linear and avoid ReDoS on attacker-controlled content
|
||||||
|
_SPLIT_BOLD_HEADING_RE = re.compile(r"^\*\*[\dA-Z][\d\.]*\*\*\s+\*\*(?!\d[\d\s.,\-–—/:()%]*\*\*)[^*]+\*\*(?:\s+\*\*[^*]+\*\*){0,2}\s*$")
|
||||||
|
|
||||||
# Maximum number of outline entries injected into the agent context.
|
# Maximum number of outline entries injected into the agent context.
|
||||||
# Keeps prompt size bounded even for very long documents.
|
# Keeps prompt size bounded even for very long documents.
|
||||||
MAX_OUTLINE_ENTRIES = 50
|
MAX_OUTLINE_ENTRIES = 50
|
||||||
@ -189,14 +202,43 @@ MAX_OUTLINE_ENTRIES = 50
|
|||||||
_ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"}
|
_ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"}
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_bold_title(raw: str) -> str:
|
||||||
|
"""Normalise a title string that may contain pymupdf4llm bold artefacts.
|
||||||
|
|
||||||
|
pymupdf4llm sometimes emits adjacent bold spans as ``**A** **B**`` instead
|
||||||
|
of a single ``**A B**`` block. This helper merges those fragments and then
|
||||||
|
strips the outermost ``**...**`` wrapper so the caller gets plain text.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
"**Overview**" → "Overview"
|
||||||
|
"**UNITED STATES** **SECURITIES**" → "UNITED STATES SECURITIES"
|
||||||
|
"plain text" → "plain text" (unchanged)
|
||||||
|
"""
|
||||||
|
# Merge adjacent bold spans: "** **" → " "
|
||||||
|
merged = re.sub(r"\*\*\s*\*\*", " ", raw).strip()
|
||||||
|
# Strip outermost **...** if the whole string is wrapped
|
||||||
|
if m := re.fullmatch(r"\*\*(.+?)\*\*", merged, re.DOTALL):
|
||||||
|
return m.group(1).strip()
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
def extract_outline(md_path: Path) -> list[dict]:
|
def extract_outline(md_path: Path) -> list[dict]:
|
||||||
"""Extract document outline (headings) from a Markdown file.
|
"""Extract document outline (headings) from a Markdown file.
|
||||||
|
|
||||||
Recognises two heading styles produced by pymupdf4llm:
|
Recognises three heading styles produced by pymupdf4llm:
|
||||||
1. Standard Markdown headings: lines starting with one or more '#'
|
|
||||||
2. Bold-only structural headings: **ITEM 1. BUSINESS**, **PART II**, etc.
|
1. Standard Markdown headings: lines starting with one or more '#'.
|
||||||
(SEC filings use bold+caps for section headings with the same font size
|
Inline ``**...**`` wrappers and adjacent bold spans (``** **``) are
|
||||||
as body text, so pymupdf4llm cannot promote them to # headings)
|
cleaned so the title is plain text.
|
||||||
|
|
||||||
|
2. Bold-only structural headings: ``**ITEM 1. BUSINESS**``, ``**PART II**``,
|
||||||
|
etc. SEC filings use bold+caps for section headings with the same font
|
||||||
|
size as body text, so pymupdf4llm cannot promote them to # headings.
|
||||||
|
|
||||||
|
3. Split-bold headings: ``**1** **Introduction**``, ``**3.2** **Attention**``.
|
||||||
|
pymupdf4llm emits these when the section number and title text are
|
||||||
|
separate spans in the underlying PDF (common in academic papers).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
md_path: Path to the .md file.
|
md_path: Path to the .md file.
|
||||||
@ -218,19 +260,23 @@ def extract_outline(md_path: Path) -> list[dict]:
|
|||||||
|
|
||||||
# Style 1: standard Markdown heading
|
# Style 1: standard Markdown heading
|
||||||
if stripped.startswith("#"):
|
if stripped.startswith("#"):
|
||||||
title = stripped.lstrip("#").strip()
|
title = _clean_bold_title(stripped.lstrip("#").strip())
|
||||||
# Strip any inline **...** wrapping (e.g. "## **Overview**" → "Overview")
|
|
||||||
if title:
|
if title:
|
||||||
if m2 := re.fullmatch(r"\*\*(.+?)\*\*", title):
|
|
||||||
title = m2.group(1).strip()
|
|
||||||
outline.append({"title": title, "line": lineno})
|
outline.append({"title": title, "line": lineno})
|
||||||
|
|
||||||
# Style 2: bold-only line (entire line is **...**)
|
# Style 2: single bold block with SEC structural keyword
|
||||||
elif m := _BOLD_HEADING_RE.match(stripped):
|
elif m := _BOLD_HEADING_RE.match(stripped):
|
||||||
title = m.group(1).strip()
|
title = m.group(1).strip()
|
||||||
if title:
|
if title:
|
||||||
outline.append({"title": title, "line": lineno})
|
outline.append({"title": title, "line": lineno})
|
||||||
|
|
||||||
|
# Style 3: split-bold heading — **<num>** **<title>**
|
||||||
|
# Regex already enforces max 4 blocks and non-numeric second block.
|
||||||
|
elif _SPLIT_BOLD_HEADING_RE.match(stripped):
|
||||||
|
title = " ".join(re.findall(r"\*\*([^*]+)\*\*", stripped))
|
||||||
|
if title:
|
||||||
|
outline.append({"title": title, "line": lineno})
|
||||||
|
|
||||||
if len(outline) >= MAX_OUTLINE_ENTRIES:
|
if len(outline) >= MAX_OUTLINE_ENTRIES:
|
||||||
outline.append({"truncated": True})
|
outline.append({"truncated": True})
|
||||||
break
|
break
|
||||||
|
|||||||
@ -420,3 +420,40 @@ class TestExtractOutline:
|
|||||||
)
|
)
|
||||||
outline = extract_outline(md)
|
outline = extract_outline(md)
|
||||||
assert outline == []
|
assert outline == []
|
||||||
|
|
||||||
|
def test_split_bold_heading_academic_paper(self, tmp_path):
|
||||||
|
"""**<num>** **<title>** lines from academic papers are recognised (Style 3)."""
|
||||||
|
md = tmp_path / "paper.md"
|
||||||
|
md.write_text(
|
||||||
|
"## **Attention Is All You Need**\n\n**1** **Introduction**\n\nBody text.\n\n**2** **Background**\n\nMore text.\n\n**3.1** **Encoder and Decoder Stacks**\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
outline = extract_outline(md)
|
||||||
|
titles = [e["title"] for e in outline]
|
||||||
|
assert "1 Introduction" in titles
|
||||||
|
assert "2 Background" in titles
|
||||||
|
assert "3.1 Encoder and Decoder Stacks" in titles
|
||||||
|
|
||||||
|
def test_split_bold_year_columns_excluded(self, tmp_path):
|
||||||
|
"""Financial table headers like **2023** **2022** **2021** are NOT headings."""
|
||||||
|
md = tmp_path / "annual.md"
|
||||||
|
md.write_text(
|
||||||
|
"# Financial Summary\n\n**2023** **2022** **2021**\n\nRevenue 100 90 80\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
outline = extract_outline(md)
|
||||||
|
titles = [e["title"] for e in outline]
|
||||||
|
# Only the # heading should appear, not the year-column row
|
||||||
|
assert titles == ["Financial Summary"]
|
||||||
|
|
||||||
|
def test_adjacent_bold_spans_merged_in_markdown_heading(self, tmp_path):
|
||||||
|
"""** ** artefacts inside a # heading are merged into clean plain text."""
|
||||||
|
md = tmp_path / "sec.md"
|
||||||
|
md.write_text(
|
||||||
|
"## **UNITED STATES** **SECURITIES AND EXCHANGE COMMISSION**\n\nBody text.\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
outline = extract_outline(md)
|
||||||
|
assert len(outline) == 1
|
||||||
|
# Title must be clean — no ** ** artefacts
|
||||||
|
assert outline[0]["title"] == "UNITED STATES SECURITIES AND EXCHANGE COMMISSION"
|
||||||
|
|||||||
@ -290,6 +290,7 @@ class TestBeforeAgent:
|
|||||||
"path": "/mnt/user-data/uploads/notes.txt",
|
"path": "/mnt/user-data/uploads/notes.txt",
|
||||||
"extension": ".txt",
|
"extension": ".txt",
|
||||||
"outline": [],
|
"outline": [],
|
||||||
|
"outline_preview": [],
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -429,3 +430,41 @@ class TestBeforeAgent:
|
|||||||
content = result["messages"][-1].content
|
content = result["messages"][-1].content
|
||||||
assert "Chapter 1" in content
|
assert "Chapter 1" in content
|
||||||
assert "Chapter 2" in content
|
assert "Chapter 2" in content
|
||||||
|
|
||||||
|
def test_fallback_preview_shown_when_outline_empty(self, tmp_path):
|
||||||
|
"""When .md exists but has no headings, first lines are shown as a preview."""
|
||||||
|
mw = _middleware(tmp_path)
|
||||||
|
uploads_dir = _uploads_dir(tmp_path)
|
||||||
|
(uploads_dir / "report.pdf").write_bytes(b"%PDF fake")
|
||||||
|
# .md with no # headings — plain prose only
|
||||||
|
(uploads_dir / "report.md").write_text(
|
||||||
|
"Annual Financial Report 2024\n\nThis document summarises key findings.\n\nRevenue grew by 12%.\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
msg = _human("analyse", files=[{"filename": "report.pdf", "size": 9, "path": "/mnt/user-data/uploads/report.pdf"}])
|
||||||
|
result = mw.before_agent(self._state(msg), _runtime())
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
content = result["messages"][-1].content
|
||||||
|
# Outline section must NOT appear
|
||||||
|
assert "Document outline" not in content
|
||||||
|
# Preview lines must appear
|
||||||
|
assert "Annual Financial Report 2024" in content
|
||||||
|
assert "No structural headings detected" in content
|
||||||
|
# grep hint must appear
|
||||||
|
assert "grep" in content
|
||||||
|
|
||||||
|
def test_fallback_grep_hint_shown_when_no_md_file(self, tmp_path):
|
||||||
|
"""Files with no sibling .md still get the grep hint (outline is empty)."""
|
||||||
|
mw = _middleware(tmp_path)
|
||||||
|
uploads_dir = _uploads_dir(tmp_path)
|
||||||
|
(uploads_dir / "data.csv").write_bytes(b"a,b,c\n1,2,3\n")
|
||||||
|
|
||||||
|
msg = _human("analyse", files=[{"filename": "data.csv", "size": 12, "path": "/mnt/user-data/uploads/data.csv"}])
|
||||||
|
result = mw.before_agent(self._state(msg), _runtime())
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
content = result["messages"][-1].content
|
||||||
|
assert "Document outline" not in content
|
||||||
|
assert "grep" in content
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user