mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-04-25 11:18:22 +00:00
* feat(uploads): guide agent to use grep/glob/read_file for uploaded documents Add workflow guidance to the <uploaded_files> context block so the agent knows to use grep and glob (added in #1784) alongside read_file when working with uploaded documents, rather than falling back to web search. This is the final piece of the three-PR PDF agentic search pipeline: - PR1 (#1727): pymupdf4llm converter produces structured Markdown with headings - PR2 (#1738): document outline injected into agent context with line numbers - PR3 (this): agent guided to use outline + grep + read_file workflow * feat(uploads): add file-first priority and fallback guidance to uploaded_files context * fix(uploads): handle split-bold headings and ** ** artefacts in extract_outline - Add _clean_bold_title() to merge adjacent bold spans (** **) produced by pymupdf4llm when bold text crosses span boundaries - Add _SPLIT_BOLD_HEADING_RE (Style 3) to recognise **<num>** **<title>** headings common in academic papers; excludes pure-number table headers and rows with more than 4 bold blocks - When outline is empty, read first 5 non-empty lines of the .md as a content preview and surface a grep hint in the agent context - Update _format_file_entry to render the preview + grep hint instead of silently omitting the outline section - Add 3 new extract_outline tests and 2 new middleware tests (65 total) * fix(uploads): address Copilot review comments on extract_outline regex - Replace ASCII [A-Za-z] guard with negative lookahead to support non-ASCII titles (e.g. **1** **概述**); pure-numeric/punctuation blocks still excluded - Replace .+ with [^*]+ and cap repetition at {0,2} (four blocks total) to keep _SPLIT_BOLD_HEADING_RE linear and avoid ReDoS on malformed input - Remove now-redundant len(blocks) <= 4 code-level check (enforced by regex) - Log debug message with exc_info when preview extraction fails
460 lines
18 KiB
Python
460 lines
18 KiB
Python
"""Tests for file_conversion utilities (PR1: pymupdf4llm + asyncio.to_thread; PR2: extract_outline)."""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import sys
|
||
from types import ModuleType
|
||
from unittest.mock import MagicMock, patch
|
||
|
||
from deerflow.utils.file_conversion import (
|
||
_ASYNC_THRESHOLD_BYTES,
|
||
_MIN_CHARS_PER_PAGE,
|
||
MAX_OUTLINE_ENTRIES,
|
||
_do_convert,
|
||
_pymupdf_output_too_sparse,
|
||
convert_file_to_markdown,
|
||
extract_outline,
|
||
)
|
||
|
||
|
||
def _make_pymupdf_mock(page_count: int) -> ModuleType:
|
||
"""Return a fake *pymupdf* module whose ``open()`` reports *page_count* pages."""
|
||
mock_doc = MagicMock()
|
||
mock_doc.__len__ = MagicMock(return_value=page_count)
|
||
fake_pymupdf = ModuleType("pymupdf")
|
||
fake_pymupdf.open = MagicMock(return_value=mock_doc) # type: ignore[attr-defined]
|
||
return fake_pymupdf
|
||
|
||
|
||
def _run(coro):
|
||
loop = asyncio.new_event_loop()
|
||
try:
|
||
return loop.run_until_complete(coro)
|
||
finally:
|
||
loop.close()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _pymupdf_output_too_sparse
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestPymupdfOutputTooSparse:
|
||
"""Check the chars-per-page sparsity heuristic."""
|
||
|
||
def test_dense_text_pdf_not_sparse(self, tmp_path):
|
||
"""Normal text PDF: many chars per page → not sparse."""
|
||
pdf = tmp_path / "dense.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
# 10 pages × 10 000 chars → 1000/page ≫ threshold
|
||
with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=10)}):
|
||
result = _pymupdf_output_too_sparse("x" * 10_000, pdf)
|
||
assert result is False
|
||
|
||
def test_image_based_pdf_is_sparse(self, tmp_path):
|
||
"""Image-based PDF: near-zero chars per page → sparse."""
|
||
pdf = tmp_path / "image.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
# 612 chars / 31 pages ≈ 19.7/page < _MIN_CHARS_PER_PAGE (50)
|
||
with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=31)}):
|
||
result = _pymupdf_output_too_sparse("x" * 612, pdf)
|
||
assert result is True
|
||
|
||
def test_fallback_when_pymupdf_unavailable(self, tmp_path):
|
||
"""When pymupdf is not installed, fall back to absolute 200-char threshold."""
|
||
pdf = tmp_path / "broken.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
# Remove pymupdf from sys.modules so the `import pymupdf` inside the
|
||
# function raises ImportError, triggering the absolute-threshold fallback.
|
||
with patch.dict(sys.modules, {"pymupdf": None}):
|
||
sparse = _pymupdf_output_too_sparse("x" * 100, pdf)
|
||
not_sparse = _pymupdf_output_too_sparse("x" * 300, pdf)
|
||
|
||
assert sparse is True
|
||
assert not_sparse is False
|
||
|
||
def test_exactly_at_threshold_is_not_sparse(self, tmp_path):
|
||
"""Chars-per-page == threshold is treated as NOT sparse (boundary inclusive)."""
|
||
pdf = tmp_path / "boundary.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
# 2 pages × _MIN_CHARS_PER_PAGE chars = exactly at threshold
|
||
with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=2)}):
|
||
result = _pymupdf_output_too_sparse("x" * (_MIN_CHARS_PER_PAGE * 2), pdf)
|
||
assert result is False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _do_convert — routing logic
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestDoConvert:
|
||
"""Verify that _do_convert routes to the right sub-converter."""
|
||
|
||
def test_non_pdf_always_uses_markitdown(self, tmp_path):
|
||
"""DOCX / XLSX / PPTX always go through MarkItDown regardless of setting."""
|
||
docx = tmp_path / "report.docx"
|
||
docx.write_bytes(b"PK fake docx")
|
||
|
||
with patch(
|
||
"deerflow.utils.file_conversion._convert_with_markitdown",
|
||
return_value="# Markdown from MarkItDown",
|
||
) as mock_md:
|
||
result = _do_convert(docx, "auto")
|
||
|
||
mock_md.assert_called_once_with(docx)
|
||
assert result == "# Markdown from MarkItDown"
|
||
|
||
def test_pdf_auto_uses_pymupdf4llm_when_dense(self, tmp_path):
|
||
"""auto mode: use pymupdf4llm output when it's dense enough."""
|
||
pdf = tmp_path / "report.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
dense_text = "# Heading\n" + "word " * 2000 # clearly dense
|
||
|
||
with (
|
||
patch(
|
||
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
|
||
return_value=dense_text,
|
||
),
|
||
patch(
|
||
"deerflow.utils.file_conversion._pymupdf_output_too_sparse",
|
||
return_value=False,
|
||
),
|
||
patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md,
|
||
):
|
||
result = _do_convert(pdf, "auto")
|
||
|
||
mock_md.assert_not_called()
|
||
assert result == dense_text
|
||
|
||
def test_pdf_auto_falls_back_when_sparse(self, tmp_path):
|
||
"""auto mode: fall back to MarkItDown when pymupdf4llm output is sparse."""
|
||
pdf = tmp_path / "scanned.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
with (
|
||
patch(
|
||
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
|
||
return_value="x" * 612, # 19.7 chars/page for 31-page doc
|
||
),
|
||
patch(
|
||
"deerflow.utils.file_conversion._pymupdf_output_too_sparse",
|
||
return_value=True,
|
||
),
|
||
patch(
|
||
"deerflow.utils.file_conversion._convert_with_markitdown",
|
||
return_value="OCR result via MarkItDown",
|
||
) as mock_md,
|
||
):
|
||
result = _do_convert(pdf, "auto")
|
||
|
||
mock_md.assert_called_once_with(pdf)
|
||
assert result == "OCR result via MarkItDown"
|
||
|
||
def test_pdf_explicit_pymupdf4llm_skips_sparsity_check(self, tmp_path):
|
||
"""'pymupdf4llm' mode: use output as-is even if sparse."""
|
||
pdf = tmp_path / "explicit.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
sparse_text = "x" * 10 # very short
|
||
|
||
with (
|
||
patch(
|
||
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
|
||
return_value=sparse_text,
|
||
),
|
||
patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md,
|
||
):
|
||
result = _do_convert(pdf, "pymupdf4llm")
|
||
|
||
mock_md.assert_not_called()
|
||
assert result == sparse_text
|
||
|
||
def test_pdf_explicit_markitdown_skips_pymupdf4llm(self, tmp_path):
|
||
"""'markitdown' mode: never attempt pymupdf4llm."""
|
||
pdf = tmp_path / "force_md.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
with (
|
||
patch("deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm") as mock_pymu,
|
||
patch(
|
||
"deerflow.utils.file_conversion._convert_with_markitdown",
|
||
return_value="MarkItDown result",
|
||
),
|
||
):
|
||
result = _do_convert(pdf, "markitdown")
|
||
|
||
mock_pymu.assert_not_called()
|
||
assert result == "MarkItDown result"
|
||
|
||
def test_pdf_auto_falls_back_when_pymupdf4llm_not_installed(self, tmp_path):
|
||
"""auto mode: if pymupdf4llm is not installed, use MarkItDown directly."""
|
||
pdf = tmp_path / "no_pymupdf.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
with (
|
||
patch(
|
||
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
|
||
return_value=None, # None signals not installed
|
||
),
|
||
patch(
|
||
"deerflow.utils.file_conversion._convert_with_markitdown",
|
||
return_value="MarkItDown fallback",
|
||
) as mock_md,
|
||
):
|
||
result = _do_convert(pdf, "auto")
|
||
|
||
mock_md.assert_called_once_with(pdf)
|
||
assert result == "MarkItDown fallback"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# convert_file_to_markdown — async + file writing
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestConvertFileToMarkdown:
|
||
def test_small_file_runs_synchronously(self, tmp_path):
|
||
"""Small files (< 1 MB) are converted in the event loop thread."""
|
||
pdf = tmp_path / "small.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 " + b"x" * 100) # well under 1 MB
|
||
|
||
with (
|
||
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
|
||
patch(
|
||
"deerflow.utils.file_conversion._do_convert",
|
||
return_value="# Small PDF",
|
||
) as mock_convert,
|
||
patch("asyncio.to_thread") as mock_thread,
|
||
):
|
||
md_path = _run(convert_file_to_markdown(pdf))
|
||
|
||
# asyncio.to_thread must NOT have been called
|
||
mock_thread.assert_not_called()
|
||
mock_convert.assert_called_once()
|
||
assert md_path == pdf.with_suffix(".md")
|
||
assert md_path.read_text() == "# Small PDF"
|
||
|
||
def test_large_file_offloaded_to_thread(self, tmp_path):
|
||
"""Large files (> 1 MB) are offloaded via asyncio.to_thread."""
|
||
pdf = tmp_path / "large.pdf"
|
||
# Write slightly more than the threshold
|
||
pdf.write_bytes(b"%PDF-1.4 " + b"x" * (_ASYNC_THRESHOLD_BYTES + 1))
|
||
|
||
async def fake_to_thread(fn, *args, **kwargs):
|
||
return fn(*args, **kwargs)
|
||
|
||
with (
|
||
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
|
||
patch(
|
||
"deerflow.utils.file_conversion._do_convert",
|
||
return_value="# Large PDF",
|
||
),
|
||
patch("asyncio.to_thread", side_effect=fake_to_thread) as mock_thread,
|
||
):
|
||
md_path = _run(convert_file_to_markdown(pdf))
|
||
|
||
mock_thread.assert_called_once()
|
||
assert md_path == pdf.with_suffix(".md")
|
||
assert md_path.read_text() == "# Large PDF"
|
||
|
||
def test_returns_none_on_conversion_error(self, tmp_path):
|
||
"""If conversion raises, return None without propagating the exception."""
|
||
pdf = tmp_path / "broken.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
|
||
with (
|
||
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
|
||
patch(
|
||
"deerflow.utils.file_conversion._do_convert",
|
||
side_effect=RuntimeError("conversion failed"),
|
||
),
|
||
):
|
||
result = _run(convert_file_to_markdown(pdf))
|
||
|
||
assert result is None
|
||
|
||
def test_writes_utf8_markdown_file(self, tmp_path):
|
||
"""Generated .md file is written with UTF-8 encoding."""
|
||
pdf = tmp_path / "report.pdf"
|
||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||
chinese_content = "# 中文报告\n\n这是测试内容。"
|
||
|
||
with (
|
||
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
|
||
patch(
|
||
"deerflow.utils.file_conversion._do_convert",
|
||
return_value=chinese_content,
|
||
),
|
||
):
|
||
md_path = _run(convert_file_to_markdown(pdf))
|
||
|
||
assert md_path is not None
|
||
assert md_path.read_text(encoding="utf-8") == chinese_content
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# extract_outline
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestExtractOutline:
|
||
"""Tests for extract_outline()."""
|
||
|
||
def test_empty_file_returns_empty(self, tmp_path):
|
||
"""Empty markdown file yields no outline entries."""
|
||
md = tmp_path / "empty.md"
|
||
md.write_text("", encoding="utf-8")
|
||
assert extract_outline(md) == []
|
||
|
||
def test_missing_file_returns_empty(self, tmp_path):
|
||
"""Non-existent path returns [] without raising."""
|
||
assert extract_outline(tmp_path / "nonexistent.md") == []
|
||
|
||
def test_standard_markdown_headings(self, tmp_path):
|
||
"""# / ## / ### headings are all recognised."""
|
||
md = tmp_path / "doc.md"
|
||
md.write_text(
|
||
"# Chapter One\n\nSome text.\n\n## Section 1.1\n\nMore text.\n\n### Sub 1.1.1\n",
|
||
encoding="utf-8",
|
||
)
|
||
outline = extract_outline(md)
|
||
assert len(outline) == 3
|
||
assert outline[0] == {"title": "Chapter One", "line": 1}
|
||
assert outline[1] == {"title": "Section 1.1", "line": 5}
|
||
assert outline[2] == {"title": "Sub 1.1.1", "line": 9}
|
||
|
||
def test_bold_sec_item_heading(self, tmp_path):
|
||
"""**ITEM N. TITLE** lines in SEC filings are recognised."""
|
||
md = tmp_path / "10k.md"
|
||
md.write_text(
|
||
"Cover page text.\n\n**ITEM 1. BUSINESS**\n\nBody.\n\n**ITEM 1A. RISK FACTORS**\n",
|
||
encoding="utf-8",
|
||
)
|
||
outline = extract_outline(md)
|
||
assert len(outline) == 2
|
||
assert outline[0] == {"title": "ITEM 1. BUSINESS", "line": 3}
|
||
assert outline[1] == {"title": "ITEM 1A. RISK FACTORS", "line": 7}
|
||
|
||
def test_bold_part_heading(self, tmp_path):
|
||
"""**PART I** / **PART II** headings are recognised."""
|
||
md = tmp_path / "10k.md"
|
||
md.write_text("**PART I**\n\n**PART II**\n\n**PART III**\n", encoding="utf-8")
|
||
outline = extract_outline(md)
|
||
assert len(outline) == 3
|
||
titles = [e["title"] for e in outline]
|
||
assert "PART I" in titles
|
||
assert "PART II" in titles
|
||
assert "PART III" in titles
|
||
|
||
def test_sec_cover_page_boilerplate_excluded(self, tmp_path):
|
||
"""Address lines and short cover boilerplate must NOT appear in outline."""
|
||
md = tmp_path / "8k.md"
|
||
md.write_text(
|
||
"## **UNITED STATES SECURITIES AND EXCHANGE COMMISSION**\n\n**WASHINGTON, DC 20549**\n\n**CURRENT REPORT**\n\n**SIGNATURES**\n\n**TESLA, INC.**\n\n**ITEM 2.02. RESULTS OF OPERATIONS**\n",
|
||
encoding="utf-8",
|
||
)
|
||
outline = extract_outline(md)
|
||
titles = [e["title"] for e in outline]
|
||
# Cover-page boilerplate should be excluded
|
||
assert "WASHINGTON, DC 20549" not in titles
|
||
assert "CURRENT REPORT" not in titles
|
||
assert "SIGNATURES" not in titles
|
||
assert "TESLA, INC." not in titles
|
||
# Real SEC heading must be included
|
||
assert "ITEM 2.02. RESULTS OF OPERATIONS" in titles
|
||
|
||
def test_chinese_headings_via_standard_markdown(self, tmp_path):
|
||
"""Chinese annual report headings emitted as # by pymupdf4llm are captured."""
|
||
md = tmp_path / "annual.md"
|
||
md.write_text(
|
||
"# 第一节 公司简介\n\n内容。\n\n## 第三节 管理层讨论与分析\n\n分析内容。\n",
|
||
encoding="utf-8",
|
||
)
|
||
outline = extract_outline(md)
|
||
assert len(outline) == 2
|
||
assert outline[0]["title"] == "第一节 公司简介"
|
||
assert outline[1]["title"] == "第三节 管理层讨论与分析"
|
||
|
||
def test_outline_capped_at_max_entries(self, tmp_path):
|
||
"""When truncated, result has MAX_OUTLINE_ENTRIES real entries + 1 sentinel."""
|
||
lines = [f"# Heading {i}" for i in range(MAX_OUTLINE_ENTRIES + 10)]
|
||
md = tmp_path / "long.md"
|
||
md.write_text("\n".join(lines), encoding="utf-8")
|
||
outline = extract_outline(md)
|
||
# Last entry is the truncation sentinel
|
||
assert outline[-1] == {"truncated": True}
|
||
# Visible entries are exactly MAX_OUTLINE_ENTRIES
|
||
visible = [e for e in outline if not e.get("truncated")]
|
||
assert len(visible) == MAX_OUTLINE_ENTRIES
|
||
|
||
def test_no_truncation_sentinel_when_under_limit(self, tmp_path):
|
||
"""Short documents produce no sentinel entry."""
|
||
lines = [f"# Heading {i}" for i in range(5)]
|
||
md = tmp_path / "short.md"
|
||
md.write_text("\n".join(lines), encoding="utf-8")
|
||
outline = extract_outline(md)
|
||
assert len(outline) == 5
|
||
assert not any(e.get("truncated") for e in outline)
|
||
|
||
def test_blank_lines_and_whitespace_ignored(self, tmp_path):
|
||
"""Blank lines between headings do not produce empty entries."""
|
||
md = tmp_path / "spaced.md"
|
||
md.write_text("\n\n# Title One\n\n\n\n# Title Two\n\n", encoding="utf-8")
|
||
outline = extract_outline(md)
|
||
assert len(outline) == 2
|
||
assert all(e["title"] for e in outline)
|
||
|
||
def test_inline_bold_not_confused_with_heading(self, tmp_path):
|
||
"""Mid-sentence bold text must not be mistaken for a heading."""
|
||
md = tmp_path / "prose.md"
|
||
md.write_text(
|
||
"This sentence has **bold words** inside it.\n\nAnother with **MULTIPLE CAPS** inline.\n",
|
||
encoding="utf-8",
|
||
)
|
||
outline = extract_outline(md)
|
||
assert outline == []
|
||
|
||
def test_split_bold_heading_academic_paper(self, tmp_path):
|
||
"""**<num>** **<title>** lines from academic papers are recognised (Style 3)."""
|
||
md = tmp_path / "paper.md"
|
||
md.write_text(
|
||
"## **Attention Is All You Need**\n\n**1** **Introduction**\n\nBody text.\n\n**2** **Background**\n\nMore text.\n\n**3.1** **Encoder and Decoder Stacks**\n",
|
||
encoding="utf-8",
|
||
)
|
||
outline = extract_outline(md)
|
||
titles = [e["title"] for e in outline]
|
||
assert "1 Introduction" in titles
|
||
assert "2 Background" in titles
|
||
assert "3.1 Encoder and Decoder Stacks" in titles
|
||
|
||
def test_split_bold_year_columns_excluded(self, tmp_path):
|
||
"""Financial table headers like **2023** **2022** **2021** are NOT headings."""
|
||
md = tmp_path / "annual.md"
|
||
md.write_text(
|
||
"# Financial Summary\n\n**2023** **2022** **2021**\n\nRevenue 100 90 80\n",
|
||
encoding="utf-8",
|
||
)
|
||
outline = extract_outline(md)
|
||
titles = [e["title"] for e in outline]
|
||
# Only the # heading should appear, not the year-column row
|
||
assert titles == ["Financial Summary"]
|
||
|
||
def test_adjacent_bold_spans_merged_in_markdown_heading(self, tmp_path):
|
||
"""** ** artefacts inside a # heading are merged into clean plain text."""
|
||
md = tmp_path / "sec.md"
|
||
md.write_text(
|
||
"## **UNITED STATES** **SECURITIES AND EXCHANGE COMMISSION**\n\nBody text.\n",
|
||
encoding="utf-8",
|
||
)
|
||
outline = extract_outline(md)
|
||
assert len(outline) == 1
|
||
# Title must be clean — no ** ** artefacts
|
||
assert outline[0]["title"] == "UNITED STATES SECURITIES AND EXCHANGE COMMISSION"
|