diff --git a/backend/packages/harness/deerflow/config/app_config.py b/backend/packages/harness/deerflow/config/app_config.py index d034ffc4c..b8a2ae156 100644 --- a/backend/packages/harness/deerflow/config/app_config.py +++ b/backend/packages/harness/deerflow/config/app_config.py @@ -1,7 +1,7 @@ import logging import os from pathlib import Path -from typing import Any, Self +from typing import Any, Literal, Self import yaml from dotenv import load_dotenv @@ -28,11 +28,26 @@ load_dotenv() logger = logging.getLogger(__name__) +class UploadsConfig(BaseModel): + """Configuration for file upload handling.""" + + pdf_converter: Literal["auto", "pymupdf4llm", "markitdown"] = Field( + default="auto", + description=( + "PDF-to-Markdown converter. " + "'auto': prefer pymupdf4llm when installed, fall back to MarkItDown for image-based PDFs; " + "'pymupdf4llm': always use pymupdf4llm (must be installed); " + "'markitdown': always use MarkItDown (original behaviour)." + ), + ) + + class AppConfig(BaseModel): """Config for the DeerFlow application""" log_level: str = Field(default="info", description="Logging level for deerflow modules (debug/info/warning/error)") token_usage: TokenUsageConfig = Field(default_factory=TokenUsageConfig, description="Token usage tracking configuration") + uploads: UploadsConfig = Field(default_factory=UploadsConfig, description="File upload handling configuration") models: list[ModelConfig] = Field(default_factory=list, description="Available models") sandbox: SandboxConfig = Field(description="Sandbox configuration") tools: list[ToolConfig] = Field(default_factory=list, description="Available tools") diff --git a/backend/packages/harness/deerflow/utils/file_conversion.py b/backend/packages/harness/deerflow/utils/file_conversion.py index eae56599e..9a180883d 100644 --- a/backend/packages/harness/deerflow/utils/file_conversion.py +++ b/backend/packages/harness/deerflow/utils/file_conversion.py @@ -1,9 +1,20 @@ """File conversion utilities. -Converts document files (PDF, PPT, Excel, Word) to Markdown using markitdown. +Converts document files (PDF, PPT, Excel, Word) to Markdown. + +PDF conversion strategy (auto mode): + 1. Try pymupdf4llm if installed — better heading detection, faster on most files. + 2. If output is suspiciously short (< _MIN_CHARS_PER_PAGE chars/page, or < 200 chars + total when page count is unavailable), treat as image-based and fall back to MarkItDown. + 3. If pymupdf4llm is not installed, use MarkItDown directly (existing behaviour). + +Large files (> ASYNC_THRESHOLD_BYTES) are converted in a thread pool via +asyncio.to_thread() to avoid blocking the event loop (fixes #1569). + No FastAPI or HTTP dependencies — pure utility functions. """ +import asyncio import logging import re from pathlib import Path @@ -21,30 +32,136 @@ CONVERTIBLE_EXTENSIONS = { ".docx", } +# Files larger than this threshold are converted in a background thread. +# Small files complete in < 1s synchronously; spawning a thread adds unnecessary +# scheduling overhead for them. +_ASYNC_THRESHOLD_BYTES = 1 * 1024 * 1024 # 1 MB + +# If pymupdf4llm produces fewer characters *per page* than this threshold, +# the PDF is likely image-based or encrypted — fall back to MarkItDown. +# Rationale: normal text PDFs yield 200-2000 chars/page; image-based PDFs +# yield close to 0. 50 chars/page gives a wide safety margin. +# Falls back to absolute 200-char check when page count is unavailable. +_MIN_CHARS_PER_PAGE = 50 + + +def _pymupdf_output_too_sparse(text: str, file_path: Path) -> bool: + """Return True if pymupdf4llm output is suspiciously short (image-based PDF). + + Uses chars-per-page rather than an absolute threshold so that both short + documents (few pages, few chars) and long documents (many pages, many chars) + are handled correctly. + """ + chars = len(text.strip()) + doc = None + pages: int | None = None + try: + import pymupdf + + doc = pymupdf.open(str(file_path)) + pages = len(doc) + except Exception: + pass + finally: + if doc is not None: + try: + doc.close() + except Exception: + pass + if pages is not None and pages > 0: + return (chars / pages) < _MIN_CHARS_PER_PAGE + # Fallback: absolute threshold when page count is unavailable + return chars < 200 + + +def _convert_pdf_with_pymupdf4llm(file_path: Path) -> str | None: + """Attempt PDF conversion with pymupdf4llm. + + Returns the markdown text, or None if pymupdf4llm is not installed or + if conversion fails (e.g. encrypted/corrupt PDF). + """ + try: + import pymupdf4llm + except ImportError: + return None + + try: + return pymupdf4llm.to_markdown(str(file_path)) + except Exception: + logger.exception("pymupdf4llm failed to convert %s; falling back to MarkItDown", file_path.name) + return None + + +def _convert_with_markitdown(file_path: Path) -> str: + """Convert any supported file to markdown text using MarkItDown.""" + from markitdown import MarkItDown + + md = MarkItDown() + return md.convert(str(file_path)).text_content + + +def _do_convert(file_path: Path, pdf_converter: str) -> str: + """Synchronous conversion — called directly or via asyncio.to_thread. + + Args: + file_path: Path to the file. + pdf_converter: "auto" | "pymupdf4llm" | "markitdown" + """ + is_pdf = file_path.suffix.lower() == ".pdf" + + if is_pdf and pdf_converter != "markitdown": + # Try pymupdf4llm first (auto or explicit) + pymupdf_text = _convert_pdf_with_pymupdf4llm(file_path) + + if pymupdf_text is not None: + # pymupdf4llm is installed + if pdf_converter == "pymupdf4llm": + # Explicit — use as-is regardless of output length + return pymupdf_text + # auto mode: fall back if output looks like a failed parse. + # Use chars-per-page to distinguish image-based PDFs (near 0) from + # legitimately short documents. + if not _pymupdf_output_too_sparse(pymupdf_text, file_path): + return pymupdf_text + logger.warning( + "pymupdf4llm produced only %d chars for %s (likely image-based PDF); falling back to MarkItDown", + len(pymupdf_text.strip()), + file_path.name, + ) + # pymupdf4llm not installed or fallback triggered → use MarkItDown + + return _convert_with_markitdown(file_path) + async def convert_file_to_markdown(file_path: Path) -> Path | None: - """Convert a file to markdown using markitdown. + """Convert a supported document file to Markdown. + + PDF files are handled with a two-converter strategy (see module docstring). + Large files (> 1 MB) are offloaded to a thread pool to avoid blocking the + event loop. Args: file_path: Path to the file to convert. Returns: - Path to the markdown file if conversion was successful, None otherwise. + Path to the generated .md file, or None if conversion failed. """ try: - from markitdown import MarkItDown + pdf_converter = _get_pdf_converter() + file_size = file_path.stat().st_size - md = MarkItDown() - result = md.convert(str(file_path)) + if file_size > _ASYNC_THRESHOLD_BYTES: + text = await asyncio.to_thread(_do_convert, file_path, pdf_converter) + else: + text = _do_convert(file_path, pdf_converter) - # Save as .md file with same name md_path = file_path.with_suffix(".md") - md_path.write_text(result.text_content, encoding="utf-8") + md_path.write_text(text, encoding="utf-8") - logger.info(f"Converted {file_path.name} to markdown: {md_path.name}") + logger.info("Converted %s to markdown: %s (%d chars)", file_path.name, md_path.name, len(text)) return md_path except Exception as e: - logger.error(f"Failed to convert {file_path.name} to markdown: {e}") + logger.error("Failed to convert %s to markdown: %s", file_path.name, e) return None @@ -69,6 +186,8 @@ _BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPEND # Keeps prompt size bounded even for very long documents. MAX_OUTLINE_ENTRIES = 50 +_ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"} + def extract_outline(md_path: Path) -> list[dict]: """Extract document outline (headings) from a Markdown file. @@ -122,14 +241,23 @@ def extract_outline(md_path: Path) -> list[dict]: def _get_pdf_converter() -> str: - """Read pdf_converter setting from app config, defaulting to 'auto'.""" + """Read pdf_converter setting from app config, defaulting to 'auto'. + + Normalizes the value to lowercase and validates it against the allowed set + so that values like 'AUTO' or 'MarkItDown' from config.yaml don't silently + fall through to unexpected behaviour. + """ try: from deerflow.config.app_config import get_app_config cfg = get_app_config() uploads_cfg = getattr(cfg, "uploads", None) if uploads_cfg is not None: - return str(getattr(uploads_cfg, "pdf_converter", "auto")) + raw = str(getattr(uploads_cfg, "pdf_converter", "auto")).strip().lower() + if raw not in _ALLOWED_PDF_CONVERTERS: + logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw) + return "auto" + return raw except Exception: pass return "auto" diff --git a/backend/packages/harness/pyproject.toml b/backend/packages/harness/pyproject.toml index c0c37e3d2..cf8b15839 100644 --- a/backend/packages/harness/pyproject.toml +++ b/backend/packages/harness/pyproject.toml @@ -34,6 +34,9 @@ dependencies = [ "langgraph-sdk>=0.1.51", ] +[project.optional-dependencies] +pymupdf = ["pymupdf4llm>=0.0.17"] + [build-system] requires = ["hatchling"] build-backend = "hatchling.build" diff --git a/backend/tests/test_file_conversion.py b/backend/tests/test_file_conversion.py index 72b13edc6..b2ad2d035 100644 --- a/backend/tests/test_file_conversion.py +++ b/backend/tests/test_file_conversion.py @@ -1,12 +1,304 @@ -"""Tests for extract_outline() in file_conversion utilities (PR2: document outline injection).""" +"""Tests for file_conversion utilities (PR1: pymupdf4llm + asyncio.to_thread; PR2: extract_outline).""" from __future__ import annotations +import asyncio +import sys +from types import ModuleType +from unittest.mock import MagicMock, patch + from deerflow.utils.file_conversion import ( + _ASYNC_THRESHOLD_BYTES, + _MIN_CHARS_PER_PAGE, MAX_OUTLINE_ENTRIES, + _do_convert, + _pymupdf_output_too_sparse, + convert_file_to_markdown, extract_outline, ) + +def _make_pymupdf_mock(page_count: int) -> ModuleType: + """Return a fake *pymupdf* module whose ``open()`` reports *page_count* pages.""" + mock_doc = MagicMock() + mock_doc.__len__ = MagicMock(return_value=page_count) + fake_pymupdf = ModuleType("pymupdf") + fake_pymupdf.open = MagicMock(return_value=mock_doc) # type: ignore[attr-defined] + return fake_pymupdf + + +def _run(coro): + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +# --------------------------------------------------------------------------- +# _pymupdf_output_too_sparse +# --------------------------------------------------------------------------- + + +class TestPymupdfOutputTooSparse: + """Check the chars-per-page sparsity heuristic.""" + + def test_dense_text_pdf_not_sparse(self, tmp_path): + """Normal text PDF: many chars per page → not sparse.""" + pdf = tmp_path / "dense.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + # 10 pages × 10 000 chars → 1000/page ≫ threshold + with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=10)}): + result = _pymupdf_output_too_sparse("x" * 10_000, pdf) + assert result is False + + def test_image_based_pdf_is_sparse(self, tmp_path): + """Image-based PDF: near-zero chars per page → sparse.""" + pdf = tmp_path / "image.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + # 612 chars / 31 pages ≈ 19.7/page < _MIN_CHARS_PER_PAGE (50) + with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=31)}): + result = _pymupdf_output_too_sparse("x" * 612, pdf) + assert result is True + + def test_fallback_when_pymupdf_unavailable(self, tmp_path): + """When pymupdf is not installed, fall back to absolute 200-char threshold.""" + pdf = tmp_path / "broken.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + # Remove pymupdf from sys.modules so the `import pymupdf` inside the + # function raises ImportError, triggering the absolute-threshold fallback. + with patch.dict(sys.modules, {"pymupdf": None}): + sparse = _pymupdf_output_too_sparse("x" * 100, pdf) + not_sparse = _pymupdf_output_too_sparse("x" * 300, pdf) + + assert sparse is True + assert not_sparse is False + + def test_exactly_at_threshold_is_not_sparse(self, tmp_path): + """Chars-per-page == threshold is treated as NOT sparse (boundary inclusive).""" + pdf = tmp_path / "boundary.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + # 2 pages × _MIN_CHARS_PER_PAGE chars = exactly at threshold + with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=2)}): + result = _pymupdf_output_too_sparse("x" * (_MIN_CHARS_PER_PAGE * 2), pdf) + assert result is False + + +# --------------------------------------------------------------------------- +# _do_convert — routing logic +# --------------------------------------------------------------------------- + + +class TestDoConvert: + """Verify that _do_convert routes to the right sub-converter.""" + + def test_non_pdf_always_uses_markitdown(self, tmp_path): + """DOCX / XLSX / PPTX always go through MarkItDown regardless of setting.""" + docx = tmp_path / "report.docx" + docx.write_bytes(b"PK fake docx") + + with patch( + "deerflow.utils.file_conversion._convert_with_markitdown", + return_value="# Markdown from MarkItDown", + ) as mock_md: + result = _do_convert(docx, "auto") + + mock_md.assert_called_once_with(docx) + assert result == "# Markdown from MarkItDown" + + def test_pdf_auto_uses_pymupdf4llm_when_dense(self, tmp_path): + """auto mode: use pymupdf4llm output when it's dense enough.""" + pdf = tmp_path / "report.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + dense_text = "# Heading\n" + "word " * 2000 # clearly dense + + with ( + patch( + "deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm", + return_value=dense_text, + ), + patch( + "deerflow.utils.file_conversion._pymupdf_output_too_sparse", + return_value=False, + ), + patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md, + ): + result = _do_convert(pdf, "auto") + + mock_md.assert_not_called() + assert result == dense_text + + def test_pdf_auto_falls_back_when_sparse(self, tmp_path): + """auto mode: fall back to MarkItDown when pymupdf4llm output is sparse.""" + pdf = tmp_path / "scanned.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + with ( + patch( + "deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm", + return_value="x" * 612, # 19.7 chars/page for 31-page doc + ), + patch( + "deerflow.utils.file_conversion._pymupdf_output_too_sparse", + return_value=True, + ), + patch( + "deerflow.utils.file_conversion._convert_with_markitdown", + return_value="OCR result via MarkItDown", + ) as mock_md, + ): + result = _do_convert(pdf, "auto") + + mock_md.assert_called_once_with(pdf) + assert result == "OCR result via MarkItDown" + + def test_pdf_explicit_pymupdf4llm_skips_sparsity_check(self, tmp_path): + """'pymupdf4llm' mode: use output as-is even if sparse.""" + pdf = tmp_path / "explicit.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + sparse_text = "x" * 10 # very short + + with ( + patch( + "deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm", + return_value=sparse_text, + ), + patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md, + ): + result = _do_convert(pdf, "pymupdf4llm") + + mock_md.assert_not_called() + assert result == sparse_text + + def test_pdf_explicit_markitdown_skips_pymupdf4llm(self, tmp_path): + """'markitdown' mode: never attempt pymupdf4llm.""" + pdf = tmp_path / "force_md.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + with ( + patch("deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm") as mock_pymu, + patch( + "deerflow.utils.file_conversion._convert_with_markitdown", + return_value="MarkItDown result", + ), + ): + result = _do_convert(pdf, "markitdown") + + mock_pymu.assert_not_called() + assert result == "MarkItDown result" + + def test_pdf_auto_falls_back_when_pymupdf4llm_not_installed(self, tmp_path): + """auto mode: if pymupdf4llm is not installed, use MarkItDown directly.""" + pdf = tmp_path / "no_pymupdf.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + with ( + patch( + "deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm", + return_value=None, # None signals not installed + ), + patch( + "deerflow.utils.file_conversion._convert_with_markitdown", + return_value="MarkItDown fallback", + ) as mock_md, + ): + result = _do_convert(pdf, "auto") + + mock_md.assert_called_once_with(pdf) + assert result == "MarkItDown fallback" + + +# --------------------------------------------------------------------------- +# convert_file_to_markdown — async + file writing +# --------------------------------------------------------------------------- + + +class TestConvertFileToMarkdown: + def test_small_file_runs_synchronously(self, tmp_path): + """Small files (< 1 MB) are converted in the event loop thread.""" + pdf = tmp_path / "small.pdf" + pdf.write_bytes(b"%PDF-1.4 " + b"x" * 100) # well under 1 MB + + with ( + patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"), + patch( + "deerflow.utils.file_conversion._do_convert", + return_value="# Small PDF", + ) as mock_convert, + patch("asyncio.to_thread") as mock_thread, + ): + md_path = _run(convert_file_to_markdown(pdf)) + + # asyncio.to_thread must NOT have been called + mock_thread.assert_not_called() + mock_convert.assert_called_once() + assert md_path == pdf.with_suffix(".md") + assert md_path.read_text() == "# Small PDF" + + def test_large_file_offloaded_to_thread(self, tmp_path): + """Large files (> 1 MB) are offloaded via asyncio.to_thread.""" + pdf = tmp_path / "large.pdf" + # Write slightly more than the threshold + pdf.write_bytes(b"%PDF-1.4 " + b"x" * (_ASYNC_THRESHOLD_BYTES + 1)) + + async def fake_to_thread(fn, *args, **kwargs): + return fn(*args, **kwargs) + + with ( + patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"), + patch( + "deerflow.utils.file_conversion._do_convert", + return_value="# Large PDF", + ), + patch("asyncio.to_thread", side_effect=fake_to_thread) as mock_thread, + ): + md_path = _run(convert_file_to_markdown(pdf)) + + mock_thread.assert_called_once() + assert md_path == pdf.with_suffix(".md") + assert md_path.read_text() == "# Large PDF" + + def test_returns_none_on_conversion_error(self, tmp_path): + """If conversion raises, return None without propagating the exception.""" + pdf = tmp_path / "broken.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + with ( + patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"), + patch( + "deerflow.utils.file_conversion._do_convert", + side_effect=RuntimeError("conversion failed"), + ), + ): + result = _run(convert_file_to_markdown(pdf)) + + assert result is None + + def test_writes_utf8_markdown_file(self, tmp_path): + """Generated .md file is written with UTF-8 encoding.""" + pdf = tmp_path / "report.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + chinese_content = "# 中文报告\n\n这是测试内容。" + + with ( + patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"), + patch( + "deerflow.utils.file_conversion._do_convert", + return_value=chinese_content, + ), + ): + md_path = _run(convert_file_to_markdown(pdf)) + + assert md_path is not None + assert md_path.read_text(encoding="utf-8") == chinese_content + + # --------------------------------------------------------------------------- # extract_outline # --------------------------------------------------------------------------- diff --git a/config.example.yaml b/config.example.yaml index 3eb0b9e9e..0ca11f418 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -369,6 +369,15 @@ tool_search: # Option 1: Local Sandbox (Default) # Executes commands directly on the host machine +uploads: + # PDF-to-Markdown converter used when a PDF is uploaded. + # auto — prefer pymupdf4llm when installed; fall back to MarkItDown for + # image-based or encrypted PDFs (recommended default). + # pymupdf4llm — always use pymupdf4llm (must be installed: uv add pymupdf4llm). + # Better heading/table extraction; faster on most files. + # markitdown — always use MarkItDown (original behaviour, no extra dependency). + pdf_converter: auto + sandbox: use: deerflow.sandbox.local:LocalSandboxProvider # Host bash execution is disabled by default because LocalSandboxProvider is