feat(uploads): add pymupdf4llm PDF converter with auto-fallback and async offload (#1727)

* feat(uploads): add pymupdf4llm PDF converter with auto-fallback and async offload - Introduce pymupdf4llm as an optional PDF converter with better heading detection and table preservation than MarkItDown - Auto mode: prefer pymupdf4llm when installed; fall back to MarkItDown when output is suspiciously sparse (image-based / scanned PDFs) - Sparsity check uses chars-per-page (< 50 chars/page) rather than an absolute threshold, correctly handling both short and long documents - Large files (> 1 MB) are offloaded to asyncio.to_thread() to avoid blocking the event loop (related: #1569) - Add UploadsConfig with pdf_converter field (auto/pymupdf4llm/markitdown) - Add pymupdf4llm as optional dependency: pip install deerflow-harness[pymupdf] - Add 14 unit tests covering sparsity heuristic, routing logic, and async path * fix(uploads): address Copilot review comments on PDF converter - Fix docstring: MIN_CHARS_PYMUPDF -> _MIN_CHARS_PER_PAGE (typo) - Fix file handle leak: wrap pymupdf.open in try/finally to ensure doc.close() - Fix silent fallback gap: _convert_pdf_with_pymupdf4llm now catches all conversion exceptions (not just ImportError), so encrypted/corrupt PDFs fall back to MarkItDown instead of propagating - Tighten type: pdf_converter field changed from str to Literal[auto|pymupdf4llm|markitdown] - Normalize config value: _get_pdf_converter() strips and lowercases the raw config string, warns and falls back to 'auto' on unknown values
2026-04-25 11:18:22 +00:00 · 2026-04-03 21:59:45 +08:00 · 2026-04-03 21:59:45 +08:00 · ddfc988bef
commit ddfc988bef
parent 5ff230eafd
5 changed files with 461 additions and 14 deletions
--- a/backend/packages/harness/deerflow/config/app_config.py
+++ b/backend/packages/harness/deerflow/config/app_config.py
@ -1,7 +1,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import Any, Self
+from typing import Any, Literal, Self
 import yaml
 from dotenv import load_dotenv
@ -28,11 +28,26 @@ load_dotenv()
 logger = logging.getLogger(__name__)
 class UploadsConfig(BaseModel):
    """Configuration for file upload handling."""
    pdf_converter: Literal["auto", "pymupdf4llm", "markitdown"] = Field(
        default="auto",
        description=(
            "PDF-to-Markdown converter. "
            "'auto': prefer pymupdf4llm when installed, fall back to MarkItDown for image-based PDFs; "
            "'pymupdf4llm': always use pymupdf4llm (must be installed); "
            "'markitdown': always use MarkItDown (original behaviour)."
        ),
    )
 class AppConfig(BaseModel):
    """Config for the DeerFlow application"""
    log_level: str = Field(default="info", description="Logging level for deerflow modules (debug/info/warning/error)")
    token_usage: TokenUsageConfig = Field(default_factory=TokenUsageConfig, description="Token usage tracking configuration")
    uploads: UploadsConfig = Field(default_factory=UploadsConfig, description="File upload handling configuration")
    models: list[ModelConfig] = Field(default_factory=list, description="Available models")
    sandbox: SandboxConfig = Field(description="Sandbox configuration")
    tools: list[ToolConfig] = Field(default_factory=list, description="Available tools")
--- a/backend/packages/harness/deerflow/utils/file_conversion.py
+++ b/backend/packages/harness/deerflow/utils/file_conversion.py
@ -1,9 +1,20 @@
 """File conversion utilities.
-Converts document files (PDF, PPT, Excel, Word) to Markdown using markitdown.
+Converts document files (PDF, PPT, Excel, Word) to Markdown.
 PDF conversion strategy (auto mode):
  1. Try pymupdf4llm if installed — better heading detection, faster on most files.
  2. If output is suspiciously short (< _MIN_CHARS_PER_PAGE chars/page, or < 200 chars
     total when page count is unavailable), treat as image-based and fall back to MarkItDown.
  3. If pymupdf4llm is not installed, use MarkItDown directly (existing behaviour).
 Large files (> ASYNC_THRESHOLD_BYTES) are converted in a thread pool via
 asyncio.to_thread() to avoid blocking the event loop (fixes #1569).
 No FastAPI or HTTP dependencies — pure utility functions.
 """
 import asyncio
 import logging
 import re
 from pathlib import Path
@ -21,30 +32,136 @@ CONVERTIBLE_EXTENSIONS = {
    ".docx",
 }
 # Files larger than this threshold are converted in a background thread.
 # Small files complete in < 1s synchronously; spawning a thread adds unnecessary
 # scheduling overhead for them.
 _ASYNC_THRESHOLD_BYTES = 1 * 1024 * 1024  # 1 MB
 # If pymupdf4llm produces fewer characters *per page* than this threshold,
 # the PDF is likely image-based or encrypted — fall back to MarkItDown.
 # Rationale: normal text PDFs yield 200-2000 chars/page; image-based PDFs
 # yield close to 0. 50 chars/page gives a wide safety margin.
 # Falls back to absolute 200-char check when page count is unavailable.
 _MIN_CHARS_PER_PAGE = 50
 def _pymupdf_output_too_sparse(text: str, file_path: Path) -> bool:
    """Return True if pymupdf4llm output is suspiciously short (image-based PDF).
    Uses chars-per-page rather than an absolute threshold so that both short
    documents (few pages, few chars) and long documents (many pages, many chars)
    are handled correctly.
    """
    chars = len(text.strip())
    doc = None
    pages: int | None = None
    try:
        import pymupdf
        doc = pymupdf.open(str(file_path))
        pages = len(doc)
    except Exception:
        pass
    finally:
        if doc is not None:
            try:
                doc.close()
            except Exception:
                pass
    if pages is not None and pages > 0:
        return (chars / pages) < _MIN_CHARS_PER_PAGE
    # Fallback: absolute threshold when page count is unavailable
    return chars < 200
 def _convert_pdf_with_pymupdf4llm(file_path: Path) -> str | None:
    """Attempt PDF conversion with pymupdf4llm.
    Returns the markdown text, or None if pymupdf4llm is not installed or
    if conversion fails (e.g. encrypted/corrupt PDF).
    """
    try:
        import pymupdf4llm
    except ImportError:
        return None
    try:
        return pymupdf4llm.to_markdown(str(file_path))
    except Exception:
        logger.exception("pymupdf4llm failed to convert %s; falling back to MarkItDown", file_path.name)
        return None
 def _convert_with_markitdown(file_path: Path) -> str:
    """Convert any supported file to markdown text using MarkItDown."""
    from markitdown import MarkItDown
    md = MarkItDown()
    return md.convert(str(file_path)).text_content
 def _do_convert(file_path: Path, pdf_converter: str) -> str:
    """Synchronous conversion — called directly or via asyncio.to_thread.
    Args:
        file_path: Path to the file.
        pdf_converter: "auto" | "pymupdf4llm" | "markitdown"
    """
    is_pdf = file_path.suffix.lower() == ".pdf"
    if is_pdf and pdf_converter != "markitdown":
        # Try pymupdf4llm first (auto or explicit)
        pymupdf_text = _convert_pdf_with_pymupdf4llm(file_path)
        if pymupdf_text is not None:
            # pymupdf4llm is installed
            if pdf_converter == "pymupdf4llm":
                # Explicit — use as-is regardless of output length
                return pymupdf_text
            # auto mode: fall back if output looks like a failed parse.
            # Use chars-per-page to distinguish image-based PDFs (near 0) from
            # legitimately short documents.
            if not _pymupdf_output_too_sparse(pymupdf_text, file_path):
                return pymupdf_text
            logger.warning(
                "pymupdf4llm produced only %d chars for %s (likely image-based PDF); falling back to MarkItDown",
                len(pymupdf_text.strip()),
                file_path.name,
            )
        # pymupdf4llm not installed or fallback triggered → use MarkItDown
    return _convert_with_markitdown(file_path)
 async def convert_file_to_markdown(file_path: Path) -> Path | None:
-    """Convert a file to markdown using markitdown.
+    """Convert a supported document file to Markdown.
    PDF files are handled with a two-converter strategy (see module docstring).
    Large files (> 1 MB) are offloaded to a thread pool to avoid blocking the
    event loop.
    Args:
        file_path: Path to the file to convert.
    Returns:
-        Path to the markdown file if conversion was successful, None otherwise.
+        Path to the generated .md file, or None if conversion failed.
    """
    try:
-        from markitdown import MarkItDown
+        pdf_converter = _get_pdf_converter()
        file_size = file_path.stat().st_size
-        md = MarkItDown()
+        if file_size > _ASYNC_THRESHOLD_BYTES:
-        result = md.convert(str(file_path))
+            text = await asyncio.to_thread(_do_convert, file_path, pdf_converter)
        else:
            text = _do_convert(file_path, pdf_converter)
        # Save as .md file with same name
        md_path = file_path.with_suffix(".md")
-        md_path.write_text(result.text_content, encoding="utf-8")
+        md_path.write_text(text, encoding="utf-8")
-        logger.info(f"Converted {file_path.name} to markdown: {md_path.name}")
+        logger.info("Converted %s to markdown: %s (%d chars)", file_path.name, md_path.name, len(text))
        return md_path
    except Exception as e:
-        logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
+        logger.error("Failed to convert %s to markdown: %s", file_path.name, e)
        return None
@ -69,6 +186,8 @@ _BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPEND
 # Keeps prompt size bounded even for very long documents.
 MAX_OUTLINE_ENTRIES = 50
 _ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"}
 def extract_outline(md_path: Path) -> list[dict]:
    """Extract document outline (headings) from a Markdown file.
@ -122,14 +241,23 @@ def extract_outline(md_path: Path) -> list[dict]:
 def _get_pdf_converter() -> str:
-    """Read pdf_converter setting from app config, defaulting to 'auto'."""
+    """Read pdf_converter setting from app config, defaulting to 'auto'.
    Normalizes the value to lowercase and validates it against the allowed set
    so that values like 'AUTO' or 'MarkItDown' from config.yaml don't silently
    fall through to unexpected behaviour.
    """
    try:
        from deerflow.config.app_config import get_app_config
        cfg = get_app_config()
        uploads_cfg = getattr(cfg, "uploads", None)
        if uploads_cfg is not None:
-            return str(getattr(uploads_cfg, "pdf_converter", "auto"))
+            raw = str(getattr(uploads_cfg, "pdf_converter", "auto")).strip().lower()
            if raw not in _ALLOWED_PDF_CONVERTERS:
                logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw)
                return "auto"
            return raw
    except Exception:
        pass
    return "auto"
--- a/backend/packages/harness/pyproject.toml
+++ b/backend/packages/harness/pyproject.toml
@ -34,6 +34,9 @@ dependencies = [
    "langgraph-sdk>=0.1.51",
 ]
 [project.optional-dependencies]
 pymupdf = ["pymupdf4llm>=0.0.17"]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
--- a/backend/tests/test_file_conversion.py
+++ b/backend/tests/test_file_conversion.py
@ -1,12 +1,304 @@
-"""Tests for extract_outline() in file_conversion utilities (PR2: document outline injection)."""
+"""Tests for file_conversion utilities (PR1: pymupdf4llm + asyncio.to_thread; PR2: extract_outline)."""
 from __future__ import annotations
 import asyncio
 import sys
 from types import ModuleType
 from unittest.mock import MagicMock, patch
 from deerflow.utils.file_conversion import (
    _ASYNC_THRESHOLD_BYTES,
    _MIN_CHARS_PER_PAGE,
    MAX_OUTLINE_ENTRIES,
    _do_convert,
    _pymupdf_output_too_sparse,
    convert_file_to_markdown,
    extract_outline,
 )
 def _make_pymupdf_mock(page_count: int) -> ModuleType:
    """Return a fake *pymupdf* module whose ``open()`` reports *page_count* pages."""
    mock_doc = MagicMock()
    mock_doc.__len__ = MagicMock(return_value=page_count)
    fake_pymupdf = ModuleType("pymupdf")
    fake_pymupdf.open = MagicMock(return_value=mock_doc)  # type: ignore[attr-defined]
    return fake_pymupdf
 def _run(coro):
    loop = asyncio.new_event_loop()
    try:
        return loop.run_until_complete(coro)
    finally:
        loop.close()
 # ---------------------------------------------------------------------------
 # _pymupdf_output_too_sparse
 # ---------------------------------------------------------------------------
 class TestPymupdfOutputTooSparse:
    """Check the chars-per-page sparsity heuristic."""
    def test_dense_text_pdf_not_sparse(self, tmp_path):
        """Normal text PDF: many chars per page → not sparse."""
        pdf = tmp_path / "dense.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        # 10 pages × 10 000 chars → 1000/page ≫ threshold
        with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=10)}):
            result = _pymupdf_output_too_sparse("x" * 10_000, pdf)
        assert result is False
    def test_image_based_pdf_is_sparse(self, tmp_path):
        """Image-based PDF: near-zero chars per page → sparse."""
        pdf = tmp_path / "image.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        # 612 chars / 31 pages ≈ 19.7/page < _MIN_CHARS_PER_PAGE (50)
        with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=31)}):
            result = _pymupdf_output_too_sparse("x" * 612, pdf)
        assert result is True
    def test_fallback_when_pymupdf_unavailable(self, tmp_path):
        """When pymupdf is not installed, fall back to absolute 200-char threshold."""
        pdf = tmp_path / "broken.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        # Remove pymupdf from sys.modules so the `import pymupdf` inside the
        # function raises ImportError, triggering the absolute-threshold fallback.
        with patch.dict(sys.modules, {"pymupdf": None}):
            sparse = _pymupdf_output_too_sparse("x" * 100, pdf)
            not_sparse = _pymupdf_output_too_sparse("x" * 300, pdf)
        assert sparse is True
        assert not_sparse is False
    def test_exactly_at_threshold_is_not_sparse(self, tmp_path):
        """Chars-per-page == threshold is treated as NOT sparse (boundary inclusive)."""
        pdf = tmp_path / "boundary.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        # 2 pages × _MIN_CHARS_PER_PAGE chars = exactly at threshold
        with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=2)}):
            result = _pymupdf_output_too_sparse("x" * (_MIN_CHARS_PER_PAGE * 2), pdf)
        assert result is False
 # ---------------------------------------------------------------------------
 # _do_convert — routing logic
 # ---------------------------------------------------------------------------
 class TestDoConvert:
    """Verify that _do_convert routes to the right sub-converter."""
    def test_non_pdf_always_uses_markitdown(self, tmp_path):
        """DOCX / XLSX / PPTX always go through MarkItDown regardless of setting."""
        docx = tmp_path / "report.docx"
        docx.write_bytes(b"PK fake docx")
        with patch(
            "deerflow.utils.file_conversion._convert_with_markitdown",
            return_value="# Markdown from MarkItDown",
        ) as mock_md:
            result = _do_convert(docx, "auto")
        mock_md.assert_called_once_with(docx)
        assert result == "# Markdown from MarkItDown"
    def test_pdf_auto_uses_pymupdf4llm_when_dense(self, tmp_path):
        """auto mode: use pymupdf4llm output when it's dense enough."""
        pdf = tmp_path / "report.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        dense_text = "# Heading\n" + "word " * 2000  # clearly dense
        with (
            patch(
                "deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
                return_value=dense_text,
            ),
            patch(
                "deerflow.utils.file_conversion._pymupdf_output_too_sparse",
                return_value=False,
            ),
            patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md,
        ):
            result = _do_convert(pdf, "auto")
        mock_md.assert_not_called()
        assert result == dense_text
    def test_pdf_auto_falls_back_when_sparse(self, tmp_path):
        """auto mode: fall back to MarkItDown when pymupdf4llm output is sparse."""
        pdf = tmp_path / "scanned.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        with (
            patch(
                "deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
                return_value="x" * 612,  # 19.7 chars/page for 31-page doc
            ),
            patch(
                "deerflow.utils.file_conversion._pymupdf_output_too_sparse",
                return_value=True,
            ),
            patch(
                "deerflow.utils.file_conversion._convert_with_markitdown",
                return_value="OCR result via MarkItDown",
            ) as mock_md,
        ):
            result = _do_convert(pdf, "auto")
        mock_md.assert_called_once_with(pdf)
        assert result == "OCR result via MarkItDown"
    def test_pdf_explicit_pymupdf4llm_skips_sparsity_check(self, tmp_path):
        """'pymupdf4llm' mode: use output as-is even if sparse."""
        pdf = tmp_path / "explicit.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        sparse_text = "x" * 10  # very short
        with (
            patch(
                "deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
                return_value=sparse_text,
            ),
            patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md,
        ):
            result = _do_convert(pdf, "pymupdf4llm")
        mock_md.assert_not_called()
        assert result == sparse_text
    def test_pdf_explicit_markitdown_skips_pymupdf4llm(self, tmp_path):
        """'markitdown' mode: never attempt pymupdf4llm."""
        pdf = tmp_path / "force_md.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        with (
            patch("deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm") as mock_pymu,
            patch(
                "deerflow.utils.file_conversion._convert_with_markitdown",
                return_value="MarkItDown result",
            ),
        ):
            result = _do_convert(pdf, "markitdown")
        mock_pymu.assert_not_called()
        assert result == "MarkItDown result"
    def test_pdf_auto_falls_back_when_pymupdf4llm_not_installed(self, tmp_path):
        """auto mode: if pymupdf4llm is not installed, use MarkItDown directly."""
        pdf = tmp_path / "no_pymupdf.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        with (
            patch(
                "deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
                return_value=None,  # None signals not installed
            ),
            patch(
                "deerflow.utils.file_conversion._convert_with_markitdown",
                return_value="MarkItDown fallback",
            ) as mock_md,
        ):
            result = _do_convert(pdf, "auto")
        mock_md.assert_called_once_with(pdf)
        assert result == "MarkItDown fallback"
 # ---------------------------------------------------------------------------
 # convert_file_to_markdown — async + file writing
 # ---------------------------------------------------------------------------
 class TestConvertFileToMarkdown:
    def test_small_file_runs_synchronously(self, tmp_path):
        """Small files (< 1 MB) are converted in the event loop thread."""
        pdf = tmp_path / "small.pdf"
        pdf.write_bytes(b"%PDF-1.4 " + b"x" * 100)  # well under 1 MB
        with (
            patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
            patch(
                "deerflow.utils.file_conversion._do_convert",
                return_value="# Small PDF",
            ) as mock_convert,
            patch("asyncio.to_thread") as mock_thread,
        ):
            md_path = _run(convert_file_to_markdown(pdf))
        # asyncio.to_thread must NOT have been called
        mock_thread.assert_not_called()
        mock_convert.assert_called_once()
        assert md_path == pdf.with_suffix(".md")
        assert md_path.read_text() == "# Small PDF"
    def test_large_file_offloaded_to_thread(self, tmp_path):
        """Large files (> 1 MB) are offloaded via asyncio.to_thread."""
        pdf = tmp_path / "large.pdf"
        # Write slightly more than the threshold
        pdf.write_bytes(b"%PDF-1.4 " + b"x" * (_ASYNC_THRESHOLD_BYTES + 1))
        async def fake_to_thread(fn, *args, **kwargs):
            return fn(*args, **kwargs)
        with (
            patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
            patch(
                "deerflow.utils.file_conversion._do_convert",
                return_value="# Large PDF",
            ),
            patch("asyncio.to_thread", side_effect=fake_to_thread) as mock_thread,
        ):
            md_path = _run(convert_file_to_markdown(pdf))
        mock_thread.assert_called_once()
        assert md_path == pdf.with_suffix(".md")
        assert md_path.read_text() == "# Large PDF"
    def test_returns_none_on_conversion_error(self, tmp_path):
        """If conversion raises, return None without propagating the exception."""
        pdf = tmp_path / "broken.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        with (
            patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
            patch(
                "deerflow.utils.file_conversion._do_convert",
                side_effect=RuntimeError("conversion failed"),
            ),
        ):
            result = _run(convert_file_to_markdown(pdf))
        assert result is None
    def test_writes_utf8_markdown_file(self, tmp_path):
        """Generated .md file is written with UTF-8 encoding."""
        pdf = tmp_path / "report.pdf"
        pdf.write_bytes(b"%PDF-1.4 fake")
        chinese_content = "# 中文报告\n\n这是测试内容。"
        with (
            patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
            patch(
                "deerflow.utils.file_conversion._do_convert",
                return_value=chinese_content,
            ),
        ):
            md_path = _run(convert_file_to_markdown(pdf))
        assert md_path is not None
        assert md_path.read_text(encoding="utf-8") == chinese_content
 # ---------------------------------------------------------------------------
 # extract_outline
 # ---------------------------------------------------------------------------
--- a/config.example.yaml
+++ b/config.example.yaml
@ -369,6 +369,15 @@ tool_search:
 # Option 1: Local Sandbox (Default)
 # Executes commands directly on the host machine
 uploads:
  # PDF-to-Markdown converter used when a PDF is uploaded.
  # auto        — prefer pymupdf4llm when installed; fall back to MarkItDown for
  #               image-based or encrypted PDFs (recommended default).
  # pymupdf4llm — always use pymupdf4llm (must be installed: uv add pymupdf4llm).
  #               Better heading/table extraction; faster on most files.
  # markitdown  — always use MarkItDown (original behaviour, no extra dependency).
  pdf_converter: auto
 sandbox:
  use: deerflow.sandbox.local:LocalSandboxProvider
  # Host bash execution is disabled by default because LocalSandboxProvider is