mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-04-25 11:18:22 +00:00
feat(uploads): add pymupdf4llm PDF converter with auto-fallback and async offload (#1727)
* feat(uploads): add pymupdf4llm PDF converter with auto-fallback and async offload - Introduce pymupdf4llm as an optional PDF converter with better heading detection and table preservation than MarkItDown - Auto mode: prefer pymupdf4llm when installed; fall back to MarkItDown when output is suspiciously sparse (image-based / scanned PDFs) - Sparsity check uses chars-per-page (< 50 chars/page) rather than an absolute threshold, correctly handling both short and long documents - Large files (> 1 MB) are offloaded to asyncio.to_thread() to avoid blocking the event loop (related: #1569) - Add UploadsConfig with pdf_converter field (auto/pymupdf4llm/markitdown) - Add pymupdf4llm as optional dependency: pip install deerflow-harness[pymupdf] - Add 14 unit tests covering sparsity heuristic, routing logic, and async path * fix(uploads): address Copilot review comments on PDF converter - Fix docstring: MIN_CHARS_PYMUPDF -> _MIN_CHARS_PER_PAGE (typo) - Fix file handle leak: wrap pymupdf.open in try/finally to ensure doc.close() - Fix silent fallback gap: _convert_pdf_with_pymupdf4llm now catches all conversion exceptions (not just ImportError), so encrypted/corrupt PDFs fall back to MarkItDown instead of propagating - Tighten type: pdf_converter field changed from str to Literal[auto|pymupdf4llm|markitdown] - Normalize config value: _get_pdf_converter() strips and lowercases the raw config string, warns and falls back to 'auto' on unknown values
This commit is contained in:
parent
5ff230eafd
commit
ddfc988bef
@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Self
|
from typing import Any, Literal, Self
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
@ -28,11 +28,26 @@ load_dotenv()
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class UploadsConfig(BaseModel):
|
||||||
|
"""Configuration for file upload handling."""
|
||||||
|
|
||||||
|
pdf_converter: Literal["auto", "pymupdf4llm", "markitdown"] = Field(
|
||||||
|
default="auto",
|
||||||
|
description=(
|
||||||
|
"PDF-to-Markdown converter. "
|
||||||
|
"'auto': prefer pymupdf4llm when installed, fall back to MarkItDown for image-based PDFs; "
|
||||||
|
"'pymupdf4llm': always use pymupdf4llm (must be installed); "
|
||||||
|
"'markitdown': always use MarkItDown (original behaviour)."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AppConfig(BaseModel):
|
class AppConfig(BaseModel):
|
||||||
"""Config for the DeerFlow application"""
|
"""Config for the DeerFlow application"""
|
||||||
|
|
||||||
log_level: str = Field(default="info", description="Logging level for deerflow modules (debug/info/warning/error)")
|
log_level: str = Field(default="info", description="Logging level for deerflow modules (debug/info/warning/error)")
|
||||||
token_usage: TokenUsageConfig = Field(default_factory=TokenUsageConfig, description="Token usage tracking configuration")
|
token_usage: TokenUsageConfig = Field(default_factory=TokenUsageConfig, description="Token usage tracking configuration")
|
||||||
|
uploads: UploadsConfig = Field(default_factory=UploadsConfig, description="File upload handling configuration")
|
||||||
models: list[ModelConfig] = Field(default_factory=list, description="Available models")
|
models: list[ModelConfig] = Field(default_factory=list, description="Available models")
|
||||||
sandbox: SandboxConfig = Field(description="Sandbox configuration")
|
sandbox: SandboxConfig = Field(description="Sandbox configuration")
|
||||||
tools: list[ToolConfig] = Field(default_factory=list, description="Available tools")
|
tools: list[ToolConfig] = Field(default_factory=list, description="Available tools")
|
||||||
|
|||||||
@ -1,9 +1,20 @@
|
|||||||
"""File conversion utilities.
|
"""File conversion utilities.
|
||||||
|
|
||||||
Converts document files (PDF, PPT, Excel, Word) to Markdown using markitdown.
|
Converts document files (PDF, PPT, Excel, Word) to Markdown.
|
||||||
|
|
||||||
|
PDF conversion strategy (auto mode):
|
||||||
|
1. Try pymupdf4llm if installed — better heading detection, faster on most files.
|
||||||
|
2. If output is suspiciously short (< _MIN_CHARS_PER_PAGE chars/page, or < 200 chars
|
||||||
|
total when page count is unavailable), treat as image-based and fall back to MarkItDown.
|
||||||
|
3. If pymupdf4llm is not installed, use MarkItDown directly (existing behaviour).
|
||||||
|
|
||||||
|
Large files (> ASYNC_THRESHOLD_BYTES) are converted in a thread pool via
|
||||||
|
asyncio.to_thread() to avoid blocking the event loop (fixes #1569).
|
||||||
|
|
||||||
No FastAPI or HTTP dependencies — pure utility functions.
|
No FastAPI or HTTP dependencies — pure utility functions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -21,30 +32,136 @@ CONVERTIBLE_EXTENSIONS = {
|
|||||||
".docx",
|
".docx",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Files larger than this threshold are converted in a background thread.
|
||||||
|
# Small files complete in < 1s synchronously; spawning a thread adds unnecessary
|
||||||
|
# scheduling overhead for them.
|
||||||
|
_ASYNC_THRESHOLD_BYTES = 1 * 1024 * 1024 # 1 MB
|
||||||
|
|
||||||
|
# If pymupdf4llm produces fewer characters *per page* than this threshold,
|
||||||
|
# the PDF is likely image-based or encrypted — fall back to MarkItDown.
|
||||||
|
# Rationale: normal text PDFs yield 200-2000 chars/page; image-based PDFs
|
||||||
|
# yield close to 0. 50 chars/page gives a wide safety margin.
|
||||||
|
# Falls back to absolute 200-char check when page count is unavailable.
|
||||||
|
_MIN_CHARS_PER_PAGE = 50
|
||||||
|
|
||||||
|
|
||||||
|
def _pymupdf_output_too_sparse(text: str, file_path: Path) -> bool:
|
||||||
|
"""Return True if pymupdf4llm output is suspiciously short (image-based PDF).
|
||||||
|
|
||||||
|
Uses chars-per-page rather than an absolute threshold so that both short
|
||||||
|
documents (few pages, few chars) and long documents (many pages, many chars)
|
||||||
|
are handled correctly.
|
||||||
|
"""
|
||||||
|
chars = len(text.strip())
|
||||||
|
doc = None
|
||||||
|
pages: int | None = None
|
||||||
|
try:
|
||||||
|
import pymupdf
|
||||||
|
|
||||||
|
doc = pymupdf.open(str(file_path))
|
||||||
|
pages = len(doc)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
if doc is not None:
|
||||||
|
try:
|
||||||
|
doc.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if pages is not None and pages > 0:
|
||||||
|
return (chars / pages) < _MIN_CHARS_PER_PAGE
|
||||||
|
# Fallback: absolute threshold when page count is unavailable
|
||||||
|
return chars < 200
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_pdf_with_pymupdf4llm(file_path: Path) -> str | None:
|
||||||
|
"""Attempt PDF conversion with pymupdf4llm.
|
||||||
|
|
||||||
|
Returns the markdown text, or None if pymupdf4llm is not installed or
|
||||||
|
if conversion fails (e.g. encrypted/corrupt PDF).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import pymupdf4llm
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
return pymupdf4llm.to_markdown(str(file_path))
|
||||||
|
except Exception:
|
||||||
|
logger.exception("pymupdf4llm failed to convert %s; falling back to MarkItDown", file_path.name)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_with_markitdown(file_path: Path) -> str:
|
||||||
|
"""Convert any supported file to markdown text using MarkItDown."""
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
|
||||||
|
md = MarkItDown()
|
||||||
|
return md.convert(str(file_path)).text_content
|
||||||
|
|
||||||
|
|
||||||
|
def _do_convert(file_path: Path, pdf_converter: str) -> str:
|
||||||
|
"""Synchronous conversion — called directly or via asyncio.to_thread.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file.
|
||||||
|
pdf_converter: "auto" | "pymupdf4llm" | "markitdown"
|
||||||
|
"""
|
||||||
|
is_pdf = file_path.suffix.lower() == ".pdf"
|
||||||
|
|
||||||
|
if is_pdf and pdf_converter != "markitdown":
|
||||||
|
# Try pymupdf4llm first (auto or explicit)
|
||||||
|
pymupdf_text = _convert_pdf_with_pymupdf4llm(file_path)
|
||||||
|
|
||||||
|
if pymupdf_text is not None:
|
||||||
|
# pymupdf4llm is installed
|
||||||
|
if pdf_converter == "pymupdf4llm":
|
||||||
|
# Explicit — use as-is regardless of output length
|
||||||
|
return pymupdf_text
|
||||||
|
# auto mode: fall back if output looks like a failed parse.
|
||||||
|
# Use chars-per-page to distinguish image-based PDFs (near 0) from
|
||||||
|
# legitimately short documents.
|
||||||
|
if not _pymupdf_output_too_sparse(pymupdf_text, file_path):
|
||||||
|
return pymupdf_text
|
||||||
|
logger.warning(
|
||||||
|
"pymupdf4llm produced only %d chars for %s (likely image-based PDF); falling back to MarkItDown",
|
||||||
|
len(pymupdf_text.strip()),
|
||||||
|
file_path.name,
|
||||||
|
)
|
||||||
|
# pymupdf4llm not installed or fallback triggered → use MarkItDown
|
||||||
|
|
||||||
|
return _convert_with_markitdown(file_path)
|
||||||
|
|
||||||
|
|
||||||
async def convert_file_to_markdown(file_path: Path) -> Path | None:
|
async def convert_file_to_markdown(file_path: Path) -> Path | None:
|
||||||
"""Convert a file to markdown using markitdown.
|
"""Convert a supported document file to Markdown.
|
||||||
|
|
||||||
|
PDF files are handled with a two-converter strategy (see module docstring).
|
||||||
|
Large files (> 1 MB) are offloaded to a thread pool to avoid blocking the
|
||||||
|
event loop.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file to convert.
|
file_path: Path to the file to convert.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Path to the markdown file if conversion was successful, None otherwise.
|
Path to the generated .md file, or None if conversion failed.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from markitdown import MarkItDown
|
pdf_converter = _get_pdf_converter()
|
||||||
|
file_size = file_path.stat().st_size
|
||||||
|
|
||||||
md = MarkItDown()
|
if file_size > _ASYNC_THRESHOLD_BYTES:
|
||||||
result = md.convert(str(file_path))
|
text = await asyncio.to_thread(_do_convert, file_path, pdf_converter)
|
||||||
|
else:
|
||||||
|
text = _do_convert(file_path, pdf_converter)
|
||||||
|
|
||||||
# Save as .md file with same name
|
|
||||||
md_path = file_path.with_suffix(".md")
|
md_path = file_path.with_suffix(".md")
|
||||||
md_path.write_text(result.text_content, encoding="utf-8")
|
md_path.write_text(text, encoding="utf-8")
|
||||||
|
|
||||||
logger.info(f"Converted {file_path.name} to markdown: {md_path.name}")
|
logger.info("Converted %s to markdown: %s (%d chars)", file_path.name, md_path.name, len(text))
|
||||||
return md_path
|
return md_path
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
|
logger.error("Failed to convert %s to markdown: %s", file_path.name, e)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ -69,6 +186,8 @@ _BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPEND
|
|||||||
# Keeps prompt size bounded even for very long documents.
|
# Keeps prompt size bounded even for very long documents.
|
||||||
MAX_OUTLINE_ENTRIES = 50
|
MAX_OUTLINE_ENTRIES = 50
|
||||||
|
|
||||||
|
_ALLOWED_PDF_CONVERTERS = {"auto", "pymupdf4llm", "markitdown"}
|
||||||
|
|
||||||
|
|
||||||
def extract_outline(md_path: Path) -> list[dict]:
|
def extract_outline(md_path: Path) -> list[dict]:
|
||||||
"""Extract document outline (headings) from a Markdown file.
|
"""Extract document outline (headings) from a Markdown file.
|
||||||
@ -122,14 +241,23 @@ def extract_outline(md_path: Path) -> list[dict]:
|
|||||||
|
|
||||||
|
|
||||||
def _get_pdf_converter() -> str:
|
def _get_pdf_converter() -> str:
|
||||||
"""Read pdf_converter setting from app config, defaulting to 'auto'."""
|
"""Read pdf_converter setting from app config, defaulting to 'auto'.
|
||||||
|
|
||||||
|
Normalizes the value to lowercase and validates it against the allowed set
|
||||||
|
so that values like 'AUTO' or 'MarkItDown' from config.yaml don't silently
|
||||||
|
fall through to unexpected behaviour.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from deerflow.config.app_config import get_app_config
|
from deerflow.config.app_config import get_app_config
|
||||||
|
|
||||||
cfg = get_app_config()
|
cfg = get_app_config()
|
||||||
uploads_cfg = getattr(cfg, "uploads", None)
|
uploads_cfg = getattr(cfg, "uploads", None)
|
||||||
if uploads_cfg is not None:
|
if uploads_cfg is not None:
|
||||||
return str(getattr(uploads_cfg, "pdf_converter", "auto"))
|
raw = str(getattr(uploads_cfg, "pdf_converter", "auto")).strip().lower()
|
||||||
|
if raw not in _ALLOWED_PDF_CONVERTERS:
|
||||||
|
logger.warning("Invalid pdf_converter value %r; falling back to 'auto'", raw)
|
||||||
|
return "auto"
|
||||||
|
return raw
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return "auto"
|
return "auto"
|
||||||
|
|||||||
@ -34,6 +34,9 @@ dependencies = [
|
|||||||
"langgraph-sdk>=0.1.51",
|
"langgraph-sdk>=0.1.51",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
pymupdf = ["pymupdf4llm>=0.0.17"]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["hatchling"]
|
requires = ["hatchling"]
|
||||||
build-backend = "hatchling.build"
|
build-backend = "hatchling.build"
|
||||||
|
|||||||
@ -1,12 +1,304 @@
|
|||||||
"""Tests for extract_outline() in file_conversion utilities (PR2: document outline injection)."""
|
"""Tests for file_conversion utilities (PR1: pymupdf4llm + asyncio.to_thread; PR2: extract_outline)."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
from types import ModuleType
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
from deerflow.utils.file_conversion import (
|
from deerflow.utils.file_conversion import (
|
||||||
|
_ASYNC_THRESHOLD_BYTES,
|
||||||
|
_MIN_CHARS_PER_PAGE,
|
||||||
MAX_OUTLINE_ENTRIES,
|
MAX_OUTLINE_ENTRIES,
|
||||||
|
_do_convert,
|
||||||
|
_pymupdf_output_too_sparse,
|
||||||
|
convert_file_to_markdown,
|
||||||
extract_outline,
|
extract_outline,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pymupdf_mock(page_count: int) -> ModuleType:
|
||||||
|
"""Return a fake *pymupdf* module whose ``open()`` reports *page_count* pages."""
|
||||||
|
mock_doc = MagicMock()
|
||||||
|
mock_doc.__len__ = MagicMock(return_value=page_count)
|
||||||
|
fake_pymupdf = ModuleType("pymupdf")
|
||||||
|
fake_pymupdf.open = MagicMock(return_value=mock_doc) # type: ignore[attr-defined]
|
||||||
|
return fake_pymupdf
|
||||||
|
|
||||||
|
|
||||||
|
def _run(coro):
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
try:
|
||||||
|
return loop.run_until_complete(coro)
|
||||||
|
finally:
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _pymupdf_output_too_sparse
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestPymupdfOutputTooSparse:
|
||||||
|
"""Check the chars-per-page sparsity heuristic."""
|
||||||
|
|
||||||
|
def test_dense_text_pdf_not_sparse(self, tmp_path):
|
||||||
|
"""Normal text PDF: many chars per page → not sparse."""
|
||||||
|
pdf = tmp_path / "dense.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
# 10 pages × 10 000 chars → 1000/page ≫ threshold
|
||||||
|
with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=10)}):
|
||||||
|
result = _pymupdf_output_too_sparse("x" * 10_000, pdf)
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_image_based_pdf_is_sparse(self, tmp_path):
|
||||||
|
"""Image-based PDF: near-zero chars per page → sparse."""
|
||||||
|
pdf = tmp_path / "image.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
# 612 chars / 31 pages ≈ 19.7/page < _MIN_CHARS_PER_PAGE (50)
|
||||||
|
with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=31)}):
|
||||||
|
result = _pymupdf_output_too_sparse("x" * 612, pdf)
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
def test_fallback_when_pymupdf_unavailable(self, tmp_path):
|
||||||
|
"""When pymupdf is not installed, fall back to absolute 200-char threshold."""
|
||||||
|
pdf = tmp_path / "broken.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
# Remove pymupdf from sys.modules so the `import pymupdf` inside the
|
||||||
|
# function raises ImportError, triggering the absolute-threshold fallback.
|
||||||
|
with patch.dict(sys.modules, {"pymupdf": None}):
|
||||||
|
sparse = _pymupdf_output_too_sparse("x" * 100, pdf)
|
||||||
|
not_sparse = _pymupdf_output_too_sparse("x" * 300, pdf)
|
||||||
|
|
||||||
|
assert sparse is True
|
||||||
|
assert not_sparse is False
|
||||||
|
|
||||||
|
def test_exactly_at_threshold_is_not_sparse(self, tmp_path):
|
||||||
|
"""Chars-per-page == threshold is treated as NOT sparse (boundary inclusive)."""
|
||||||
|
pdf = tmp_path / "boundary.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
# 2 pages × _MIN_CHARS_PER_PAGE chars = exactly at threshold
|
||||||
|
with patch.dict(sys.modules, {"pymupdf": _make_pymupdf_mock(page_count=2)}):
|
||||||
|
result = _pymupdf_output_too_sparse("x" * (_MIN_CHARS_PER_PAGE * 2), pdf)
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _do_convert — routing logic
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestDoConvert:
|
||||||
|
"""Verify that _do_convert routes to the right sub-converter."""
|
||||||
|
|
||||||
|
def test_non_pdf_always_uses_markitdown(self, tmp_path):
|
||||||
|
"""DOCX / XLSX / PPTX always go through MarkItDown regardless of setting."""
|
||||||
|
docx = tmp_path / "report.docx"
|
||||||
|
docx.write_bytes(b"PK fake docx")
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"deerflow.utils.file_conversion._convert_with_markitdown",
|
||||||
|
return_value="# Markdown from MarkItDown",
|
||||||
|
) as mock_md:
|
||||||
|
result = _do_convert(docx, "auto")
|
||||||
|
|
||||||
|
mock_md.assert_called_once_with(docx)
|
||||||
|
assert result == "# Markdown from MarkItDown"
|
||||||
|
|
||||||
|
def test_pdf_auto_uses_pymupdf4llm_when_dense(self, tmp_path):
|
||||||
|
"""auto mode: use pymupdf4llm output when it's dense enough."""
|
||||||
|
pdf = tmp_path / "report.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
dense_text = "# Heading\n" + "word " * 2000 # clearly dense
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
|
||||||
|
return_value=dense_text,
|
||||||
|
),
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._pymupdf_output_too_sparse",
|
||||||
|
return_value=False,
|
||||||
|
),
|
||||||
|
patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md,
|
||||||
|
):
|
||||||
|
result = _do_convert(pdf, "auto")
|
||||||
|
|
||||||
|
mock_md.assert_not_called()
|
||||||
|
assert result == dense_text
|
||||||
|
|
||||||
|
def test_pdf_auto_falls_back_when_sparse(self, tmp_path):
|
||||||
|
"""auto mode: fall back to MarkItDown when pymupdf4llm output is sparse."""
|
||||||
|
pdf = tmp_path / "scanned.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
|
||||||
|
return_value="x" * 612, # 19.7 chars/page for 31-page doc
|
||||||
|
),
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._pymupdf_output_too_sparse",
|
||||||
|
return_value=True,
|
||||||
|
),
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._convert_with_markitdown",
|
||||||
|
return_value="OCR result via MarkItDown",
|
||||||
|
) as mock_md,
|
||||||
|
):
|
||||||
|
result = _do_convert(pdf, "auto")
|
||||||
|
|
||||||
|
mock_md.assert_called_once_with(pdf)
|
||||||
|
assert result == "OCR result via MarkItDown"
|
||||||
|
|
||||||
|
def test_pdf_explicit_pymupdf4llm_skips_sparsity_check(self, tmp_path):
|
||||||
|
"""'pymupdf4llm' mode: use output as-is even if sparse."""
|
||||||
|
pdf = tmp_path / "explicit.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
sparse_text = "x" * 10 # very short
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
|
||||||
|
return_value=sparse_text,
|
||||||
|
),
|
||||||
|
patch("deerflow.utils.file_conversion._convert_with_markitdown") as mock_md,
|
||||||
|
):
|
||||||
|
result = _do_convert(pdf, "pymupdf4llm")
|
||||||
|
|
||||||
|
mock_md.assert_not_called()
|
||||||
|
assert result == sparse_text
|
||||||
|
|
||||||
|
def test_pdf_explicit_markitdown_skips_pymupdf4llm(self, tmp_path):
|
||||||
|
"""'markitdown' mode: never attempt pymupdf4llm."""
|
||||||
|
pdf = tmp_path / "force_md.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm") as mock_pymu,
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._convert_with_markitdown",
|
||||||
|
return_value="MarkItDown result",
|
||||||
|
),
|
||||||
|
):
|
||||||
|
result = _do_convert(pdf, "markitdown")
|
||||||
|
|
||||||
|
mock_pymu.assert_not_called()
|
||||||
|
assert result == "MarkItDown result"
|
||||||
|
|
||||||
|
def test_pdf_auto_falls_back_when_pymupdf4llm_not_installed(self, tmp_path):
|
||||||
|
"""auto mode: if pymupdf4llm is not installed, use MarkItDown directly."""
|
||||||
|
pdf = tmp_path / "no_pymupdf.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._convert_pdf_with_pymupdf4llm",
|
||||||
|
return_value=None, # None signals not installed
|
||||||
|
),
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._convert_with_markitdown",
|
||||||
|
return_value="MarkItDown fallback",
|
||||||
|
) as mock_md,
|
||||||
|
):
|
||||||
|
result = _do_convert(pdf, "auto")
|
||||||
|
|
||||||
|
mock_md.assert_called_once_with(pdf)
|
||||||
|
assert result == "MarkItDown fallback"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# convert_file_to_markdown — async + file writing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestConvertFileToMarkdown:
|
||||||
|
def test_small_file_runs_synchronously(self, tmp_path):
|
||||||
|
"""Small files (< 1 MB) are converted in the event loop thread."""
|
||||||
|
pdf = tmp_path / "small.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 " + b"x" * 100) # well under 1 MB
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._do_convert",
|
||||||
|
return_value="# Small PDF",
|
||||||
|
) as mock_convert,
|
||||||
|
patch("asyncio.to_thread") as mock_thread,
|
||||||
|
):
|
||||||
|
md_path = _run(convert_file_to_markdown(pdf))
|
||||||
|
|
||||||
|
# asyncio.to_thread must NOT have been called
|
||||||
|
mock_thread.assert_not_called()
|
||||||
|
mock_convert.assert_called_once()
|
||||||
|
assert md_path == pdf.with_suffix(".md")
|
||||||
|
assert md_path.read_text() == "# Small PDF"
|
||||||
|
|
||||||
|
def test_large_file_offloaded_to_thread(self, tmp_path):
|
||||||
|
"""Large files (> 1 MB) are offloaded via asyncio.to_thread."""
|
||||||
|
pdf = tmp_path / "large.pdf"
|
||||||
|
# Write slightly more than the threshold
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 " + b"x" * (_ASYNC_THRESHOLD_BYTES + 1))
|
||||||
|
|
||||||
|
async def fake_to_thread(fn, *args, **kwargs):
|
||||||
|
return fn(*args, **kwargs)
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._do_convert",
|
||||||
|
return_value="# Large PDF",
|
||||||
|
),
|
||||||
|
patch("asyncio.to_thread", side_effect=fake_to_thread) as mock_thread,
|
||||||
|
):
|
||||||
|
md_path = _run(convert_file_to_markdown(pdf))
|
||||||
|
|
||||||
|
mock_thread.assert_called_once()
|
||||||
|
assert md_path == pdf.with_suffix(".md")
|
||||||
|
assert md_path.read_text() == "# Large PDF"
|
||||||
|
|
||||||
|
def test_returns_none_on_conversion_error(self, tmp_path):
|
||||||
|
"""If conversion raises, return None without propagating the exception."""
|
||||||
|
pdf = tmp_path / "broken.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._do_convert",
|
||||||
|
side_effect=RuntimeError("conversion failed"),
|
||||||
|
),
|
||||||
|
):
|
||||||
|
result = _run(convert_file_to_markdown(pdf))
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_writes_utf8_markdown_file(self, tmp_path):
|
||||||
|
"""Generated .md file is written with UTF-8 encoding."""
|
||||||
|
pdf = tmp_path / "report.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||||
|
chinese_content = "# 中文报告\n\n这是测试内容。"
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("deerflow.utils.file_conversion._get_pdf_converter", return_value="auto"),
|
||||||
|
patch(
|
||||||
|
"deerflow.utils.file_conversion._do_convert",
|
||||||
|
return_value=chinese_content,
|
||||||
|
),
|
||||||
|
):
|
||||||
|
md_path = _run(convert_file_to_markdown(pdf))
|
||||||
|
|
||||||
|
assert md_path is not None
|
||||||
|
assert md_path.read_text(encoding="utf-8") == chinese_content
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# extract_outline
|
# extract_outline
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
@ -369,6 +369,15 @@ tool_search:
|
|||||||
|
|
||||||
# Option 1: Local Sandbox (Default)
|
# Option 1: Local Sandbox (Default)
|
||||||
# Executes commands directly on the host machine
|
# Executes commands directly on the host machine
|
||||||
|
uploads:
|
||||||
|
# PDF-to-Markdown converter used when a PDF is uploaded.
|
||||||
|
# auto — prefer pymupdf4llm when installed; fall back to MarkItDown for
|
||||||
|
# image-based or encrypted PDFs (recommended default).
|
||||||
|
# pymupdf4llm — always use pymupdf4llm (must be installed: uv add pymupdf4llm).
|
||||||
|
# Better heading/table extraction; faster on most files.
|
||||||
|
# markitdown — always use MarkItDown (original behaviour, no extra dependency).
|
||||||
|
pdf_converter: auto
|
||||||
|
|
||||||
sandbox:
|
sandbox:
|
||||||
use: deerflow.sandbox.local:LocalSandboxProvider
|
use: deerflow.sandbox.local:LocalSandboxProvider
|
||||||
# Host bash execution is disabled by default because LocalSandboxProvider is
|
# Host bash execution is disabled by default because LocalSandboxProvider is
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user