deer-flow/src/tools/crawl.py

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT

import json
import logging
from typing import Annotated, Optional
from urllib.parse import urlparse

from langchain_core.tools import tool

from src.crawler import Crawler

from .decorators import log_io

logger = logging.getLogger(__name__)

def is_pdf_url(url: Optional[str]) -> bool:
    """Check if the URL points to a PDF file."""
    if not url:
        return False
    parsed_url = urlparse(url)
    # Check if the path ends with .pdf (case insensitive)
    return parsed_url.path.lower().endswith('.pdf')


@tool
@log_io
def crawl_tool(
    url: Annotated[str, "The url to crawl."],
) -> str:
    """Use this to crawl a url and get a readable content in markdown format."""
    # Special handling for PDF URLs
    if is_pdf_url(url):
        logger.info(f"PDF URL detected, skipping crawling: {url}")
        pdf_message = json.dumps({
            "url": url,
            "error": "PDF files cannot be crawled directly. Please download and view the PDF manually.",
            "crawled_content": None,
            "is_pdf": True
        })
        return pdf_message

    try:
        crawler = Crawler()
        article = crawler.crawl(url)
        return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]})
    except BaseException as e:
        error_msg = f"Failed to crawl. Error: {repr(e)}"
        logger.error(error_msg)
        return error_msg