deer-flow/src/tools/crawl.py
Willem Jiang bec97f02ae
fix: the crawling error when encountering PDF URLs (#707)
* fix: the crawling error when encountering PDF URLs

* Added the unit test for the new feature of crawl tool

* fix: address the code review problems

* fix: address the code review problems
2025-11-25 09:24:52 +08:00

51 lines
1.4 KiB
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
import json
import logging
from typing import Annotated, Optional
from urllib.parse import urlparse
from langchain_core.tools import tool
from src.crawler import Crawler
from .decorators import log_io
logger = logging.getLogger(__name__)
def is_pdf_url(url: Optional[str]) -> bool:
"""Check if the URL points to a PDF file."""
if not url:
return False
parsed_url = urlparse(url)
# Check if the path ends with .pdf (case insensitive)
return parsed_url.path.lower().endswith('.pdf')
@tool
@log_io
def crawl_tool(
url: Annotated[str, "The url to crawl."],
) -> str:
"""Use this to crawl a url and get a readable content in markdown format."""
# Special handling for PDF URLs
if is_pdf_url(url):
logger.info(f"PDF URL detected, skipping crawling: {url}")
pdf_message = json.dumps({
"url": url,
"error": "PDF files cannot be crawled directly. Please download and view the PDF manually.",
"crawled_content": None,
"is_pdf": True
})
return pdf_message
try:
crawler = Crawler()
article = crawler.crawl(url)
return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]})
except BaseException as e:
error_msg = f"Failed to crawl. Error: {repr(e)}"
logger.error(error_msg)
return error_msg