mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-04-30 05:48:38 +00:00
* fix: the crawling error when encountering PDF URLs * Added the unit test for the new feature of crawl tool * fix: address the code review problems * fix: address the code review problems
51 lines
1.4 KiB
Python
51 lines
1.4 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
import json
|
|
import logging
|
|
from typing import Annotated, Optional
|
|
from urllib.parse import urlparse
|
|
|
|
from langchain_core.tools import tool
|
|
|
|
from src.crawler import Crawler
|
|
|
|
from .decorators import log_io
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def is_pdf_url(url: Optional[str]) -> bool:
|
|
"""Check if the URL points to a PDF file."""
|
|
if not url:
|
|
return False
|
|
parsed_url = urlparse(url)
|
|
# Check if the path ends with .pdf (case insensitive)
|
|
return parsed_url.path.lower().endswith('.pdf')
|
|
|
|
|
|
@tool
|
|
@log_io
|
|
def crawl_tool(
|
|
url: Annotated[str, "The url to crawl."],
|
|
) -> str:
|
|
"""Use this to crawl a url and get a readable content in markdown format."""
|
|
# Special handling for PDF URLs
|
|
if is_pdf_url(url):
|
|
logger.info(f"PDF URL detected, skipping crawling: {url}")
|
|
pdf_message = json.dumps({
|
|
"url": url,
|
|
"error": "PDF files cannot be crawled directly. Please download and view the PDF manually.",
|
|
"crawled_content": None,
|
|
"is_pdf": True
|
|
})
|
|
return pdf_message
|
|
|
|
try:
|
|
crawler = Crawler()
|
|
article = crawler.crawl(url)
|
|
return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]})
|
|
except BaseException as e:
|
|
error_msg = f"Failed to crawl. Error: {repr(e)}"
|
|
logger.error(error_msg)
|
|
return error_msg
|