mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-04-25 11:18:22 +00:00
* security: add log injection attack prevention with input sanitization - Created src/utils/log_sanitizer.py to sanitize user-controlled input before logging - Prevents log injection attacks using newlines, tabs, carriage returns, etc. - Escapes dangerous characters: \n, \r, \t, \0, \x1b - Provides specialized functions for different input types: - sanitize_log_input: general purpose sanitization - sanitize_thread_id: for user-provided thread IDs - sanitize_user_content: for user messages (more aggressive truncation) - sanitize_agent_name: for agent identifiers - sanitize_tool_name: for tool names - sanitize_feedback: for user interrupt feedback - create_safe_log_message: template-based safe message creation - Updated src/server/app.py to sanitize all user input in logging: - Thread IDs from request parameter - Message content from user - Agent names and node information - Tool names and feedback - Updated src/agents/tool_interceptor.py to sanitize: - Tool names during execution - User feedback during interrupt handling - Tool input data - Added 29 comprehensive unit tests covering: - Classic newline injection attacks - Carriage return injection - Tab and null character injection - HTML/ANSI escape sequence injection - Combined multi-character attacks - Truncation and length limits Fixes potential log forgery vulnerability where malicious users could inject fake log entries via unsanitized input containing control characters.
30 lines
866 B
Python
30 lines
866 B
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
import logging
|
|
|
|
from readabilipy import simple_json_from_html_string
|
|
|
|
from .article import Article
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ReadabilityExtractor:
|
|
def extract_article(self, html: str) -> Article:
|
|
article = simple_json_from_html_string(html, use_readability=True)
|
|
|
|
content = article.get("content")
|
|
if not content or not str(content).strip():
|
|
logger.warning("Readability extraction returned empty content")
|
|
content = "<p>No content could be extracted from this page</p>"
|
|
|
|
title = article.get("title")
|
|
if not title or not str(title).strip():
|
|
title = "Untitled"
|
|
|
|
return Article(
|
|
title=title,
|
|
html_content=content,
|
|
)
|