mirror of
https://github.com/OpenBMB/ChatDev.git
synced 2026-04-25 11:18:06 +00:00
646 lines
22 KiB
Python
Executable File
646 lines
22 KiB
Python
Executable File
"""Deep research tools for search results and report management."""
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Annotated, Any, Dict, List, Optional, Tuple
|
|
|
|
from filelock import FileLock
|
|
|
|
from entity.messages import MessageBlock, MessageBlockType
|
|
from functions.function_calling.file import FileToolContext
|
|
from utils.function_catalog import ParamMeta
|
|
|
|
# Constants for file paths (relative to workspace root)
|
|
SEARCH_RESULTS_FILE = "deep_research/search_results.json"
|
|
SEARCH_LOCK_FILE = "deep_research/search_results.lock"
|
|
REPORT_FILE = "deep_research/report.md"
|
|
REPORT_LOCK_FILE = "deep_research/report.lock"
|
|
|
|
|
|
def _get_files(ctx: FileToolContext) -> Tuple[Path, Path]:
|
|
search_file = ctx.resolve_under_workspace(SEARCH_RESULTS_FILE)
|
|
report_file = ctx.resolve_under_workspace(REPORT_FILE)
|
|
return search_file, report_file
|
|
|
|
|
|
def _get_locks(ctx: FileToolContext) -> Tuple[Path, Path]:
|
|
search_lock = ctx.resolve_under_workspace(SEARCH_LOCK_FILE)
|
|
report_lock = ctx.resolve_under_workspace(REPORT_LOCK_FILE)
|
|
return search_lock, report_lock
|
|
|
|
|
|
def _load_search_results(file_path: Path) -> Dict[str, Any]:
|
|
if not file_path.exists():
|
|
return {}
|
|
try:
|
|
return json.loads(file_path.read_text(encoding="utf-8"))
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
|
|
|
|
def _save_search_results(file_path: Path, data: Dict[str, Any]) -> None:
|
|
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
file_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
|
|
|
|
def _format_search_result(url: str, data: Dict[str, Any], concise: bool) -> str:
|
|
keys = data.get("highlight_keys", [])
|
|
highlight_str = f" [IMPORTANT MATCHES: {', '.join(keys)}]" if keys else ""
|
|
|
|
if concise:
|
|
return (
|
|
f"URL: {url}{highlight_str}\n"
|
|
f"Title: {data.get('title', '')}\n"
|
|
f"Abstract: {data.get('abs', '')}\n"
|
|
f"{'-' * 40}"
|
|
)
|
|
else:
|
|
return (
|
|
f"URL: {url}{highlight_str}\n"
|
|
f"Title: {data.get('title', '')}\n"
|
|
f"Abstract: {data.get('abs', '')}\n"
|
|
f"Detail: {data.get('detail', '')}\n"
|
|
f"{'-' * 40}"
|
|
)
|
|
|
|
|
|
def search_save_result(
|
|
url: Annotated[str, ParamMeta(description="URL of the search result (used as key)")],
|
|
title: Annotated[str, ParamMeta(description="Title of the search result")],
|
|
abs: Annotated[str, ParamMeta(description="Abstract/Summary of the content")],
|
|
detail: Annotated[str, ParamMeta(description="Detailed content")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Save or update a search result.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
search_file, _ = _get_files(ctx)
|
|
search_lock, _ = _get_locks(ctx)
|
|
|
|
with FileLock(search_lock):
|
|
data = _load_search_results(search_file)
|
|
current = data.get(url, {})
|
|
|
|
# Preserve existing keys if updating
|
|
highlight_keys = current.get("highlight_keys", [])
|
|
|
|
data[url] = {
|
|
"title": title,
|
|
"abs": abs,
|
|
"detail": detail,
|
|
"highlight_keys": highlight_keys,
|
|
}
|
|
|
|
_save_search_results(search_file, data)
|
|
return f"Saved result for {url}"
|
|
|
|
|
|
def search_load_all(
|
|
# concise: Annotated[bool, ParamMeta(description="If True, only show concise information")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Load all saved search results.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
search_file, _ = _get_files(ctx)
|
|
search_lock, _ = _get_locks(ctx)
|
|
|
|
with FileLock(search_lock):
|
|
data = _load_search_results(search_file)
|
|
|
|
if not data:
|
|
return "No search results found."
|
|
|
|
results = []
|
|
for url, content in data.items():
|
|
results.append(_format_search_result(url, content, concise=True))
|
|
|
|
return "\n\n".join(results)
|
|
|
|
|
|
def search_load_by_url(
|
|
url: Annotated[str, ParamMeta(description="URL to retrieve")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Load a specific search result by URL.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
search_file, _ = _get_files(ctx)
|
|
search_lock, _ = _get_locks(ctx)
|
|
|
|
with FileLock(search_lock):
|
|
data = _load_search_results(search_file)
|
|
|
|
if url not in data:
|
|
return f"No result found for {url}"
|
|
|
|
return _format_search_result(url, data[url], concise=False)
|
|
|
|
|
|
def search_high_light_key(
|
|
url: Annotated[str, ParamMeta(description="URL to highlight keys for")],
|
|
keys: Annotated[List[str], ParamMeta(description="List of keys/terms to highlight")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Save highlighted keys for a specific search result.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
search_file, _ = _get_files(ctx)
|
|
search_lock, _ = _get_locks(ctx)
|
|
|
|
with FileLock(search_lock):
|
|
data = _load_search_results(search_file)
|
|
|
|
if url not in data:
|
|
return f"URL {url} not found in results. Please save it first."
|
|
|
|
current_keys = set(data[url].get("highlight_keys", []))
|
|
current_keys.update(keys)
|
|
data[url]["highlight_keys"] = list(current_keys)
|
|
|
|
_save_search_results(search_file, data)
|
|
return f"Updated highlights for {url}: {list(current_keys)}"
|
|
|
|
|
|
# Report Helpers
|
|
|
|
def _read_report_lines(file_path: Path) -> List[str]:
|
|
if not file_path.exists():
|
|
return []
|
|
return file_path.read_text(encoding="utf-8").splitlines()
|
|
|
|
|
|
def _save_report(file_path: Path, lines: List[str]) -> None:
|
|
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
# Ensure final newline
|
|
content = "\n".join(lines)
|
|
if content and not content.endswith("\n"):
|
|
content += "\n"
|
|
file_path.write_text(content, encoding="utf-8")
|
|
|
|
|
|
def _parse_header(line: str) -> Tuple[int, str]:
|
|
"""Returns (level, title) if line is a header, else (0, "")."""
|
|
match = re.match(r"^(#+)\s+(.+)$", line)
|
|
if match:
|
|
return len(match.group(1)), match.group(2).strip()
|
|
return 0, ""
|
|
|
|
|
|
def _find_chapter_range(lines: List[str], title_path: str) -> Tuple[int, int]:
|
|
"""
|
|
Find the start and end indices (inclusive, exclusive) of a chapter.
|
|
title_path is like "Chapter 1/Section 2"
|
|
"""
|
|
titles = [t.strip() for t in title_path.split("/")]
|
|
current_level_idx = 0
|
|
start_idx = -1
|
|
|
|
# We need to find the sequence of headers
|
|
search_start = 0
|
|
|
|
for i, target_title in enumerate(titles):
|
|
found = False
|
|
|
|
for idx in range(search_start, len(lines)):
|
|
level, text = _parse_header(lines[idx])
|
|
if level > 0 and text == target_title:
|
|
# Found the current segment
|
|
search_start = idx + 1
|
|
found = True
|
|
if i == len(titles) - 1:
|
|
start_idx = idx
|
|
current_level_idx = level
|
|
break
|
|
|
|
if not found:
|
|
return -1, -1
|
|
|
|
if start_idx == -1:
|
|
return -1, -1
|
|
|
|
# Find end: next header of same or lower level (higher importance, smaller integer)
|
|
end_idx = len(lines)
|
|
for idx in range(start_idx + 1, len(lines)):
|
|
level, _ = _parse_header(lines[idx])
|
|
if level > 0 and level <= current_level_idx:
|
|
end_idx = idx
|
|
break
|
|
|
|
return start_idx, end_idx
|
|
|
|
|
|
def report_read(
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Read the current content of the report.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
_, report_file = _get_files(ctx)
|
|
_, report_lock = _get_locks(ctx)
|
|
|
|
with FileLock(report_lock):
|
|
if not report_file.exists():
|
|
return "Report is empty."
|
|
return report_file.read_text(encoding="utf-8")
|
|
|
|
|
|
def report_read_chapter(
|
|
title: Annotated[str, ParamMeta(description="Chapter title to read (supports multi-level index e.g. 'Intro/Background')")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Read the content of a specific chapter.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
_, report_file = _get_files(ctx)
|
|
_, report_lock = _get_locks(ctx)
|
|
|
|
with FileLock(report_lock):
|
|
lines = _read_report_lines(report_file)
|
|
|
|
start, end = _find_chapter_range(lines, title)
|
|
if start == -1:
|
|
return f"Chapter '{title}' not found."
|
|
|
|
# Return content (excluding header)
|
|
# start is the header line, so start+1
|
|
return "\\n".join(lines[start+1:end])
|
|
|
|
|
|
def report_outline(
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Get the outline of the report (headers).
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
_, report_file = _get_files(ctx)
|
|
_, report_lock = _get_locks(ctx)
|
|
|
|
with FileLock(report_lock):
|
|
lines = _read_report_lines(report_file)
|
|
|
|
outline = []
|
|
for line in lines:
|
|
level, title = _parse_header(line)
|
|
if level > 0:
|
|
outline.append(f"{'#' * level} {title}")
|
|
|
|
if not outline:
|
|
return "No headers found in report."
|
|
return "\n".join(outline)
|
|
|
|
|
|
def report_create_chapter(
|
|
title: Annotated[str, ParamMeta(description="Chapter title (supports 'Parent/NewChild' to insert into existing). Use '|' to specify insertion point e.g. 'Prev|New' to insert after 'Prev', or '|New' to insert at start.")],
|
|
level: Annotated[int, ParamMeta(description="Header level (1-6)")],
|
|
content: Annotated[str, ParamMeta(description="Content of the chapter")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Create a new chapter in the report.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
_, report_file = _get_files(ctx)
|
|
_, report_lock = _get_locks(ctx)
|
|
|
|
with FileLock(report_lock):
|
|
lines = _read_report_lines(report_file)
|
|
|
|
# Check for routing path
|
|
parent_path = None
|
|
display_title = title
|
|
p_start, p_end = -1, len(lines)
|
|
|
|
if "/" in title:
|
|
# Handle recursive "Parent/Child" structure, where Child might contain "|"
|
|
parent_path, new_title = title.rsplit("/", 1)
|
|
p_start, p_end = _find_chapter_range(lines, parent_path)
|
|
|
|
if p_start == -1:
|
|
return f"Parent chapter '{parent_path}' not found. Cannot create '{new_title}' inside it."
|
|
|
|
display_title = new_title
|
|
|
|
# Check for "|" syntax in the leaf title
|
|
insert_after_target = None # None means append, "" means start, "str" means after that chapter
|
|
if "|" in display_title:
|
|
target, real_title = display_title.split("|", 1)
|
|
display_title = real_title
|
|
insert_after_target = target
|
|
|
|
# Determine insertion index
|
|
insert_idx = -1
|
|
|
|
if insert_after_target is not None:
|
|
if insert_after_target == "":
|
|
# Insert at the beginning of the context
|
|
if parent_path:
|
|
# Inside parent: Insert after parent header (and its intro text), before first subchapter
|
|
insert_idx = p_end # Default to appending if no subchapters found
|
|
|
|
# Scan for first header inside parent
|
|
for idx in range(p_start + 1, len(lines)):
|
|
if idx >= p_end:
|
|
break
|
|
lvl, _ = _parse_header(lines[idx])
|
|
if lvl > 0:
|
|
insert_idx = idx
|
|
break
|
|
else:
|
|
# Top level: Insert at start of file
|
|
insert_idx = 0
|
|
else:
|
|
# Insert after the specified chapter
|
|
# If we are inside a parent, the target must be relative to the parent?
|
|
# The user requirement says "Prev|New".
|
|
# If inside "Parent", "Prev" should be a sibling inside "Parent".
|
|
|
|
search_target = insert_after_target
|
|
if parent_path:
|
|
# Construct full path for search if we are scoped
|
|
search_target = f"{parent_path}/{insert_after_target}"
|
|
|
|
a_start, a_end = _find_chapter_range(lines, search_target)
|
|
if a_start == -1:
|
|
return f"Target chapter '{search_target}' not found."
|
|
insert_idx = a_end
|
|
else:
|
|
# Default: Append to parent context or file end
|
|
insert_idx = p_end if parent_path else len(lines)
|
|
|
|
header = f"{'#' * level} {display_title}"
|
|
new_section = [header] + content.splitlines() + [""]
|
|
|
|
# Insert
|
|
lines[insert_idx:insert_idx] = new_section
|
|
|
|
_save_report(report_file, lines)
|
|
|
|
final_path = f"{parent_path}/{display_title}" if parent_path else display_title
|
|
return f"Created chapter '{final_path}' at level {level}"
|
|
|
|
|
|
def report_rewrite_chapter(
|
|
title: Annotated[str, ParamMeta(description="Chapter title to rewrite (supports multi-level index e.g. 'Intro/Background')")],
|
|
content: Annotated[str, ParamMeta(description="New content")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Rewrite the content of an existing chapter.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
_, report_file = _get_files(ctx)
|
|
_, report_lock = _get_locks(ctx)
|
|
|
|
with FileLock(report_lock):
|
|
lines = _read_report_lines(report_file)
|
|
|
|
start, end = _find_chapter_range(lines, title)
|
|
if start == -1:
|
|
return f"Chapter '{title}' not found."
|
|
|
|
# Keep the header, replace the body
|
|
# new body should not contain the header itself, just the content
|
|
new_body = [lines[start]] + content.splitlines() + [""]
|
|
|
|
# Replace slice
|
|
lines[start:end] = new_body
|
|
|
|
_save_report(report_file, lines)
|
|
return f"Rewrote chapter '{title}'"
|
|
|
|
|
|
def report_continue_chapter(
|
|
title: Annotated[str, ParamMeta(description="Chapter title to append to (supports multi-level index e.g. 'Intro/Background')")],
|
|
content: Annotated[str, ParamMeta(description="Content to append")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Append content to an existing chapter.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
_, report_file = _get_files(ctx)
|
|
_, report_lock = _get_locks(ctx)
|
|
|
|
with FileLock(report_lock):
|
|
lines = _read_report_lines(report_file)
|
|
|
|
start, end = _find_chapter_range(lines, title)
|
|
if start == -1:
|
|
return f"Chapter '{title}' not found."
|
|
|
|
# Append content before 'end' (which is the start of next section or end of file)
|
|
new_lines = content.splitlines() + [""]
|
|
lines[end:end] = new_lines
|
|
|
|
_save_report(report_file, lines)
|
|
return f"Appended content to chapter '{title}'"
|
|
|
|
|
|
def report_reorder_chapters(
|
|
new_order: Annotated[List[str], ParamMeta(description="List of chapter titles in the new desired order")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Reorder chapters in the report.
|
|
This swaps the positions of the specified chapters, preserving their content and valid text between them.
|
|
All specified chapters must exist and must not overlap (e.g. cannot reorder a parent and its child).
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
_, report_file = _get_files(ctx)
|
|
_, report_lock = _get_locks(ctx)
|
|
|
|
with FileLock(report_lock):
|
|
lines = _read_report_lines(report_file)
|
|
|
|
# 1. Find all ranges
|
|
chapters = [] # (index in new_order, title, start, end)
|
|
for i, title in enumerate(new_order):
|
|
s, e = _find_chapter_range(lines, title)
|
|
if s == -1:
|
|
return f"Chapter '{title}' not found."
|
|
chapters.append({
|
|
"target_order_idx": i,
|
|
"title": title,
|
|
"content": lines[s:e],
|
|
"start": s,
|
|
"end": e
|
|
})
|
|
|
|
# 2. Sort by original position in file to identify slots
|
|
chapters_sorted_by_pos = sorted(chapters, key=lambda x: x["start"])
|
|
|
|
# 3. Validation: Check for overlaps
|
|
for i in range(len(chapters_sorted_by_pos) - 1):
|
|
curr = chapters_sorted_by_pos[i]
|
|
next_ch = chapters_sorted_by_pos[i+1]
|
|
if curr["end"] > next_ch["start"]:
|
|
return f"Chapters '{curr['title']}' and '{next_ch['title']}' overlap. Cannot reorder nested or overlapping chapters."
|
|
|
|
# 4. Construct new line list
|
|
result_lines = []
|
|
current_idx = 0
|
|
|
|
for k, original_slot_holder in enumerate(chapters_sorted_by_pos):
|
|
# Append text before this slot
|
|
result_lines.extend(lines[current_idx : original_slot_holder["start"]])
|
|
|
|
# Append the content of the chapter that belongs in this k-th slot
|
|
# The slot sequence corresponds to the input list order
|
|
desired_chapter = chapters[k]
|
|
result_lines.extend(desired_chapter["content"])
|
|
|
|
current_idx = original_slot_holder["end"]
|
|
|
|
# Append remaining file content
|
|
result_lines.extend(lines[current_idx:])
|
|
|
|
_save_report(report_file, result_lines)
|
|
|
|
return "Reordered chapters successfully."
|
|
|
|
|
|
def report_del_chapter(
|
|
title: Annotated[str, ParamMeta(description="Chapter title to delete (supports multi-level index e.g. 'Intro/Background')")],
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> str:
|
|
"""
|
|
Delete a chapter and its content.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
_, report_file = _get_files(ctx)
|
|
_, report_lock = _get_locks(ctx)
|
|
|
|
with FileLock(report_lock):
|
|
lines = _read_report_lines(report_file)
|
|
|
|
start, end = _find_chapter_range(lines, title)
|
|
if start == -1:
|
|
return f"Chapter '{title}' not found."
|
|
|
|
del lines[start:end]
|
|
|
|
_save_report(report_file, lines)
|
|
return f"Deleted chapter '{title}'"
|
|
|
|
def report_export_pdf(
|
|
_context: Dict[str, Any] | None = None,
|
|
) -> List[MessageBlock]:
|
|
"""
|
|
Export the report to PDF.
|
|
"""
|
|
ctx = FileToolContext(_context)
|
|
_, report_file = _get_files(ctx)
|
|
_, report_lock = _get_locks(ctx)
|
|
|
|
with FileLock(report_lock):
|
|
if not report_file.exists():
|
|
raise FileNotFoundError("Report file does not exist.")
|
|
text = report_file.read_text(encoding="utf-8")
|
|
|
|
text = re.sub(r"([^\n])\n(#{1,6}\s)", r"\1\n\n\2", text)
|
|
text = re.sub(r"(?m)^(?!\s*(?:[*+-]|\d+\.)\s)(.+)\n(\s*(?:[*+-]|\d+\.)\s)", r"\1\n\n\2", text)
|
|
|
|
try:
|
|
import markdown
|
|
from xhtml2pdf import pisa
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Error: strict dependencies 'markdown' and 'xhtml2pdf' are missing."
|
|
)
|
|
|
|
pdf_file = report_file.with_suffix(".pdf")
|
|
|
|
# Convert to HTML
|
|
extensions = ["extra", "codehilite", "nl2br", "tables"]
|
|
html_content = markdown.markdown(text, extensions=extensions)
|
|
|
|
styled_html = f"""
|
|
<html>
|
|
<head>
|
|
<style>
|
|
@page {{
|
|
size: A4;
|
|
margin: 2cm;
|
|
}}
|
|
body {{
|
|
font-family: sans-serif;
|
|
line-height: 1.6;
|
|
font-size: 10pt;
|
|
word-wrap: break-word;
|
|
word-break: break-all;
|
|
}}
|
|
h1, h2, h3 {{
|
|
color: #2c3e50;
|
|
margin-top: 25px; /* Add spacing above the title */
|
|
margin-bottom: 15px;
|
|
border-bottom: 1px solid #eee; /* Add an underline to the main title for clarity */
|
|
padding-bottom: 5px;
|
|
}}
|
|
|
|
/* --- Table style fixes --- */
|
|
table {{
|
|
width: 100%;
|
|
border-collapse: collapse;
|
|
margin-bottom: 20px;
|
|
border: 1px solid #ddd;
|
|
}}
|
|
th, td {{
|
|
border: 1px solid #ddd; /* Explicitly add borders */
|
|
padding: 8px;
|
|
text-align: left;
|
|
vertical-align: top;
|
|
}}
|
|
th {{
|
|
background-color: #f2f2f2;
|
|
font-weight: bold;
|
|
color: #333;
|
|
}}
|
|
/* ------------------ */
|
|
|
|
code {{ background-color: #f4f4f4; padding: 2px 5px; border-radius: 3px; font-family: monospace; }}
|
|
pre {{ background-color: #f4f4f4; padding: 10px; border-radius: 5px; overflow-x: auto; white-space: pre-wrap; }}
|
|
ul, ol {{ margin-top: 8px; margin-bottom: 8px; padding-left: 20px; }}
|
|
li {{ margin-bottom: 4px; }}
|
|
blockquote {{ border-left: 4px solid #ccc; padding-left: 10px; color: #666; margin: 10px 0; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
{html_content}
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
# Convert to PDF
|
|
try:
|
|
with open(pdf_file, "wb") as f:
|
|
pisa_status = pisa.CreatePDF(styled_html, dest=f)
|
|
|
|
if pisa_status.err:
|
|
raise RuntimeError("Failed to generate PDF: xhtml2pdf error")
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to generate PDF: {e}")
|
|
|
|
record = ctx.attachment_store.register_file(
|
|
pdf_file,
|
|
kind=MessageBlockType.FILE,
|
|
display_name=pdf_file.name,
|
|
mime_type="application/pdf",
|
|
copy_file=False,
|
|
persist=False,
|
|
deduplicate=True,
|
|
extra={
|
|
"source": "generated_report",
|
|
"workspace_path": str(pdf_file),
|
|
},
|
|
)
|
|
return [record.as_message_block()]
|