Merge pull request #134 from linyqh/dev_0.6.0 大版本更新

Dev 0.6.0
2026-06-29 02:32:02 +00:00 · 2025-05-08 20:58:12 +08:00 · 2025-05-08 20:58:12 +08:00 · 9aefe76a8c
commit 9aefe76a8c
parent 5f778d5b15 afeeb7c516
57 changed files with 3368 additions and 1492 deletions
--- a/.gitignore
+++ b/.gitignore
@ -32,4 +32,5 @@ resource/fonts/*.ttf
 resource/fonts/*.otf
 resource/srt/*.srt
 app/models/faster-whisper-large-v2/*
+app/models/faster-whisper-large-v3/*
 app/models/bert/*
--- a/README-en.md
+++ b/README-en.md
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 <h3 align="center">一站式 AI 影视解说+自动化剪辑工具🎬🎞️ </h3>


-<h3>📖 <a href="README-cn.md">English</a> | 简体中文 | <a href="README-ja.md">日本語</a> </h3>
+<h3>📖 <a href="README-en.md">English</a> | 简体中文 | <a href="README-ja.md">日本語</a> </h3>
 <div align="center">

 [//]: # (  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
@ -83,7 +83,7 @@ _**注意⚠️：近期在 x (推特) 上发现有人冒充作者在 pump.fun
 ## 配置要求 📦

 - 建议最低 CPU 4核或以上，内存 8G 或以上，显卡非必须
- Windows 10 或 MacOS 11.0 以上系统
+- Windows 10/11 或 MacOS 11.0 以上系统
 - [Python 3.10+](https://www.python.org/downloads/)

 ## 反馈建议 📢
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -20,7 +20,9 @@ class VideoConcatMode(str, Enum):

 class VideoAspect(str, Enum):
    landscape = "16:9"
+    landscape_2 = "4:3"
    portrait = "9:16"
+    portrait_2 = "3:4"
    square = "1:1"

    def to_resolution(self):
@ -360,13 +362,14 @@ class VideoClipParams(BaseModel):
    text_back_color: Optional[str] = None       # 文本背景色
    stroke_color: str = "black"                 # 描边颜色
    stroke_width: float = 1.5                   # 描边宽度
-    subtitle_position: str = "bottom"  # top, bottom, center, custom
+    subtitle_position: str = "bottom"   # top, bottom, center, custom
+    custom_position: float = 70.0       # 自定义位置

-    n_threads: Optional[int] = Field(default=16, description="解说语音音量")    # 线程<E7BABF><E7A88B><EFBFBD>，有助于提升视频处理速度
+    n_threads: Optional[int] = Field(default=16, description="线程数")    # 线程数，有助于提升视频处理速度

    tts_volume: Optional[float] = Field(default=1.0, description="解说语音音量（后处理）")
    original_volume: Optional[float] = Field(default=1.0, description="视频原声音量")
-    bgm_volume: Optional[float] = Field(default=0.6, description="背景音乐音量")
+    bgm_volume: Optional[float] = Field(default=0.3, description="背景音乐音量")


 class VideoTranscriptionRequest(BaseModel):
--- a/app/models/schema_v2.py
+++ b/app/models/schema_v2.py
@ -6,6 +6,7 @@ class GenerateScriptRequest(BaseModel):
    video_path: str
    video_theme: Optional[str] = ""
    custom_prompt: Optional[str] = ""
+    frame_interval_input: Optional[int] = 5
    skip_seconds: Optional[int] = 0
    threshold: Optional[int] = 30
    vision_batch_size: Optional[int] = 5
--- a/app/services/SDP/generate_script_short.pyd
+++ b/app/services/SDP/generate_script_short.pyd
--- a/app/services/SDP/generate_script_short.so
+++ b/app/services/SDP/generate_script_short.so
--- a/app/services/SDP/utils/short_schema.pyd
+++ b/app/services/SDP/utils/short_schema.pyd
--- a/app/services/SDP/utils/short_schema.so
+++ b/app/services/SDP/utils/short_schema.so
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.so
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.so
--- a/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd
+++ b/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd
--- a/app/services/SDP/utils/step2_subtitle_analyzer_bert.so
+++ b/app/services/SDP/utils/step2_subtitle_analyzer_bert.so
--- a/app/services/SDP/utils/step3_fragment_check.pyd
+++ b/app/services/SDP/utils/step3_fragment_check.pyd
--- a/app/services/SDP/utils/step3_fragment_check.so
+++ b/app/services/SDP/utils/step3_fragment_check.so
--- a/app/services/SDP/utils/step4_text_generate.pyd
+++ b/app/services/SDP/utils/step4_text_generate.pyd
--- a/app/services/SDP/utils/step4_text_generate.so
+++ b/app/services/SDP/utils/step4_text_generate.so
--- a/app/services/SDP/utils/step5_merge_script.pyd
+++ b/app/services/SDP/utils/step5_merge_script.pyd
--- a/app/services/SDP/utils/step5_merge_script.so
+++ b/app/services/SDP/utils/step5_merge_script.so
--- a/app/services/SDP/utils/utils.pyd
+++ b/app/services/SDP/utils/utils.pyd
--- a/app/services/SDP/utils/utils.so
+++ b/app/services/SDP/utils/utils.so
--- a/app/services/audio_merger.py
+++ b/app/services/audio_merger.py
@ -18,15 +18,14 @@ def check_ffmpeg():
        return False


-def merge_audio_files(task_id: str, audio_files: list, total_duration: float, list_script: list):
+def merge_audio_files(task_id: str, total_duration: float, list_script: list):
    """
-    合并音频文件，根据OST设置处理不同的音频轨道
+    合并音频文件
    
    Args:
        task_id: 任务ID
-        audio_files: TTS生成的音频文件列表
        total_duration: 总时长
-        list_script: 完整脚本信息，包含OST设置
+        list_script: 完整脚本信息，包含duration时长和audio路径
    
    Returns:
        str: 合并后的音频文件路径
@ -39,36 +38,38 @@ def merge_audio_files(task_id: str, audio_files: list, total_duration: float, li
    # 创建一个空的音频片段
    final_audio = AudioSegment.silent(duration=total_duration * 1000)  # 总时长以毫秒为单位

+    # 计算每个片段的开始位置（基于duration字段）
+    current_position = 0  # 初始位置（秒）
+    
    # 遍历脚本中的每个片段
-    for segment, audio_file in zip(list_script, audio_files):
+    for segment in list_script:
        try:
-            # 加载TTS音频文件
-            tts_audio = AudioSegment.from_file(audio_file)
-
-            # 获取片段的开始和结束时间
-            start_time, end_time = segment['new_timestamp'].split('-')
-            start_seconds = utils.time_to_seconds(start_time)
-            end_seconds = utils.time_to_seconds(end_time)
-
-            # 根据OST设置处理音频
-            if segment['OST'] == 0:
-                # 只使用TTS音频
-                final_audio = final_audio.overlay(tts_audio, position=start_seconds * 1000)
-            elif segment['OST'] == 1:
-                # 只使用原声（假设原声已经在视频中）
-                continue
-            elif segment['OST'] == 2:
-                # 混合TTS音频和原声
-                original_audio = AudioSegment.silent(duration=(end_seconds - start_seconds) * 1000)
-                mixed_audio = original_audio.overlay(tts_audio)
-                final_audio = final_audio.overlay(mixed_audio, position=start_seconds * 1000)
+            # 获取片段时长（秒）
+            duration = segment['duration']
+            
+            # 检查audio字段是否为空
+            if segment['audio'] and os.path.exists(segment['audio']):
+                # 加载TTS音频文件
+                tts_audio = AudioSegment.from_file(segment['audio'])
+                
+                # 将TTS音频添加到最终音频
+                final_audio = final_audio.overlay(tts_audio, position=current_position * 1000)
+            else:
+                # audio为空，不添加音频，仅保留间隔
+                logger.info(f"片段 {segment.get('timestamp', '')} 没有音频文件，保留 {duration} 秒的间隔")
+            
+            # 更新下一个片段的开始位置
+            current_position += duration

        except Exception as e:
-            logger.error(f"处理音频文件 {audio_file} 时出错: {str(e)}")
+            logger.error(f"处理音频片段时出错: {str(e)}")
+            # 即使处理失败，也要更新位置，确保后续片段位置正确
+            if 'duration' in segment:
+                current_position += segment['duration']
            continue

    # 保存合并后的音频文件
-    output_audio_path = os.path.join(utils.task_dir(task_id), "final_audio.mp3")
+    output_audio_path = os.path.join(utils.task_dir(task_id), "merger_audio.mp3")
    final_audio.export(output_audio_path, format="mp3")
    logger.info(f"合并后的音频文件已保存: {output_audio_path}")

@ -93,7 +94,7 @@ def time_to_seconds(time_str):

        # 分割时间部分
        parts = time_part.split(':')
-        
+
        if len(parts) == 3:  # HH:MM:SS
            h, m, s = map(int, parts)
            seconds = h * 3600 + m * 60 + s
@ -118,11 +119,11 @@ def extract_timestamp(filename):
        # 从文件名中提取时间部分
        time_part = filename.split('_', 1)[1].split('.')[0]  # 获取 "00_06,500-00_24,800" 部分
        start_time, end_time = time_part.split('-')  # 分割成开始和结束时间
-        
+
        # 将下划线格式转换回冒号格式
        start_time = start_time.replace('_', ':')
        end_time = end_time.replace('_', ':')
-        
+
        # 将时间戳转换为秒
        start_seconds = time_to_seconds(start_time)
        end_seconds = time_to_seconds(end_time)
@ -135,17 +136,36 @@ def extract_timestamp(filename):

 if __name__ == "__main__":
    # 示例用法
-    audio_files =[
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:06-00:24.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:32-00:38.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:43-00:52.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:52-01:09.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01:13-01:15.mp3",
-    ]
-    total_duration = 38
-    video_script_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json"
-    with open(video_script_path, "r", encoding="utf-8") as f:
-        video_script = json.load(f)
+    total_duration = 90

-    output_file = merge_audio_files("test456", audio_files, total_duration, video_script)
+    video_script = [
+        {'picture': '【解说】好的，各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！',
+         'timestamp': '00:00:00-00:00:26',
+         'narration': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！',
+         'OST': 0, 'duration': 26, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3'},
+        {'picture': '【解说】上一集我们看到，范闲在北齐遭遇了惊天变故，生死不明！', 'timestamp': '00:01:15-00:01:29',
+         'narration': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…',
+         'OST': 0, 'duration': 14, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3'},
+        {'picture': '画面切到王启年小心翼翼地向范闲汇报。', 'timestamp': '00:04:41-00:04:58',
+         'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪',
+         'OST': 1, 'duration': 17, 
+         'audio': ''},
+        {'picture': '【解说】"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。',
+         'timestamp': '00:04:58-00:05:20',
+         'narration': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！',
+         'OST': 0, 'duration': 22, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3'},
+        {'picture': '【解说】但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+         'timestamp': '00:05:45-00:05:53',
+         'narration': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+         'OST': 0, 'duration': 8, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'},
+        {'picture': '画面切换到范闲蒙面闯入皇宫，被侍卫包围的场景。', 'timestamp': '00:06:00-00:06:03',
+         'narration': '抓刺客',
+         'OST': 1, 'duration': 3, 
+         'audio': ''}]
+
+    output_file = merge_audio_files("test456", total_duration, video_script)
    print(output_file)
--- a/app/services/clip_video.py
+++ b/app/services/clip_video.py
@ -0,0 +1,256 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : clip_video
+@Author : 小林同学
+@Date   : 2025/5/6 下午6:14 
+'''
+
+import os
+import subprocess
+import json
+import hashlib
+from loguru import logger
+from typing import Dict, List, Optional
+from pathlib import Path
+
+
+def parse_timestamp(timestamp: str) -> tuple:
+    """
+    解析时间戳字符串，返回开始和结束时间
+    
+    Args:
+        timestamp: 格式为'HH:MM:SS-HH:MM:SS'或'HH:MM:SS,sss-HH:MM:SS,sss'的时间戳字符串
+        
+    Returns:
+        tuple: (开始时间, 结束时间) 格式为'HH:MM:SS'或'HH:MM:SS,sss'
+    """
+    start_time, end_time = timestamp.split('-')
+    return start_time, end_time
+
+
+def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str:
+    """
+    根据开始时间和持续时间计算结束时间
+    
+    Args:
+        start_time: 开始时间，格式为'HH:MM:SS'或'HH:MM:SS,sss'(带毫秒)
+        duration: 持续时间，单位为秒
+        extra_seconds: 额外添加的秒数，默认为1秒
+        
+    Returns:
+        str: 计算后的结束时间，格式与输入格式相同
+    """
+    # 检查是否包含毫秒
+    has_milliseconds = ',' in start_time
+    milliseconds = 0
+    
+    if has_milliseconds:
+        time_part, ms_part = start_time.split(',')
+        h, m, s = map(int, time_part.split(':'))
+        milliseconds = int(ms_part)
+    else:
+        h, m, s = map(int, start_time.split(':'))
+    
+    # 转换为总毫秒数
+    total_milliseconds = ((h * 3600 + m * 60 + s) * 1000 + milliseconds + 
+                          int((duration + extra_seconds) * 1000))
+    
+    # 计算新的时、分、秒、毫秒
+    ms_new = total_milliseconds % 1000
+    total_seconds = total_milliseconds // 1000
+    h_new = int(total_seconds // 3600)
+    m_new = int((total_seconds % 3600) // 60)
+    s_new = int(total_seconds % 60)
+    
+    # 返回与输入格式一致的时间字符串
+    if has_milliseconds:
+        return f"{h_new:02d}:{m_new:02d}:{s_new:02d},{ms_new:03d}"
+    else:
+        return f"{h_new:02d}:{m_new:02d}:{s_new:02d}"
+
+
+def check_hardware_acceleration() -> Optional[str]:
+    """
+    检查系统支持的硬件加速选项
+    
+    Returns:
+        Optional[str]: 硬件加速参数，如果不支持则返回None
+    """
+    # 检查NVIDIA GPU支持
+    try:
+        nvidia_check = subprocess.run(
+            ["ffmpeg", "-hwaccel", "cuda", "-i", "/dev/null", "-f", "null", "-"],
+            stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+        )
+        if nvidia_check.returncode == 0:
+            return "cuda"
+    except Exception:
+        pass
+
+    # 检查MacOS videotoolbox支持
+    try:
+        videotoolbox_check = subprocess.run(
+            ["ffmpeg", "-hwaccel", "videotoolbox", "-i", "/dev/null", "-f", "null", "-"],
+            stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+        )
+        if videotoolbox_check.returncode == 0:
+            return "videotoolbox"
+    except Exception:
+        pass
+
+    # 检查Intel Quick Sync支持
+    try:
+        qsv_check = subprocess.run(
+            ["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"],
+            stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+        )
+        if qsv_check.returncode == 0:
+            return "qsv"
+    except Exception:
+        pass
+
+    return None
+
+
+def clip_video(
+        video_origin_path: str,
+        tts_result: List[Dict],
+        output_dir: Optional[str] = None,
+        task_id: Optional[str] = None
+) -> Dict[str, str]:
+    """
+    根据时间戳裁剪视频
+    
+    Args:
+        video_origin_path: 原始视频的路径
+        tts_result: 包含时间戳和持续时间信息的列表
+        output_dir: 输出目录路径，默认为None时会自动生成
+        task_id: 任务ID，用于生成唯一的输出目录，默认为None时会自动生成
+        
+    Returns:
+        Dict[str, str]: 时间戳到裁剪后视频路径的映射
+    """
+    # 检查视频文件是否存在
+    if not os.path.exists(video_origin_path):
+        raise FileNotFoundError(f"视频文件不存在: {video_origin_path}")
+
+    # 如果未提供task_id，则根据输入生成一个唯一ID
+    if task_id is None:
+        content_for_hash = f"{video_origin_path}_{json.dumps(tts_result)}"
+        task_id = hashlib.md5(content_for_hash.encode()).hexdigest()
+
+    # 设置输出目录
+    if output_dir is None:
+        output_dir = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+            "storage", "temp", "clip_video", task_id
+        )
+
+    # 确保输出目录存在
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # 检查硬件加速支持
+    hwaccel = check_hardware_acceleration()
+    hwaccel_args = []
+    if hwaccel:
+        hwaccel_args = ["-hwaccel", hwaccel]
+        logger.info(f"使用硬件加速: {hwaccel}")
+
+    # 存储裁剪结果
+    result = {}
+
+    for item in tts_result:
+        _id = item.get("_id", item.get("timestamp", "unknown"))
+        timestamp = item["timestamp"]
+        start_time, _ = parse_timestamp(timestamp)
+
+        # 根据持续时间计算真正的结束时间（加上1秒余量）
+        duration = item["duration"]
+        calculated_end_time = calculate_end_time(start_time, duration)
+        
+        # 转换为FFmpeg兼容的时间格式（逗号替换为点）
+        ffmpeg_start_time = start_time.replace(',', '.')
+        ffmpeg_end_time = calculated_end_time.replace(',', '.')
+
+        # 格式化输出文件名（使用连字符替代冒号和逗号）
+        safe_start_time = start_time.replace(':', '-').replace(',', '-')
+        safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
+        output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4"
+        output_path = os.path.join(output_dir, output_filename)
+
+        # 构建FFmpeg命令
+        ffmpeg_cmd = [
+            "ffmpeg", "-y", *hwaccel_args,
+            "-i", video_origin_path,
+            "-ss", ffmpeg_start_time,
+            "-to", ffmpeg_end_time,
+            "-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264",
+            "-c:a", "aac",
+            "-strict", "experimental",
+            output_path
+        ]
+
+        # 执行FFmpeg命令
+        try:
+            logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}")
+            # logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}")
+
+            process = subprocess.run(
+                ffmpeg_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
+            )
+
+            result[_id] = output_path
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"裁剪视频片段失败: {timestamp}")
+            logger.error(f"错误信息: {e.stderr}")
+            raise RuntimeError(f"视频裁剪失败: {e.stderr}")
+
+    return result
+
+
+if __name__ == "__main__":
+    video_origin_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/qyn2-2无片头片尾.mp4"
+
+    tts_result = [{'timestamp': '00:00:00-00:01:15',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt',
+                   'duration': 25.55,
+                   'text': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！'},
+                  {'timestamp': '00:01:15-00:04:40',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt',
+                   'duration': 13.488,
+                   'text': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…'},
+                  {'timestamp': '00:04:58-00:05:45',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt',
+                   'duration': 21.363,
+                   'text': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！'},
+                  {'timestamp': '00:05:45-00:06:00',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt',
+                   'duration': 7.675, 'text': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！'}]
+    subclip_path_videos = {
+        '00:00:00-00:01:15': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-00-00-00-01-15.mp4',
+        '00:01:15-00:04:40': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-01-15-00-04-40.mp4',
+        '00:04:41-00:04:58': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4',
+        '00:04:58-00:05:45': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-58-00-05-45.mp4',
+        '00:05:45-00:06:00': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-05-45-00-06-00.mp4',
+        '00:06:00-00:06:03': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4',
+    }
+
+    # 使用方法示例
+    try:
+        result = clip_video(video_origin_path, tts_result, subclip_path_videos)
+        print("裁剪结果:")
+        print(json.dumps(result, indent=4, ensure_ascii=False))
+    except Exception as e:
+        print(f"发生错误: {e}")
--- a/app/services/generate_narration_script.py
+++ b/app/services/generate_narration_script.py
@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : 生成介绍文案
+@Author : 小林同学
+@Date   : 2025/5/8 上午11:33 
+'''
+
+import json
+import os
+import traceback
+from openai import OpenAI
+from loguru import logger
+
+
+def parse_frame_analysis_to_markdown(json_file_path):
+    """
+    解析视频帧分析JSON文件并转换为Markdown格式
+    
+    :param json_file_path: JSON文件路径
+    :return: Markdown格式的字符串
+    """
+    # 检查文件是否存在
+    if not os.path.exists(json_file_path):
+        return f"错误: 文件 {json_file_path} 不存在"
+    
+    try:
+        # 读取JSON文件
+        with open(json_file_path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+        
+        # 初始化Markdown字符串
+        markdown = ""
+        
+        # 获取总结和帧观察数据
+        summaries = data.get('overall_activity_summaries', [])
+        frame_observations = data.get('frame_observations', [])
+        
+        # 按批次组织数据
+        batch_frames = {}
+        for frame in frame_observations:
+            batch_index = frame.get('batch_index')
+            if batch_index not in batch_frames:
+                batch_frames[batch_index] = []
+            batch_frames[batch_index].append(frame)
+        
+        # 生成Markdown内容
+        for i, summary in enumerate(summaries, 1):
+            batch_index = summary.get('batch_index')
+            time_range = summary.get('time_range', '')
+            batch_summary = summary.get('summary', '')
+            
+            markdown += f"## 片段 {i}\n"
+            markdown += f"- 时间范围：{time_range}\n"
+            
+            # 添加片段描述
+            markdown += f"- 片段描述：{batch_summary}\n" if batch_summary else f"- 片段描述：\n"
+            
+            markdown += "- 详细描述：\n"
+            
+            # 添加该批次的帧观察详情
+            frames = batch_frames.get(batch_index, [])
+            for frame in frames:
+                timestamp = frame.get('timestamp', '')
+                observation = frame.get('observation', '')
+                
+                # 直接使用原始文本，不进行分割
+                markdown += f"  - {timestamp}: {observation}\n" if observation else f"  - {timestamp}: \n"
+            
+            markdown += "\n"
+        
+        return markdown
+    
+    except Exception as e:
+        return f"处理JSON文件时出错: {traceback.format_exc()}"
+
+
+def generate_narration(markdown_content, api_key, base_url, model):
+    """
+    调用OpenAI API根据视频帧分析的Markdown内容生成解说文案
+    
+    :param markdown_content: Markdown格式的视频帧分析内容
+    :param api_key: OpenAI API密钥
+    :param base_url: API基础URL，如果使用非官方API
+    :param model: 使用的模型名称
+    :return: 生成的解说文案
+    """
+    try:
+        # 构建提示词
+        prompt = """
+我是一名荒野建造解说的博主，以下是一些同行的对标文案，请你深度学习并总结这些文案的风格特点跟内容特点：
+
+<example_text_1>
+解压助眠的天花板就是荒野建造，沉浸丝滑的搭建过程可以说每一帧都是极致享受，我保证强迫症来了都找不出一丁点毛病。更别说全屋严丝合缝的拼接工艺，还能轻松抵御零下二十度气温，让你居住的每一天都温暖如春。
+在家闲不住的西姆今天也打算来一次野外建造，行走没多久他就发现许多倒塌的树，任由它们自生自灭不如将其利用起来。想到这他就开始挥舞铲子要把地基挖掘出来，虽然每次只能挖一点点，但架不住他体能惊人。没多长时间一个 2x3 的深坑就赫然出现，这深度住他一人绰绰有余。
+随后他去附近收集来原木，这些都是搭建墙壁的最好材料。而在投入使用前自然要把表皮刮掉，防止森林中的白蚁蛀虫。处理好一大堆后西姆还在两端打孔，使用木钉固定在一起。这可不是用来做墙壁的，而是做庇护所的承重柱。只要木头间的缝隙足够紧密，那搭建出的木屋就能足够坚固。
+每向上搭建一层，他都会在中间塞入苔藓防寒，保证不会泄露一丝热量。其他几面也是用相同方法，很快西姆就做好了三面墙壁，每一根木头都极其工整，保证强迫症来了都要点个赞再走。
+在继续搭建墙壁前西姆决定将壁炉制作出来，毕竟森林夜晚的气温会很低，保暖措施可是重中之重。完成后他找来一块大树皮用来充当庇护所的大门，而上面刮掉的木屑还能作为壁炉的引火物，可以说再完美不过。
+测试了排烟没问题后他才开始搭建最后一面墙壁，这一面要预留门和窗，所以在搭建到一半后还需要在原木中间开出卡口，让自己劈砍时能轻松许多。此时只需将另外一根如法炮制，两端拼接在一起后就是一扇大小适中的窗户。而随着随后一层苔藓铺好，最后一根原木落位，这个庇护所的雏形就算完成。
+大门的安装他没选择用合页，而是在底端雕刻出榫头，门框上则雕刻出榫眼，只能说西姆的眼就是一把尺，这完全就是严丝合缝。此时他才开始搭建屋顶。这里西姆用的方法不同，他先把最外围的原木固定好，随后将原木平铺在上面，就能得到完美的斜面屋顶。等他将四周的围栏也装好后，工整的屋顶看起来十分舒服，西姆躺上去都不想动。
+稍作休息后，他利用剩余的苔藓，对屋顶的缝隙处密封。可这样西姆觉得不够保险，于是他找来一些黏土，再次对原本的缝隙二次加工，保管这庇护所冬天也暖和。最后只需要平铺上枯叶，以及挖掘出的泥土，整个屋顶就算完成。
+考虑到庇护所的美观性，自然少不了覆盖上苔藓，翠绿的颜色看起来十分舒服。就连门口的庭院旁，他都移植了许多小树做点缀，让这木屋与周边环境融为一体。西姆才刚完成好这件事，一场大雨就骤然降临。好在此时的他已经不用淋雨，更别说这屋顶防水十分不错，室内没一点雨水渗透进来。
+等待温度回升的过程，西姆利用墙壁本身的凹槽，把床框镶嵌在上面，只需要铺上苔藓，以及自带的床单枕头，一张完美的单人床就做好。辛苦劳作一整天，西姆可不会亏待自己。他将自带的牛肉腌制好后，直接放到壁炉中烤，只需要等待三十分钟，就能享受这美味的一顿。
+在辛苦建造一星期后，他终于可以在自己搭建的庇护所中，享受最纯正的野外露营。后面西姆回家补给了一堆物资，再次回来时森林已经大雪纷飞，让他原本翠绿的小屋，更换上了冬季限定皮肤。好在内部设施没受什么影响，和他离开时一样整洁。
+就是房间中已经没多少柴火，让西姆今天又得劈柴。寒冷干燥的天气，让木头劈起来十分轻松。没多久他就收集到一大堆，这些足够燃烧好几天。虽然此时外面大雪纷飞，但小屋中却开始逐渐温暖。这次他除了带来一些食物外，还有几瓶调味料，以及一整套被褥，让自己的居住舒适度提高一大截。
+而秋天他有收集干草的缘故，只需要塞入枕套中密封起来，就能作为靠垫用。就这居住条件，比一般人在家过的还要奢侈。趁着壁炉木头变木炭的过程，西姆则开始不紧不慢的处理食物。他取出一块牛排，改好花刀以后，撒上一堆调料腌制起来。接着用锡纸包裹好，放到壁炉中直接炭烤，搭配上自带的红酒，是一个非常好的选择。
+随着时间来到第二天，外面的积雪融化了不少，西姆简单做顿煎蛋补充体力后，决定制作一个室外篝火堆，用来晚上驱散周边野兽。搭建这玩意没什么技巧，只需要找到一大堆木棍，利用大树的夹缝将其掰弯，然后将其堆积在一起，就是一个简易版的篝火堆。看这外形有点像帐篷，好在西姆没想那么多。
+等待天色暗淡下来后，他才来到室外将其点燃，顺便处理下多余的废料。只可惜这场景没朋友陪在身边，对西姆来说可能是个遗憾。而哪怕森林只有他一个人，都依旧做了好几个小时。等到里面的篝火彻底燃尽后，西姆还找来雪球，覆盖到上面将火熄灭，这防火意识可谓十分好。最后在室内二十五度的高温下，裹着被子睡觉。
+</example_text_1>
+
+<example_text_2>
+解压助眠的天花板就是荒野建造，沉浸丝滑的搭建过程每一帧都是极致享受，全屋严丝合缝的拼接工艺，能轻松抵御零下二十度气温，居住体验温暖如春。
+在家闲不住的西姆开启野外建造。他发现倒塌的树，决定加以利用。先挖掘出 2x3 的深坑作为地基，接着收集原木，刮掉表皮防白蚁蛀虫，打孔用木钉固定制作承重柱。搭建墙壁时，每一层都塞入苔藓防寒，很快做好三面墙。
+为应对森林夜晚低温，西姆制作壁炉，用大树皮当大门，刮下的木屑做引火物。搭建最后一面墙时预留门窗，通过在原木中间开口拼接做出窗户。大门采用榫卯结构安装，严丝合缝。
+搭建屋顶时，先固定外围原木，再平铺原木形成斜面屋顶，之后用苔藓、黏土密封缝隙，铺上枯叶和泥土。为美观，在木屋覆盖苔藓，移植小树点缀。完工时遇大雨，木屋防水良好。
+西姆利用墙壁凹槽镶嵌床框，铺上苔藓、床单枕头做成床。劳作一天后，他用壁炉烤牛肉享用。建造一星期后，他开始野外露营。
+后来西姆回家补给物资，回来时森林大雪纷飞。他劈柴储备，带回食物、调味料和被褥，提高居住舒适度，还用干草做靠垫。他用壁炉烤牛排，搭配红酒。
+第二天，积雪融化，西姆制作室外篝火堆防野兽。用大树夹缝掰弯木棍堆积而成，晚上点燃处理废料，结束后用雪球灭火，最后在室内二十五度的环境中裹被入睡。
+</example_text_2>
+
+<example_text_3>
+如果战争到来，这个深埋地下十几米的庇护所绝对是 bug 般的存在。即使被敌人发现，还能通过快速通道一秒逃出。里面不仅有竹子、地暖、地下水井，还自制抽水机。在解决用水问题的同时，甚至自研无土栽培技术，过上完全自给自足的生活。
+阿伟的老婆美如花，但阿伟从来不回家，来到野外他乐哈哈，一言不合就开挖。众所周知当战争来临时，地下堡垒的安全性是最高的。阿伟苦苦研习两载半，只为练就一身挖洞本领。在这双逆天麒麟臂的加持下，如此坚硬的泥土都只能当做炮灰。
+得到了充足的空间后，他便开始对这些边缘进行打磨。随后阿伟将细线捆在木棍上，以此描绘出圆柱的轮廓。接着再一点点铲掉多余的部分。虽然是由泥土一体式打造，但这样的桌子保准用上千年都不成问题。
+考虑到十几米的深度进出非常不方便，于是阿伟找来两根长达 66.6 米的木头，打算为庇护所打造一条快速通道。只见他将木桩牢牢地插入地下，并顺着洞口的方向延伸出去，直到贯穿整个山洞。接着在每个木桩的连接处钉入铁钉，确保轨道不能有一毫米的偏差。完成后再制作一个木质框架，从而达到前后滑动的效果。
+不得不说阿伟这手艺简直就是大钢管子杵青蛙。在上面放上一个木制的车斗，还能加快搬运泥土的速度。没多久庇护所的内部就已经初见雏形。为了住起来更加舒适，还需要为自己打造一张床。虽然深处的泥土同样很坚固，但好处就是不用担心垮塌的风险。
+阿伟不仅设计了更加符合人体工学的拱形，并且还在一旁雕刻处壁龛。就是这氛围怎么看着有点不太吉利。别看阿伟一身腱子肉，但这身体里的艺术细菌可不少。每个边缘的地方他都做了精雕细琢，瞬间让整个卧室的颜值提升一大截。
+住在地下的好处就是房子面积全靠挖，每平方消耗两个半馒头。不仅没有了房贷的压力，就连买墓地的钱也省了。阿伟将中间的墙壁挖空，从而得到取暖的壁炉。当然最重要的还有排烟问题，要想从上往下打通十几米的山体是件极其困难的事。好在阿伟年轻时报过忆坤年的古墓派补习班，这打洞技术堪比隔壁学校的土拨鼠专业。虽然深度长达十几米，但排烟效果却一点不受影响，一个字专业！
+随后阿伟继续对壁炉底部雕刻，打通了底部放柴火的空间，并制作出放锅的灶头。完成后阿伟从侧面将壁炉打通，并制作出一条导热的通道，以此连接到床铺的位置。毕竟住在这么一个风湿宝地，不注意保暖除湿很容易得老寒腿。
+阿伟在床面上挖出一条条管道，以便于温度能传输到床的每个角落。接下来就可以根据这些通道的长度裁切出同样长短的竹子，根据竹筒的大小凿出相互连接的孔洞，最后再将竹筒内部打通，以达到温度传送的效果。
+而后阿伟将这些管道安装到凹槽内，在他严谨的制作工艺下，每根竹子刚好都能镶嵌进去。在铺设床面之前还需要用木塞把圆孔堵住，防止泥土掉落进管道。泥土虽然不能隔绝湿气，但却是十分优良的导热材料。等他把床面都压平后就可以小心的将这些木塞拔出来，最后再用黏土把剩余的管道也遮盖起来，直到整个墙面恢复原样。
+接下来还需要测试一下加热效果，当他把火点起来后，温度很快就传送到了管道内，把火力一点点加大，直到热气流淌到更远的床面。随着小孔里的青烟冒出，也预示着阿伟的地暖可以投入使用。而后阿伟制作了一些竹条，并用细绳将它们喜结连理。
+千里之行始于足下，美好的家园要靠自己双手打造。明明可以靠才艺吃饭的阿伟偏偏要用八块腹肌征服大家，就问这样的男人哪个野生婆娘不喜欢？完成后阿伟还用自己 35 码的大腚感受了一下，真烫！
+随后阿伟来到野区找到一根上好的雷击木，他当即就把木头咔嚓成两段，并取下两节较为完整的带了回去，刚好能和圆桌配套。另外一个在里面凿出凹槽，并插入木棍连接，得到一个夯土的木锤。住过农村的小伙伴都知道，这样夯出来的地面堪比水泥地，不仅坚硬耐磨，还不用担心脚底打滑。忙碌了一天的阿伟已经饥渴难耐，拿出野生小烤肠，安安心心住新房，光脚爬上大热炕，一觉能睡到天亮。
+第二天阿伟打算将房间扩宽，毕竟吃住的地方有了，还要解决个人卫生的问题。阿伟在另一侧增加了一个房间，他打算将这里打造成洗澡的地方。为了防止泥土垮塌，他将顶部做成圆弧形，等挖出足够的空间后，旁边的泥土已经堆成了小山。
+为了方便清理这些泥土，阿伟在之前的轨道增加了转弯，交接处依然是用铁钉固定，一直延伸到房间的最里面。有了运输车的帮助，这些成吨的泥土也能轻松的运送出去，并且还能体验过山车的感觉。很快他就完成了清理工作。
+为了更方便的在里面洗澡，他将底部一点点挖空，这么大的浴缸，看来阿伟并不打算一个人住。完成后他将墙面雕刻的凹凸有致，让这里看起来更加豪华。接着用洛阳铲挖出排水口，并用一根相同大小的竹筒作为开关。
+由于四周都是泥土还不能防水，阿伟特意找了一些白蚁巢，用来制作可以防水的野生水泥。现在就可以将里里外外，能接触到水的地方都涂抹一遍。细心的阿伟还找来这种 500 克一斤的鹅卵石，对池子表面进行装饰。
+没错，水源问题阿伟早已经考虑在内，他打算直接在旁边挖个水井，毕竟已经挖了这么深，再向下挖一挖，应该就能到达地下水的深度。经过几日的奋战，能看得出阿伟已经消瘦了不少，但一想到马上就能拥有的豪宅，他直接化身为无情的挖土机器，很快就挖到了好几米的深度。
+考虑到自己的弹跳力有限，阿伟在一旁定入木桩，然后通过绳子爬上爬下。随着深度越来越深，井底已经开始渗出水来，这也预示着打井成功。没多久这里面将渗满泉水，仅凭一次就能挖到水源，看来这里还真是块风湿宝地。
+随后阿伟在井口四周挖出凹槽，以便于井盖的安置。这一量才知道，井的深度已经达到了足足的 5 米。阿伟把木板组合在一起，再沿着标记切掉多余部分，他甚至还给井盖做了把手。可是如何从这么深的井里打水还是个问题，但从阿伟坚定的眼神来看，他应该想到了解决办法。
+只见他将树桩锯成两半，然后用凿子把里面一点点掏空，另外一半也是如法炮制。接着还要在底部挖出圆孔，要想成功将水从 5 米深的地方抽上来，那就不得不提到大家熟知的勾股定理。没错，这跟勾股定理没什么关系。
+阿伟给竹筒做了一个木塞，并在里面打上安装连接轴的孔。为了增加密闭性，阿伟不得不牺牲了自己的 AJ，剪出与木塞相同的大小后，再用木钉固定住。随后他收集了一些树胶，并放到火上加热融化。接下来就可以涂在木塞上增加使用寿命。
+现在将竹筒组装完成，就可以利用虹吸原理将水抽上来。完成后就可以把井盖盖上去，再用泥土在上面覆盖，现在就不用担心失足掉下去了。
+接下来阿伟去采集了一些大漆，将它涂抹在木桶接缝处，就能将其二合为一。完了再接入旁边浴缸的入水口，每个连接的地方都要做好密封，不然后面很容易漏水。随后就可以安装上活塞，并用一根木桩作为省力杠杆，根据空气压强的原理将井水抽上来。
+经过半小时的来回拉扯，硕大的浴缸终于被灌满，阿伟也是忍不住洗了把脸。接下来还需要解决排水的问题，阿伟在地上挖出沟渠，一直贯穿到屋外，然后再用竹筒从出水口连接，每个接口处都要抹上胶水，就连门外的出水口他都做了隐藏。
+在野外最重要的就是庇护所、水源还有食物。既然已经完成了前二者，那么阿伟还需要拥有可持续发展的食物来源。他先是在地上挖了两排地洞，然后在每根竹筒的表面都打上无数孔洞，这就是他打算用来种植的载体。在此之前，还需要用大火对竹筒进行杀菌消毒。
+趁着这时候，他去搬了一麻袋的木屑，先用芭蕉叶覆盖在上面，再铺上厚厚的黏土隔绝温度。在火焰的温度下，能让里面的木屑达到生长条件。
+等到第二天所有材料都晾凉后，阿伟才将竹筒内部掏空，并将木屑一点点地塞入竹筒。一切准备就绪，就可以将竹筒插入提前挖好的地洞。最后再往竹筒里塞入种子，依靠房间内的湿度和温度，就能达到大棚种植的效果。稍加时日，这些种子就会慢慢发芽。
+虽然暂时还吃不上自己培养的食物，但好在阿伟从表哥贺强那里学到不少钓鱼本领，哪怕只有一根小小的竹竿，也能让他钓上两斤半的大鲶鱼。新鲜的食材，那肯定是少不了高温消毒的过程。趁着鱼没熟，阿伟直接爬进浴缸，冰凉的井水瞬间洗去了身上的疲惫。这一刻的阿伟是无比的享受。
+不久后鱼也烤得差不多了，阿伟的生活现在可以说是有滋有味。住在十几米的地下，不仅能安全感满满，哪怕遇到危险，还能通过轨道快速逃生。
+<example_text_3>
+
+<video_frame_description>
+%s
+</video_frame_description>
+
+我正在尝试做这个内容的解说纪录片视频，我需要你以 <video_frame_description> </video_frame_description> 中的内容为解说目标，根据我刚才提供给你的对标文案 <example_text> 特点，以及你总结的特点，帮我生成一段关于荒野建造的解说文案，文案需要符合平台受欢迎的解说风格，请使用 json 格式进行输出；使用 <output> 中的输出格式：
+
+<output>
+{
+  "items": [
+    {
+        "_id": 1, # 唯一递增id
+        "timestamp": "00:00:05,390-00:00:10,430",
+        "picture": "画面描述",
+        "narration": "解说文案",
+    }
+}
+</output>
+
+<restriction>
+1. 只输出 json 内容，不要输出其他任何说明性的文字
+2. 解说文案的语言使用 简体中文
+3. 严禁虚构画面，所有画面只能从 <video_frame_description> 中摘取
+</restriction>
+""" % (markdown_content)
+
+        # 使用OpenAI SDK初始化客户端
+        client = OpenAI(
+            api_key=api_key,
+            base_url=base_url
+        )
+        
+        # 使用SDK发送请求
+        if model not in ["deepseek-reasoner"]:
+            # deepseek-reasoner 不支持 json 输出
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=1.5,
+                response_format={"type": "json_object"},
+            )
+            # 提取生成的文案
+            if response.choices and len(response.choices) > 0:
+                narration_script = response.choices[0].message.content
+                # 打印消耗的tokens
+                logger.debug(f"消耗的tokens: {response.usage.total_tokens}")
+                return narration_script
+            else:
+                return "生成解说文案失败: 未获取到有效响应"
+        else:
+            # 不支持 json 输出，需要多一步处理 ```json ``` 的步骤
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=1.5,
+            )
+            # 提取生成的文案
+            if response.choices and len(response.choices) > 0:
+                narration_script = response.choices[0].message.content
+                # 打印消耗的tokens
+                logger.debug(f"文案消耗的tokens: {response.usage.total_tokens}")
+                # 清理 narration_script 字符串前后的 ```json ``` 字符串
+                narration_script = narration_script.replace("```json", "").replace("```", "")
+                return narration_script
+            else:
+                return "生成解说文案失败: 未获取到有效响应"
+    
+    except Exception as e:
+        return f"调用API生成解说文案时出错: {traceback.format_exc()}"
+
+
+if __name__ == '__main__':
+    text_provider = 'openai'
+    text_api_key = "sk-xxx"
+    text_model = "deepseek-reasoner"
+    text_base_url = "https://api.deepseek.com"
+    video_frame_description_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1139.json"
+
+    # 测试新的JSON文件
+    test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1458.json"
+    markdown_output = parse_frame_analysis_to_markdown(test_file_path)
+    # print(markdown_output)
+    
+    # 输出到文件以便检查格式
+    output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/narration_script.md"
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(markdown_output)
+    # print(f"\n已将Markdown输出保存到: {output_file}")
+    
+    # 生成解说文案
+    narration = generate_narration(
+        markdown_output, 
+        text_api_key,
+        base_url=text_base_url,
+        model=text_model
+    )
+    
+    # 保存解说文案
+    print(narration)
+    print(type(narration))
+    narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json"
+    with open(narration_file, 'w', encoding='utf-8') as f:
+        f.write(narration)
+    print(f"\n已将解说文案保存到: {narration_file}")
--- a/app/services/generate_video.py
+++ b/app/services/generate_video.py
@ -0,0 +1,393 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : generate_video
+@Author : 小林同学
+@Date   : 2025/5/7 上午11:55 
+'''
+
+import os
+import traceback
+from typing import Optional, Dict, Any
+from loguru import logger
+from moviepy import (
+    VideoFileClip,
+    AudioFileClip,
+    CompositeAudioClip,
+    CompositeVideoClip,
+    TextClip,
+    afx
+)
+from moviepy.video.tools.subtitles import SubtitlesClip
+from PIL import ImageFont
+
+from app.utils import utils
+
+
+def merge_materials(
+    video_path: str,
+    audio_path: str,
+    output_path: str,
+    subtitle_path: Optional[str] = None,
+    bgm_path: Optional[str] = None,
+    options: Optional[Dict[str, Any]] = None
+) -> str:
+    """
+    合并视频、音频、BGM和字幕素材生成最终视频
+    
+    参数:
+        video_path: 视频文件路径
+        audio_path: 音频文件路径
+        output_path: 输出文件路径
+        subtitle_path: 字幕文件路径，可选
+        bgm_path: 背景音乐文件路径，可选
+        options: 其他选项配置，可包含以下字段:
+            - voice_volume: 人声音量，默认1.0
+            - bgm_volume: 背景音乐音量，默认0.3
+            - original_audio_volume: 原始音频音量，默认0.0
+            - keep_original_audio: 是否保留原始音频，默认False
+            - subtitle_font: 字幕字体，默认None，系统会使用默认字体
+            - subtitle_font_size: 字幕字体大小，默认40
+            - subtitle_color: 字幕颜色，默认白色
+            - subtitle_bg_color: 字幕背景颜色，默认透明
+            - subtitle_position: 字幕位置，可选值'bottom', 'top', 'center'，默认'bottom'
+            - custom_position: 自定义位置
+            - stroke_color: 描边颜色，默认黑色
+            - stroke_width: 描边宽度，默认1
+            - threads: 处理线程数，默认2
+            - fps: 输出帧率，默认30
+            
+    返回:
+        输出视频的路径
+    """
+    # 合并选项默认值
+    if options is None:
+        options = {}
+    
+    # 设置默认参数值
+    voice_volume = options.get('voice_volume', 1.0)
+    bgm_volume = options.get('bgm_volume', 0.3)
+    original_audio_volume = options.get('original_audio_volume', 0.0)  # 默认为0，即不保留原声
+    keep_original_audio = options.get('keep_original_audio', False)  # 是否保留原声
+    subtitle_font = options.get('subtitle_font', '')
+    subtitle_font_size = options.get('subtitle_font_size', 40)
+    subtitle_color = options.get('subtitle_color', '#FFFFFF')
+    subtitle_bg_color = options.get('subtitle_bg_color', 'transparent')
+    subtitle_position = options.get('subtitle_position', 'bottom')
+    custom_position = options.get('custom_position', 70)
+    stroke_color = options.get('stroke_color', '#000000')
+    stroke_width = options.get('stroke_width', 1)
+    threads = options.get('threads', 2)
+    fps = options.get('fps', 30)
+    
+    # 处理透明背景色问题 - MoviePy 2.1.1不支持'transparent'值
+    if subtitle_bg_color == 'transparent':
+        subtitle_bg_color = None  # None在新版MoviePy中表示透明背景
+    
+    # 创建输出目录（如果不存在）
+    output_dir = os.path.dirname(output_path)
+    os.makedirs(output_dir, exist_ok=True)
+    
+    logger.info(f"开始合并素材...")
+    logger.info(f"  ① 视频: {video_path}")
+    logger.info(f"  ② 音频: {audio_path}")
+    if subtitle_path:
+        logger.info(f"  ③ 字幕: {subtitle_path}")
+    if bgm_path:
+        logger.info(f"  ④ 背景音乐: {bgm_path}")
+    logger.info(f"  ⑤ 输出: {output_path}")
+    
+    # 加载视频
+    try:
+        video_clip = VideoFileClip(video_path)
+        logger.info(f"视频尺寸: {video_clip.size[0]}x{video_clip.size[1]}, 时长: {video_clip.duration}秒")
+        
+        # 提取视频原声(如果需要)
+        original_audio = None
+        if keep_original_audio and original_audio_volume > 0:
+            try:
+                original_audio = video_clip.audio
+                if original_audio:
+                    original_audio = original_audio.with_effects([afx.MultiplyVolume(original_audio_volume)])
+                    logger.info(f"已提取视频原声，音量设置为: {original_audio_volume}")
+                else:
+                    logger.warning("视频没有音轨，无法提取原声")
+            except Exception as e:
+                logger.error(f"提取视频原声失败: {str(e)}")
+                original_audio = None
+        
+        # 移除原始音轨，稍后会合并新的音频
+        video_clip = video_clip.without_audio()
+        
+    except Exception as e:
+        logger.error(f"加载视频失败: {str(e)}")
+        raise
+    
+    # 处理背景音乐和所有音频轨道合成
+    audio_tracks = []
+
+    # 先添加主音频（配音）
+    if audio_path and os.path.exists(audio_path):
+        try:
+            voice_audio = AudioFileClip(audio_path).with_effects([afx.MultiplyVolume(voice_volume)])
+            audio_tracks.append(voice_audio)
+            logger.info(f"已添加配音音频，音量: {voice_volume}")
+        except Exception as e:
+            logger.error(f"加载配音音频失败: {str(e)}")
+
+    # 添加原声（如果需要）
+    if original_audio is not None:
+        audio_tracks.append(original_audio)
+        logger.info(f"已添加视频原声，音量: {original_audio_volume}")
+
+    # 添加背景音乐（如果有）
+    if bgm_path and os.path.exists(bgm_path):
+        try:
+            bgm_clip = AudioFileClip(bgm_path).with_effects([
+                afx.MultiplyVolume(bgm_volume),
+                afx.AudioFadeOut(3),
+                afx.AudioLoop(duration=video_clip.duration),
+            ])
+            audio_tracks.append(bgm_clip)
+            logger.info(f"已添加背景音乐，音量: {bgm_volume}")
+        except Exception as e:
+            logger.error(f"添加背景音乐失败: \n{traceback.format_exc()}")
+
+    # 合成最终的音频轨道
+    if audio_tracks:
+        final_audio = CompositeAudioClip(audio_tracks)
+        video_clip = video_clip.with_audio(final_audio)
+        logger.info(f"已合成所有音频轨道，共{len(audio_tracks)}个")
+    else:
+        logger.warning("没有可用的音频轨道，输出视频将没有声音")
+    
+    # 处理字体路径
+    font_path = None
+    if subtitle_path and subtitle_font:
+        font_path = os.path.join(utils.font_dir(), subtitle_font)
+        if os.name == "nt":
+            font_path = font_path.replace("\\", "/")
+        logger.info(f"使用字体: {font_path}")
+    
+    # 处理视频尺寸
+    video_width, video_height = video_clip.size
+    
+    # 字幕处理函数
+    def create_text_clip(subtitle_item):
+        """创建单个字幕片段"""
+        phrase = subtitle_item[1]
+        max_width = video_width * 0.9
+        
+        # 如果有字体路径，进行文本换行处理
+        wrapped_txt = phrase
+        txt_height = 0
+        if font_path:
+            wrapped_txt, txt_height = wrap_text(
+                phrase, 
+                max_width=max_width, 
+                font=font_path, 
+                fontsize=subtitle_font_size
+            )
+        
+        # 创建文本片段
+        try:
+            _clip = TextClip(
+                text=wrapped_txt,
+                font=font_path,
+                font_size=subtitle_font_size,
+                color=subtitle_color,
+                bg_color=subtitle_bg_color,  # 这里已经在前面处理过，None表示透明
+                stroke_color=stroke_color,
+                stroke_width=stroke_width,
+            )
+        except Exception as e:
+            logger.error(f"创建字幕片段失败: {str(e)}, 使用简化参数重试")
+            # 如果上面的方法失败，尝试使用更简单的参数
+            _clip = TextClip(
+                text=wrapped_txt,
+                font=font_path,
+                font_size=subtitle_font_size,
+                color=subtitle_color,
+            )
+        
+        # 设置字幕时间
+        duration = subtitle_item[0][1] - subtitle_item[0][0]
+        _clip = _clip.with_start(subtitle_item[0][0])
+        _clip = _clip.with_end(subtitle_item[0][1])
+        _clip = _clip.with_duration(duration)
+        
+        # 设置字幕位置
+        if subtitle_position == "bottom":
+            _clip = _clip.with_position(("center", video_height * 0.95 - _clip.h))
+        elif subtitle_position == "top":
+            _clip = _clip.with_position(("center", video_height * 0.05))
+        elif subtitle_position == "custom":
+            margin = 10
+            max_y = video_height - _clip.h - margin
+            min_y = margin
+            custom_y = (video_height - _clip.h) * (custom_position / 100)
+            custom_y = max(
+                min_y, min(custom_y, max_y)
+            )
+            _clip = _clip.with_position(("center", custom_y))
+        else:  # center
+            _clip = _clip.with_position(("center", "center"))
+            
+        return _clip
+        
+    # 创建TextClip工厂函数
+    def make_textclip(text):
+        return TextClip(
+            text=text,
+            font=font_path,
+            font_size=subtitle_font_size,
+            color=subtitle_color,
+        )
+    
+    # 处理字幕
+    if subtitle_path and os.path.exists(subtitle_path):
+        try:
+            # 加载字幕文件
+            sub = SubtitlesClip(
+                subtitles=subtitle_path, 
+                encoding="utf-8", 
+                make_textclip=make_textclip
+            )
+            
+            # 创建每个字幕片段
+            text_clips = []
+            for item in sub.subtitles:
+                clip = create_text_clip(subtitle_item=item)
+                text_clips.append(clip)
+                
+            # 合成视频和字幕
+            video_clip = CompositeVideoClip([video_clip, *text_clips])
+            logger.info(f"已添加{len(text_clips)}个字幕片段")
+        except Exception as e:
+            logger.error(f"处理字幕失败: \n{traceback.format_exc()}")
+    
+    # 导出最终视频
+    try:
+        video_clip.write_videofile(
+            output_path,
+            audio_codec="aac",
+            temp_audiofile_path=output_dir,
+            threads=threads,
+            fps=fps,
+        )
+        logger.success(f"素材合并完成: {output_path}")
+    except Exception as e:
+        logger.error(f"导出视频失败: {str(e)}")
+        raise
+    finally:
+        # 释放资源
+        video_clip.close()
+        del video_clip
+    
+    return output_path
+
+
+def wrap_text(text, max_width, font="Arial", fontsize=60):
+    """
+    文本换行函数，使长文本适应指定宽度
+    
+    参数:
+        text: 需要换行的文本
+        max_width: 最大宽度（像素）
+        font: 字体路径
+        fontsize: 字体大小
+        
+    返回:
+        换行后的文本和文本高度
+    """
+    # 创建ImageFont对象
+    try:
+        font_obj = ImageFont.truetype(font, fontsize)
+    except:
+        # 如果无法加载指定字体，使用默认字体
+        font_obj = ImageFont.load_default()
+    
+    def get_text_size(inner_text):
+        inner_text = inner_text.strip()
+        left, top, right, bottom = font_obj.getbbox(inner_text)
+        return right - left, bottom - top
+
+    width, height = get_text_size(text)
+    if width <= max_width:
+        return text, height
+
+    processed = True
+
+    _wrapped_lines_ = []
+    words = text.split(" ")
+    _txt_ = ""
+    for word in words:
+        _before = _txt_
+        _txt_ += f"{word} "
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            if _txt_.strip() == word.strip():
+                processed = False
+                break
+            _wrapped_lines_.append(_before)
+            _txt_ = f"{word} "
+    _wrapped_lines_.append(_txt_)
+    if processed:
+        _wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
+        result = "\n".join(_wrapped_lines_).strip()
+        height = len(_wrapped_lines_) * height
+        return result, height
+
+    _wrapped_lines_ = []
+    chars = list(text)
+    _txt_ = ""
+    for word in chars:
+        _txt_ += word
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            _wrapped_lines_.append(_txt_)
+            _txt_ = ""
+    _wrapped_lines_.append(_txt_)
+    result = "\n".join(_wrapped_lines_).strip()
+    height = len(_wrapped_lines_) * height
+    return result, height
+
+
+if __name__ == '__main__':
+    merger_mp4 = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger.mp4'
+    merger_sub = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merged_subtitle_00_00_00-00_01_30.srt'
+    merger_audio = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger_audio.mp3'
+    bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3'
+    output_video = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/combined_test.mp4'
+    
+    # 调用示例
+    options = {
+        'voice_volume': 1.0,            # 配音音量
+        'bgm_volume': 0.1,              # 背景音乐音量
+        'original_audio_volume': 1.0,   # 视频原声音量，0表示不保留
+        'keep_original_audio': True,    # 是否保留原声
+        'subtitle_font': 'MicrosoftYaHeiNormal.ttc',  # 这里使用相对字体路径，会自动在 font_dir() 目录下查找
+        'subtitle_font_size': 40,
+        'subtitle_color': '#FFFFFF',
+        'subtitle_bg_color': None,      # 直接使用None表示透明背景
+        'subtitle_position': 'bottom',
+        'threads': 2
+    }
+    
+    try:
+        merge_materials(
+            video_path=merger_mp4,
+            audio_path=merger_audio,
+            subtitle_path=merger_sub,
+            bgm_path=bgm_path,
+            output_path=output_video,
+            options=options
+        )
+    except Exception as e:
+        logger.error(f"合并素材失败: \n{traceback.format_exc()}")
--- a/app/services/llm.py
+++ b/app/services/llm.py
@ -7,7 +7,7 @@ from typing import List
 from loguru import logger
 from openai import OpenAI
 from openai import AzureOpenAI
-from moviepy.editor import VideoFileClip
+from moviepy import VideoFileClip
 from openai.types.chat import ChatCompletion
 import google.generativeai as gemini
 from googleapiclient.errors import ResumableUploadError
--- a/app/services/material.py
+++ b/app/services/material.py
@ -4,9 +4,10 @@ import random
 import traceback
 from urllib.parse import urlencode
 from datetime import datetime
+import json

 import requests
-from typing import List
+from typing import List, Optional
 from loguru import logger
 from moviepy.video.io.VideoFileClip import VideoFileClip

@ -306,7 +307,50 @@ def format_timestamp(seconds: float) -> str:
    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"


-def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict:
+def _detect_hardware_acceleration() -> Optional[str]:
+    """
+    检测系统可用的硬件加速器
+    
+    Returns:
+        Optional[str]: 硬件加速参数，如果不支持则返回None
+    """
+    # 检查NVIDIA GPU支持
+    try:
+        nvidia_check = subprocess.run(
+            ["ffmpeg", "-hwaccel", "cuda", "-i", "/dev/null", "-f", "null", "-"],
+            stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+        )
+        if nvidia_check.returncode == 0:
+            return "cuda"
+    except Exception:
+        pass
+
+    # 检查MacOS videotoolbox支持
+    try:
+        videotoolbox_check = subprocess.run(
+            ["ffmpeg", "-hwaccel", "videotoolbox", "-i", "/dev/null", "-f", "null", "-"],
+            stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+        )
+        if videotoolbox_check.returncode == 0:
+            return "videotoolbox"
+    except Exception:
+        pass
+
+    # 检查Intel Quick Sync支持
+    try:
+        qsv_check = subprocess.run(
+            ["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"],
+            stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
+        )
+        if qsv_check.returncode == 0:
+            return "qsv"
+    except Exception:
+        pass
+
+    return None
+
+
+def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> str:
    """
    保存剪辑后的视频
    
@ -328,29 +372,43 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

-    # 生成更规范的视频文件名
-    video_id = f"vid-{timestamp.replace(':', '-').replace(',', '_')}"
-    video_path = os.path.join(save_dir, f"{video_id}.mp4")
+    # 解析时间戳
+    start_str, end_str = timestamp.split('-')
+    
+    # 格式化输出文件名（使用连字符替代冒号和逗号）
+    safe_start_time = start_str.replace(':', '-').replace(',', '-')
+    safe_end_time = end_str.replace(':', '-').replace(',', '-')
+    output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4"
+    video_path = os.path.join(save_dir, output_filename)

+    # 如果视频已存在，直接返回
    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
-        logger.info(f"video already exists: {video_path}")
-        return {timestamp: video_path}
+        logger.info(f"视频已存在: {video_path}")
+        return video_path

    try:
-        # 加载视频获取总时长
-        video = VideoFileClip(origin_video)
-        total_duration = video.duration
+        # 检查视频是否存在
+        if not os.path.exists(origin_video):
+            logger.error(f"源视频文件不存在: {origin_video}")
+            return ''
+            
+        # 获取视频总时长
+        try:
+            probe_cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", 
+                        "-of", "default=noprint_wrappers=1:nokey=1", origin_video]
+            total_duration = float(subprocess.check_output(probe_cmd).decode('utf-8').strip())
+        except subprocess.CalledProcessError as e:
+            logger.error(f"获取视频时长失败: {str(e)}")
+            return ''
        
-        # 解析时间戳
-        start_str, end_str = timestamp.split('-')
+        # 计算时间点
        start = time_to_seconds(start_str)
        end = time_to_seconds(end_str)
        
        # 验证时间段
        if start >= total_duration:
            logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)")
-            video.close()
-            return {}
+            return ''
            
        if end > total_duration:
            logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)，将自动调整为视频结尾")
@ -358,55 +416,74 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
            
        if end <= start:
            logger.warning(f"结束时间 {format_timestamp(end)} 必须大于起始时间 {format_timestamp(start)}")
-            video.close()
-            return {}
+            return ''
            
-        # 剪辑视频
+        # 计算剪辑时长
        duration = end - start
-        logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)}，时长 {format_timestamp(duration)}")
+        # logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)}，时长 {format_timestamp(duration)}")
        
-        # 剪辑视频
-        subclip = video.subclip(start, end)
+        # 检测可用的硬件加速选项
+        hwaccel = _detect_hardware_acceleration()
+        hwaccel_args = []
+        if hwaccel:
+            hwaccel_args = ["-hwaccel", hwaccel]
+            logger.info(f"使用硬件加速: {hwaccel}")
        
-        try:
-            # 检查视频是否有音频轨道并写入文件
-            subclip.write_videofile(
-                video_path,
-                codec='libx264',
-                audio_codec='aac',
-                temp_audiofile='temp-audio.m4a',
-                remove_temp=True,
-                audio=(subclip.audio is not None),
-                logger=None
-            )
-            
-            # 验证生成的视频文件
-            if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
-                with VideoFileClip(video_path) as clip:
-                    if clip.duration > 0 and clip.fps > 0:
-                        return {timestamp: video_path}
-                    
-            raise ValueError("视频文件验证失败")
-            
-        except Exception as e:
-            logger.warning(f"视频文件处理失败: {video_path} => {str(e)}")
+        # 转换为FFmpeg兼容的时间格式（逗号替换为点）
+        ffmpeg_start_time = start_str.replace(',', '.')
+        ffmpeg_end_time = end_str.replace(',', '.')
+        
+        # 构建FFmpeg命令
+        ffmpeg_cmd = [
+            "ffmpeg", "-y", *hwaccel_args,
+            "-i", origin_video,
+            "-ss", ffmpeg_start_time,
+            "-to", ffmpeg_end_time,
+            "-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264",
+            "-c:a", "aac",
+            "-strict", "experimental",
+            video_path
+        ]
+        
+        # 执行FFmpeg命令
+        # logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}")
+        # logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}")
+        
+        process = subprocess.run(
+            ffmpeg_cmd, 
+            stdout=subprocess.PIPE, 
+            stderr=subprocess.PIPE, 
+            text=True,
+            check=False  # 不抛出异常，我们会检查返回码
+        )
+        
+        # 检查是否成功
+        if process.returncode != 0:
+            logger.error(f"视频剪辑失败: {process.stderr}")
            if os.path.exists(video_path):
                os.remove(video_path)
+            return ''
+        
+        # 验证生成的视频文件
+        if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+            # 检查视频是否可播放
+            probe_cmd = ["ffprobe", "-v", "error", video_path]
+            validate_result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            
+            if validate_result.returncode == 0:
+                logger.info(f"视频剪辑成功: {video_path}")
+                return video_path
                
-    except Exception as e:
-        logger.warning(f"视频剪辑失败: \n{str(traceback.format_exc())}")
+        logger.error("视频文件验证失败")
        if os.path.exists(video_path):
            os.remove(video_path)
-    finally:
-        # 确保视频对象被正确关闭
-        try:
-            video.close()
-            if 'subclip' in locals():
-                subclip.close()
-        except:
-            pass
-    
-    return {}
+        return ''
+
+    except Exception as e:
+        logger.error(f"视频剪辑过程中发生错误: \n{str(traceback.format_exc())}")
+        if os.path.exists(video_path):
+            os.remove(video_path)
+        return ''


 def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None) -> dict:
@ -428,8 +505,7 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro
        try:
            saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory)
            if saved_video_path:
-                logger.info(f"video saved: {saved_video_path}")
-                video_paths.update(saved_video_path)
+                video_paths.update({index+1:saved_video_path})
            
            # 更新进度
            if progress_callback:
@ -439,6 +515,7 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro
            return {}
            
    logger.success(f"裁剪 {len(video_paths)} videos")
+    # logger.debug(json.dumps(video_paths, indent=4, ensure_ascii=False))
    return video_paths


--- a/app/services/merger_video.py
+++ b/app/services/merger_video.py
@ -0,0 +1,555 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : merger_video
+@Author : 小林同学
+@Date   : 2025/5/6 下午7:38 
+'''
+
+import os
+import shutil
+import subprocess
+from enum import Enum
+from typing import List, Optional, Tuple
+from loguru import logger
+
+
+class VideoAspect(Enum):
+    """视频宽高比枚举"""
+    landscape = "16:9"  # 横屏 16:9
+    landscape_2 = "4:3"
+    portrait = "9:16"   # 竖屏 9:16
+    portrait_2 = "3:4"
+    square = "1:1"      # 方形 1:1
+
+    def to_resolution(self) -> Tuple[int, int]:
+        """根据宽高比返回标准分辨率"""
+        if self == VideoAspect.portrait:
+            return 1080, 1920  # 竖屏 9:16
+        elif self == VideoAspect.portrait_2:
+            return 720, 1280   # 竖屏 4:3
+        elif self == VideoAspect.landscape:
+            return 1920, 1080  # 横屏 16:9
+        elif self == VideoAspect.landscape_2:
+            return 1280, 720   # 横屏 4:3
+        elif self == VideoAspect.square:
+            return 1080, 1080  # 方形 1:1
+        else:
+            return 1080, 1920  # 默认竖屏
+
+
+def check_ffmpeg_installation() -> bool:
+    """
+    检查ffmpeg是否已安装
+    
+    Returns:
+        bool: 如果安装则返回True，否则返回False
+    """
+    try:
+        subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+        return True
+    except (subprocess.SubprocessError, FileNotFoundError):
+        logger.error("ffmpeg未安装或不在系统PATH中，请安装ffmpeg")
+        return False
+
+
+def get_hardware_acceleration_option() -> Optional[str]:
+    """
+    根据系统环境选择合适的硬件加速选项
+    
+    Returns:
+        Optional[str]: 硬件加速参数，如果不支持则返回None
+    """
+    try:
+        # 检查NVIDIA GPU支持
+        nvidia_check = subprocess.run(
+            ['ffmpeg', '-hide_banner', '-hwaccels'],
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+        )
+        output = nvidia_check.stdout.lower()
+
+        if 'cuda' in output:
+            return 'cuda'
+        elif 'nvenc' in output:
+            return 'nvenc'
+        elif 'qsv' in output:  # Intel Quick Sync
+            return 'qsv'
+        elif 'videotoolbox' in output:  # macOS
+            return 'videotoolbox'
+        elif 'vaapi' in output:  # Linux VA-API
+            return 'vaapi'
+        else:
+            logger.info("没有找到支持的硬件加速器，将使用软件编码")
+            return None
+    except Exception as e:
+        logger.warning(f"检测硬件加速器时出错: {str(e)}，将使用软件编码")
+        return None
+
+
+def check_video_has_audio(video_path: str) -> bool:
+    """
+    检查视频是否包含音频流
+    
+    Args:
+        video_path: 视频文件路径
+        
+    Returns:
+        bool: 如果视频包含音频流则返回True，否则返回False
+    """
+    if not os.path.exists(video_path):
+        logger.warning(f"视频文件不存在: {video_path}")
+        return False
+        
+    probe_cmd = [
+        'ffprobe', '-v', 'error', 
+        '-select_streams', 'a:0', 
+        '-show_entries', 'stream=codec_type', 
+        '-of', 'csv=p=0', 
+        video_path
+    ]
+    
+    try:
+        result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False)
+        return result.stdout.strip() == 'audio'
+    except Exception as e:
+        logger.warning(f"检测视频音频流时出错: {str(e)}")
+        return False
+
+
+def create_ffmpeg_concat_file(video_paths: List[str], concat_file_path: str) -> str:
+    """
+    创建ffmpeg合并所需的concat文件
+    
+    Args:
+        video_paths: 需要合并的视频文件路径列表
+        concat_file_path: concat文件的输出路径
+        
+    Returns:
+        str: concat文件的路径
+    """
+    with open(concat_file_path, 'w', encoding='utf-8') as f:
+        for video_path in video_paths:
+            # 获取绝对路径
+            abs_path = os.path.abspath(video_path)
+            # 在Windows上将反斜杠替换为正斜杠
+            if os.name == 'nt':  # Windows系统
+                abs_path = abs_path.replace('\\', '/')
+            else:  # Unix/Mac系统
+                # 转义特殊字符
+                abs_path = abs_path.replace('\\', '\\\\').replace(':', '\\:')
+            
+            # 处理路径中的单引号 (如果有)
+            abs_path = abs_path.replace("'", "\\'")
+            
+            f.write(f"file '{abs_path}'\n")
+    return concat_file_path
+
+
+def process_single_video(
+        input_path: str,
+        output_path: str,
+        target_width: int,
+        target_height: int,
+        keep_audio: bool = True,
+        hwaccel: Optional[str] = None
+) -> str:
+    """
+    处理单个视频：调整分辨率、帧率等
+    
+    Args:
+        input_path: 输入视频路径
+        output_path: 输出视频路径
+        target_width: 目标宽度
+        target_height: 目标高度
+        keep_audio: 是否保留音频
+        hwaccel: 硬件加速选项
+        
+    Returns:
+        str: 处理后的视频路径
+    """
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"找不到视频文件: {input_path}")
+
+    # 构建基本命令
+    command = ['ffmpeg', '-y']
+
+    # 添加硬件加速参数
+    if hwaccel:
+        if hwaccel == 'cuda' or hwaccel == 'nvenc':
+            command.extend(['-hwaccel', 'cuda'])
+        elif hwaccel == 'qsv':
+            command.extend(['-hwaccel', 'qsv'])
+        elif hwaccel == 'videotoolbox':
+            command.extend(['-hwaccel', 'videotoolbox'])
+        elif hwaccel == 'vaapi':
+            command.extend(['-hwaccel', 'vaapi', '-vaapi_device', '/dev/dri/renderD128'])
+
+    # 输入文件
+    command.extend(['-i', input_path])
+
+    # 处理音频
+    if not keep_audio:
+        command.extend(['-an'])  # 移除音频
+    else:
+        # 检查输入视频是否有音频流
+        has_audio = check_video_has_audio(input_path)
+        if has_audio:
+            command.extend(['-c:a', 'aac', '-b:a', '128k'])  # 音频编码为AAC
+        else:
+            logger.warning(f"视频 {input_path} 没有音频流，将会忽略音频设置")
+            command.extend(['-an'])  # 没有音频流时移除音频设置
+
+    # 视频处理参数：缩放并添加填充以保持比例
+    scale_filter = f"scale={target_width}:{target_height}:force_original_aspect_ratio=decrease"
+    pad_filter = f"pad={target_width}:{target_height}:(ow-iw)/2:(oh-ih)/2"
+    command.extend([
+        '-vf', f"{scale_filter},{pad_filter}",
+        '-r', '30',  # 设置帧率为30fps
+    ])
+
+    # 选择编码器
+    if hwaccel == 'cuda' or hwaccel == 'nvenc':
+        command.extend(['-c:v', 'h264_nvenc', '-preset', 'p4', '-profile:v', 'high'])
+    elif hwaccel == 'qsv':
+        command.extend(['-c:v', 'h264_qsv', '-preset', 'medium'])
+    elif hwaccel == 'videotoolbox':
+        command.extend(['-c:v', 'h264_videotoolbox', '-profile:v', 'high'])
+    elif hwaccel == 'vaapi':
+        command.extend(['-c:v', 'h264_vaapi', '-profile', '100'])
+    else:
+        command.extend(['-c:v', 'libx264', '-preset', 'medium', '-profile:v', 'high'])
+
+    # 设置视频比特率和其他参数
+    command.extend([
+        '-b:v', '5M',
+        '-maxrate', '8M',
+        '-bufsize', '10M',
+        '-pix_fmt', 'yuv420p',  # 兼容性更好的颜色格式
+    ])
+
+    # 输出文件
+    command.append(output_path)
+
+    # 执行命令
+    try:
+        subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return output_path
+    except subprocess.CalledProcessError as e:
+        logger.error(f"处理视频失败: {e.stderr.decode() if e.stderr else str(e)}")
+        raise RuntimeError(f"处理视频失败: {str(e)}")
+
+
+def combine_clip_videos(
+        output_video_path: str,
+        video_paths: List[str],
+        video_ost_list: List[int],
+        video_aspect: VideoAspect = VideoAspect.portrait,
+        threads: int = 4,
+) -> str:
+    """
+    合并子视频
+    Args:
+        output_video_path: 合并后的存储路径
+        video_paths: 子视频路径列表
+        video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说)
+        video_aspect: 屏幕比例
+        threads: 线程数
+
+    Returns:
+        str: 合并后的视频路径
+    """
+    # 检查ffmpeg是否安装
+    if not check_ffmpeg_installation():
+        raise RuntimeError("未找到ffmpeg，请先安装")
+
+    # 准备输出目录
+    output_dir = os.path.dirname(output_video_path)
+    os.makedirs(output_dir, exist_ok=True)
+
+    # 获取目标分辨率
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    # 检测可用的硬件加速选项
+    hwaccel = get_hardware_acceleration_option()
+    if hwaccel:
+        logger.info(f"将使用 {hwaccel} 硬件加速")
+
+    # 重组视频路径和原声设置为一个字典列表结构
+    video_segments = []
+    
+    # 检查视频路径和原声设置列表长度是否匹配
+    if len(video_paths) != len(video_ost_list):
+        logger.warning(f"视频路径列表({len(video_paths)})和原声设置列表({len(video_ost_list)})长度不匹配")
+        # 调整长度以匹配较短的列表
+        min_length = min(len(video_paths), len(video_ost_list))
+        video_paths = video_paths[:min_length]
+        video_ost_list = video_ost_list[:min_length]
+    
+    # 创建视频处理配置字典列表
+    for i, (video_path, video_ost) in enumerate(zip(video_paths, video_ost_list)):
+        if not os.path.exists(video_path):
+            logger.warning(f"视频不存在，跳过: {video_path}")
+            continue
+            
+        # 检查是否有音频流
+        has_audio = check_video_has_audio(video_path)
+        
+        # 构建视频片段配置
+        segment = {
+            "index": i,
+            "path": video_path,
+            "ost": video_ost,
+            "has_audio": has_audio,
+            "keep_audio": video_ost > 0 and has_audio  # 只有当ost>0且实际有音频时才保留
+        }
+        
+        # 记录日志
+        if video_ost > 0 and not has_audio:
+            logger.warning(f"视频 {video_path} 设置为保留原声(ost={video_ost})，但该视频没有音频流")
+        
+        video_segments.append(segment)
+        
+    # 处理每个视频片段
+    processed_videos = []
+    temp_dir = os.path.join(output_dir, "temp_videos")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    try:
+        # 第一阶段：处理所有视频片段到中间文件
+        for segment in video_segments:
+            # 处理单个视频，去除或保留音频
+            temp_output = os.path.join(temp_dir, f"processed_{segment['index']}.mp4")
+            try:
+                process_single_video(
+                    input_path=segment['path'],
+                    output_path=temp_output,
+                    target_width=video_width,
+                    target_height=video_height,
+                    keep_audio=segment['keep_audio'],
+                    hwaccel=hwaccel
+                )
+                processed_videos.append({
+                    "index": segment["index"],
+                    "path": temp_output,
+                    "keep_audio": segment["keep_audio"]
+                })
+                logger.info(f"视频 {segment['index'] + 1}/{len(video_segments)} 处理完成")
+            except Exception as e:
+                logger.error(f"处理视频 {segment['path']} 时出错: {str(e)}")
+                continue
+
+        if not processed_videos:
+            raise ValueError("没有有效的视频片段可以合并")
+            
+        # 按原始索引排序处理后的视频
+        processed_videos.sort(key=lambda x: x["index"])
+        
+        # 第二阶段：分步骤合并视频 - 避免复杂的filter_complex滤镜
+        try:
+            # 1. 首先，将所有没有音频的视频或音频被禁用的视频合并到一个临时文件中
+            video_paths_only = [video["path"] for video in processed_videos]
+            video_concat_path = os.path.join(temp_dir, "video_concat.mp4")
+            
+            # 创建concat文件，用于合并视频流
+            concat_file = os.path.join(temp_dir, "concat_list.txt")
+            create_ffmpeg_concat_file(video_paths_only, concat_file)
+            
+            # 合并所有视频流，但不包含音频
+            concat_cmd = [
+                'ffmpeg', '-y',
+                '-f', 'concat',
+                '-safe', '0',
+                '-i', concat_file,
+                '-c:v', 'libx264',
+                '-preset', 'medium',
+                '-profile:v', 'high',
+                '-an',  # 不包含音频
+                '-threads', str(threads),
+                video_concat_path
+            ]
+            
+            subprocess.run(concat_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("视频流合并完成")
+            
+            # 2. 提取并合并有音频的片段
+            audio_segments = [video for video in processed_videos if video["keep_audio"]]
+            
+            if not audio_segments:
+                # 如果没有音频片段，直接使用无音频的合并视频作为最终结果
+                shutil.copy(video_concat_path, output_video_path)
+                logger.info("无音频视频合并完成")
+                return output_video_path
+            
+            # 创建音频中间文件
+            audio_files = []
+            for i, segment in enumerate(audio_segments):
+                # 提取音频
+                audio_file = os.path.join(temp_dir, f"audio_{i}.aac")
+                extract_audio_cmd = [
+                    'ffmpeg', '-y',
+                    '-i', segment["path"],
+                    '-vn',  # 不包含视频
+                    '-c:a', 'aac',
+                    '-b:a', '128k',
+                    audio_file
+                ]
+                subprocess.run(extract_audio_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                audio_files.append({
+                    "index": segment["index"],
+                    "path": audio_file
+                })
+                logger.info(f"提取音频 {i+1}/{len(audio_segments)} 完成")
+            
+            # 3. 计算每个音频片段的时间位置
+            audio_timings = []
+            current_time = 0.0
+            
+            # 获取每个视频片段的时长
+            for i, video in enumerate(processed_videos):
+                duration_cmd = [
+                    'ffprobe', '-v', 'error',
+                    '-show_entries', 'format=duration',
+                    '-of', 'csv=p=0',
+                    video["path"]
+                ]
+                result = subprocess.run(duration_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                duration = float(result.stdout.strip())
+                
+                # 如果当前片段需要保留音频，记录时间位置
+                if video["keep_audio"]:
+                    for audio in audio_files:
+                        if audio["index"] == video["index"]:
+                            audio_timings.append({
+                                "file": audio["path"],
+                                "start": current_time,
+                                "index": video["index"]
+                            })
+                            break
+                
+                current_time += duration
+            
+            # 4. 创建静音音频轨道作为基础
+            silence_audio = os.path.join(temp_dir, "silence.aac")
+            create_silence_cmd = [
+                'ffmpeg', '-y',
+                '-f', 'lavfi',
+                '-i', f'anullsrc=r=44100:cl=stereo',
+                '-t', str(current_time),  # 总时长
+                '-c:a', 'aac',
+                '-b:a', '128k',
+                silence_audio
+            ]
+            subprocess.run(create_silence_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            
+            # 5. 创建复杂滤镜命令以混合音频
+            filter_script = os.path.join(temp_dir, "filter_script.txt")
+            with open(filter_script, 'w') as f:
+                f.write(f"[0:a]volume=0.0[silence];\n")  # 首先静音背景轨道
+                
+                # 添加每个音频文件
+                for i, timing in enumerate(audio_timings):
+                    f.write(f"[{i+1}:a]adelay={int(timing['start']*1000)}|{int(timing['start']*1000)}[a{i}];\n")
+                
+                # 混合所有音频
+                mix_str = "[silence]"
+                for i in range(len(audio_timings)):
+                    mix_str += f"[a{i}]"
+                mix_str += f"amix=inputs={len(audio_timings)+1}:duration=longest[aout]"
+                f.write(mix_str)
+            
+            # 6. 构建音频合并命令
+            audio_inputs = ['-i', silence_audio]
+            for timing in audio_timings:
+                audio_inputs.extend(['-i', timing["file"]])
+                
+            mixed_audio = os.path.join(temp_dir, "mixed_audio.aac")
+            audio_mix_cmd = [
+                'ffmpeg', '-y'
+            ] + audio_inputs + [
+                '-filter_complex_script', filter_script,
+                '-map', '[aout]',
+                '-c:a', 'aac',
+                '-b:a', '128k',
+                mixed_audio
+            ]
+            
+            subprocess.run(audio_mix_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("音频混合完成")
+            
+            # 7. 将合并的视频和混合的音频组合在一起
+            final_cmd = [
+                'ffmpeg', '-y',
+                '-i', video_concat_path,
+                '-i', mixed_audio,
+                '-c:v', 'copy',
+                '-c:a', 'aac',
+                '-map', '0:v:0',
+                '-map', '1:a:0',
+                '-shortest',
+                output_video_path
+            ]
+            
+            subprocess.run(final_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("视频最终合并完成")
+            
+            return output_video_path
+            
+        except subprocess.CalledProcessError as e:
+            logger.error(f"合并视频过程中出错: {e.stderr.decode() if e.stderr else str(e)}")
+            
+            # 尝试备用合并方法 - 最简单的无音频合并
+            logger.info("尝试备用合并方法 - 无音频合并")
+            try:
+                concat_file = os.path.join(temp_dir, "concat_list.txt")
+                video_paths_only = [video["path"] for video in processed_videos]
+                create_ffmpeg_concat_file(video_paths_only, concat_file)
+                
+                backup_cmd = [
+                    'ffmpeg', '-y',
+                    '-f', 'concat',
+                    '-safe', '0',
+                    '-i', concat_file,
+                    '-c:v', 'copy',
+                    '-an',  # 无音频
+                    output_video_path
+                ]
+                
+                subprocess.run(backup_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                logger.warning("使用备用方法（无音频）成功合并视频")
+                return output_video_path
+            except Exception as backup_error:
+                logger.error(f"备用合并方法也失败: {str(backup_error)}")
+                raise RuntimeError(f"无法合并视频: {str(backup_error)}")
+            
+    except Exception as e:
+        logger.error(f"合并视频时出错: {str(e)}")
+        raise
+    finally:
+        # 清理临时文件
+        try:
+            if os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir)
+                logger.info("已清理临时文件")
+        except Exception as e:
+            logger.warning(f"清理临时文件时出错: {str(e)}")
+
+
+if __name__ == '__main__':
+    video_paths = [
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-00-00-00-00-26.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-01-15-00-01-29.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-04-58-00-05-20.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-05-45-00-05-53.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4'
+    ]
+
+    combine_clip_videos(
+        output_video_path="/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/merged_123.mp4",
+        video_paths=video_paths,
+        video_ost_list=[1, 0, 1, 0, 0, 1],
+        video_aspect=VideoAspect.portrait
+    )
--- a/app/services/script_service.py
+++ b/app/services/script_service.py
@ -3,10 +3,11 @@ import json
 import time
 import asyncio
 import requests
+from app.utils import video_processor
 from loguru import logger
 from typing import List, Dict, Any, Callable

-from app.utils import utils, gemini_analyzer, video_processor, video_processor_v2
+from app.utils import utils, gemini_analyzer, video_processor
 from app.utils.script_generator import ScriptProcessor
 from app.config import config

@ -21,6 +22,7 @@ class ScriptGenerator:
        video_path: str,
        video_theme: str = "",
        custom_prompt: str = "",
+        frame_interval_input: int = 5,
        skip_seconds: int = 0,
        threshold: int = 30,
        vision_batch_size: int = 5,
@ -105,20 +107,13 @@ class ScriptGenerator:
        os.makedirs(video_keyframes_dir, exist_ok=True)
        
        try:
-            if config.frames.get("version") == "v2":
-                processor = video_processor_v2.VideoProcessor(video_path)
-                processor.process_video_pipeline(
-                    output_dir=video_keyframes_dir,
-                    skip_seconds=skip_seconds,
-                    threshold=threshold
-                )
-            else:
-                processor = video_processor.VideoProcessor(video_path)
-                processor.process_video(
-                    output_dir=video_keyframes_dir,
-                    skip_seconds=skip_seconds
-                )
-                
+            processor = video_processor.VideoProcessor(video_path)
+            processor.process_video_pipeline(
+                output_dir=video_keyframes_dir,
+                skip_seconds=skip_seconds,
+                threshold=threshold
+            )
+
            for filename in sorted(os.listdir(video_keyframes_dir)):
                if filename.endswith('.jpg'):
                    keyframe_files.append(os.path.join(video_keyframes_dir, filename))
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@ -4,11 +4,11 @@ import re
 import traceback
 from typing import Optional

-from faster_whisper import WhisperModel
+# from faster_whisper import WhisperModel
 from timeit import default_timer as timer
 from loguru import logger
 import google.generativeai as genai
-from moviepy.editor import VideoFileClip
+from moviepy import VideoFileClip
 import os

 from app.config import config
@ -33,7 +33,7 @@ def create(audio_file, subtitle_file: str = ""):
    """
    global model, device, compute_type
    if not model:
-        model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v2"
+        model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v3"
        model_bin_file = f"{model_path}/model.bin"
        if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
            logger.error(
@ -45,12 +45,25 @@ def create(audio_file, subtitle_file: str = ""):
            )
            return None

-        # 尝试使用 CUDA，如果失败则回退到 CPU
+        # 首先使用CPU模式，不触发CUDA检查
+        use_cuda = False
        try:
-            import torch
-            if torch.cuda.is_available():
+            # 在函数中延迟导入torch，而不是在全局范围内
+            # 使用安全的方式检查CUDA可用性
+            def check_cuda_available():
+                try:
+                    import torch
+                    return torch.cuda.is_available()
+                except (ImportError, RuntimeError) as e:
+                    logger.warning(f"检查CUDA可用性时出错: {e}")
+                    return False
+                
+            # 仅当明确需要时才检查CUDA
+            use_cuda = check_cuda_available()
+            
+            if use_cuda:
+                logger.info(f"尝试使用 CUDA 加载模型: {model_path}")
                try:
-                    logger.info(f"尝试使用 CUDA 加载模型: {model_path}")
                    model = WhisperModel(
                        model_size_or_path=model_path,
                        device="cuda",
@ -63,18 +76,18 @@ def create(audio_file, subtitle_file: str = ""):
                except Exception as e:
                    logger.warning(f"CUDA 加载失败，错误信息: {str(e)}")
                    logger.warning("回退到 CPU 模式")
-                    device = "cpu"
-                    compute_type = "int8"
+                    use_cuda = False
            else:
-                logger.info("未检测到 CUDA，使用 CPU 模式")
-                device = "cpu"
-                compute_type = "int8"
-        except ImportError:
-            logger.warning("未安装 torch，使用 CPU 模式")
+                logger.info("使用 CPU 模式")
+        except Exception as e:
+            logger.warning(f"CUDA检查过程出错: {e}")
+            logger.warning("默认使用CPU模式")
+            use_cuda = False
+
+        # 如果CUDA不可用或加载失败，使用CPU
+        if not use_cuda:
            device = "cpu"
            compute_type = "int8"
-
-        if device == "cpu":
            logger.info(f"使用 CPU 加载模型: {model_path}")
            model = WhisperModel(
                model_size_or_path=model_path,
@ -403,7 +416,7 @@ def extract_audio_and_create_subtitle(video_file: str, subtitle_file: str = "")
        logger.info("音频提取完成，开始生成字幕")
        
        # 使用create函数生成字幕
-        create(audio_file, subtitle_file)
+        create("/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav", subtitle_file)
        
        # 删除临时音频文件
        if os.path.exists(audio_file):
@ -422,8 +435,8 @@ if __name__ == "__main__":
    task_id = "123456"
    task_dir = utils.task_dir(task_id)
    subtitle_file = f"{task_dir}/subtitle_123456.srt"
-    audio_file = f"{task_dir}/audio.wav"
-    video_file = "/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_1702.mp4"
+    audio_file = "/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav"
+    video_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/qyn2-2-720p.mp4"

    extract_audio_and_create_subtitle(video_file, subtitle_file)

--- a/app/services/subtitle_merger.py
+++ b/app/services/subtitle_merger.py
@ -0,0 +1,202 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : subtitle_merger
+@Author : viccy
+@Date   : 2025/5/6 下午4:00 
+'''
+
+import re
+import os
+from datetime import datetime, timedelta
+
+
+def parse_time(time_str):
+    """解析时间字符串为timedelta对象"""
+    hours, minutes, seconds_ms = time_str.split(':')
+    seconds, milliseconds = seconds_ms.split(',')
+    
+    td = timedelta(
+        hours=int(hours),
+        minutes=int(minutes),
+        seconds=int(seconds),
+        milliseconds=int(milliseconds)
+    )
+    return td
+
+
+def format_time(td):
+    """将timedelta对象格式化为SRT时间字符串"""
+    total_seconds = int(td.total_seconds())
+    hours = total_seconds // 3600
+    minutes = (total_seconds % 3600) // 60
+    seconds = total_seconds % 60
+    milliseconds = td.microseconds // 1000
+    
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+
+
+def parse_edited_time_range(time_range_str):
+    """从editedTimeRange字符串中提取时间范围"""
+    if not time_range_str:
+        return None, None
+    
+    parts = time_range_str.split('-')
+    if len(parts) != 2:
+        return None, None
+    
+    start_time_str, end_time_str = parts
+    
+    # 将HH:MM:SS格式转换为timedelta
+    start_h, start_m, start_s = map(int, start_time_str.split(':'))
+    end_h, end_m, end_s = map(int, end_time_str.split(':'))
+    
+    start_time = timedelta(hours=start_h, minutes=start_m, seconds=start_s)
+    end_time = timedelta(hours=end_h, minutes=end_m, seconds=end_s)
+    
+    return start_time, end_time
+
+
+def merge_subtitle_files(subtitle_items, output_file=None):
+    """
+    合并多个SRT字幕文件
+    
+    参数:
+        subtitle_items: 字典列表，每个字典包含subtitle文件路径和editedTimeRange
+        output_file: 输出文件的路径，如果为None则自动生成
+    
+    返回:
+        合并后的字幕文件路径
+    """
+    # 按照editedTimeRange的开始时间排序
+    sorted_items = sorted(subtitle_items, 
+                         key=lambda x: parse_edited_time_range(x.get('editedTimeRange', ''))[0] or timedelta())
+    
+    merged_subtitles = []
+    subtitle_index = 1
+    
+    for item in sorted_items:
+        if not item.get('subtitle') or not os.path.exists(item.get('subtitle')):
+            continue
+            
+        # 从editedTimeRange获取起始时间偏移
+        offset_time, _ = parse_edited_time_range(item.get('editedTimeRange', ''))
+        
+        if offset_time is None:
+            print(f"警告: 无法从项目 {item.get('_id')} 的editedTimeRange中提取时间范围，跳过该项")
+            continue
+        
+        with open(item['subtitle'], 'r', encoding='utf-8') as file:
+            content = file.read()
+            
+        # 解析字幕文件
+        subtitle_blocks = re.split(r'\n\s*\n', content.strip())
+        
+        for block in subtitle_blocks:
+            lines = block.strip().split('\n')
+            if len(lines) < 3:  # 确保块有足够的行数
+                continue
+                
+            # 解析时间轴行
+            time_line = lines[1]
+            time_parts = time_line.split(' --> ')
+            if len(time_parts) != 2:
+                continue
+                
+            start_time = parse_time(time_parts[0])
+            end_time = parse_time(time_parts[1])
+            
+            # 应用时间偏移
+            adjusted_start_time = start_time + offset_time
+            adjusted_end_time = end_time + offset_time
+            
+            # 重建字幕块
+            adjusted_time_line = f"{format_time(adjusted_start_time)} --> {format_time(adjusted_end_time)}"
+            text_lines = lines[2:]
+            
+            new_block = [
+                str(subtitle_index),
+                adjusted_time_line,
+                *text_lines
+            ]
+            
+            merged_subtitles.append('\n'.join(new_block))
+            subtitle_index += 1
+    
+    # 确定输出文件路径
+    if output_file is None:
+        dir_path = os.path.dirname(sorted_items[0]['subtitle'])
+        first_start = parse_edited_time_range(sorted_items[0]['editedTimeRange'])[0]
+        last_end = parse_edited_time_range(sorted_items[-1]['editedTimeRange'])[1]
+        
+        first_start_h, first_start_m, first_start_s = int(first_start.seconds // 3600), int((first_start.seconds % 3600) // 60), int(first_start.seconds % 60)
+        last_end_h, last_end_m, last_end_s = int(last_end.seconds // 3600), int((last_end.seconds % 3600) // 60), int(last_end.seconds % 60)
+        
+        first_start_str = f"{first_start_h:02d}_{first_start_m:02d}_{first_start_s:02d}"
+        last_end_str = f"{last_end_h:02d}_{last_end_m:02d}_{last_end_s:02d}"
+        
+        output_file = os.path.join(dir_path, f"merged_subtitle_{first_start_str}-{last_end_str}.srt")
+    
+    # 合并所有字幕块
+    merged_content = '\n\n'.join(merged_subtitles)
+    
+    # 写入合并后的内容
+    with open(output_file, 'w', encoding='utf-8') as file:
+        file.write(merged_content)
+    
+    return output_file
+
+
+if __name__ == '__main__':
+    # 测试数据
+    test_data = [
+        {'picture': '【解说】好的，各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！', 
+         'timestamp': '00:00:00-00:01:15', 
+         'narration': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！', 
+         'OST': 0, 
+         '_id': 1, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', 
+         'sourceTimeRange': '00:00:00-00:00:26', 
+         'duration': 26, 
+         'editedTimeRange': '00:00:00-00:00:26'
+        },
+        {'picture': '【解说】上一集我们看到，范闲在北齐遭遇了惊天变故，生死不明！', 
+         'timestamp': '00:01:15-00:04:40', 
+         'narration': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…', 
+         'OST': 0, 
+         '_id': 2, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', 
+         'sourceTimeRange': '00:01:15-00:01:29', 
+         'duration': 14, 
+         'editedTimeRange': '00:00:26-00:00:40'
+        },
+        {'picture': '【解说】"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。', 
+         'timestamp': '00:04:58-00:05:45', 
+         'narration': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！', 
+         'OST': 0, 
+         '_id': 4, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', 
+         'sourceTimeRange': '00:04:58-00:05:20', 
+         'duration': 22, 
+         'editedTimeRange': '00:00:57-00:01:19'
+        },
+        {'picture': '【解说】但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！', 
+         'timestamp': '00:05:45-00:06:00', 
+         'narration': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！', 
+         'OST': 0, 
+         '_id': 5, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt', 
+         'sourceTimeRange': '00:05:45-00:05:53', 
+         'duration': 8, 
+         'editedTimeRange': '00:01:19-00:01:27'
+        }
+    ]
+    
+    output_file = merge_subtitle_files(test_data)
+    print(f"字幕文件已合并至: {output_file}")
--- a/app/services/task.py
+++ b/app/services/task.py
@ -9,167 +9,177 @@ from loguru import logger
 from app.config import config
 from app.models import const
 from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams
-from app.services import llm, material, subtitle, video, voice, audio_merger
+from app.services import (llm, material, subtitle, video, voice, audio_merger,
+                          subtitle_merger, clip_video, merger_video, update_script, generate_video)
 from app.services import state as sm
 from app.utils import utils


-def generate_script(task_id, params):
-    logger.info("\n\n## generating video script")
-    video_script = params.video_script.strip()
-    if not video_script:
-        video_script = llm.generate_script(
-            video_subject=params.video_subject,
-            language=params.video_language,
-            paragraph_number=params.paragraph_number,
-        )
-    else:
-        logger.debug(f"video script: \n{video_script}")
+# def generate_script(task_id, params):
+#     logger.info("\n\n## generating video script")
+#     video_script = params.video_script.strip()
+#     if not video_script:
+#         video_script = llm.generate_script(
+#             video_subject=params.video_subject,
+#             language=params.video_language,
+#             paragraph_number=params.paragraph_number,
+#         )
+#     else:
+#         logger.debug(f"video script: \n{video_script}")

-    if not video_script:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error("failed to generate video script.")
-        return None
+#     if not video_script:
+#         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#         logger.error("failed to generate video script.")
+#         return None

-    return video_script
+#     return video_script


-def generate_terms(task_id, params, video_script):
-    logger.info("\n\n## generating video terms")
-    video_terms = params.video_terms
-    if not video_terms:
-        video_terms = llm.generate_terms(
-            video_subject=params.video_subject, video_script=video_script, amount=5
-        )
-    else:
-        if isinstance(video_terms, str):
-            video_terms = [term.strip() for term in re.split(r"[,，]", video_terms)]
-        elif isinstance(video_terms, list):
-            video_terms = [term.strip() for term in video_terms]
-        else:
-            raise ValueError("video_terms must be a string or a list of strings.")
+# def generate_terms(task_id, params, video_script):
+#     logger.info("\n\n## generating video terms")
+#     video_terms = params.video_terms
+#     if not video_terms:
+#         video_terms = llm.generate_terms(
+#             video_subject=params.video_subject, video_script=video_script, amount=5
+#         )
+#     else:
+#         if isinstance(video_terms, str):
+#             video_terms = [term.strip() for term in re.split(r"[,，]", video_terms)]
+#         elif isinstance(video_terms, list):
+#             video_terms = [term.strip() for term in video_terms]
+#         else:
+#             raise ValueError("video_terms must be a string or a list of strings.")

-        logger.debug(f"video terms: {utils.to_json(video_terms)}")
+#         logger.debug(f"video terms: {utils.to_json(video_terms)}")

-    if not video_terms:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error("failed to generate video terms.")
-        return None
+#     if not video_terms:
+#         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#         logger.error("failed to generate video terms.")
+#         return None

-    return video_terms
+#     return video_terms


-def save_script_data(task_id, video_script, video_terms, params):
-    script_file = path.join(utils.task_dir(task_id), "script.json")
-    script_data = {
-        "script": video_script,
-        "search_terms": video_terms,
-        "params": params,
-    }
+# def save_script_data(task_id, video_script, video_terms, params):
+#     script_file = path.join(utils.task_dir(task_id), "script.json")
+#     script_data = {
+#         "script": video_script,
+#         "search_terms": video_terms,
+#         "params": params,
+#     }

-    with open(script_file, "w", encoding="utf-8") as f:
-        f.write(utils.to_json(script_data))
+#     with open(script_file, "w", encoding="utf-8") as f:
+#         f.write(utils.to_json(script_data))


-def generate_audio(task_id, params, video_script):
-    logger.info("\n\n## generating audio")
-    audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
-    sub_maker = voice.tts(
-        text=video_script,
-        voice_name=voice.parse_voice_name(params.voice_name),
-        voice_rate=params.voice_rate,
-        voice_file=audio_file,
-    )
-    if sub_maker is None:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error(
-            """failed to generate audio:
-1. check if the language of the voice matches the language of the video script.
-2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
-        """.strip()
-        )
-        return None, None, None
+# def generate_audio(task_id, params, video_script):
+#     logger.info("\n\n## generating audio")
+#     audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
+#     sub_maker = voice.tts(
+#         text=video_script,
+#         voice_name=voice.parse_voice_name(params.voice_name),
+#         voice_rate=params.voice_rate,
+#         voice_file=audio_file,
+#     )
+#     if sub_maker is None:
+#         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#         logger.error(
+#             """failed to generate audio:
+# 1. check if the language of the voice matches the language of the video script.
+# 2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
+#         """.strip()
+#         )
+#         return None, None, None

-    audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
-    return audio_file, audio_duration, sub_maker
+#     audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
+#     return audio_file, audio_duration, sub_maker


-def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
-    if not params.subtitle_enabled:
-        return ""
+# def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
+#     if not params.subtitle_enabled:
+#         return ""

-    subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt")
-    subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
-    logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
+#     subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt")
+#     subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+#     logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")

-    subtitle_fallback = False
-    if subtitle_provider == "edge":
-        voice.create_subtitle(
-            text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
-        )
-        if not os.path.exists(subtitle_path):
-            subtitle_fallback = True
-            logger.warning("subtitle file not found, fallback to whisper")
+#     subtitle_fallback = False
+#     if subtitle_provider == "edge":
+#         voice.create_subtitle(
+#             text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
+#         )
+#         if not os.path.exists(subtitle_path):
+#             subtitle_fallback = True
+#             logger.warning("subtitle file not found, fallback to whisper")

-    if subtitle_provider == "whisper" or subtitle_fallback:
-        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
-        logger.info("\n\n## correcting subtitle")
-        subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+#     if subtitle_provider == "whisper" or subtitle_fallback:
+#         subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+#         logger.info("\n\n## correcting subtitle")
+#         subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)

-    subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
-    if not subtitle_lines:
-        logger.warning(f"subtitle file is invalid: {subtitle_path}")
-        return ""
+#     subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
+#     if not subtitle_lines:
+#         logger.warning(f"subtitle file is invalid: {subtitle_path}")
+#         return ""

-    return subtitle_path
+#     return subtitle_path


-def get_video_materials(task_id, params, video_terms, audio_duration):
-    if params.video_source == "local":
-        logger.info("\n\n## preprocess local materials")
-        materials = video.preprocess_video(
-            materials=params.video_materials, clip_duration=params.video_clip_duration
-        )
-        if not materials:
-            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-            logger.error(
-                "no valid materials found, please check the materials and try again."
-            )
-            return None
-        return [material_info.url for material_info in materials]
-    else:
-        logger.info(f"\n\n## downloading videos from {params.video_source}")
-        downloaded_videos = material.download_videos(
-            task_id=task_id,
-            search_terms=video_terms,
-            source=params.video_source,
-            video_aspect=params.video_aspect,
-            video_contact_mode=params.video_concat_mode,
-            audio_duration=audio_duration * params.video_count,
-            max_clip_duration=params.video_clip_duration,
-        )
-        if not downloaded_videos:
-            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-            logger.error(
-                "failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
-            )
-            return None
-        return downloaded_videos
+# def get_video_materials(task_id, params, video_terms, audio_duration):
+#     if params.video_source == "local":
+#         logger.info("\n\n## preprocess local materials")
+#         materials = video.preprocess_video(
+#             materials=params.video_materials, clip_duration=params.video_clip_duration
+#         )
+#         if not materials:
+#             sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#             logger.error(
+#                 "no valid materials found, please check the materials and try again."
+#             )
+#             return None
+#         return [material_info.url for material_info in materials]
+#     else:
+#         logger.info(f"\n\n## downloading videos from {params.video_source}")
+#         downloaded_videos = material.download_videos(
+#             task_id=task_id,
+#             search_terms=video_terms,
+#             source=params.video_source,
+#             video_aspect=params.video_aspect,
+#             video_contact_mode=params.video_concat_mode,
+#             audio_duration=audio_duration * params.video_count,
+#             max_clip_duration=params.video_clip_duration,
+#         )
+#         if not downloaded_videos:
+#             sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#             logger.error(
+#                 "failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
+#             )
+#             return None
+#         return downloaded_videos


 def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
-    """后台任务（自动剪辑视频进行剪辑）"""
+    """
+    后台任务（自动剪辑视频进行剪辑）
+    Args:
+        task_id: 任务ID
+        params: 视频参数
+        subclip_path_videos: 视频片段路径
+    """
+    global merged_audio_path, merged_subtitle_path
+
    logger.info(f"\n\n## 开始任务: {task_id}")
-    
-    # 初始化 ImageMagick
-    if not utils.init_imagemagick():
-        logger.warning("ImageMagick 初始化失败，字幕可能无法正常显示")
-    
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0)

-    # tts 角色名称
-    voice_name = voice.parse_voice_name(params.voice_name)
+    # # 初始化 ImageMagick
+    # if not utils.init_imagemagick():
+    #     logger.warning("ImageMagick 初始化失败，字幕可能无法正常显示")

+    # # tts 角色名称
+    # voice_name = voice.parse_voice_name(params.voice_name)
+    """
+    1. 加载剪辑脚本
+    """
    logger.info("\n\n## 1. 加载视频脚本")
    video_script_path = path.join(params.video_clip_json_path)
    
@ -185,174 +195,144 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
                logger.debug(f"解说完整脚本: \n{video_script}")
                logger.debug(f"解说 OST 列表: \n{video_ost}")
                logger.debug(f"解说时间戳列表: \n{time_list}")
-                
-                # 获取视频总时长(单位 s)
-                last_timestamp = list_script[-1]['new_timestamp']
-                end_time = last_timestamp.split("-")[1]
-                total_duration = utils.time_to_seconds(end_time)
-                
        except Exception as e:
-            logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
-            raise ValueError("无法读取视频json脚本，请检查配置是否正确")
+            logger.error(f"无法读取视频json脚本，请检查脚本格式是否正确")
+            raise ValueError("无法读取视频json脚本，请检查脚本格式是否正确")
    else:
        logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
        raise ValueError("解说脚本不存在！请检查配置是否正确。")

+    """
+    2. 使用 TTS 生成音频素材
+    """
    logger.info("\n\n## 2. 根据OST设置生成音频列表")
-    # 只为OST=0或2的片段生成TTS音频
+    # 只为OST=0 or 2的判断生成音频， OST=0 仅保留解说 OST=2 保留解说和原声
    tts_segments = [
        segment for segment in list_script 
        if segment['OST'] in [0, 2]
    ]
    logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}")
-    
-    # 初始化音频文件路径
-    audio_files = []
-    final_audio = ""
-    
+
+    tts_results = voice.tts_multiple(
+        task_id=task_id,
+        list_script=tts_segments,  # 只传入需要TTS的片段
+        voice_name=params.voice_name,
+        voice_rate=params.voice_rate,
+        voice_pitch=params.voice_pitch,
+    )
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
+
+    # """
+    # 3. (可选) 使用 whisper 生成字幕
+    # """
+    # if merged_subtitle_path is None:
+    #     if audio_files:
+    #         merged_subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
+    #         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+    #         logger.info(f"\n\n使用 {subtitle_provider} 生成字幕")
+    #
+    #         subtitle.create(
+    #             audio_file=merged_audio_path,
+    #             subtitle_file=merged_subtitle_path,
+    #         )
+    #         subtitle_lines = subtitle.file_to_subtitles(merged_subtitle_path)
+    #         if not subtitle_lines:
+    #             logger.warning(f"字幕文件无效: {merged_subtitle_path}")
+    #
+    # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
+
+    """
+    3. 裁剪视频 - 将超出音频长度的视频进行裁剪
+    """
+    logger.info("\n\n## 3. 裁剪视频")
+    video_clip_result = clip_video.clip_video(params.video_origin_path, tts_results)
+    # 更新 list_script 中的时间戳
+    tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results}
+    subclip_clip_result = {
+        tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results
+    }
+    new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result)
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60)
+
+    """
+    4. 合并音频和字幕
+    """
+    logger.info("\n\n## 4. 合并音频和字幕")
+    total_duration = sum([script["duration"] for script in new_script_list])
    if tts_segments:
-        audio_files, sub_maker_list = voice.tts_multiple(
-            task_id=task_id,
-            list_script=tts_segments,  # 只传入需要TTS的片段
-            voice_name=voice_name,
-            voice_rate=params.voice_rate,
-            voice_pitch=params.voice_pitch,
-            force_regenerate=True
-        )
-        if audio_files is None:
-            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-            logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
-            return
-
-        if audio_files:
-            logger.info(f"合并音频文件: {audio_files}")
-            try:
-                # 传入OST信息以便正确处理音频
-                final_audio = audio_merger.merge_audio_files(
-                    task_id=task_id,
-                    audio_files=audio_files,
-                    total_duration=total_duration,
-                    list_script=list_script  # 传入完整脚本以便处理OST
-                )
-                logger.info("音频文件合并成功")
-            except Exception as e:
-                logger.error(f"合并音频文件失败: {str(e)}")
-                final_audio = ""
-    else:
-        # 如果没有需要生成TTS的片段，创建一个空白音频文件
-        # 这样可以确保后续的音频处理能正确进行
-        logger.info("没有需要生成TTS的片段，将保留原声和背景音乐")
-        final_audio = path.join(utils.task_dir(task_id), "empty.mp3")
        try:
-            from moviepy.editor import AudioClip
-            # 创建一个与视频等长的空白音频
-            empty_audio = AudioClip(make_frame=lambda t: 0, duration=total_duration)
-            empty_audio.write_audiofile(final_audio, fps=44100)
-            logger.info(f"已创建空白音频文件: {final_audio}")
-        except Exception as e:
-            logger.error(f"创建空白音频文件失败: {str(e)}")
-            final_audio = ""
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
-
-    subtitle_path = ""
-    if params.subtitle_enabled:
-        if audio_files:
-            subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
-            subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
-            logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-
-            subtitle.create(
-                audio_file=final_audio,
-                subtitle_file=subtitle_path,
+            # 合并音频文件
+            merged_audio_path = audio_merger.merge_audio_files(
+                task_id=task_id,
+                total_duration=total_duration,
+                list_script=new_script_list
            )
+            logger.info(f"音频文件合并成功->{merged_audio_path}")
+            # 合并字幕文件
+            merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list)
+            logger.info(f"字幕文件合并成功->{merged_subtitle_path}")
+        except Exception as e:
+            logger.error(f"合并音频文件失败: {str(e)}")
+    else:
+        logger.warning("没有需要合并的音频/字幕")
+        merged_audio_path = ""
+        merged_subtitle_path = ""

-            subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
-            if not subtitle_lines:
-                logger.warning(f"字幕文件无效: {subtitle_path}")
-                subtitle_path = ""
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
-
-    logger.info("\n\n## 4. 裁剪视频")
-    subclip_videos = [x for x in subclip_path_videos.values()]
-    # logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}")
-
-    if not subclip_videos:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error(
-            "裁剪视频失败，可能是 ImageMagick 不可用")
-        return
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
-
+    """
+    5. 合并视频
+    """
    final_video_paths = []
    combined_video_paths = []

-    _progress = 50
-    index = 1
-    combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
+    combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4")
    logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")
+    # 如果 new_script_list 中没有 video，则使用 subclip_path_videos 中的视频
+    video_clips = [new_script['video'] if new_script.get('video') else subclip_path_videos.get(new_script.get('_id', '')) for new_script in new_script_list]

-    video.combine_clip_videos(
-        combined_video_path=combined_video_path,
-        video_paths=subclip_videos,
+    merger_video.combine_clip_videos(
+        output_video_path=combined_video_path,
+        video_paths=video_clips,
        video_ost_list=video_ost,
-        list_script=list_script,
        video_aspect=params.video_aspect,
-        threads=params.n_threads  # 多线程
+        threads=params.n_threads
    )
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80)

-    _progress += 50 / 2
-    sm.state.update_task(task_id, progress=_progress)
+    """
+    6. 合并字幕/BGM/配音/视频
+    """
+    output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
+    logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}")

-    final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
+    # bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3'
+    bgm_path = utils.get_bgm_file()

-    logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}")
-    
-    # 获取背景音乐
-    bgm_path = None
-    if params.bgm_type or params.bgm_file:
-        try:
-            bgm_path = utils.get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
-            if bgm_path:
-                logger.info(f"使用背景音乐: {bgm_path}")
-        except Exception as e:
-            logger.error(f"获取背景音乐失败: {str(e)}")
-
-    # 示例：自定义字幕样式
-    subtitle_style = {
-        'fontsize': params.font_size,  # 字体大小
-        'color': params.text_fore_color,  # 字体颜色
-        'stroke_color': params.stroke_color,  # 描边颜色
-        'stroke_width': params.stroke_width,  # 描边宽度, 范围0-10
-        'bg_color': params.text_back_color,   # 半透明黑色背景
-        'position': (params.subtitle_position, 0.2),  # 距离顶部60%的位置
-        'method': 'caption'  # 渲染方法
+    # 调用示例
+    options = {
+        'voice_volume': params.tts_volume,  # 配音音量
+        'bgm_volume': params.bgm_volume,  # 背景音乐音量
+        'original_audio_volume': params.original_volume,  # 视频原声音量，0表示不保留
+        'keep_original_audio': True,  # 是否保留原声
+        'subtitle_font': params.font_name,  # 这里使用相对字体路径，会自动在 font_dir() 目录下查找
+        'subtitle_font_size': params.font_size,
+        'subtitle_color': params.text_fore_color,
+        'subtitle_bg_color': None,  # 直接使用None表示透明背景
+        'subtitle_position': params.subtitle_position,
+        'custom_position': params.custom_position,
+        'threads': params.n_threads
    }
-
-    # 示例：自定义音量配置
-    volume_config = {
-        'original': params.original_volume,  # 原声音量80%
-        'bgm': params.bgm_volume,  # BGM音量20%
-        'narration': params.tts_volume or params.voice_volume,  # 解说音量100%
-    }
-    font_path = utils.font_dir(params.font_name)
-    video.generate_video_v3(
+    generate_video.merge_materials(
        video_path=combined_video_path,
-        subtitle_path=subtitle_path,
+        audio_path=merged_audio_path,
+        subtitle_path=merged_subtitle_path,
        bgm_path=bgm_path,
-        narration_path=final_audio,
-        output_path=final_video_path,
-        volume_config=volume_config,  # 添加音量配置
-        subtitle_style=subtitle_style,
-        font_path=font_path
+        output_path=output_video_path,
+        options=options
    )

-    _progress += 50 / 2
-    sm.state.update_task(task_id, progress=_progress)
-
-    final_video_paths.append(final_video_path)
+    final_video_paths.append(output_video_path)
    combined_video_paths.append(combined_video_path)

    logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.")
@ -400,35 +380,19 @@ def validate_params(video_path, audio_path, output_file, params):


 if __name__ == "__main__":
-    # task_id = "test123"
-    # subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4',
-    #                        '00:06-00:15': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_06-00_15.mp4',
-    #                        '01:10-01:17': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_10-01_17.mp4',
-    #                        '00:47-01:03': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_47-01_03.mp4',
-    #                        '01:03-01:10': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_03-01_10.mp4',
-    #                        '02:40-03:08': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-02_40-03_08.mp4',
-    #                        '03:02-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_02-03_20.mp4',
-    #                        '03:18-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_18-03_20.mp4'}
-    #
-    # params = VideoClipParams(
-    #     video_clip_json_path="E:\\projects\\NarratoAI\\resource/scripts/test003.json",
-    #     video_origin_path="E:\\projects\\NarratoAI\\resource/videos/1.mp4",
-    # )
-    # start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
+    task_id = "demo"

-    task_id = "test456"
-    subclip_path_videos = {'01:10-01:17': './storage/cache_videos/vid-01_10-01_17.mp4',
-                           '01:58-02:04': './storage/cache_videos/vid-01_58-02_04.mp4',
-                           '02:25-02:31': './storage/cache_videos/vid-02_25-02_31.mp4',
-                           '01:28-01:33': './storage/cache_videos/vid-01_28-01_33.mp4',
-                           '03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4',
-                           '00:24-00:28': './storage/cache_videos/vid-00_24-00_28.mp4',
-                           '03:02-03:08': './storage/cache_videos/vid-03_02-03_08.mp4',
-                           '00:41-00:44': './storage/cache_videos/vid-00_41-00_44.mp4',
-                           '02:12-02:25': './storage/cache_videos/vid-02_12-02_25.mp4'}
+    # 提前裁剪是为了方便检查视频
+    subclip_path_videos = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-05-390@00-00-57-980.mp4',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-28-900@00-00-43-700.mp4',
+        3: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-01-17-840@00-01-27-600.mp4',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-02-35-460@00-02-52-380.mp4',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-06-59-520@00-07-29-500.mp4',
+    }

    params = VideoClipParams(
-        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test004.json",
-        video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4",
+        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/2025-0507-223311.json",
+        video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_4938.mp4",
    )
-    start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
+    start_subclip(task_id, params, subclip_path_videos)
--- a/app/services/update_script.py
+++ b/app/services/update_script.py
@ -0,0 +1,266 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : update_script
+@Author : 小林同学
+@Date   : 2025/5/6 下午11:00 
+'''
+
+import re
+import os
+from typing import Dict, List, Any, Tuple, Union
+
+
+def extract_timestamp_from_video_path(video_path: str) -> str:
+    """
+    从视频文件路径中提取时间戳
+    
+    Args:
+        video_path: 视频文件路径
+    
+    Returns:
+        提取出的时间戳，格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss'
+    """
+    # 使用正则表达式从文件名中提取时间戳
+    filename = os.path.basename(video_path)
+    
+    # 匹配新格式: vid_00-00-00-000@00-00-20-250.mp4
+    match_new = re.search(r'vid_(\d{2})-(\d{2})-(\d{2})-(\d{3})@(\d{2})-(\d{2})-(\d{2})-(\d{3})\.mp4', filename)
+    if match_new:
+        # 提取并格式化时间戳（包含毫秒）
+        start_h, start_m, start_s, start_ms = match_new.group(1), match_new.group(2), match_new.group(3), match_new.group(4)
+        end_h, end_m, end_s, end_ms = match_new.group(5), match_new.group(6), match_new.group(7), match_new.group(8)
+        return f"{start_h}:{start_m}:{start_s},{start_ms}-{end_h}:{end_m}:{end_s},{end_ms}"
+    
+    # 匹配旧格式: vid-00-00-00-00-00-00.mp4
+    match_old = re.search(r'vid-(\d{2}-\d{2}-\d{2})-(\d{2}-\d{2}-\d{2})\.mp4', filename)
+    if match_old:
+        # 提取并格式化时间戳
+        start_time = match_old.group(1).replace('-', ':')
+        end_time = match_old.group(2).replace('-', ':')
+        return f"{start_time}-{end_time}"
+
+    return ""
+
+
+def calculate_duration(timestamp: str) -> float:
+    """
+    计算时间戳范围的持续时间（秒）
+    
+    Args:
+        timestamp: 格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss' 的时间戳
+    
+    Returns:
+        持续时间（秒）
+    """
+    try:
+        start_time, end_time = timestamp.split('-')
+
+        # 处理毫秒部分
+        if ',' in start_time:
+            start_parts = start_time.split(',')
+            start_time_parts = start_parts[0].split(':')
+            start_ms = float('0.' + start_parts[1]) if len(start_parts) > 1 else 0
+            start_h, start_m, start_s = map(int, start_time_parts)
+        else:
+            start_h, start_m, start_s = map(int, start_time.split(':'))
+            start_ms = 0
+
+        if ',' in end_time:
+            end_parts = end_time.split(',')
+            end_time_parts = end_parts[0].split(':')
+            end_ms = float('0.' + end_parts[1]) if len(end_parts) > 1 else 0
+            end_h, end_m, end_s = map(int, end_time_parts)
+        else:
+            end_h, end_m, end_s = map(int, end_time.split(':'))
+            end_ms = 0
+
+        # 转换为秒
+        start_seconds = start_h * 3600 + start_m * 60 + start_s + start_ms
+        end_seconds = end_h * 3600 + end_m * 60 + end_s + end_ms
+
+        # 计算时间差（秒）
+        return round(end_seconds - start_seconds, 2)
+    except (ValueError, AttributeError):
+        return 0.0
+
+
+def update_script_timestamps(
+    script_list: List[Dict[str, Any]], 
+    video_result: Dict[Union[str, int], str], 
+    audio_result: Dict[Union[str, int], str] = None,
+    subtitle_result: Dict[Union[str, int], str] = None,
+    calculate_edited_timerange: bool = True
+) -> List[Dict[str, Any]]:
+    """
+    根据 video_result 中的视频文件更新 script_list 中的时间戳，添加持续时间，
+    并根据 audio_result 添加音频路径，根据 subtitle_result 添加字幕路径
+    
+    Args:
+        script_list: 原始脚本列表
+        video_result: 视频结果字典，键为原时间戳或_id，值为视频文件路径
+        audio_result: 音频结果字典，键为原时间戳或_id，值为音频文件路径
+        subtitle_result: 字幕结果字典，键为原时间戳或_id，值为字幕文件路径
+        calculate_edited_timerange: 是否计算并添加成品视频中的时间范围
+    
+    Returns:
+        更新后的脚本列表
+    """
+    # 创建副本，避免修改原始数据
+    updated_script = []
+
+    # 建立ID和时间戳到视频路径和新时间戳的映射
+    id_timestamp_mapping = {}
+    for key, video_path in video_result.items():
+        new_timestamp = extract_timestamp_from_video_path(video_path)
+        if new_timestamp:
+            id_timestamp_mapping[key] = {
+                'new_timestamp': new_timestamp,
+                'video_path': video_path
+            }
+
+    # 计算累积时长，用于生成成品视频中的时间范围
+    accumulated_duration = 0.0
+    
+    # 更新脚本中的时间戳
+    for item in script_list:
+        item_copy = item.copy()
+        item_id = item_copy.get('_id')
+        orig_timestamp = item_copy.get('timestamp', '')
+
+        # 初始化音频和字幕路径为空字符串
+        item_copy['audio'] = ""
+        item_copy['subtitle'] = ""
+        item_copy['video'] = ""  # 初始化视频路径为空字符串
+
+        # 如果提供了音频结果字典且ID存在于音频结果中，直接使用对应的音频路径
+        if audio_result:
+            if item_id and item_id in audio_result:
+                item_copy['audio'] = audio_result[item_id]
+            elif orig_timestamp in audio_result:
+                item_copy['audio'] = audio_result[orig_timestamp]
+
+        # 如果提供了字幕结果字典且ID存在于字幕结果中，直接使用对应的字幕路径
+        if subtitle_result:
+            if item_id and item_id in subtitle_result:
+                item_copy['subtitle'] = subtitle_result[item_id]
+            elif orig_timestamp in subtitle_result:
+                item_copy['subtitle'] = subtitle_result[orig_timestamp]
+
+        # 添加视频路径
+        if item_id and item_id in video_result:
+            item_copy['video'] = video_result[item_id]
+        elif orig_timestamp in video_result:
+            item_copy['video'] = video_result[orig_timestamp]
+
+        # 更新时间戳和计算持续时间
+        current_duration = 0.0
+        if item_id and item_id in id_timestamp_mapping:
+            # 根据ID找到对应的新时间戳
+            item_copy['sourceTimeRange'] = id_timestamp_mapping[item_id]['new_timestamp']
+            current_duration = calculate_duration(item_copy['sourceTimeRange'])
+            item_copy['duration'] = current_duration
+        elif orig_timestamp in id_timestamp_mapping:
+            # 根据原始时间戳找到对应的新时间戳
+            item_copy['sourceTimeRange'] = id_timestamp_mapping[orig_timestamp]['new_timestamp']
+            current_duration = calculate_duration(item_copy['sourceTimeRange'])
+            item_copy['duration'] = current_duration
+        elif orig_timestamp:
+            # 对于未更新的时间戳，也计算并添加持续时间
+            item_copy['sourceTimeRange'] = orig_timestamp
+            current_duration = calculate_duration(orig_timestamp)
+            item_copy['duration'] = current_duration
+            
+        # 计算片段在成品视频中的时间范围
+        if calculate_edited_timerange and current_duration > 0:
+            start_time_seconds = accumulated_duration
+            end_time_seconds = accumulated_duration + current_duration
+            
+            # 将秒数转换为 HH:MM:SS 格式
+            start_h = int(start_time_seconds // 3600)
+            start_m = int((start_time_seconds % 3600) // 60)
+            start_s = int(start_time_seconds % 60)
+            
+            end_h = int(end_time_seconds // 3600)
+            end_m = int((end_time_seconds % 3600) // 60)
+            end_s = int(end_time_seconds % 60)
+            
+            item_copy['editedTimeRange'] = f"{start_h:02d}:{start_m:02d}:{start_s:02d}-{end_h:02d}:{end_m:02d}:{end_s:02d}"
+            
+            # 更新累积时长
+            accumulated_duration = end_time_seconds
+
+        updated_script.append(item_copy)
+
+    return updated_script
+
+
+if __name__ == '__main__':
+    list_script = [
+        {
+            'picture': '【解说】好的，各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！',
+            'timestamp': '00:00:00,001-00:01:15,001',
+            'narration': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！',
+            'OST': 0,
+            '_id': 1
+        },
+        {
+            'picture': '【解说】上一集我们看到，范闲在北齐遭遇了惊天变故，生死不明！',
+            'timestamp': '00:01:15,001-00:04:40,001',
+            'narration': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…',
+            'OST': 0,
+            '_id': 2
+        },
+        {
+            'picture': '画面切到王启年小心翼翼地向范闲汇报。',
+            'timestamp': '00:04:41,001-00:04:58,001',
+            'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪',
+            'OST': 1,
+            '_id': 3
+        },
+        {
+            'picture': '【解说】"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。',
+            'timestamp': '00:04:58,001-00:05:45,001',
+            'narration': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！',
+            'OST': 0,
+            '_id': 4
+        },
+        {
+            'picture': '【解说】但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+            'timestamp': '00:05:45,001-00:06:00,001',
+            'narration': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+            'OST': 0,
+            '_id': 5
+        },
+        {
+            'picture': '画面切换到范闲蒙面闯入皇宫，被侍卫包围的场景。',
+            'timestamp': '00:06:00,001-00:06:03,001',
+            'narration': '抓刺客',
+            'OST': 1,
+            '_id': 6
+        }]
+    video_res = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-00-000@00-00-20-250.mp4',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-30-000@00-00-48-950.mp4',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-00-000@00-01-15-688.mp4',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-30-000@00-01-49-512.mp4'}
+    audio_res = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'}
+    sub_res = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt'}
+    
+    # 更新并打印结果
+    updated_list_script = update_script_timestamps(list_script, video_res, audio_res, sub_res)
+    for item in updated_list_script:
+        print(
+            f"ID: {item['_id']} | Picture: {item['picture'][:20]}... | Timestamp: {item['timestamp']} | " +
+            f"SourceTimeRange: {item['sourceTimeRange']} | EditedTimeRange: {item.get('editedTimeRange', '')} | " +
+            f"Duration: {item['duration']} 秒 | Audio: {item['audio']} | Video: {item['video']} | Subtitle: {item['subtitle']}")
--- a/app/services/video.py
+++ b/app/services/video.py
@ -1,13 +1,13 @@
 import traceback

-import pysrt
+# import pysrt
 from typing import Optional
 from typing import List
 from loguru import logger
-from moviepy.editor import *
+from moviepy import *
 from PIL import ImageFont
 from contextlib import contextmanager
-from moviepy.editor import (
+from moviepy import (
    VideoFileClip,
    AudioFileClip,
    TextClip,
@ -105,86 +105,6 @@ def manage_clip(clip):
        del clip


-def combine_clip_videos(combined_video_path: str,
-                        video_paths: List[str],
-                        video_ost_list: List[int],
-                        list_script: list,
-                        video_aspect: VideoAspect = VideoAspect.portrait,
-                        threads: int = 2,
-                        ) -> str:
-    """
-    合并子视频
-    Args:
-        combined_video_path: 合并后的存储路径
-        video_paths: 子视频路径列表
-        video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说)
-        list_script: 剪辑脚本
-        video_aspect: 屏幕比例
-        threads: 线程数
-
-    Returns:
-        str: 合并后的视频路径
-    """
-    from app.utils.utils import calculate_total_duration
-    audio_duration = calculate_total_duration(list_script)
-    logger.info(f"音频的最大持续时间: {audio_duration} s")
-
-    output_dir = os.path.dirname(combined_video_path)
-    aspect = VideoAspect(video_aspect)
-    video_width, video_height = aspect.to_resolution()
-
-    clips = []
-    for video_path, video_ost in zip(video_paths, video_ost_list):
-        try:
-            clip = VideoFileClip(video_path)
-
-            if video_ost == 0:  # 不保留原声
-                clip = clip.without_audio()
-            # video_ost 为 1 或 2 时都保留原声，不需要特殊处理
-
-            clip = clip.set_fps(30)
-
-            # 处理视频尺寸
-            clip_w, clip_h = clip.size
-            if clip_w != video_width or clip_h != video_height:
-                clip = resize_video_with_padding(
-                    clip,
-                    target_width=video_width,
-                    target_height=video_height
-                )
-                logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}")
-
-            clips.append(clip)
-
-        except Exception as e:
-            logger.error(f"处理视频 {video_path} 时出错: {str(e)}")
-            continue
-
-    if not clips:
-        raise ValueError("没有有效的视频片段可以合并")
-
-    try:
-        video_clip = concatenate_videoclips(clips)
-        video_clip = video_clip.set_fps(30)
-
-        logger.info("开始合并视频... (过程中出现 UserWarning: 不必理会)")
-        video_clip.write_videofile(
-            filename=combined_video_path,
-            threads=threads,
-            audio_codec="aac",
-            fps=30,
-            temp_audiofile=os.path.join(output_dir, "temp-audio.m4a")
-        )
-    finally:
-        # 确保资源被正确放
-        video_clip.close()
-        for clip in clips:
-            clip.close()
-
-    logger.success("视频合并完成")
-    return combined_video_path
-
-
 def resize_video_with_padding(clip, target_width: int, target_height: int):
    """
    调整视频尺寸并添加黑边
@ -443,4 +363,3 @@ def generate_video_v3(
        bgm.close()
    if narration_path:
        narration.close()
-
--- a/app/services/video_service.py
+++ b/app/services/video_service.py
@ -4,8 +4,6 @@ from loguru import logger
 from typing import Dict, List, Optional, Tuple

 from app.services import material
-from app.models.schema import VideoClipParams
-from app.utils import utils


 class VideoService:
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -5,10 +5,11 @@ import traceback
 import edge_tts
 import asyncio
 from loguru import logger
-from typing import List
+from typing import List, Union
 from datetime import datetime
 from xml.sax.saxutils import unescape
 from edge_tts import submaker, SubMaker
+from edge_tts.submaker import mktimestamp
 from moviepy.video.tools import subtitles
 import time

@ -1036,7 +1037,7 @@ def is_azure_v2_voice(voice_name: str):

 def tts(
    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
-) -> [SubMaker, None]:
+) -> Union[SubMaker, None]:
    if is_azure_v2_voice(voice_name):
        return azure_tts_v2(text, voice_name, voice_file)
    return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
@ -1064,7 +1065,7 @@ def convert_pitch_to_percent(rate: float) -> str:

 def azure_tts_v1(
    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
-) -> [SubMaker, None]:
+) -> Union[SubMaker, None]:
    voice_name = parse_voice_name(voice_name)
    text = text.strip()
    rate_str = convert_rate_to_percent(voice_rate)
@ -1087,11 +1088,6 @@ def azure_tts_v1(
                        )
                return sub_maker, audio_data

-            # 判断音频文件是否已存在
-            if os.path.exists(voice_file):
-                logger.info(f"voice file exists, skip tts: {voice_file}")
-                continue
-
            # 获取音频数据和字幕信息
            sub_maker, audio_data = asyncio.run(_do())
            
@ -1105,8 +1101,6 @@ def azure_tts_v1(
            # 数据有效，写入文件
            with open(voice_file, "wb") as file:
                file.write(audio_data)
-
-            logger.info(f"completed, output file: {voice_file}")
            return sub_maker
        except Exception as e:
            logger.error(f"生成音频文件时出错: {str(e)}")
@ -1115,7 +1109,7 @@ def azure_tts_v1(
    return None


-def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]:
    voice_name = is_azure_v2_voice(voice_name)
    if not voice_name:
        logger.error(f"invalid voice name: {voice_name}")
@ -1203,11 +1197,14 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None


 def _format_text(text: str) -> str:
-    # text = text.replace("\n", " ")
+    text = text.replace("\n", " ")
+    text = text.replace("\"", " ")
    text = text.replace("[", " ")
    text = text.replace("]", " ")
    text = text.replace("(", " ")
    text = text.replace(")", " ")
+    text = text.replace("）", " ")
+    text = text.replace("（", " ")
    text = text.replace("{", " ")
    text = text.replace("}", " ")
    text = text.strip()
@ -1240,7 +1237,7 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis
            if script_item['OST']:
                continue

-            start_time, end_time = script_item['new_timestamp'].split('-')
+            start_time, end_time = script_item['timestamp'].split('-')
            if sub_maker_index >= len(sub_maker_list):
                logger.error(f"Sub maker list index out of range: {sub_maker_index}")
                break
@ -1317,6 +1314,99 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis
        traceback.print_exc()


+def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
+    """
+    优化字幕文件
+    1. 将字幕文件按照标点符号分割成多行
+    2. 逐行匹配字幕文件中的文本
+    3. 生成新的字幕文件
+    """
+
+    text = _format_text(text)
+
+    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
+        """
+        1
+        00:00:00,000 --> 00:00:02,360
+        跑步是一项简单易行的运动
+        """
+        start_t = mktimestamp(start_time).replace(".", ",")
+        end_t = mktimestamp(end_time).replace(".", ",")
+        return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n"
+
+    start_time = -1.0
+    sub_items = []
+    sub_index = 0
+
+    script_lines = utils.split_string_by_punctuations(text)
+
+    def match_line(_sub_line: str, _sub_index: int):
+        if len(script_lines) <= _sub_index:
+            return ""
+
+        _line = script_lines[_sub_index]
+        if _sub_line == _line:
+            return script_lines[_sub_index].strip()
+
+        _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line)
+        _line_ = re.sub(r"[^\w\s]", "", _line)
+        if _sub_line_ == _line_:
+            return _line_.strip()
+
+        _sub_line_ = re.sub(r"\W+", "", _sub_line)
+        _line_ = re.sub(r"\W+", "", _line)
+        if _sub_line_ == _line_:
+            return _line.strip()
+
+        return ""
+
+    sub_line = ""
+
+    try:
+        for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
+            _start_time, end_time = offset
+            if start_time < 0:
+                start_time = _start_time
+
+            sub = unescape(sub)
+            sub_line += sub
+            sub_text = match_line(sub_line, sub_index)
+            if sub_text:
+                sub_index += 1
+                line = formatter(
+                    idx=sub_index,
+                    start_time=start_time,
+                    end_time=end_time,
+                    sub_text=sub_text,
+                )
+                sub_items.append(line)
+                start_time = -1.0
+                sub_line = ""
+
+        if len(sub_items) == len(script_lines):
+            with open(subtitle_file, "w", encoding="utf-8") as file:
+                file.write("\n".join(sub_items) + "\n")
+            try:
+                sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
+                duration = max([tb for ((ta, tb), txt) in sbs])
+                logger.info(
+                    f"已创建字幕文件: {subtitle_file}, duration: {duration}"
+                )
+                return subtitle_file, duration
+            except Exception as e:
+                logger.error(f"failed, error: {str(e)}")
+                os.remove(subtitle_file)
+        else:
+            logger.error(
+                f"字幕创建失败, 字幕长度: {len(sub_items)}, script_lines len: {len(script_lines)}"
+                f"\nsub_items:{json.dumps(sub_items, indent=4, ensure_ascii=False)}"
+                f"\nscript_lines:{json.dumps(script_lines, indent=4, ensure_ascii=False)}"
+            )
+
+    except Exception as e:
+        logger.error(f"failed, error: {str(e)}")
+
+
 def get_audio_duration(sub_maker: submaker.SubMaker):
    """
    获取音频时长
@ -1326,7 +1416,7 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
    return sub_maker.offset[-1][1] / 10000000


-def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float, force_regenerate: bool = True):
+def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float):
    """
    根据JSON文件中的多段文本进行TTS转换
    
@ -1334,25 +1424,18 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
    :param list_script: 脚本列表
    :param voice_name: 语音名称
    :param voice_rate: 语音速率
-    :param force_regenerate: 是否强制重新生成已存在的音频文件
    :return: 生成的音频文件列表
    """
    voice_name = parse_voice_name(voice_name)
    output_dir = utils.task_dir(task_id)
-    audio_files = []
-    sub_maker_list = []
+    tts_results = []

    for item in list_script:
        if item['OST'] != 1:
            # 将时间戳中的冒号替换为下划线
-            timestamp = item['new_timestamp'].replace(':', '_')
+            timestamp = item['timestamp'].replace(':', '_')
            audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
-            
-            # 检查文件是否已存在，如存在且不强制重新生成，则跳过
-            if os.path.exists(audio_file) and not force_regenerate:
-                logger.info(f"音频文件已存在，跳过生成: {audio_file}")
-                audio_files.append(audio_file)
-                continue
+            subtitle_file = os.path.join(output_dir, f"subtitle_{timestamp}.srt")

            text = item['narration']

@ -1369,9 +1452,18 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
                             f"如果您在中国，请使用VPN; "
                             f"或者使用其他 tts 引擎")
                continue
+            else:
+                # 为当前片段生成字幕文件
+                _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)

-            audio_files.append(audio_file)
-            sub_maker_list.append(sub_maker)
+            tts_results.append({
+                "_id": item['_id'],
+                "timestamp": item['timestamp'],
+                "audio_file": audio_file,
+                "subtitle_file": subtitle_file,
+                "duration": duration,
+                "text": text,
+            })
            logger.info(f"已生成音频文件: {audio_file}")

-    return audio_files, sub_maker_list
+    return tts_results
--- a/app/utils/gemini_analyzer.py
+++ b/app/utils/gemini_analyzer.py
@ -61,7 +61,6 @@ class VisionAnalyzer:
        try:
            # 加载图片
            if isinstance(images[0], str):
-                logger.info("正在加载图片...")
                images = self.load_images(images)

            # 验证图片列表
@ -81,11 +80,14 @@ class VisionAnalyzer:

            images = valid_images
            results = []
-            total_batches = (len(images) + batch_size - 1) // batch_size
+            # 视频帧总数除以批量处理大小，如果有小数则+1
+            batches_needed = len(images) // batch_size
+            if len(images) % batch_size > 0:
+                batches_needed += 1
+                
+            logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次")

-            logger.debug(f"共 {total_batches} 个批次，每批次 {batch_size} 张图片")
-
-            with tqdm(total=total_batches, desc="分析进度") as pbar:
+            with tqdm(total=batches_needed, desc="分析进度") as pbar:
                for i in range(0, len(images), batch_size):
                    batch = images[i:i + batch_size]
                    retry_count = 0
@ -93,8 +95,8 @@ class VisionAnalyzer:
                    while retry_count < 3:
                        try:
                            # 在每个批次处理前添加小延迟
-                            if i > 0:
-                                await asyncio.sleep(2)
+                            # if i > 0:
+                            #     await asyncio.sleep(2)

                            # 确保每个批次的图片都是有效的
                            valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)]
--- a/app/utils/qwenvl_analyzer.py
+++ b/app/utils/qwenvl_analyzer.py
@ -30,7 +30,7 @@ class QwenAnalyzer:

        self.model_name = model_name
        self.api_key = api_key
-        self.base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
+        self.base_url = base_url

        # 配置API客户端
        self._configure_client()
@ -80,7 +80,7 @@ class QwenAnalyzer:
            # 添加文本提示
            content.append({
                "type": "text",
-                "text": prompt
+                "text": prompt % (len(content), len(content), len(content))
            })

            # 调用API
@ -102,7 +102,7 @@ class QwenAnalyzer:
    async def analyze_images(self,
                             images: Union[List[str], List[PIL.Image.Image]],
                             prompt: str,
-                             batch_size: int = 5) -> List[Dict]:
+                             batch_size: int) -> List[Dict]:
        """
        批量分析多张图片
        Args:
@ -118,7 +118,6 @@ class QwenAnalyzer:

            # 加载图片
            if isinstance(images[0], str):
-                logger.info("正在加载图片...")
                images = self.load_images(images)

            # 验证图片列表
@ -141,9 +140,14 @@ class QwenAnalyzer:

            images = valid_images
            results = []
-            total_batches = (len(images) + batch_size - 1) // batch_size
+            # 视频帧总数除以批量处理大小，如果有小数则+1
+            batches_needed = len(images) // batch_size
+            if len(images) % batch_size > 0:
+                batches_needed += 1
+                
+            logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次")

-            with tqdm(total=total_batches, desc="分析进度") as pbar:
+            with tqdm(total=batches_needed, desc="分析进度") as pbar:
                for i in range(0, len(images), batch_size):
                    batch = images[i:i + batch_size]
                    batch_paths = valid_paths[i:i + batch_size] if valid_paths else None
@ -151,9 +155,9 @@ class QwenAnalyzer:

                    while retry_count < 3:
                        try:
-                            # 在每个批次处理前<EFBFBD><EFBFBD>加小延迟
-                            if i > 0:
-                                await asyncio.sleep(2)
+                            # 在每个批次处理前添加小延迟
+                            # if i > 0:
+                            #     await asyncio.sleep(0.5)

                            # 确保每个批次的图片都是有效的
                            valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)]
@ -209,7 +213,7 @@ class QwenAnalyzer:
        for i, result in enumerate(results):
            response_text = result['response']

-            # 如果有图片路径信息，<EFBFBD><EFBFBD><EFBFBD>用它来生成文件名
+            # 如果有图片路径信息，用它来生成文件名
            if result.get('image_paths'):
                image_paths = result['image_paths']
                img_name_start = Path(image_paths[0]).stem.split('_')[-1]
--- a/app/utils/script_generator.py
+++ b/app/utils/script_generator.py
@ -2,7 +2,7 @@ import os
 import json
 import traceback
 from loguru import logger
-import tiktoken
+# import tiktoken
 from typing import List, Dict
 from datetime import datetime
 from openai import OpenAI
@ -94,12 +94,12 @@ class OpenAIGenerator(BaseGenerator):
            "user": "script_generator"
        }
        
-        # 初始化token计数器
-        try:
-            self.encoding = tiktoken.encoding_for_model(self.model_name)
-        except KeyError:
-            logger.warning(f"未找到模型 {self.model_name} 的专用编码器，使用默认编码器")
-            self.encoding = tiktoken.get_encoding("cl100k_base")
+        # # 初始化token计数器
+        # try:
+        #     self.encoding = tiktoken.encoding_for_model(self.model_name)
+        # except KeyError:
+        #     logger.warning(f"未找到模型 {self.model_name} 的专用编码器，使用默认编码器")
+        #     self.encoding = tiktoken.get_encoding("cl100k_base")

    def _generate(self, messages: list, params: dict) -> any:
        """实现OpenAI特定的生成逻辑"""
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@ -197,6 +197,28 @@ def time_convert_seconds_to_hmsm(seconds) -> str:
    return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds)


+def format_time(seconds: float) -> str:
+    """
+    将秒数转换为格式化的时间字符串 (HH:MM:SS,mmm)
+    
+    参数:
+        seconds: 需要转换的秒数，可以是整数或浮点数
+        
+    返回:
+        格式化的时间字符串，格式为 HH:MM:SS,mmm
+    """
+    # 计算小时、分钟、秒和毫秒
+    hours = int(seconds // 3600)
+    remaining_seconds = seconds % 3600
+    minutes = int(remaining_seconds // 60)
+    remaining_seconds = remaining_seconds % 60
+    secs = int(remaining_seconds)
+    milliseconds = int((remaining_seconds - secs) * 1000)
+    
+    # 格式化为时间字符串
+    return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, secs, milliseconds)
+
+
 def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str:
    start_time = time_convert_seconds_to_hmsm(start_time)
    end_time = time_convert_seconds_to_hmsm(end_time)
@ -506,7 +528,7 @@ def cut_video(params, progress_callback=None):
        st.session_state['subclip_videos'] = subclip_videos
        for i, video_script in enumerate(video_script_list):
            try:
-                video_script['path'] = subclip_videos[video_script['timestamp']]
+                video_script['path'] = subclip_videos[i+1]
            except KeyError as err:
                logger.error(f"裁剪视频失败: {err}")

--- a/app/utils/video_processor.py
+++ b/app/utils/video_processor.py
@ -1,237 +1,339 @@
-import cv2
-import numpy as np
-from sklearn.cluster import MiniBatchKMeans
+"""
+视频帧提取工具
+
+这个模块提供了简单高效的视频帧提取功能。主要特点：
+1. 使用ffmpeg进行视频处理，支持硬件加速
+2. 按指定时间间隔提取视频关键帧
+3. 支持多种视频格式
+4. 支持高清视频帧输出
+5. 直接从原视频提取高质量关键帧
+
+不依赖OpenCV和sklearn等库，只使用ffmpeg作为外部依赖，降低了安装和使用的复杂度。
+"""
+
 import os
 import re
-from typing import List, Tuple, Generator
+import time
+import subprocess
+from typing import List, Dict
 from loguru import logger
-import gc
 from tqdm import tqdm


 class VideoProcessor:
-    def __init__(self, video_path: str, batch_size: int = 100):
+    def __init__(self, video_path: str):
        """
        初始化视频处理器
-        
+
        Args:
            video_path: 视频文件路径
-            batch_size: 批处理大小，控制内存使用
        """
        if not os.path.exists(video_path):
            raise FileNotFoundError(f"视频文件不存在: {video_path}")
-        
+
        self.video_path = video_path
-        self.batch_size = batch_size
-        self.cap = cv2.VideoCapture(video_path)
-        
-        if not self.cap.isOpened():
-            raise RuntimeError(f"无法打开视频文件: {video_path}")
-        
-        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
+        self.video_info = self._get_video_info()
+        self.fps = float(self.video_info.get('fps', 25))
+        self.duration = float(self.video_info.get('duration', 0))
+        self.width = int(self.video_info.get('width', 0))
+        self.height = int(self.video_info.get('height', 0))
+        self.total_frames = int(self.fps * self.duration)

-    def __del__(self):
-        """析构函数，确保视频资源被释放"""
-        if hasattr(self, 'cap'):
-            self.cap.release()
-        gc.collect()
+    def _get_video_info(self) -> Dict[str, str]:
+        """
+        使用ffprobe获取视频信息

-    def preprocess_video(self) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """
-        使用生成器方式分批读取视频帧
-        
-        Yields:
-            Tuple[int, np.ndarray]: (帧索引, 视频帧)
-        """
-        self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
-        frame_idx = 0
-        
-        while self.cap.isOpened():
-            ret, frame = self.cap.read()
-            if not ret:
-                break
-                
-            # 降低分辨率以减少内存使用
-            frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)
-            yield frame_idx, frame
-            
-            frame_idx += 1
-            
-            # 定期进行垃圾回收
-            if frame_idx % 1000 == 0:
-                gc.collect()
-
-    def detect_shot_boundaries(self, threshold: int = 70) -> List[int]:
-        """
-        使用批处理方式检测镜头边界
-        
-        Args:
-            threshold: 差异阈值
-            
        Returns:
-            List[int]: 镜头边界帧的索引列表
+            Dict[str, str]: 包含视频基本信息的字典
        """
-        shot_boundaries = []
-        prev_frame = None
-        prev_idx = -1
-        
-        pbar = tqdm(self.preprocess_video(), 
-                   total=self.total_frames,
-                   desc="检测镜头边界",
-                   unit="帧")
-        
-        for frame_idx, curr_frame in pbar:
-            if prev_frame is not None:
-                prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
-                curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)
-                
-                diff = np.mean(np.abs(curr_gray.astype(float) - prev_gray.astype(float)))
-                if diff > threshold:
-                    shot_boundaries.append(frame_idx)
-                    pbar.set_postfix({"检测到边界": len(shot_boundaries)})
-            
-            prev_frame = curr_frame.copy()
-            prev_idx = frame_idx
-            
-            del curr_frame
-            if frame_idx % 100 == 0:
-                gc.collect()
-        
-        return shot_boundaries
+        cmd = [
+            "ffprobe",
+            "-v", "error",
+            "-select_streams", "v:0",
+            "-show_entries", "stream=width,height,r_frame_rate,duration",
+            "-of", "default=noprint_wrappers=1:nokey=0",
+            self.video_path
+        ]

-    def process_shot(self, shot_frames: List[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, int]:
-        """
-        处理单个镜头的帧
-        
-        Args:
-            shot_frames: 镜头中的帧列表
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            lines = result.stdout.strip().split('\n')
+            info = {}
+            for line in lines:
+                if '=' in line:
+                    key, value = line.split('=', 1)
+                    info[key] = value
            
+            # 处理帧率（可能是分数形式）
+            if 'r_frame_rate' in info:
+                try:
+                    num, den = map(int, info['r_frame_rate'].split('/'))
+                    info['fps'] = str(num / den)
+                except ValueError:
+                    info['fps'] = info.get('r_frame_rate', '25')
+            
+            return info
+        
+        except subprocess.CalledProcessError as e:
+            logger.error(f"获取视频信息失败: {e.stderr}")
+            return {
+                'width': '1280',
+                'height': '720',
+                'fps': '25',
+                'duration': '0'
+            }
+
+    def extract_frames_by_interval(self, output_dir: str, interval_seconds: float = 5.0, 
+                                  use_hw_accel: bool = True) -> List[int]:
+        """
+        按指定时间间隔提取视频帧
+
+        Args:
+            output_dir: 输出目录
+            interval_seconds: 帧提取间隔（秒）
+            use_hw_accel: 是否使用硬件加速
+
        Returns:
-            Tuple[np.ndarray, int]: (关键帧, 帧索引)
+            List[int]: 提取的帧号列表
        """
-        if not shot_frames:
-            return None, -1
-            
-        frame_features = []
-        frame_indices = []
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
        
-        for idx, frame in tqdm(shot_frames, 
-                             desc="处理镜头帧",
-                             unit="帧",
-                             leave=False):
-            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-            resized_gray = cv2.resize(gray, (32, 32))
-            frame_features.append(resized_gray.flatten())
-            frame_indices.append(idx)
-            
-        frame_features = np.array(frame_features)
+        # 计算起始时间和帧提取点
+        start_time = 0
+        end_time = self.duration
+        extraction_times = []
        
-        kmeans = MiniBatchKMeans(n_clusters=1, batch_size=min(len(frame_features), 100),
-                                random_state=0).fit(frame_features)
+        current_time = start_time
+        while current_time < end_time:
+            extraction_times.append(current_time)
+            current_time += interval_seconds
        
-        center_idx = np.argmin(np.sum((frame_features - kmeans.cluster_centers_[0]) ** 2, axis=1))
-        
-        return shot_frames[center_idx][1], frame_indices[center_idx]
+        if not extraction_times:
+            logger.warning("未找到需要提取的帧")
+            return []

-    def extract_keyframes(self, shot_boundaries: List[int]) -> Generator[Tuple[np.ndarray, int], None, None]:
-        """
-        使用生成器方式提取关键帧
+        # 确定硬件加速器选项
+        hw_accel = []
+        if use_hw_accel:
+            # 尝试检测可用的硬件加速器
+            hw_accel_options = self._detect_hw_accelerator()
+            if hw_accel_options:
+                hw_accel = hw_accel_options
+                logger.info(f"使用硬件加速: {' '.join(hw_accel)}")
+            else:
+                logger.warning("未检测到可用的硬件加速器，使用软件解码")
        
-        Args:
-            shot_boundaries: 镜头边界列表
+        # 提取帧
+        frame_numbers = []
+        for i, timestamp in enumerate(tqdm(extraction_times, desc="提取视频帧")):
+            frame_number = int(timestamp * self.fps)
+            frame_numbers.append(frame_number)
            
-        Yields:
-            Tuple[np.ndarray, int]: (关键帧, 帧索引)
-        """
-        shot_frames = []
-        current_shot_start = 0
+            # 格式化时间戳字符串 (HHMMSSmmm)
+            hours = int(timestamp // 3600)
+            minutes = int((timestamp % 3600) // 60)
+            seconds = int(timestamp % 60)
+            milliseconds = int((timestamp % 1) * 1000)
+            time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
+            
+            output_path = os.path.join(output_dir, f"keyframe_{frame_number:06d}_{time_str}.jpg")
+            
+            # 使用ffmpeg提取单帧
+            cmd = [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel", "error",
+            ]
+            
+            # 添加硬件加速参数
+            cmd.extend(hw_accel)
+            
+            cmd.extend([
+                "-ss", str(timestamp),
+                "-i", self.video_path,
+                "-vframes", "1",
+                "-q:v", "1",  # 最高质量
+                "-y",
+                output_path
+            ])
+            
+            try:
+                subprocess.run(cmd, check=True, capture_output=True)
+            except subprocess.CalledProcessError as e:
+                logger.warning(f"提取帧 {frame_number} 失败: {e.stderr}")
        
-        for frame_idx, frame in self.preprocess_video():
-            if frame_idx in shot_boundaries:
-                if shot_frames:
-                    keyframe, keyframe_idx = self.process_shot(shot_frames)
-                    if keyframe is not None:
-                        yield keyframe, keyframe_idx
-                    
-                    # 清理内存
-                    shot_frames.clear()
-                    gc.collect()
+        logger.info(f"成功提取了 {len(frame_numbers)} 个视频帧")
+        return frame_numbers
+
+    def _detect_hw_accelerator(self) -> List[str]:
+        """
+        检测系统可用的硬件加速器
+
+        Returns:
+            List[str]: 硬件加速器ffmpeg命令参数
+        """
+        # 检测操作系统
+        import platform
+        system = platform.system().lower()
+        
+        # 测试不同的硬件加速器
+        accelerators = []
+        
+        if system == 'darwin':  # macOS
+            # 测试 videotoolbox (Apple 硬件加速)
+            test_cmd = [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel", "error",
+                "-hwaccel", "videotoolbox",
+                "-i", self.video_path,
+                "-t", "0.1",
+                "-f", "null",
+                "-"
+            ]
+            try:
+                subprocess.run(test_cmd, capture_output=True, check=True)
+                return ["-hwaccel", "videotoolbox"]
+            except subprocess.CalledProcessError:
+                pass
                
-                current_shot_start = frame_idx
+        elif system == 'linux':
+            # 测试 VAAPI
+            test_cmd = [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel", "error",
+                "-hwaccel", "vaapi",
+                "-i", self.video_path,
+                "-t", "0.1",
+                "-f", "null",
+                "-"
+            ]
+            try:
+                subprocess.run(test_cmd, capture_output=True, check=True)
+                return ["-hwaccel", "vaapi"]
+            except subprocess.CalledProcessError:
+                pass
            
-            shot_frames.append((frame_idx, frame))
-            
-            # 控制单个镜头的最大帧数
-            if len(shot_frames) > self.batch_size:
-                keyframe, keyframe_idx = self.process_shot(shot_frames)
-                if keyframe is not None:
-                    yield keyframe, keyframe_idx
-                shot_frames.clear()
-                gc.collect()
+            # 尝试 CUDA
+            test_cmd = [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel", "error", 
+                "-hwaccel", "cuda",
+                "-i", self.video_path,
+                "-t", "0.1",
+                "-f", "null",
+                "-"
+            ]
+            try:
+                subprocess.run(test_cmd, capture_output=True, check=True)
+                return ["-hwaccel", "cuda"]
+            except subprocess.CalledProcessError:
+                pass
+                
+        elif system == 'windows':
+            # 测试 CUDA
+            test_cmd = [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel", "error",
+                "-hwaccel", "cuda",
+                "-i", self.video_path,
+                "-t", "0.1",
+                "-f", "null",
+                "-"
+            ]
+            try:
+                subprocess.run(test_cmd, capture_output=True, check=True)
+                return ["-hwaccel", "cuda"]
+            except subprocess.CalledProcessError:
+                pass
+                
+            # 测试 D3D11VA
+            test_cmd = [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel", "error",
+                "-hwaccel", "d3d11va",
+                "-i", self.video_path,
+                "-t", "0.1",
+                "-f", "null", 
+                "-"
+            ]
+            try:
+                subprocess.run(test_cmd, capture_output=True, check=True)
+                return ["-hwaccel", "d3d11va"]
+            except subprocess.CalledProcessError:
+                pass
+                
+            # 测试 DXVA2
+            test_cmd = [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel", "error",
+                "-hwaccel", "dxva2",
+                "-i", self.video_path,
+                "-t", "0.1",
+                "-f", "null",
+                "-"
+            ]
+            try:
+                subprocess.run(test_cmd, capture_output=True, check=True)
+                return ["-hwaccel", "dxva2"]
+            except subprocess.CalledProcessError:
+                pass
        
-        # 处理最后一个镜头
-        if shot_frames:
-            keyframe, keyframe_idx = self.process_shot(shot_frames)
-            if keyframe is not None:
-                yield keyframe, keyframe_idx
+        # 如果没有找到可用的硬件加速器        
+        return []

-    def process_video(self, output_dir: str, skip_seconds: float = 0) -> None:
+    def process_video_pipeline(self,
+                              output_dir: str,
+                              interval_seconds: float = 5.0,  # 帧提取间隔（秒）
+                              use_hw_accel: bool = True) -> None:
        """
-        处理视频并提取关键帧，使用分批处理方式
+        执行简化的视频处理流程，直接从原视频按固定时间间隔提取帧
        
        Args:
            output_dir: 输出目录
-            skip_seconds: 跳过视频开头的秒数
+            interval_seconds: 帧提取间隔（秒）
+            use_hw_accel: 是否使用硬件加速
        """
+        # 创建输出目录
+        os.makedirs(output_dir, exist_ok=True)
+        
        try:
-            # 创建输出目录
-            os.makedirs(output_dir, exist_ok=True)
-            
-            # 计算要跳过的帧数
-            skip_frames = int(skip_seconds * self.fps)
-            self.cap.set(cv2.CAP_PROP_POS_FRAMES, skip_frames)
-            
-            # 检测镜头边界
-            logger.info("开始检测镜头边界...")
-            shot_boundaries = self.detect_shot_boundaries()
-            
-            # 提取关键帧
-            logger.info("开始提取关键帧...")
-            frame_count = 0
-            
-            pbar = tqdm(self.extract_keyframes(shot_boundaries),
-                       desc="提取关键帧",
-                       unit="帧")
-            
-            for keyframe, frame_idx in pbar:
-                if frame_idx < skip_frames:
-                    continue
-                    
-                # 计算时间戳
-                timestamp = frame_idx / self.fps
-                hours = int(timestamp // 3600)
-                minutes = int((timestamp % 3600) // 60)
-                seconds = int(timestamp % 60)
-                time_str = f"{hours:02d}{minutes:02d}{seconds:02d}"
-                
-                # 保存关键帧
-                output_path = os.path.join(output_dir, 
-                                         f'keyframe_{frame_idx:06d}_{time_str}.jpg')
-                cv2.imwrite(output_path, keyframe)
-                frame_count += 1
-                
-                pbar.set_postfix({"已保存": frame_count})
-                
-                if frame_count % 10 == 0:
-                    gc.collect()
-            
-            logger.info(f"关键帧提取完成，共保存 {frame_count} 帧到 {output_dir}")
+            # 直接从原视频提取关键帧
+            logger.info(f"从视频间隔 {interval_seconds} 秒提取关键帧...")
+            self.extract_frames_by_interval(
+                output_dir,
+                interval_seconds=interval_seconds,
+                use_hw_accel=use_hw_accel
+            )
            
+            logger.info(f"处理完成！视频帧已保存在: {output_dir}")
+
        except Exception as e:
-            logger.error(f"视频处理失败: {str(e)}")
+            import traceback
+            logger.error(f"视频处理失败: \n{traceback.format_exc()}")
            raise
-        finally:
-            # 确保资源被释放
-            self.cap.release()
-            gc.collect()
+
+
+if __name__ == "__main__":
+    import time
+
+    start_time = time.time()
+    
+    # 使用示例
+    processor = VideoProcessor("./resource/videos/test.mp4")
+    
+    # 设置间隔为3秒提取帧
+    processor.process_video_pipeline(
+        output_dir="output",
+        interval_seconds=3.0,
+        use_hw_accel=True
+    )
+    
+    end_time = time.time()
+    print(f"处理完成！总耗时: {end_time - start_time:.2f} 秒")
--- a/app/utils/video_processor_v2.py
+++ b/app/utils/video_processor_v2.py
@ -1,382 +0,0 @@
-import cv2
-import numpy as np
-from sklearn.cluster import KMeans
-import os
-import re
-from typing import List, Tuple, Generator
-from loguru import logger
-import subprocess
-from tqdm import tqdm
-
-
-class VideoProcessor:
-    def __init__(self, video_path: str):
-        """
-        初始化视频处理器
-
-        Args:
-            video_path: 视频文件路径
-        """
-        if not os.path.exists(video_path):
-            raise FileNotFoundError(f"视频文件不存在: {video_path}")
-
-        self.video_path = video_path
-        self.cap = cv2.VideoCapture(video_path)
-
-        if not self.cap.isOpened():
-            raise RuntimeError(f"无法打开视频文件: {video_path}")
-
-        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
-
-    def __del__(self):
-        """析构函数，确保视频资源被释放"""
-        if hasattr(self, 'cap'):
-            self.cap.release()
-
-    def preprocess_video(self) -> Generator[np.ndarray, None, None]:
-        """
-        使用生成器方式读取视频帧
-
-        Yields:
-            np.ndarray: 视频帧
-        """
-        self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0)  # 重置到视频开始
-        while self.cap.isOpened():
-            ret, frame = self.cap.read()
-            if not ret:
-                break
-            yield frame
-
-    def detect_shot_boundaries(self, frames: List[np.ndarray], threshold: int = 30) -> List[int]:
-        """
-        使用帧差法检测镜头边界
-        
-        Args:
-            frames: 视频帧列表
-            threshold: 差异阈值，默认值调低为30
-        
-        Returns:
-            List[int]: 镜头边界帧的索引列表
-        """
-        shot_boundaries = []
-        if len(frames) < 2:  # 添加帧数检查
-            logger.warning("视频帧数过少，无法检测场景边界")
-            return [len(frames) - 1]  # 返回最后一帧作为边界
-        
-        for i in range(1, len(frames)):
-            prev_frame = cv2.cvtColor(frames[i - 1], cv2.COLOR_BGR2GRAY)
-            curr_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
-            
-            # 计算帧差
-            diff = np.mean(np.abs(curr_frame.astype(float) - prev_frame.astype(float)))
-            
-            if diff > threshold:
-                shot_boundaries.append(i)
-
-        # 如果没有检测到任何边界，至少返回最后一帧
-        if not shot_boundaries:
-            logger.warning("未检测到场景边界，将视频作为单个场景处理")
-            shot_boundaries.append(len(frames) - 1)
-        
-        return shot_boundaries
-
-    def extract_keyframes(self, frames: List[np.ndarray], shot_boundaries: List[int]) -> Tuple[
-        List[np.ndarray], List[int]]:
-        """
-        从每个镜头中提取关键帧
-
-        Args:
-            frames: 视频帧列表
-            shot_boundaries: 镜头边界列表
-
-        Returns:
-            Tuple[List[np.ndarray], List[int]]: 关键帧列表和对应的帧索引
-        """
-        keyframes = []
-        keyframe_indices = []
-
-        for i in tqdm(range(len(shot_boundaries)), desc="提取关键帧"):
-            start = shot_boundaries[i - 1] if i > 0 else 0
-            end = shot_boundaries[i]
-            shot_frames = frames[start:end]
-
-            if not shot_frames:
-                continue
-
-            # 将每一帧转换为灰度图并展平为一维数组
-            frame_features = np.array([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY).flatten()
-                                       for frame in shot_frames])
-            
-            try:
-                # 尝试使用 KMeans
-                kmeans = KMeans(n_clusters=1, random_state=0).fit(frame_features)
-                center_idx = np.argmin(np.sum((frame_features - kmeans.cluster_centers_[0]) ** 2, axis=1))
-            except Exception as e:
-                logger.warning(f"KMeans 聚类失败，使用备选方案: {str(e)}")
-                # 备选方案：选择镜头中间的帧作为关键帧
-                center_idx = len(shot_frames) // 2
-
-            keyframes.append(shot_frames[center_idx])
-            keyframe_indices.append(start + center_idx)
-
-        return keyframes, keyframe_indices
-
-    def save_keyframes(self, keyframes: List[np.ndarray], keyframe_indices: List[int],
-                       output_dir: str, desc: str = "保存关键帧") -> None:
-        """
-        保存关键帧到指定目录，文件名格式为：keyframe_帧序号_时间戳.jpg
-        时间戳精确到毫秒，格式为：HHMMSSmmm
-        """
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
-        for keyframe, frame_idx in tqdm(zip(keyframes, keyframe_indices),
-                                        total=len(keyframes),
-                                        desc=desc):
-            # 计算精确到毫秒的时间戳
-            timestamp = frame_idx / self.fps
-            hours = int(timestamp // 3600)
-            minutes = int((timestamp % 3600) // 60)
-            seconds = int(timestamp % 60)
-            milliseconds = int((timestamp % 1) * 1000)  # 计算毫秒部分
-            time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
-
-            output_path = os.path.join(output_dir,
-                                       f'keyframe_{frame_idx:06d}_{time_str}.jpg')
-            cv2.imwrite(output_path, keyframe)
-
-    def extract_frames_by_numbers(self, frame_numbers: List[int], output_folder: str) -> None:
-        """
-        根据指定的帧号提取帧，如果多个帧在同一毫秒内，只保留一个
-        """
-        if not frame_numbers:
-            raise ValueError("未提供帧号列表")
-
-        if any(fn >= self.total_frames or fn < 0 for fn in frame_numbers):
-            raise ValueError("存在无效的帧号")
-
-        if not os.path.exists(output_folder):
-            os.makedirs(output_folder)
-
-        # 用于记录已处理的时间戳（毫秒）
-        processed_timestamps = set()
-
-        for frame_number in tqdm(frame_numbers, desc="提取高清帧"):
-            # 计算精确到毫秒的时间戳
-            timestamp = frame_number / self.fps
-            timestamp_ms = int(timestamp * 1000)  # 转换为毫秒
-
-            # 如果这一毫秒已经处理过，跳过
-            if timestamp_ms in processed_timestamps:
-                continue
-
-            self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
-            ret, frame = self.cap.read()
-
-            if ret:
-                # 记录这一毫秒已经处理
-                processed_timestamps.add(timestamp_ms)
-
-                # 计算时间戳字符串
-                hours = int(timestamp // 3600)
-                minutes = int((timestamp % 3600) // 60)
-                seconds = int(timestamp % 60)
-                milliseconds = int((timestamp % 1) * 1000)  # 计算毫秒部分
-                time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
-
-                output_path = os.path.join(output_folder,
-                                           f"keyframe_{frame_number:06d}_{time_str}.jpg")
-                cv2.imwrite(output_path, frame)
-            else:
-                logger.info(f"无法读取帧 {frame_number}")
-
-        logger.info(f"共提取了 {len(processed_timestamps)} 个不同时间戳的帧")
-
-    @staticmethod
-    def extract_numbers_from_folder(folder_path: str) -> List[int]:
-        """
-        从文件夹中提取帧号
-        
-        Args:
-            folder_path: 关键帧文件夹路径
-        
-        Returns:
-            List[int]: 排序后的帧号列表
-        """
-        files = [f for f in os.listdir(folder_path) if f.endswith('.jpg')]
-        # 更新正则表达式以匹配新的文件名格式：keyframe_000123_010534123.jpg
-        pattern = re.compile(r'keyframe_(\d+)_\d{9}\.jpg$')
-        numbers = []
-        
-        for f in files:
-            match = pattern.search(f)
-            if match:
-                numbers.append(int(match.group(1)))
-            else:
-                logger.warning(f"文件名格式不匹配: {f}")
-        
-        if not numbers:
-            logger.error(f"在目录 {folder_path} 中未找到有效的关键帧文件")
-        
-        return sorted(numbers)
-
-    def process_video(self, output_dir: str, skip_seconds: float = 0, threshold: int = 30) -> None:
-        """
-        处理视频并提取关键帧
-
-        Args:
-            output_dir: 输出目录
-            skip_seconds: 跳过视频开头的秒数
-        """
-        skip_frames = int(skip_seconds * self.fps)
-
-        logger.info("读取视频帧...")
-        frames = []
-        for frame in tqdm(self.preprocess_video(),
-                          total=self.total_frames,
-                          desc="读取视频"):
-            frames.append(frame)
-
-        frames = frames[skip_frames:]
-
-        if not frames:
-            raise ValueError(f"跳过 {skip_seconds} 秒后没有剩余帧可以处理")
-
-        logger.info("检测场景边界...")
-        shot_boundaries = self.detect_shot_boundaries(frames, threshold)
-        logger.info(f"检测到 {len(shot_boundaries)} 个场景边界")
-
-        keyframes, keyframe_indices = self.extract_keyframes(frames, shot_boundaries)
-
-        adjusted_indices = [idx + skip_frames for idx in keyframe_indices]
-        self.save_keyframes(keyframes, adjusted_indices, output_dir, desc="保存压缩关键帧")
-
-    def process_video_pipeline(self,
-                               output_dir: str,
-                               skip_seconds: float = 0,
-                               threshold: int = 20,  # 降低默认阈值
-                               compressed_width: int = 320,
-                               keep_temp: bool = False) -> None:
-        """
-        执行完整的视频处理流程
-        
-        Args:
-            threshold: 降低默认阈值为20，使场景检测更敏感
-        """
-        os.makedirs(output_dir, exist_ok=True)
-        temp_dir = os.path.join(output_dir, 'temp')
-        compressed_dir = os.path.join(temp_dir, 'compressed')
-        mini_frames_dir = os.path.join(temp_dir, 'mini_frames')
-        hd_frames_dir = output_dir
-
-        os.makedirs(temp_dir, exist_ok=True)
-        os.makedirs(compressed_dir, exist_ok=True)
-        os.makedirs(mini_frames_dir, exist_ok=True)
-        os.makedirs(hd_frames_dir, exist_ok=True)
-
-        mini_processor = None
-        compressed_video = None
-
-        try:
-            # 1. 压缩视频
-            video_name = os.path.splitext(os.path.basename(self.video_path))[0]
-            compressed_video = os.path.join(compressed_dir, f"{video_name}_compressed.mp4")
-
-            # 获取原始视频的宽度和高度
-            original_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-            original_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-            
-            logger.info("步骤1: 压缩视频...")
-            if original_width > original_height:
-                # 横版视频
-                scale_filter = f'scale={compressed_width}:-1'
-            else:
-                # 竖版视频
-                scale_filter = f'scale=-1:{compressed_width}'
-            
-            ffmpeg_cmd = [
-                'ffmpeg', '-i', self.video_path,
-                '-vf', scale_filter,
-                '-y',
-                compressed_video
-            ]
-            
-            try:
-                subprocess.run(ffmpeg_cmd, check=True, capture_output=True, text=True)
-            except subprocess.CalledProcessError as e:
-                logger.error(f"FFmpeg 错误输出: {e.stderr}")
-                raise
-
-            # 2. 从压缩视频中提取关键帧
-            logger.info("\n步骤2: 从压缩视频提取关键帧...")
-            mini_processor = VideoProcessor(compressed_video)
-            mini_processor.process_video(mini_frames_dir, skip_seconds, threshold)
-
-            # 3. 从原始视频提取高清关键帧
-            logger.info("\n步骤3: 提取高清关键帧...")
-            frame_numbers = self.extract_numbers_from_folder(mini_frames_dir)
-
-            if not frame_numbers:
-                raise ValueError("未能从压缩视频中提取到有效的关键帧")
-
-            self.extract_frames_by_numbers(frame_numbers, hd_frames_dir)
-
-            logger.info(f"处理完成！高清关键帧保存在: {hd_frames_dir}")
-
-        except Exception as e:
-            import traceback
-            logger.error(f"视频处理失败: \n{traceback.format_exc()}")
-            raise
-
-        finally:
-            # 释放资源
-            if mini_processor:
-                mini_processor.cap.release()
-                del mini_processor
-
-            # 确保视频文件句柄被释放
-            if hasattr(self, 'cap'):
-                self.cap.release()
-
-            # 等待资源释放
-            import time
-            time.sleep(0.5)
-
-            if not keep_temp:
-                try:
-                    # 先删除压缩视频文件
-                    if compressed_video and os.path.exists(compressed_video):
-                        try:
-                            os.remove(compressed_video)
-                        except Exception as e:
-                            logger.warning(f"删除压缩视频失败: {e}")
-
-                    # 再删除临时目录
-                    import shutil
-                    if os.path.exists(temp_dir):
-                        max_retries = 3
-                        for i in range(max_retries):
-                            try:
-                                shutil.rmtree(temp_dir)
-                                break
-                            except Exception as e:
-                                if i == max_retries - 1:
-                                    logger.warning(f"清理临时文件失败: {e}")
-                                else:
-                                    time.sleep(1)  # 等待1秒后重试
-                                    continue
-
-                    logger.info("临时文件已清理")
-                except Exception as e:
-                    logger.warning(f"清理临时文件时出错: {e}")
-
-
-if __name__ == "__main__":
-    import time
-
-    start_time = time.time()
-    processor = VideoProcessor("E:\\projects\\NarratoAI\\resource\\videos\\test.mp4")
-    processor.process_video_pipeline(output_dir="output")
-    end_time = time.time()
-    print(f"处理完成！总耗时: {end_time - start_time:.2f} 秒")
--- a/config.example.toml
+++ b/config.example.toml
@ -1,10 +1,9 @@
 [app]
-    project_version="0.5.3"
+    project_version="0.6.0"
    # 支持视频理解的大模型提供商
    #   gemini
    #   qwenvl
    vision_llm_provider="qwenvl"
-    vision_analysis_prompt = "你是资深视频内容分析专家，擅长分析视频画面信息，分析下面视频画面内容，只输出客观的画面描述不要给任何总结或评价"

    ########## Vision Gemini API Key
    vision_gemini_api_key = ""
@ -173,12 +172,7 @@
    speech_region=""

 [frames]
-    skip_seconds = 0
-    # threshold（差异阈值）用于判断两个连续帧之间是否发生了场景切换
-    # 较小的阈值（如 20）：更敏感，能捕捉到细微的场景变化，但可能会误判，关键帧图片更多
-    # 较大的阈值（如 40）：更保守，只捕捉明显的场景切换，但可能会漏掉渐变场景，关键帧图片更少
-    # 默认值 30：在实践中是一个比较平衡的选择
-    threshold = 30
-    version = "v2"
+    # 提取关键帧的间隔时间
+    frame_interval_input = 3
    # 大模型单次处理的关键帧数量
-    vision_batch_size = 5
+    vision_batch_size = 10
--- a/requirements.txt
+++ b/requirements.txt
@ -1,38 +1,46 @@
-requests~=2.31.0
-moviepy==2.0.0.dev2
-faster-whisper~=1.0.1
-uvicorn~=0.27.1
-fastapi~=0.115.4
-tomli~=2.0.1
-streamlit~=1.40.0
-loguru~=0.7.2
-aiohttp~=3.10.10
-urllib3~=2.2.1
-pydantic~=2.6.3
-g4f~=0.3.0.4
-dashscope~=1.15.0
-google.generativeai>=0.8.3
-python-multipart~=0.0.9
-redis==5.0.3
-opencv-python~=4.10.0.84
-# for azure speech
-# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
-azure-cognitiveservices-speech~=1.37.0
-git-changelog~=2.5.2
-watchdog==5.0.2
-pydub==0.25.1
-psutil>=5.9.0
-opencv-python~=4.10.0.84
-scikit-learn~=1.5.2
-google-generativeai~=0.8.3
-pillow==10.3.0
-python-dotenv~=1.0.1
-openai~=1.53.0
-tqdm>=4.66.6
-tenacity>=9.0.0
-tiktoken==0.8.0
-yt-dlp==2024.11.18
-pysrt==1.1.2
-httpx==0.27.2
-transformers==4.47.0
+# 必须项
+requests~=2.32.0
+moviepy==2.1.1
 edge-tts==6.1.19
+streamlit~=1.45.0
+watchdog==6.0.0
+loguru~=0.7.3
+tomli~=2.2.1
+pydub==0.25.1
+
+openai~=1.77.0
+google-generativeai>=0.8.5
+
+# 待优化项
+# opencv-python==4.11.0.86
+# scikit-learn==1.6.1
+
+# fastapi~=0.115.4
+# uvicorn~=0.27.1
+# pydantic~=2.11.4
+
+# faster-whisper~=1.0.1
+# tomli~=2.0.1
+# aiohttp~=3.10.10
+# httpx==0.27.2
+# urllib3~=2.2.1
+
+# python-multipart~=0.0.9
+# redis==5.0.3
+# opencv-python~=4.10.0.84
+# azure-cognitiveservices-speech~=1.37.0
+# git-changelog~=2.5.2
+# watchdog==5.0.2
+# pydub==0.25.1
+# psutil>=5.9.0
+# scikit-learn~=1.5.2
+# pillow==10.3.0
+# python-dotenv~=1.0.1
+
+# tqdm>=4.66.6
+# tenacity>=9.0.0
+# tiktoken==0.8.0
+# pysrt==1.1.2
+# transformers==4.50.0
+
+# yt-dlp==2025.4.30
--- a/webui.py
+++ b/webui.py
@ -1,13 +1,14 @@
 import streamlit as st
 import os
 import sys
-from uuid import uuid4
+from loguru import logger
 from app.config import config
-from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, review_settings, merge_settings, system_settings
+from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \
+    review_settings, merge_settings, system_settings
 from webui.utils import cache, file_utils
 from app.utils import utils
 from app.models.schema import VideoClipParams, VideoAspect
-from webui.utils.performance import PerformanceMonitor
+

 # 初始化配置 - 必须是第一个 Streamlit 命令
 st.set_page_config(
@ -17,7 +18,7 @@ st.set_page_config(
    initial_sidebar_state="auto",
    menu_items={
        "Report a bug": "https://github.com/linyqh/NarratoAI/issues",
-        'About': f"# NarratoAI:sunglasses: 📽️ \n #### Version: v{config.project_version} \n "
+        'About': f"# Narrato:blue[AI] :sunglasses: 📽️ \n #### Version: v{config.project_version} \n "
                 f"自动化影视解说视频详情请移步：https://github.com/linyqh/NarratoAI"
    },
 )
@ -28,6 +29,7 @@ hide_streamlit_style = """
 """
 st.markdown(hide_streamlit_style, unsafe_allow_html=True)

+
 def init_log():
    """初始化日志配置"""
    from loguru import logger
@ -35,17 +37,7 @@ def init_log():
    _lvl = "DEBUG"

    def format_record(record):
-        # 增加更多需要过滤的警告消息
-        ignore_messages = [
-            "Examining the path of torch.classes raised",
-            "torch.cuda.is_available()",
-            "CUDA initialization"
-        ]
-        
-        for msg in ignore_messages:
-            if msg in record["message"]:
-                return ""
-            
+        # 简化日志格式化处理，不尝试按特定字符串过滤torch相关内容
        file_path = record["file"].path
        relative_path = os.path.relpath(file_path, config.root_dir)
        record["file"].path = f"./{relative_path}"
@ -57,23 +49,54 @@ def init_log():
                  '- <level>{message}</>' + "\n"
        return _format

-    # 优化日志过滤器
-    def log_filter(record):
-        ignore_messages = [
-            "Examining the path of torch.classes raised",
-            "torch.cuda.is_available()",
-            "CUDA initialization"
-        ]
-        return not any(msg in record["message"] for msg in ignore_messages)
-
+    # 替换为更简单的过滤方式，避免在过滤时访问message内容
+    # 此处先不设置复杂的过滤器，等应用启动后再动态添加
    logger.add(
        sys.stdout,
        level=_lvl,
        format=format_record,
-        colorize=True,
-        filter=log_filter
+        colorize=True
    )

+    # 应用启动后，可以再添加更复杂的过滤器
+    def setup_advanced_filters():
+        """在应用完全启动后设置高级过滤器"""
+        try:
+            for handler_id in logger._core.handlers:
+                logger.remove(handler_id)
+                
+            # 重新添加带有高级过滤的处理器
+            def advanced_filter(record):
+                """更复杂的过滤器，在应用启动后安全使用"""
+                ignore_messages = [
+                    "Examining the path of torch.classes raised",
+                    "torch.cuda.is_available()",
+                    "CUDA initialization"
+                ]
+                return not any(msg in record["message"] for msg in ignore_messages)
+                
+            logger.add(
+                sys.stdout,
+                level=_lvl,
+                format=format_record,
+                colorize=True,
+                filter=advanced_filter
+            )
+        except Exception as e:
+            # 如果过滤器设置失败，确保日志仍然可用
+            logger.add(
+                sys.stdout,
+                level=_lvl,
+                format=format_record,
+                colorize=True
+            )
+            logger.error(f"设置高级日志过滤器失败: {e}")
+    
+    # 将高级过滤器设置放到启动主逻辑后
+    import threading
+    threading.Timer(5.0, setup_advanced_filters).start()
+
+
 def init_global_state():
    """初始化全局状态"""
    if 'video_clip_json' not in st.session_state:
@ -85,6 +108,7 @@ def init_global_state():
    if 'subclip_videos' not in st.session_state:
        st.session_state['subclip_videos'] = {}

+
 def tr(key):
    """翻译函数"""
    i18n_dir = os.path.join(os.path.dirname(__file__), "webui", "i18n")
@ -92,90 +116,94 @@ def tr(key):
    loc = locales.get(st.session_state['ui_language'], {})
    return loc.get("Translation", {}).get(key, key)

+
 def render_generate_button():
    """渲染生成按钮和处理逻辑"""
    if st.button(tr("Generate Video"), use_container_width=True, type="primary"):
+        from app.services import task as tm
+
+        # 重置日志容器和记录
+        log_container = st.empty()
+        log_records = []
+
+        def log_received(msg):
+            with log_container:
+                log_records.append(msg)
+                st.code("\n".join(log_records))
+
+        from loguru import logger
+        logger.add(log_received)
+
+        config.save_config()
+        task_id = st.session_state.get('task_id')
+
+        if not task_id:
+            st.error(tr("请先裁剪视频"))
+            return
+        if not st.session_state.get('video_clip_json_path'):
+            st.error(tr("脚本文件不能为空"))
+            return
+        if not st.session_state.get('video_origin_path'):
+            st.error(tr("视频文件不能为空"))
+            return
+
+        st.toast(tr("生成视频"))
+        logger.info(tr("开始生成视频"))
+
+        # 获取所有参数
+        script_params = script_settings.get_script_params()
+        video_params = video_settings.get_video_params()
+        audio_params = audio_settings.get_audio_params()
+        subtitle_params = subtitle_settings.get_subtitle_params()
+
+        # 合并所有参数
+        all_params = {
+            **script_params,
+            **video_params,
+            **audio_params,
+            **subtitle_params
+        }
+
+        # 创建参数对象
+        params = VideoClipParams(**all_params)
+
+        result = tm.start_subclip(
+            task_id=task_id,
+            params=params,
+            subclip_path_videos=st.session_state['subclip_videos']
+        )
+
+        video_files = result.get("videos", [])
+        st.success(tr("视生成完成"))
+
        try:
-            from app.services import task as tm
-            import torch
-            
-            # 重置日志容器和记录
-            log_container = st.empty()
-            log_records = []
+            if video_files:
+                player_cols = st.columns(len(video_files) * 2 + 1)
+                for i, url in enumerate(video_files):
+                    player_cols[i * 2 + 1].video(url)
+        except Exception as e:
+            logger.error(f"播放视频失败: {e}")

-            def log_received(msg):
-                with log_container:
-                    log_records.append(msg)
-                    st.code("\n".join(log_records))
+        file_utils.open_task_folder(config.root_dir, task_id)
+        logger.info(tr("视频生成完成"))

-            from loguru import logger
-            logger.add(log_received)
-
-            config.save_config()
-            task_id = st.session_state.get('task_id')
-
-            if not task_id:
-                st.error(tr("请先裁剪视频"))
-                return
-            if not st.session_state.get('video_clip_json_path'):
-                st.error(tr("脚本文件不能为空"))
-                return
-            if not st.session_state.get('video_origin_path'):
-                st.error(tr("视频文件不能为空"))
-                return
-
-            st.toast(tr("生成视频"))
-            logger.info(tr("开始生成视频"))
-
-            # 获取所有参数
-            script_params = script_settings.get_script_params()
-            video_params = video_settings.get_video_params()
-            audio_params = audio_settings.get_audio_params()
-            subtitle_params = subtitle_settings.get_subtitle_params()
-
-            # 合并所有参数
-            all_params = {
-                **script_params,
-                **video_params,
-                **audio_params,
-                **subtitle_params
-            }
-
-            # 创建参数对象
-            params = VideoClipParams(**all_params)
-
-            result = tm.start_subclip(
-                task_id=task_id,
-                params=params,
-                subclip_path_videos=st.session_state['subclip_videos']
-            )
-
-            video_files = result.get("videos", [])
-            st.success(tr("视生成完成"))
-            
-            try:
-                if video_files:
-                    player_cols = st.columns(len(video_files) * 2 + 1)
-                    for i, url in enumerate(video_files):
-                        player_cols[i * 2 + 1].video(url)
-            except Exception as e:
-                logger.error(f"播放视频失败: {e}")
-
-            file_utils.open_task_folder(config.root_dir, task_id)
-            logger.info(tr("视频生成完成"))
-
-        finally:
-            PerformanceMonitor.cleanup_resources()

 def main():
    """主函数"""
    init_log()
    init_global_state()
-    utils.init_resources()
    
-    st.title(f"NarratoAI :sunglasses:📽️")
+    # 仅初始化基本资源，避免过早地加载依赖PyTorch的资源
+    # 检查是否能分解utils.init_resources()为基本资源和高级资源(如依赖PyTorch的资源)
+    try:
+        utils.init_resources()
+    except Exception as e:
+        logger.warning(f"资源初始化时出现警告: {e}")
+
+    st.title(f"Narrato:blue[AI]:sunglasses: 📽️")
    st.write(tr("Get Help"))
-    
+
+    # 首先渲染不依赖PyTorch的UI部分
    # 渲染基础设置面板
    basic_settings.render_basic_settings(tr)
    # 渲染合并设置
@ -190,14 +218,18 @@ def main():
        audio_settings.render_audio_panel(tr)
    with panel[2]:
        subtitle_settings.render_subtitle_panel(tr)
-        # 渲染系统设置面板
-        system_settings.render_system_panel(tr)
    
    # 渲染视频审查面板
    review_settings.render_review_panel(tr)
    
-    # 渲染生成按钮和处理逻辑
+    # 放到最后渲染可能使用PyTorch的部分
+    # 渲染系统设置面板
+    with panel[2]:
+        system_settings.render_system_panel(tr)
+        
+    # 放到最后渲染生成按钮和处理逻辑
    render_generate_button()

+
 if __name__ == "__main__":
    main()
--- a/webui/init.py
+++ b/webui/init.py
@ -8,7 +8,7 @@ from webui.components import (
    audio_settings,
    subtitle_settings
 )
-from webui.utils import cache, file_utils, performance
+from webui.utils import cache, file_utils

 __all__ = [
    'config',
@ -17,6 +17,5 @@ __all__ = [
    'audio_settings',
    'subtitle_settings',
    'cache',
-    'file_utils',
-    'performance'
+    'file_utils'
 ] 
--- a/webui/components/basic_settings.py
+++ b/webui/components/basic_settings.py
@ -1,7 +1,10 @@
+import traceback
+
 import streamlit as st
 import os
 from app.config import config
 from app.utils import utils
+from loguru import logger


 def render_basic_settings(tr):
@ -266,7 +269,7 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr):
            elif provider.lower() == 'moonshot':
                base_url = "https://api.moonshot.cn/v1"
            elif provider.lower() == 'deepseek':
-                base_url = "https://api.deepseek.com/v1"
+                base_url = "https://api.deepseek.com"
                
        # 构建测试URL
        test_url = f"{base_url.rstrip('/')}/chat/completions"
@ -288,7 +291,7 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr):
            "messages": [
                {"role": "user", "content": "直接回复我文本'当前网络可用'"}
            ],
-            "max_tokens": 10
+            "stream": False
        }
        
        # 发送测试请求
@ -296,7 +299,6 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr):
            test_url,
            headers=headers,
            json=test_data,
-            timeout=10
        )
        
        if response.status_code == 200:
@ -313,7 +315,7 @@ def render_text_llm_settings(tr):
    st.subheader(tr("Text Generation Model Settings"))

    # 文案生成模型提供商选择
-    text_providers = ['DeepSeek', 'OpenAI', 'Qwen', 'Moonshot', 'Gemini']
+    text_providers = ['DeepSeek', 'OpenAI', 'Siliconflow', 'Qwen', 'Moonshot', 'Gemini']
    saved_text_provider = config.app.get("text_llm_provider", "DeepSeek").lower()
    saved_provider_index = 0

@ -331,9 +333,9 @@ def render_text_llm_settings(tr):
    config.app["text_llm_provider"] = text_provider

    # 获取已保存的文本模型配置
-    text_api_key = config.app.get(f"text_{text_provider}_api_key", "")
-    text_base_url = config.app.get(f"text_{text_provider}_base_url", "")
-    text_model_name = config.app.get(f"text_{text_provider}_model_name", "")
+    text_api_key = config.app.get(f"text_{text_provider}_api_key")
+    text_base_url = config.app.get(f"text_{text_provider}_base_url")
+    text_model_name = config.app.get(f"text_{text_provider}_model_name")

    # 渲染文本模型配置输入框
    st_text_api_key = st.text_input(tr("Text API Key"), value=text_api_key, type="password")
@ -342,6 +344,8 @@ def render_text_llm_settings(tr):

    # 添加测试按钮
    if st.button(tr("Test Connection"), key="test_text_connection"):
+        logger.debug(st_text_base_url)
+        logger.debug(st_text_model_name)
        with st.spinner(tr("Testing connection...")):
            success, message = test_text_model_connection(
                api_key=st_text_api_key,
@ -364,11 +368,11 @@ def render_text_llm_settings(tr):
    if st_text_model_name:
        config.app[f"text_{text_provider}_model_name"] = st_text_model_name

-    # Cloudflare 特殊配置
-    if text_provider == 'cloudflare':
-        st_account_id = st.text_input(
-            tr("Account ID"),
-            value=config.app.get(f"text_{text_provider}_account_id", "")
-        )
-        if st_account_id:
-            config.app[f"text_{text_provider}_account_id"] = st_account_id
+    # # Cloudflare 特殊配置
+    # if text_provider == 'cloudflare':
+    #     st_account_id = st.text_input(
+    #         tr("Account ID"),
+    #         value=config.app.get(f"text_{text_provider}_account_id", "")
+    #     )
+    #     if st_account_id:
+    #         config.app[f"text_{text_provider}_account_id"] = st_account_id
--- a/webui/components/merge_settings.py
+++ b/webui/components/merge_settings.py
@ -285,8 +285,8 @@ def render_merge_settings(tr):
                            error_message = str(e)
                            if "moviepy" in error_message.lower():
                                st.error(tr("Error processing video files. Please check if the videos are valid MP4 files."))
-                            elif "pysrt" in error_message.lower():
-                                st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files."))
+                            # elif "pysrt" in error_message.lower():
+                            #     st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files."))
                            else:
                                st.error(f"{tr('Error during merge')}: {error_message}")
                
--- a/webui/components/review_settings.py
+++ b/webui/components/review_settings.py
@ -33,7 +33,7 @@ def render_video_item(tr, video_list, subclip_videos, index):
    video_script = video_list[index]

    # 显示时间戳
-    timestamp = video_script.get('timestamp', '')
+    timestamp = video_script.get('_id', '')
    st.text_area(
        tr("Timestamp"),
        value=timestamp,
--- a/webui/components/script_settings.py
+++ b/webui/components/script_settings.py
@ -47,7 +47,7 @@ def render_script_file(tr, params):
        (tr("None"), ""), 
        (tr("Auto Generate"), "auto"), 
        (tr("Short Generate"), "short"),
-        (tr("Upload Script"), "upload_script")  # 新增上传脚本选项
+        (tr("Upload Script"), "upload_script")
    ]

    # 获取已有脚本文件
@ -214,38 +214,25 @@ def render_script_buttons(tr, params):
    # 根据脚本类型显示不同的设置
    if script_path != "short":
        # 非短视频模式下显示原有的三个输入框
-        input_cols = st.columns(3)
+        input_cols = st.columns(2)
        
        with input_cols[0]:
-            skip_seconds = st.number_input(
-                "skip_seconds",
+            st.number_input(
+                tr("Frame Interval (seconds)"),
                min_value=0,
-                value=st.session_state.get('skip_seconds', config.frames.get('skip_seconds', 0)),
-                help=tr("Skip the first few seconds"),
-                key="skip_seconds_input"
+                value=st.session_state.get('frame_interval_input', config.frames.get('frame_interval_input', 3)),
+                help=tr("Frame Interval (seconds) (More keyframes consume more tokens)"),
+                key="frame_interval_input"
            )
-            st.session_state['skip_seconds'] = skip_seconds
-            
+        
        with input_cols[1]:
-            threshold = st.number_input(
-                "threshold",
+            st.number_input(
+                tr("Batch Size"),
                min_value=0,
-                value=st.session_state.get('threshold', config.frames.get('threshold', 30)),
-                help=tr("Difference threshold"),
-                key="threshold_input"
+                value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 10)),
+                help=tr("Batch Size (More keyframes consume more tokens)"),
+                key="vision_batch_size"
            )
-            st.session_state['threshold'] = threshold
-            
-        with input_cols[2]:
-            vision_batch_size = st.number_input(
-                "vision_batch_size",
-                min_value=1,
-                max_value=20,
-                value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 5)),
-                help=tr("Vision processing batch size"),
-                key="vision_batch_size_input"
-            )
-            st.session_state['vision_batch_size'] = vision_batch_size

    # 生成/加载按钮
    if script_path == "auto":
@ -259,7 +246,8 @@ def render_script_buttons(tr, params):

    if st.button(button_name, key="script_action", disabled=not script_path):
        if script_path == "auto":
-            generate_script_docu(tr, params)
+            # 执行纪录片视频脚本生成（视频无字幕无配音）
+            generate_script_docu(params)
        elif script_path == "short":
            # 获取自定义片段数量参数
            custom_clips = st.session_state.get('custom_clips', 5)
@ -366,12 +354,11 @@ def crop_video(tr, params):
        utils.cut_video(params, update_progress)
        time.sleep(0.5)
        progress_bar.progress(100)
-        status_text.text("剪完成！")
        st.success("视频剪辑成功完成！")
    except Exception as e:
        st.error(f"剪辑过程中发生错误: {str(e)}")
    finally:
-        time.sleep(2)
+        time.sleep(1)
        progress_bar.empty()
        status_text.empty()

--- a/webui/components/subtitle_settings.py
+++ b/webui/components/subtitle_settings.py
@ -127,7 +127,7 @@ def get_subtitle_params():
        'font_name': st.session_state.get('font_name', ''),
        'font_size': st.session_state.get('font_size', 60),
        'text_fore_color': st.session_state.get('text_fore_color', '#FFFFFF'),
-        'position': st.session_state.get('subtitle_position', 'bottom'),
+        'subtitle_position': st.session_state.get('subtitle_position', 'bottom'),
        'custom_position': st.session_state.get('custom_position', 70.0),
        'stroke_color': st.session_state.get('stroke_color', '#000000'),
        'stroke_width': st.session_state.get('stroke_width', 1.5),
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@ -85,6 +85,7 @@
    "TTS Provider": "TTS Provider",
    "Hide Log": "Hide Log",
    "Upload Local Files": "Upload Local Files",
-    "File Uploaded Successfully": "File Uploaded Successfully"
+    "File Uploaded Successfully": "File Uploaded Successfully",
+    "Frame Interval (seconds)": "Frame Interval (seconds) (More keyframes consume more tokens)"
  }
 }
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -115,7 +115,6 @@
    "Text Generation Model Settings": "文案生成模型设置",
    "LLM Model Name": "大语言模型名称",
    "LLM Model API Key": "大语言模型 API 密钥",
-    "Batch Size": "批处理大小",
    "Text Model Provider": "文案生成模型提供商",
    "Text API Key": "文案生成 API 密钥",
    "Text Base URL": "文案生成接口地址",
@ -192,6 +191,10 @@
    "Generate Short Video Script": "AI生成短剧混剪脚本",
    "Adjust the volume of the original audio": "调整原始音频的音量",
    "Original Volume": "视频音量",
-    "Auto Generate": "纪录片解说 (画面解说)"
+    "Auto Generate": "纪录片解说 (画面解说)",
+    "Frame Interval (seconds)": "帧间隔 (秒)",
+    "Frame Interval (seconds) (More keyframes consume more tokens)": "帧间隔 (秒) (更多关键帧消耗更多令牌)",
+    "Batch Size": "批处理大小",
+    "Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多"
  }
-}
+}
--- a/webui/tools/generate_script_docu.py
+++ b/webui/tools/generate_script_docu.py
@ -5,20 +5,23 @@ import time
 import asyncio
 import traceback
 import requests
+from app.utils import video_processor
 import streamlit as st
 from loguru import logger
 from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
+from datetime import datetime

 from app.config import config
 from app.utils.script_generator import ScriptProcessor
-from app.utils import utils, video_processor, video_processor_v2, qwenvl_analyzer
+from app.utils import utils, video_processor, qwenvl_analyzer
 from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps, chekc_video_config


-def generate_script_docu(tr, params):
+def generate_script_docu(params):
    """
    生成 纪录片 视频脚本
+    要求: 原视频无字幕无配音
+    适合场景: 纪录片、动物搞笑解说、荒野建造等
    """
    progress_bar = st.progress(0)
    status_text = st.empty()
@ -35,8 +38,9 @@ def generate_script_docu(tr, params):
            if not params.video_origin_path:
                st.error("请先选择视频文件")
                return
-
-            # ===================提取键帧===================
+            """
+            1. 提取键帧
+            """
            update_progress(10, "正在提取关键帧...")

            # 创建临时目录用于存储关键帧
@ -64,21 +68,12 @@ def generate_script_docu(tr, params):
                    os.makedirs(video_keyframes_dir, exist_ok=True)

                    # 初始化视频处理器
-                    if config.frames.get("version") == "v2":
-                        processor = video_processor_v2.VideoProcessor(params.video_origin_path)
-                        # 处理视频并提取关键帧
-                        processor.process_video_pipeline(
-                            output_dir=video_keyframes_dir,
-                            skip_seconds=st.session_state.get('skip_seconds'),
-                            threshold=st.session_state.get('threshold')
-                        )
-                    else:
-                        processor = video_processor.VideoProcessor(params.video_origin_path)
-                        # 处理视频并提取关键帧
-                        processor.process_video(
-                            output_dir=video_keyframes_dir,
-                            skip_seconds=0
-                        )
+                    processor = video_processor.VideoProcessor(params.video_origin_path)
+                    # 处理视频并提取关键帧
+                    processor.process_video_pipeline(
+                        output_dir=video_keyframes_dir,
+                        interval_seconds=st.session_state.get('frame_interval_input'),
+                    )

                    # 获取所有关键文件路径
                    for filename in sorted(os.listdir(video_keyframes_dir)):
@ -101,9 +96,11 @@ def generate_script_docu(tr, params):

                    raise Exception(f"关键帧提取失败: {str(e)}")

-            # 根据不同的 LLM 提供商处理
+            """
+            2. 视觉分析(批量分析每一帧)
+            """
            vision_llm_provider = st.session_state.get('vision_llm_providers').lower()
-            logger.debug(f"Vision LLM 提供商: {vision_llm_provider}")
+            logger.debug(f"VLM 视觉大模型提供商: {vision_llm_provider}")

            try:
                # ===================初始化视觉分析器===================
@ -137,111 +134,240 @@ def generate_script_docu(tr, params):

                # 执行异步分析
                vision_batch_size = st.session_state.get('vision_batch_size') or config.frames.get("vision_batch_size")
+                vision_analysis_prompt = """
+我提供了 %s 张视频帧，它们按时间顺序排列，代表一个连续的视频片段。请仔细分析每一帧的内容，并关注帧与帧之间的变化，以理解整个片段的活动。
+
+首先，请详细描述每一帧的关键视觉信息（包含：主要内容、人物、动作和场景）。
+然后，基于所有帧的分析，请用**简洁的语言**总结整个视频片段中发生的主要活动或事件流程。
+
+请务必使用 JSON 格式输出你的结果。JSON 结构应如下：
+{
+  "frame_observations": [
+    {
+      "frame_number": 1, // 或其他标识帧的方式
+      "observation": "描述每张视频帧中的主要内容、人物、动作和场景。"
+    },
+    // ... 更多帧的观察 ...
+  ],
+  "overall_activity_summary": "在这里填写你总结的整个片段的主要活动，保持简洁。"
+}
+
+请务必不要遗漏视频帧，我提供了 %s 张视频帧，frame_observations 必须包含 %s 个元素
+
+请只返回 JSON 字符串，不要包含任何其他解释性文字。
+                """
                results = loop.run_until_complete(
                    analyzer.analyze_images(
                        images=keyframe_files,
-                        prompt=config.app.get('vision_analysis_prompt'),
+                        prompt=vision_analysis_prompt,
                        batch_size=vision_batch_size
                    )
                )
                loop.close()

+                """
+                3. 处理分析结果（格式化为 json 数据）
+                """
                # ===================处理分析结果===================
                update_progress(60, "正在整理分析结果...")

-                # 合并所有批次的析结果
+                # 合并所有批次的分析结果
                frame_analysis = ""
+                merged_frame_observations = []  # 合并所有批次的帧观察
+                overall_activity_summaries = []  # 合并所有批次的整体总结
                prev_batch_files = None
-
+                frame_counter = 1  # 初始化帧计数器，用于给所有帧分配连续的序号
+                # logger.debug(json.dumps(results, indent=4, ensure_ascii=False))
+                # 确保分析目录存在
+                analysis_dir = os.path.join(utils.storage_dir(), "temp", "analysis")
+                os.makedirs(analysis_dir, exist_ok=True)
+                origin_res = os.path.join(analysis_dir, "frame_analysis.json")
+                with open(origin_res, 'w', encoding='utf-8') as f:
+                    json.dump(results, f, ensure_ascii=False, indent=2)
+                
+                # 开始处理
                for result in results:
                    if 'error' in result:
                        logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}")
-
-                    # 获取当前批次的文件列表 keyframe_001136_000045.jpg 将 000045 精度提升到 毫秒
+                        continue
+                        
+                    # 获取当前批次的文件列表
                    batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
                    logger.debug(f"批次 {result['batch_index']} 处理完成，共 {len(batch_files)} 张图片")
-                    # logger.debug(batch_files)
-
-                    first_timestamp, last_timestamp, _ = get_batch_timestamps(batch_files, prev_batch_files)
+                    
+                    # 获取批次的时间戳范围
+                    first_timestamp, last_timestamp, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files)
                    logger.debug(f"处理时间戳: {first_timestamp}-{last_timestamp}")
-
-                    # 添加带时间戳的分析结果
-                    frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
-                    frame_analysis += result['response']
-                    frame_analysis += "\n"
-
+                    
+                    # 解析响应中的JSON数据
+                    response_text = result['response']
+                    try:
+                        # 处理可能包含```json```格式的响应
+                        if "```json" in response_text:
+                            json_content = response_text.split("```json")[1].split("```")[0].strip()
+                        elif "```" in response_text:
+                            json_content = response_text.split("```")[1].split("```")[0].strip()
+                        else:
+                            json_content = response_text.strip()
+                            
+                        response_data = json.loads(json_content)
+                        
+                        # 提取frame_observations和overall_activity_summary
+                        if "frame_observations" in response_data:
+                            frame_obs = response_data["frame_observations"]
+                            overall_summary = response_data.get("overall_activity_summary", "")
+                            
+                            # 添加时间戳信息到每个帧观察
+                            for i, obs in enumerate(frame_obs):
+                                if i < len(batch_files):
+                                    # 从文件名中提取时间戳
+                                    file_path = batch_files[i]
+                                    file_name = os.path.basename(file_path)
+                                    # 提取时间戳字符串 (格式如: keyframe_000675_000027000.jpg)
+                                    # 格式解析: keyframe_帧序号_毫秒时间戳.jpg
+                                    timestamp_parts = file_name.split('_')
+                                    if len(timestamp_parts) >= 3:
+                                        timestamp_str = timestamp_parts[-1].split('.')[0]
+                                        try:
+                                            # 修正时间戳解析逻辑
+                                            # 格式为000100000，表示00:01:00,000，即1分钟
+                                            # 需要按照对应位数进行解析:
+                                            # 前两位是小时，中间两位是分钟，后面是秒和毫秒
+                                            if len(timestamp_str) >= 9:  # 确保格式正确
+                                                hours = int(timestamp_str[0:2])
+                                                minutes = int(timestamp_str[2:4])
+                                                seconds = int(timestamp_str[4:6])
+                                                milliseconds = int(timestamp_str[6:9])
+                                                
+                                                # 计算总秒数
+                                                timestamp_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000
+                                                formatted_time = utils.format_time(timestamp_seconds)  # 格式化时间戳
+                                            else:
+                                                # 兼容旧的解析方式
+                                                timestamp_seconds = int(timestamp_str) / 1000  # 转换为秒
+                                                formatted_time = utils.format_time(timestamp_seconds)  # 格式化时间戳
+                                        except ValueError:
+                                            logger.warning(f"无法解析时间戳: {timestamp_str}")
+                                            timestamp_seconds = 0
+                                            formatted_time = "00:00:00,000"
+                                    else:
+                                        logger.warning(f"文件名格式不符合预期: {file_name}")
+                                        timestamp_seconds = 0
+                                        formatted_time = "00:00:00,000"
+                                    
+                                    # 添加额外信息到帧观察
+                                    obs["frame_path"] = file_path
+                                    obs["timestamp"] = formatted_time
+                                    obs["timestamp_seconds"] = timestamp_seconds
+                                    obs["batch_index"] = result['batch_index']
+                                    
+                                    # 使用全局递增的帧计数器替换原始的frame_number
+                                    if "frame_number" in obs:
+                                        obs["original_frame_number"] = obs["frame_number"]  # 保留原始编号作为参考
+                                    obs["frame_number"] = frame_counter  # 赋值连续的帧编号
+                                    frame_counter += 1  # 增加帧计数器
+                                    
+                                    # 添加到合并列表
+                                    merged_frame_observations.append(obs)
+                            
+                            # 添加批次整体总结信息
+                            if overall_summary:
+                                # 从文件名中提取时间戳数值
+                                first_time_str = first_timestamp.split('_')[-1].split('.')[0]
+                                last_time_str = last_timestamp.split('_')[-1].split('.')[0]
+                                
+                                # 转换为毫秒并计算持续时间（秒）
+                                try:
+                                    # 修正解析逻辑，与上面相同的方式解析时间戳
+                                    if len(first_time_str) >= 9 and len(last_time_str) >= 9:
+                                        # 解析第一个时间戳
+                                        first_hours = int(first_time_str[0:2])
+                                        first_minutes = int(first_time_str[2:4])
+                                        first_seconds = int(first_time_str[4:6])
+                                        first_ms = int(first_time_str[6:9])
+                                        first_time_seconds = first_hours * 3600 + first_minutes * 60 + first_seconds + first_ms / 1000
+                                        
+                                        # 解析第二个时间戳
+                                        last_hours = int(last_time_str[0:2])
+                                        last_minutes = int(last_time_str[2:4])
+                                        last_seconds = int(last_time_str[4:6])
+                                        last_ms = int(last_time_str[6:9])
+                                        last_time_seconds = last_hours * 3600 + last_minutes * 60 + last_seconds + last_ms / 1000
+                                        
+                                        batch_duration = last_time_seconds - first_time_seconds
+                                    else:
+                                        # 兼容旧的解析方式
+                                        first_time_ms = int(first_time_str)
+                                        last_time_ms = int(last_time_str)
+                                        batch_duration = (last_time_ms - first_time_ms) / 1000
+                                except ValueError:
+                                    # 使用 utils.time_to_seconds 函数处理格式化的时间戳
+                                    first_time_seconds = utils.time_to_seconds(first_time_str.replace('_', ':').replace('-', ','))
+                                    last_time_seconds = utils.time_to_seconds(last_time_str.replace('_', ':').replace('-', ','))
+                                    batch_duration = last_time_seconds - first_time_seconds
+                                
+                                overall_activity_summaries.append({
+                                    "batch_index": result['batch_index'],
+                                    "time_range": f"{first_timestamp}-{last_timestamp}",
+                                    "duration_seconds": batch_duration,
+                                    "summary": overall_summary
+                                })
+                    except Exception as e:
+                        logger.error(f"解析批次 {result['batch_index']} 的响应数据失败: {str(e)}")
+                        # 添加原始响应作为回退
+                        frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
+                        frame_analysis += response_text
+                        frame_analysis += "\n"
+                    
                    # 更新上一个批次的文件
                    prev_batch_files = batch_files
+                
+                # 将合并后的结果转为JSON字符串
+                merged_results = {
+                    "frame_observations": merged_frame_observations,
+                    "overall_activity_summaries": overall_activity_summaries
+                }
+                
+                # 使用当前时间创建文件名
+                now = datetime.now()
+                timestamp_str = now.strftime("%Y%m%d_%H%M")
+                
+                # 保存完整的分析结果为JSON
+                analysis_filename = f"frame_analysis_{timestamp_str}.json"
+                analysis_json_path = os.path.join(analysis_dir, analysis_filename)
+                with open(analysis_json_path, 'w', encoding='utf-8') as f:
+                    json.dump(merged_results, f, ensure_ascii=False, indent=2)
+                logger.info(f"分析结果已保存到: {analysis_json_path}")

-                if not frame_analysis.strip():
-                    raise Exception("未能生成有效的帧分析结果")
-
-                # 保存分析结果
-                analysis_path = os.path.join(utils.temp_dir(), "frame_analysis.txt")
-                with open(analysis_path, 'w', encoding='utf-8') as f:
-                    f.write(frame_analysis)
-
-                update_progress(70, "正在生成脚本...")
-
+                """
+                4. 生成文案
+                """
+                logger.info("开始准备生成解说文案")
+                update_progress(80, "正在生成文案...")
+                from app.services.generate_narration_script import parse_frame_analysis_to_markdown, generate_narration
                # 从配置中获取文本生成相关配置
                text_provider = config.app.get('text_llm_provider', 'gemini').lower()
                text_api_key = config.app.get(f'text_{text_provider}_api_key')
                text_model = config.app.get(f'text_{text_provider}_model_name')
                text_base_url = config.app.get(f'text_{text_provider}_base_url')

-                # 构建帧内容列表
-                frame_content_list = []
-                prev_batch_files = None
+                # 整理帧分析数据
+                markdown_output = parse_frame_analysis_to_markdown(analysis_json_path)

-                for i, result in enumerate(results):
-                    if 'error' in result:
-                        continue
-
-                    batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
-                    _, _, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files)
-
-                    frame_content = {
-                        "timestamp": timestamp_range,
-                        "picture": result['response'],
-                        "narration": "",
-                        "OST": 2
-                    }
-                    frame_content_list.append(frame_content)
-
-                    logger.debug(f"添加帧内容: 时间范围={timestamp_range}, 分析结果长度={len(result['response'])}")
-
-                    # 更新上一个批次的文件
-                    prev_batch_files = batch_files
-
-                if not frame_content_list:
-                    raise Exception("没有有效的帧内容可以处理")
-
-                # ===================开始生成文案===================
-                update_progress(80, "正在生成文案...")
-                # 校验配置
-                api_params = {
-                    "vision_api_key": vision_api_key,
-                    "vision_model_name": vision_model,
-                    "vision_base_url": vision_base_url or "",
-                    "text_api_key": text_api_key,
-                    "text_model_name": text_model,
-                    "text_base_url": text_base_url or ""
-                }
-                chekc_video_config(api_params)
-                custom_prompt = st.session_state.get('custom_prompt', '')
-                processor = ScriptProcessor(
-                    model_name=text_model,
-                    api_key=text_api_key,
-                    prompt=custom_prompt,
-                    base_url=text_base_url or "",
-                    video_theme=st.session_state.get('video_theme', '')
+                # 生成文案
+                # 生成解说文案
+                narration = generate_narration(
+                    markdown_output,
+                    text_api_key,
+                    base_url=text_base_url,
+                    model=text_model
                )
-
-                # 处理帧内容生成脚本
-                script_result = processor.process_frames(frame_content_list)
-
+                narration_dict = json.loads(narration)['items']
+                # 为 narration_dict 中每个 item 新增一个 OST: 2 的字段, 代表保留原声和配音
+                narration_dict = [{**item, "OST": 2} for item in narration_dict]
+                logger.debug(f"解说文案创作完成:\n{"\n".join([item['narration'] for item in narration_dict])}")
                # 结果转换为JSON字符串
-                script = json.dumps(script_result, ensure_ascii=False, indent=2)
+                script = json.dumps(narration_dict, ensure_ascii=False, indent=2)

            except Exception as e:
                logger.exception(f"大模型处理过程中发生错误\n{traceback.format_exc()}")
@ -250,7 +376,7 @@ def generate_script_docu(tr, params):
            if script is None:
                st.error("生成脚本失败，请检查日志")
                st.stop()
-            logger.info(f"脚本生成完成")
+            logger.success(f"剪辑脚本生成完成")
            if isinstance(script, list):
                st.session_state['video_clip_json'] = script
            elif isinstance(script, str):
--- a/webui/utils/init.py
+++ b/webui/utils/init.py
@ -1,8 +0,0 @@
-from .performance import monitor_performance, PerformanceMonitor
-from .cache import *
-from .file_utils import *
-
-__all__ = [
-    'monitor_performance',
-    'PerformanceMonitor'
-] 
--- a/webui/utils/merge_video.py
+++ b/webui/utils/merge_video.py
@ -1,8 +1,8 @@
 """
 合并视频和字幕文件
 """
-from moviepy.editor import VideoFileClip, concatenate_videoclips
-import pysrt
+from moviepy import VideoFileClip, concatenate_videoclips
+# import pysrt
 import os


--- a/webui/utils/performance.py
+++ b/webui/utils/performance.py
@ -1,37 +0,0 @@
-import psutil
-import os
-from loguru import logger
-import torch
-
-class PerformanceMonitor:
-    @staticmethod
-    def monitor_memory():
-        process = psutil.Process(os.getpid())
-        memory_info = process.memory_info()
-        
-        logger.debug(f"Memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")
-        
-        if torch.cuda.is_available():
-            gpu_memory = torch.cuda.memory_allocated() / 1024 / 1024
-            logger.debug(f"GPU Memory usage: {gpu_memory:.2f} MB")
-    
-    @staticmethod
-    def cleanup_resources():
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        
-        import gc
-        gc.collect()
-        
-        PerformanceMonitor.monitor_memory()
-
-def monitor_performance(func):
-    """性能监控装饰器"""
-    def wrapper(*args, **kwargs):
-        try:
-            PerformanceMonitor.monitor_memory()
-            result = func(*args, **kwargs)
-            return result
-        finally:
-            PerformanceMonitor.cleanup_resources()
-    return wrapper