Merge pull request #71 from linyqh/dev_v2

0.3.9新版本发布
2025-12-30 18:00:16 +00:00 · 2024-12-06 18:46:34 +08:00 · 2024-12-06 18:46:34 +08:00 · 894ba13026
commit 894ba13026
parent 4621a6729a 36e6018c74
45 changed files with 3946 additions and 1256 deletions
--- a/.gitignore
+++ b/.gitignore
@ -23,8 +23,12 @@ node_modules
 # 模型目录
 /models/
 ./models/*
-resource/scripts/*
-resource/videos/*
-resource/songs/*
-resource/fonts/*
+resource/scripts/*.json
+resource/videos/*.mp4
+resource/songs/*.mp3
+resource/songs/*.flac
+resource/fonts/*.ttc
+resource/fonts/*.ttf
+resource/fonts/*.otf
+resource/srt/*.srt
 app/models/faster-whisper-large-v2/*
--- a/README.md
+++ b/README.md
@ -43,6 +43,9 @@ NarratoAI 是一个自动化影视解说工具，基于LLM实现文案撰写、
 - [x] 发布 0.3.5 整合包
 - [ ] 支持阿里 Qwen2-VL 大模型理解视频
 - [ ] 支持短剧解说
+  - [x] 合并素材
+  - [ ] 一键转录
+  - [ ] 一键清理缓存
 - [ ] ...

 ## 配置要求 📦
--- a/app/controllers/v1/video.py
+++ b/app/controllers/v1/video.py
@ -163,109 +163,109 @@ def delete_video(request: Request, task_id: str = Path(..., description="Task ID
    )


-@router.get(
-    "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files"
-)
-def get_bgm_list(request: Request):
-    suffix = "*.mp3"
-    song_dir = utils.song_dir()
-    files = glob.glob(os.path.join(song_dir, suffix))
-    bgm_list = []
-    for file in files:
-        bgm_list.append(
-            {
-                "name": os.path.basename(file),
-                "size": os.path.getsize(file),
-                "file": file,
-            }
-        )
-    response = {"files": bgm_list}
-    return utils.get_response(200, response)
+# @router.get(
+#     "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files"
+# )
+# def get_bgm_list(request: Request):
+#     suffix = "*.mp3"
+#     song_dir = utils.song_dir()
+#     files = glob.glob(os.path.join(song_dir, suffix))
+#     bgm_list = []
+#     for file in files:
+#         bgm_list.append(
+#             {
+#                 "name": os.path.basename(file),
+#                 "size": os.path.getsize(file),
+#                 "file": file,
+#             }
+#         )
+#     response = {"files": bgm_list}
+#     return utils.get_response(200, response)
+#

-
-@router.post(
-    "/musics",
-    response_model=BgmUploadResponse,
-    summary="Upload the BGM file to the songs directory",
-)
-def upload_bgm_file(request: Request, file: UploadFile = File(...)):
-    request_id = base.get_task_id(request)
-    # check file ext
-    if file.filename.endswith("mp3"):
-        song_dir = utils.song_dir()
-        save_path = os.path.join(song_dir, file.filename)
-        # save file
-        with open(save_path, "wb+") as buffer:
-            # If the file already exists, it will be overwritten
-            file.file.seek(0)
-            buffer.write(file.file.read())
-        response = {"file": save_path}
-        return utils.get_response(200, response)
-
-    raise HttpException(
-        "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded"
-    )
-
-
-@router.get("/stream/{file_path:path}")
-async def stream_video(request: Request, file_path: str):
-    tasks_dir = utils.task_dir()
-    video_path = os.path.join(tasks_dir, file_path)
-    range_header = request.headers.get("Range")
-    video_size = os.path.getsize(video_path)
-    start, end = 0, video_size - 1
-
-    length = video_size
-    if range_header:
-        range_ = range_header.split("bytes=")[1]
-        start, end = [int(part) if part else None for part in range_.split("-")]
-        if start is None:
-            start = video_size - end
-            end = video_size - 1
-        if end is None:
-            end = video_size - 1
-        length = end - start + 1
-
-    def file_iterator(file_path, offset=0, bytes_to_read=None):
-        with open(file_path, "rb") as f:
-            f.seek(offset, os.SEEK_SET)
-            remaining = bytes_to_read or video_size
-            while remaining > 0:
-                bytes_to_read = min(4096, remaining)
-                data = f.read(bytes_to_read)
-                if not data:
-                    break
-                remaining -= len(data)
-                yield data
-
-    response = StreamingResponse(
-        file_iterator(video_path, start, length), media_type="video/mp4"
-    )
-    response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}"
-    response.headers["Accept-Ranges"] = "bytes"
-    response.headers["Content-Length"] = str(length)
-    response.status_code = 206  # Partial Content
-
-    return response
-
-
-@router.get("/download/{file_path:path}")
-async def download_video(_: Request, file_path: str):
-    """
-    download video
-    :param _: Request request
-    :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4
-    :return: video file
-    """
-    tasks_dir = utils.task_dir()
-    video_path = os.path.join(tasks_dir, file_path)
-    file_path = pathlib.Path(video_path)
-    filename = file_path.stem
-    extension = file_path.suffix
-    headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"}
-    return FileResponse(
-        path=video_path,
-        headers=headers,
-        filename=f"{filename}{extension}",
-        media_type=f"video/{extension[1:]}",
-    )
+# @router.post(
+#     "/musics",
+#     response_model=BgmUploadResponse,
+#     summary="Upload the BGM file to the songs directory",
+# )
+# def upload_bgm_file(request: Request, file: UploadFile = File(...)):
+#     request_id = base.get_task_id(request)
+#     # check file ext
+#     if file.filename.endswith("mp3"):
+#         song_dir = utils.song_dir()
+#         save_path = os.path.join(song_dir, file.filename)
+#         # save file
+#         with open(save_path, "wb+") as buffer:
+#             # If the file already exists, it will be overwritten
+#             file.file.seek(0)
+#             buffer.write(file.file.read())
+#         response = {"file": save_path}
+#         return utils.get_response(200, response)
+#
+#     raise HttpException(
+#         "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded"
+#     )
+#
+#
+# @router.get("/stream/{file_path:path}")
+# async def stream_video(request: Request, file_path: str):
+#     tasks_dir = utils.task_dir()
+#     video_path = os.path.join(tasks_dir, file_path)
+#     range_header = request.headers.get("Range")
+#     video_size = os.path.getsize(video_path)
+#     start, end = 0, video_size - 1
+#
+#     length = video_size
+#     if range_header:
+#         range_ = range_header.split("bytes=")[1]
+#         start, end = [int(part) if part else None for part in range_.split("-")]
+#         if start is None:
+#             start = video_size - end
+#             end = video_size - 1
+#         if end is None:
+#             end = video_size - 1
+#         length = end - start + 1
+#
+#     def file_iterator(file_path, offset=0, bytes_to_read=None):
+#         with open(file_path, "rb") as f:
+#             f.seek(offset, os.SEEK_SET)
+#             remaining = bytes_to_read or video_size
+#             while remaining > 0:
+#                 bytes_to_read = min(4096, remaining)
+#                 data = f.read(bytes_to_read)
+#                 if not data:
+#                     break
+#                 remaining -= len(data)
+#                 yield data
+#
+#     response = StreamingResponse(
+#         file_iterator(video_path, start, length), media_type="video/mp4"
+#     )
+#     response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}"
+#     response.headers["Accept-Ranges"] = "bytes"
+#     response.headers["Content-Length"] = str(length)
+#     response.status_code = 206  # Partial Content
+#
+#     return response
+#
+#
+# @router.get("/download/{file_path:path}")
+# async def download_video(_: Request, file_path: str):
+#     """
+#     download video
+#     :param _: Request request
+#     :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4
+#     :return: video file
+#     """
+#     tasks_dir = utils.task_dir()
+#     video_path = os.path.join(tasks_dir, file_path)
+#     file_path = pathlib.Path(video_path)
+#     filename = file_path.stem
+#     extension = file_path.suffix
+#     headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"}
+#     return FileResponse(
+#         path=video_path,
+#         headers=headers,
+#         filename=f"{filename}{extension}",
+#         media_type=f"video/{extension[1:]}",
+#     )
--- a/app/controllers/v2/base.py
+++ b/app/controllers/v2/base.py
@ -0,0 +1,11 @@
+from fastapi import APIRouter, Depends
+
+
+def v2_router(dependencies=None):
+    router = APIRouter()
+    router.tags = ["V2"]
+    router.prefix = "/api/v2"
+    # 将认证依赖项应用于所有路由
+    if dependencies:
+        router.dependencies = dependencies
+    return router
--- a/app/controllers/v2/script.py
+++ b/app/controllers/v2/script.py
@ -0,0 +1,170 @@
+from fastapi import APIRouter, BackgroundTasks
+from loguru import logger
+import os
+
+from app.models.schema_v2 import (
+    GenerateScriptRequest, 
+    GenerateScriptResponse,
+    CropVideoRequest,
+    CropVideoResponse,
+    DownloadVideoRequest,
+    DownloadVideoResponse,
+    StartSubclipRequest,
+    StartSubclipResponse
+)
+from app.models.schema import VideoClipParams
+from app.services.script_service import ScriptGenerator
+from app.services.video_service import VideoService
+from app.utils import utils
+from app.controllers.v2.base import v2_router
+from app.models.schema import VideoClipParams
+from app.services.youtube_service import YoutubeService
+from app.services import task as task_service
+
+router = v2_router()
+
+
+@router.post(
+    "/scripts/generate",
+    response_model=GenerateScriptResponse,
+    summary="同步请求；生成视频脚本 (V2)"
+)
+async def generate_script(
+    request: GenerateScriptRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    生成视频脚本的V2版本API
+    """
+    task_id = utils.get_uuid()
+    
+    try:
+        generator = ScriptGenerator()
+        script = await generator.generate_script(
+            video_path=request.video_path,
+            video_theme=request.video_theme,
+            custom_prompt=request.custom_prompt,
+            skip_seconds=request.skip_seconds,
+            threshold=request.threshold,
+            vision_batch_size=request.vision_batch_size,
+            vision_llm_provider=request.vision_llm_provider
+        )
+        
+        return {
+            "task_id": task_id,
+            "script": script
+        }
+        
+    except Exception as e:
+        logger.exception(f"Generate script failed: {str(e)}")
+        raise
+
+
+@router.post(
+    "/scripts/crop",
+    response_model=CropVideoResponse,
+    summary="同步请求；裁剪视频 (V2)"
+)
+async def crop_video(
+    request: CropVideoRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    根据脚本裁剪视频的V2版本API
+    """
+    try:
+        # 调用视频裁剪服务
+        video_service = VideoService()
+        task_id, subclip_videos = await video_service.crop_video(
+            video_path=request.video_origin_path,
+            video_script=request.video_script
+        )
+        logger.debug(f"裁剪视频成功，视频片段路径: {subclip_videos}")
+        logger.debug(type(subclip_videos))
+        return {
+            "task_id": task_id,
+            "subclip_videos": subclip_videos
+        }
+        
+    except Exception as e:
+        logger.exception(f"Crop video failed: {str(e)}")
+        raise
+
+
+@router.post(
+    "/youtube/download",
+    response_model=DownloadVideoResponse,
+    summary="同步请求；下载YouTube视频 (V2)"
+)
+async def download_youtube_video(
+    request: DownloadVideoRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    下载指定分辨率的YouTube视频
+    """
+    try:
+        youtube_service = YoutubeService()
+        task_id, output_path, filename = await youtube_service.download_video(
+            url=request.url,
+            resolution=request.resolution,
+            output_format=request.output_format,
+            rename=request.rename
+        )
+        
+        return {
+            "task_id": task_id,
+            "output_path": output_path,
+            "resolution": request.resolution,
+            "format": request.output_format,
+            "filename": filename
+        }
+        
+    except Exception as e:
+        logger.exception(f"Download YouTube video failed: {str(e)}")
+        raise
+
+
+@router.post(
+    "/scripts/start-subclip",
+    response_model=StartSubclipResponse,
+    summary="异步请求；开始视频剪辑任务 (V2)"
+)
+async def start_subclip(
+    request: VideoClipParams,
+    task_id: str,
+    subclip_videos: dict,
+    background_tasks: BackgroundTasks
+):
+    """
+    开始视频剪辑任务的V2版本API
+    """
+    try:
+        # 构建参数对象
+        params = VideoClipParams(
+            video_origin_path=request.video_origin_path,
+            video_clip_json_path=request.video_clip_json_path,
+            voice_name=request.voice_name,
+            voice_rate=request.voice_rate,
+            voice_pitch=request.voice_pitch,
+            subtitle_enabled=request.subtitle_enabled,
+            video_aspect=request.video_aspect,
+            n_threads=request.n_threads
+        )
+        
+        # 在后台任务中执行视频剪辑
+        background_tasks.add_task(
+            task_service.start_subclip,
+            task_id=task_id,
+            params=params,
+            subclip_path_videos=subclip_videos
+        )
+        
+        return {
+            "task_id": task_id,
+            "state": "PROCESSING"  # 初始状态
+        }
+        
+    except Exception as e:
+        logger.exception(f"Start subclip task failed: {str(e)}")
+        raise
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -366,6 +366,8 @@ class VideoClipParams(BaseModel):
    custom_position: float = Field(default=70.0, description="自定义位置")

    n_threads: Optional[int] = 8    # 线程数，有助于提升视频处理速度
+    tts_volume: float = 1.0  # TTS音频音量
+    video_volume: float = 0.1  # 视频原声音量

 class VideoTranscriptionRequest(BaseModel):
    video_name: str
--- a/app/models/schema_v2.py
+++ b/app/models/schema_v2.py
@ -0,0 +1,62 @@
+from typing import Optional, List
+from pydantic import BaseModel
+
+
+class GenerateScriptRequest(BaseModel):
+    video_path: str
+    video_theme: Optional[str] = ""
+    custom_prompt: Optional[str] = ""
+    skip_seconds: Optional[int] = 0
+    threshold: Optional[int] = 30
+    vision_batch_size: Optional[int] = 5
+    vision_llm_provider: Optional[str] = "gemini"
+
+
+class GenerateScriptResponse(BaseModel):
+    task_id: str
+    script: List[dict]
+
+
+class CropVideoRequest(BaseModel):
+    video_origin_path: str
+    video_script: List[dict]
+
+
+class CropVideoResponse(BaseModel):
+    task_id: str
+    subclip_videos: dict
+
+
+class DownloadVideoRequest(BaseModel):
+    url: str
+    resolution: str
+    output_format: Optional[str] = "mp4"
+    rename: Optional[str] = None
+
+
+class DownloadVideoResponse(BaseModel):
+    task_id: str
+    output_path: str
+    resolution: str
+    format: str
+    filename: str
+
+
+class StartSubclipRequest(BaseModel):
+    task_id: str
+    video_origin_path: str
+    video_clip_json_path: str
+    voice_name: Optional[str] = None
+    voice_rate: Optional[int] = 0
+    voice_pitch: Optional[int] = 0
+    subtitle_enabled: Optional[bool] = True
+    video_aspect: Optional[str] = "16:9"
+    n_threads: Optional[int] = 4
+    subclip_videos: list  # 从裁剪视频接口获取的视频片段字典
+
+
+class StartSubclipResponse(BaseModel):
+    task_id: str
+    state: str
+    videos: Optional[List[str]] = None
+    combined_videos: Optional[List[str]] = None
--- a/app/router.py
+++ b/app/router.py
@ -10,8 +10,12 @@ Resources:
 from fastapi import APIRouter

 from app.controllers.v1 import llm, video
+from app.controllers.v2 import script

 root_api_router = APIRouter()
 # v1
 root_api_router.include_router(video.router)
 root_api_router.include_router(llm.router)
+
+# v2
+root_api_router.include_router(script.router)
--- a/app/services/audio_merger.py
+++ b/app/services/audio_merger.py
@ -18,95 +18,119 @@ def check_ffmpeg():
        return False


-def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int, video_script: list):
+def merge_audio_files(task_id: str, audio_files: list, total_duration: float, list_script: list):
    """
-    合并多个音频文件到一个指定总时长的音频文件中，并生成相应的字幕
-    :param task_id: 任务ID
-    :param audio_file_paths: 音频文件路径列表
-    :param total_duration: 最终音频文件的总时长（秒）
-    :param video_script: JSON格式的视频脚本
+    合并音频文件，根据OST设置处理不同的音频轨道
+    
+    Args:
+        task_id: 任务ID
+        audio_files: TTS生成的音频文件列表
+        total_duration: 总时长
+        list_script: 完整脚本信息，包含OST设置
+    
+    Returns:
+        str: 合并后的音频文件路径
    """
-    output_dir = utils.task_dir(task_id)
-
+    # 检查FFmpeg是否安装
    if not check_ffmpeg():
-        logger.error("错误：FFmpeg未安装。请安装FFmpeg后再运行此脚本。")
-        return None, None
+        logger.error("FFmpeg未安装，无法合并音频文件")
+        return None

-    # 创建一个总时长为total_duration的空白音频
-    blank_audio = AudioSegment.silent(duration=total_duration * 1000)  # pydub使用毫秒
+    # 创建一个空的音频片段
+    final_audio = AudioSegment.silent(duration=total_duration * 1000)  # 总时长以毫秒为单位

-    for audio_path in audio_file_paths:
-        if not os.path.exists(audio_path):
-            logger.info(f"警告：文件 {audio_path} 不存在，已跳过。")
+    # 遍历脚本中的每个片段
+    for segment, audio_file in zip(list_script, audio_files):
+        try:
+            # 加载TTS音频文件
+            tts_audio = AudioSegment.from_file(audio_file)
+
+            # 获取片段的开始和结束时间
+            start_time, end_time = segment['new_timestamp'].split('-')
+            start_seconds = utils.time_to_seconds(start_time)
+            end_seconds = utils.time_to_seconds(end_time)
+
+            # 根据OST设置处理音频
+            if segment['OST'] == 0:
+                # 只使用TTS音频
+                final_audio = final_audio.overlay(tts_audio, position=start_seconds * 1000)
+            elif segment['OST'] == 1:
+                # 只使用原声（假设原声已经在视频中）
+                continue
+            elif segment['OST'] == 2:
+                # 混合TTS音频和原声
+                original_audio = AudioSegment.silent(duration=(end_seconds - start_seconds) * 1000)
+                mixed_audio = original_audio.overlay(tts_audio)
+                final_audio = final_audio.overlay(mixed_audio, position=start_seconds * 1000)
+
+        except Exception as e:
+            logger.error(f"处理音频文件 {audio_file} 时出错: {str(e)}")
            continue

-        # 从文件名中提取时间戳
-        filename = os.path.basename(audio_path)
-        start_time, end_time = extract_timestamp(filename)
+    # 保存合并后的音频文件
+    output_audio_path = os.path.join(utils.task_dir(task_id), "final_audio.mp3")
+    final_audio.export(output_audio_path, format="mp3")
+    logger.info(f"合并后的音频文件已保存: {output_audio_path}")

-        # 读取音频文件
-        try:
-            audio = AudioSegment.from_mp3(audio_path)
-        except Exception as e:
-            logger.error(f"错误：无法读取文件 {audio_path}。错误信息：{str(e)}")
-            continue
-        
-        # 将音频插入到空白音频的指定位置
-        blank_audio = blank_audio.overlay(audio, position=start_time * 1000)
-
-    # 尝试导出为WAV格式
-    try:
-        output_file = os.path.join(output_dir, "audio.wav")
-        blank_audio.export(output_file, format="wav")
-        logger.info(f"音频合并完成，已保存为 {output_file}")
-    except Exception as e:
-        logger.info(f"导出为WAV格式失败，尝试使用MP3格式：{str(e)}")
-        try:
-            output_file = os.path.join(output_dir, "audio.mp3")
-            blank_audio.export(output_file, format="mp3", codec="libmp3lame")
-            logger.info(f"音频合并完成，已保存为 {output_file}")
-        except Exception as e:
-            logger.error(f"导出音频失败：{str(e)}")
-            return None, None
-
-    return output_file
-
-def parse_timestamp(timestamp: str):
-    """解析时间戳字符串为秒数"""
-    # 确保使用冒号作为分隔符
-    timestamp = timestamp.replace('_', ':')
-    return time_to_seconds(timestamp)
-
-def extract_timestamp(filename):
-    """从文件名中提取开始和结束时间戳"""
-    # 从 "audio_00_06-00_24.mp3" 这样的格式中提取时间
-    time_part = filename.split('_', 1)[1].split('.')[0]  # 获取 "00_06-00_24" 部分
-    start_time, end_time = time_part.split('-')  # 分割成 "00_06" 和 "00_24"
-    
-    # 将下划线格式转换回冒号格式
-    start_time = start_time.replace('_', ':')
-    end_time = end_time.replace('_', ':')
-    
-    # 将时间戳转换为秒
-    start_seconds = time_to_seconds(start_time)
-    end_seconds = time_to_seconds(end_time)
-
-    return start_seconds, end_seconds
+    return output_audio_path


 def time_to_seconds(time_str):
-    """将 "00:06" 或 "00_06" 格式转换为总秒数"""
-    # 确保使用冒号作为分隔符
-    time_str = time_str.replace('_', ':')
+    """
+    将时间字符串转换为秒数，支持多种格式：
+    1. 'HH:MM:SS,mmm' (时:分:秒,毫秒)
+    2. 'MM:SS,mmm' (分:秒,毫秒)
+    3. 'SS,mmm' (秒,毫秒)
+    """
    try:
-        parts = time_str.split(':')
-        if len(parts) != 2:
-            logger.error(f"Invalid time format: {time_str}")
-            return 0
-        return int(parts[0]) * 60 + int(parts[1])
+        # 处理毫秒部分
+        if ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = float(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 分割时间部分
+        parts = time_part.split(':')
+        
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(int, parts)
+            seconds = h * 3600 + m * 60 + s
+        elif len(parts) == 2:  # MM:SS
+            m, s = map(int, parts)
+            seconds = m * 60 + s
+        else:  # SS
+            seconds = int(parts[0])
+
+        return seconds + ms
    except (ValueError, IndexError) as e:
        logger.error(f"Error parsing time {time_str}: {str(e)}")
-        return 0
+        return 0.0
+
+
+def extract_timestamp(filename):
+    """
+    从文件名中提取开始和结束时间戳
+    例如: "audio_00_06,500-00_24,800.mp3" -> (6.5, 24.8)
+    """
+    try:
+        # 从文件名中提取时间部分
+        time_part = filename.split('_', 1)[1].split('.')[0]  # 获取 "00_06,500-00_24,800" 部分
+        start_time, end_time = time_part.split('-')  # 分割成开始和结束时间
+        
+        # 将下划线格式转换回冒号格式
+        start_time = start_time.replace('_', ':')
+        end_time = end_time.replace('_', ':')
+        
+        # 将时间戳转换为秒
+        start_seconds = time_to_seconds(start_time)
+        end_seconds = time_to_seconds(end_time)
+
+        return start_seconds, end_seconds
+    except Exception as e:
+        logger.error(f"Error extracting timestamp from {filename}: {str(e)}")
+        return 0.0, 0.0


 if __name__ == "__main__":
--- a/app/services/material.py
+++ b/app/services/material.py
@ -3,6 +3,7 @@ import subprocess
 import random
 import traceback
 from urllib.parse import urlencode
+from datetime import datetime

 import requests
 from typing import List
@ -254,70 +255,105 @@ def download_videos(
 def time_to_seconds(time_str: str) -> float:
    """
    将时间字符串转换为秒数
-    支持格式：
-    1. "MM:SS" (分:秒)
-    2. "SS" (纯秒数)
+    支持格式: 'HH:MM:SS,mmm' (时:分:秒,毫秒)
+    
+    Args:
+        time_str: 时间字符串,如 "00:00:20,100"
+        
+    Returns:
+        float: 转换后的秒数(包含毫秒)
    """
-    parts = time_str.split(':')
-    if len(parts) == 2:
-        minutes, seconds = map(float, parts)
-        return minutes * 60 + seconds
-    return float(time_str)
+    try:
+        # 处理毫秒部分
+        if ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = int(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 处理时分秒
+        parts = time_part.split(':')
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(int, parts)
+            seconds = h * 3600 + m * 60 + s
+        else:
+            raise ValueError("时间格式必须为 HH:MM:SS,mmm")
+
+        return seconds + ms
+        
+    except ValueError as e:
+        logger.error(f"时间格式错误: {time_str}")
+        raise ValueError(f"时间格式错误: 必须为 HH:MM:SS,mmm 格式") from e


 def format_timestamp(seconds: float) -> str:
    """
-    将秒数转换为 "MM:SS" 格式的时间字符串
+    将秒数转换为可读的时间格式 (HH:MM:SS,mmm)
+    
+    Args:
+        seconds: 秒数(可包含毫秒)
+        
+    Returns:
+        str: 格式化的时间字符串,如 "00:00:20,100"
    """
-    minutes = int(seconds) // 60
-    secs = int(seconds) % 60
-    return f"{minutes:02d}:{secs:02d}"
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds_remain = seconds % 60
+    whole_seconds = int(seconds_remain)
+    milliseconds = int((seconds_remain - whole_seconds) * 1000)
+    
+    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"


 def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict:
    """
    保存剪辑后的视频
+    
    Args:
-        timestamp: 需要裁剪的单个时间戳，支持两种格式：
-                  1. '00:36-00:40' (分:秒-分:秒)
-                  2. 'SS-SS' (秒-秒)
+        timestamp: 需要裁剪的时间戳,格式为 'HH:MM:SS,mmm-HH:MM:SS,mmm'
+                  例如: '00:00:00,000-00:00:20,100'
        origin_video: 原视频路径
        save_dir: 存储目录

    Returns:
-        裁剪后的视频路径，格式为 {timestamp: video_path}
+        dict: 裁剪后的视频路径,格式为 {timestamp: video_path}
    """
+    # 使用新的路径结构
    if not save_dir:
-        save_dir = utils.storage_dir("cache_videos")
+        base_dir = os.path.join(utils.temp_dir(), "clip_video")
+        video_hash = utils.md5(origin_video)
+        save_dir = os.path.join(base_dir, video_hash)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

-    video_id = f"vid-{timestamp.replace(':', '_')}"
-    video_path = f"{save_dir}/{video_id}.mp4"
+    # 生成更规范的视频文件名
+    video_id = f"vid-{timestamp.replace(':', '-').replace(',', '_')}"
+    video_path = os.path.join(save_dir, f"{video_id}.mp4")

    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
        logger.info(f"video already exists: {video_path}")
        return {timestamp: video_path}

    try:
-        # 先加载视频获取总时长
+        # 加载视频获取总时长
        video = VideoFileClip(origin_video)
        total_duration = video.duration
        
-        # 获取目标时间段
+        # 解析时间戳
        start_str, end_str = timestamp.split('-')
        start = time_to_seconds(start_str)
        end = time_to_seconds(end_str)
        
-        # 验证时间段是否有效
+        # 验证时间段
        if start >= total_duration:
-            logger.warning(f"起始时间 {format_timestamp(start)} ({start:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒)")
+            logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)")
            video.close()
            return {}
            
        if end > total_duration:
-            logger.warning(f"结束时间 {format_timestamp(end)} ({end:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒)，将自动调整为视频结尾")
+            logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)，将自动调整为视频结尾")
            end = total_duration
            
        if end <= start:
@ -328,11 +364,21 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
        # 剪辑视频
        duration = end - start
        logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)}，时长 {format_timestamp(duration)}")
+        
+        # 剪辑视频
        subclip = video.subclip(start, end)
        
        try:
            # 检查视频是否有音频轨道并写入文件
-            subclip.write_videofile(video_path, audio=(subclip.audio is not None), logger=None)
+            subclip.write_videofile(
+                video_path,
+                codec='libx264',
+                audio_codec='aac',
+                temp_audiofile='temp-audio.m4a',
+                remove_temp=True,
+                audio=(subclip.audio is not None),
+                logger=None
+            )
            
            # 验证生成的视频文件
            if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
@ -363,12 +409,12 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
    return {}


-def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None):
+def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None) -> dict:
    """
    剪辑视频
    Args:
        task_id: 任务id
-        timestamp_terms: 需要剪辑的时间戳列表，如:['00:00-00:20', '00:36-00:40', '07:07-07:22']
+        timestamp_terms: 需要剪辑的时间戳列表，如:['00:00:00,000-00:00:20,100', '00:00:43,039-00:00:46,959']
        origin_video: 原视频路径
        progress_callback: 进度回调函数

@ -379,11 +425,6 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro
    total_items = len(timestamp_terms)
    for index, item in enumerate(timestamp_terms):
        material_directory = config.app.get("material_directory", "").strip()
-        if material_directory == "task":
-            material_directory = utils.task_dir(task_id)
-        elif material_directory and not os.path.isdir(material_directory):
-            material_directory = ""
-
        try:
            saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory)
            if saved_video_path:
@ -396,6 +437,7 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro
        except Exception as e:
            logger.error(f"视频裁剪失败: {utils.to_json(item)} =>\n{str(traceback.format_exc())}")
            return {}
+            
    logger.success(f"裁剪 {len(video_paths)} videos")
    return video_paths

@ -455,29 +497,3 @@ def merge_videos(video_paths, ost_list):
                    os.remove(silent_video)

    return output_file
-
-
-# 使用示例
-# if __name__ == "__main__":
-#     video_paths = ['/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_17-01_37.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_00-00_06.mp4',
-#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_06-00_09.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_03-01_10.mp4',
-#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_10-01_17.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_24-00_27.mp4',
-#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_28-01_36.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_32-00_41.mp4',
-#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_36-01_58.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_12-00_15.mp4',
-#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_09-00_12.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-02_12-02_25.mp4',
-#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-02_03-02_12.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_58-02_03.mp4',
-#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-03_14-03_18.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-03_18-03_20.mp4']
-#
-#     ost_list = [True, False, False, False, False, False, False, False, True, False, False, False, False, False, False,
-#                 False]
-#
-#     result = merge_videos(video_paths, ost_list)
-#     if result:
-#         print(f"合并后的视频文件：{result}")
-#     else:
-#         print("视频合并失败")
-#
-
-
-if __name__ == "__main__":
-    save_clip_video('00:50-01:41', 'E:\\projects\\NarratoAI\\resource\\videos\\WeChat_20241110144511.mp4')
--- a/app/services/script_service.py
+++ b/app/services/script_service.py
@ -0,0 +1,405 @@
+import os
+import json
+import time
+import asyncio
+import requests
+from loguru import logger
+from typing import List, Dict, Any, Callable
+
+from app.utils import utils, gemini_analyzer, video_processor, video_processor_v2
+from app.utils.script_generator import ScriptProcessor
+from app.config import config
+
+
+class ScriptGenerator:
+    def __init__(self):
+        self.temp_dir = utils.temp_dir()
+        self.keyframes_dir = os.path.join(self.temp_dir, "keyframes")
+        
+    async def generate_script(
+        self,
+        video_path: str,
+        video_theme: str = "",
+        custom_prompt: str = "",
+        skip_seconds: int = 0,
+        threshold: int = 30,
+        vision_batch_size: int = 5,
+        vision_llm_provider: str = "gemini",
+        progress_callback: Callable[[float, str], None] = None
+    ) -> List[Dict[Any, Any]]:
+        """
+        生成视频脚本的核心逻辑
+        
+        Args:
+            video_path: 视频文件路径
+            video_theme: 视频主题
+            custom_prompt: 自定义提示词
+            skip_seconds: 跳过开始的秒数
+            threshold: 差异<EFBFBD><EFBFBD><EFBFBD>值
+            vision_batch_size: 视觉处理批次大小
+            vision_llm_provider: 视觉模型提供商
+            progress_callback: 进度回调函数
+            
+        Returns:
+            List[Dict]: 生成的视频脚本
+        """
+        if progress_callback is None:
+            progress_callback = lambda p, m: None
+            
+        try:
+            # 提取关键帧
+            progress_callback(10, "正在提取关键帧...")
+            keyframe_files = await self._extract_keyframes(
+                video_path, 
+                skip_seconds,
+                threshold
+            )
+            
+            if vision_llm_provider == "gemini":
+                script = await self._process_with_gemini(
+                    keyframe_files,
+                    video_theme,
+                    custom_prompt,
+                    vision_batch_size,
+                    progress_callback
+                )
+            elif vision_llm_provider == "narratoapi":
+                script = await self._process_with_narrato(
+                    keyframe_files,
+                    video_theme,
+                    custom_prompt,
+                    vision_batch_size,
+                    progress_callback
+                )
+            else:
+                raise ValueError(f"Unsupported vision provider: {vision_llm_provider}")
+                
+            return json.loads(script) if isinstance(script, str) else script
+            
+        except Exception as e:
+            logger.exception("Generate script failed")
+            raise
+            
+    async def _extract_keyframes(
+        self,
+        video_path: str,
+        skip_seconds: int,
+        threshold: int
+    ) -> List[str]:
+        """提取视频关键帧"""
+        video_hash = utils.md5(video_path + str(os.path.getmtime(video_path)))
+        video_keyframes_dir = os.path.join(self.keyframes_dir, video_hash)
+        
+        # 检查缓存
+        keyframe_files = []
+        if os.path.exists(video_keyframes_dir):
+            for filename in sorted(os.listdir(video_keyframes_dir)):
+                if filename.endswith('.jpg'):
+                    keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+                    
+            if keyframe_files:
+                logger.info(f"Using cached keyframes: {video_keyframes_dir}")
+                return keyframe_files
+                
+        # 提取新的关键帧
+        os.makedirs(video_keyframes_dir, exist_ok=True)
+        
+        try:
+            if config.frames.get("version") == "v2":
+                processor = video_processor_v2.VideoProcessor(video_path)
+                processor.process_video_pipeline(
+                    output_dir=video_keyframes_dir,
+                    skip_seconds=skip_seconds,
+                    threshold=threshold
+                )
+            else:
+                processor = video_processor.VideoProcessor(video_path)
+                processor.process_video(
+                    output_dir=video_keyframes_dir,
+                    skip_seconds=skip_seconds
+                )
+                
+            for filename in sorted(os.listdir(video_keyframes_dir)):
+                if filename.endswith('.jpg'):
+                    keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+                    
+            return keyframe_files
+            
+        except Exception as e:
+            if os.path.exists(video_keyframes_dir):
+                import shutil
+                shutil.rmtree(video_keyframes_dir)
+            raise
+            
+    async def _process_with_gemini(
+        self,
+        keyframe_files: List[str],
+        video_theme: str,
+        custom_prompt: str,
+        vision_batch_size: int,
+        progress_callback: Callable[[float, str], None]
+    ) -> str:
+        """使用Gemini处理视频帧"""
+        progress_callback(30, "正在初始化视觉分析器...")
+        
+        # 获取Gemini配置
+        vision_api_key = config.app.get("vision_gemini_api_key")
+        vision_model = config.app.get("vision_gemini_model_name")
+        
+        if not vision_api_key or not vision_model:
+            raise ValueError("未配置 Gemini API Key 或者模型")
+
+        analyzer = gemini_analyzer.VisionAnalyzer(
+            model_name=vision_model,
+            api_key=vision_api_key,
+        )
+
+        progress_callback(40, "正在分析关键帧...")
+
+        # 执行异步分析
+        results = await analyzer.analyze_images(
+            images=keyframe_files,
+            prompt=config.app.get('vision_analysis_prompt'),
+            batch_size=vision_batch_size
+        )
+
+        progress_callback(60, "正在整理分析结果...")
+        
+        # 合并所有批次的分析结果
+        frame_analysis = ""
+        prev_batch_files = None
+
+        for result in results:
+            if 'error' in result:
+                logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}")
+                continue
+                
+            batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size)
+            first_timestamp, last_timestamp, _ = self._get_batch_timestamps(batch_files, prev_batch_files)
+            
+            # 添加带时间戳的分<E79A84><E58886>结果
+            frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
+            frame_analysis += result['response']
+            frame_analysis += "\n"
+            
+            prev_batch_files = batch_files
+        
+        if not frame_analysis.strip():
+            raise Exception("未能生成有效的帧分析结果")
+        
+        progress_callback(70, "正在生成脚本...")
+
+        # 构建帧内容列表
+        frame_content_list = []
+        prev_batch_files = None
+
+        for result in results:
+            if 'error' in result:
+                continue
+            
+            batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size)
+            _, _, timestamp_range = self._get_batch_timestamps(batch_files, prev_batch_files)
+            
+            frame_content = {
+                "timestamp": timestamp_range,
+                "picture": result['response'],
+                "narration": "",
+                "OST": 2
+            }
+            frame_content_list.append(frame_content)
+            prev_batch_files = batch_files
+
+        if not frame_content_list:
+            raise Exception("没有有效的帧内容可以处理")
+
+        progress_callback(90, "正在生成文案...")
+        
+        # 获取文本生<E69CAC><E7949F>配置
+        text_provider = config.app.get('text_llm_provider', 'gemini').lower()
+        text_api_key = config.app.get(f'text_{text_provider}_api_key')
+        text_model = config.app.get(f'text_{text_provider}_model_name')
+
+        processor = ScriptProcessor(
+            model_name=text_model,
+            api_key=text_api_key,
+            prompt=custom_prompt,
+            video_theme=video_theme
+        )
+
+        return processor.process_frames(frame_content_list)
+
+    async def _process_with_narrato(
+        self,
+        keyframe_files: List[str],
+        video_theme: str,
+        custom_prompt: str,
+        vision_batch_size: int,
+        progress_callback: Callable[[float, str], None]
+    ) -> str:
+        """使用NarratoAPI处理视频帧"""
+        # 创建临时目录
+        temp_dir = utils.temp_dir("narrato")
+        
+        # 打包关键帧
+        progress_callback(30, "正在打包关键帧...")
+        zip_path = os.path.join(temp_dir, f"keyframes_{int(time.time())}.zip")
+        
+        try:
+            if not utils.create_zip(keyframe_files, zip_path):
+                raise Exception("打包关键帧失败")
+            
+            # 获取API配置
+            api_url = config.app.get("narrato_api_url")
+            api_key = config.app.get("narrato_api_key")
+            
+            if not api_key:
+                raise ValueError("未配置 Narrato API Key")
+            
+            headers = {
+                'X-API-Key': api_key,
+                'accept': 'application/json'
+            }
+            
+            api_params = {
+                'batch_size': vision_batch_size,
+                'use_ai': False,
+                'start_offset': 0,
+                'vision_model': config.app.get('narrato_vision_model', 'gemini-1.5-flash'),
+                'vision_api_key': config.app.get('narrato_vision_key'),
+                'llm_model': config.app.get('narrato_llm_model', 'qwen-plus'),
+                'llm_api_key': config.app.get('narrato_llm_key'),
+                'custom_prompt': custom_prompt
+            }
+            
+            progress_callback(40, "正在上传文件...")
+            with open(zip_path, 'rb') as f:
+                files = {'file': (os.path.basename(zip_path), f, 'application/x-zip-compressed')}
+                response = requests.post(
+                    f"{api_url}/video/analyze",
+                    headers=headers, 
+                    params=api_params, 
+                    files=files,
+                    timeout=30
+                )
+                response.raise_for_status()
+            
+            task_data = response.json()
+            task_id = task_data["data"].get('task_id')
+            if not task_id:
+                raise Exception(f"无效的API<EFBFBD><EFBFBD>应: {response.text}")
+            
+            progress_callback(50, "正在等待分析结果...")
+            retry_count = 0
+            max_retries = 60
+            
+            while retry_count < max_retries:
+                try:
+                    status_response = requests.get(
+                        f"{api_url}/video/tasks/{task_id}",
+                        headers=headers,
+                        timeout=10
+                    )
+                    status_response.raise_for_status()
+                    task_status = status_response.json()['data']
+                    
+                    if task_status['status'] == 'SUCCESS':
+                        return task_status['result']['data']
+                    elif task_status['status'] in ['FAILURE', 'RETRY']:
+                        raise Exception(f"任务失败: {task_status.get('error')}")
+                    
+                    retry_count += 1
+                    time.sleep(2)
+                    
+                except requests.RequestException as e:
+                    logger.warning(f"获取任务状态失败，重试中: {str(e)}")
+                    retry_count += 1
+                    time.sleep(2)
+                    continue
+            
+            raise Exception("任务执行超时")
+            
+        finally:
+            # 清理临时文件
+            try:
+                if os.path.exists(zip_path):
+                    os.remove(zip_path)
+            except Exception as e:
+                logger.warning(f"清理临时文件失败: {str(e)}")
+
+    def _get_batch_files(
+        self, 
+        keyframe_files: List[str], 
+        result: Dict[str, Any], 
+        batch_size: int
+    ) -> List[str]:
+        """获取当前批次的图片文件"""
+        batch_start = result['batch_index'] * batch_size
+        batch_end = min(batch_start + batch_size, len(keyframe_files))
+        return keyframe_files[batch_start:batch_end]
+
+    def _get_batch_timestamps(
+        self, 
+        batch_files: List[str], 
+        prev_batch_files: List[str] = None
+    ) -> tuple[str, str, str]:
+        """获取一批文件的时间戳范围，支持毫秒级精度"""
+        if not batch_files:
+            logger.warning("Empty batch files")
+            return "00:00:00,000", "00:00:00,000", "00:00:00,000-00:00:00,000"
+            
+        if len(batch_files) == 1 and prev_batch_files and len(prev_batch_files) > 0:
+            first_frame = os.path.basename(prev_batch_files[-1])
+            last_frame = os.path.basename(batch_files[0])
+        else:
+            first_frame = os.path.basename(batch_files[0])
+            last_frame = os.path.basename(batch_files[-1])
+        
+        first_time = first_frame.split('_')[2].replace('.jpg', '')
+        last_time = last_frame.split('_')[2].replace('.jpg', '')
+        
+        def format_timestamp(time_str: str) -> str:
+            """将时间字符串转换为 HH:MM:SS,mmm 格式"""
+            try:
+                if len(time_str) < 4:
+                    logger.warning(f"Invalid timestamp format: {time_str}")
+                    return "00:00:00,000"
+                
+                # 处理毫秒部分
+                if ',' in time_str:
+                    time_part, ms_part = time_str.split(',')
+                    ms = int(ms_part)
+                else:
+                    time_part = time_str
+                    ms = 0
+                
+                # 处理时分秒
+                parts = time_part.split(':')
+                if len(parts) == 3:  # HH:MM:SS
+                    h, m, s = map(int, parts)
+                elif len(parts) == 2:  # MM:SS
+                    h = 0
+                    m, s = map(int, parts)
+                else:  # SS
+                    h = 0
+                    m = 0
+                    s = int(parts[0])
+                    
+                # 处理进位
+                if s >= 60:
+                    m += s // 60
+                    s = s % 60
+                if m >= 60:
+                    h += m // 60
+                    m = m % 60
+                    
+                return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+                
+            except Exception as e:
+                logger.error(f"时间戳格式转换错误 {time_str}: {str(e)}")
+                return "00:00:00,000"
+        
+        first_timestamp = format_timestamp(first_time)
+        last_timestamp = format_timestamp(last_time)
+        timestamp_range = f"{first_timestamp}-{last_timestamp}"
+        
+        return first_timestamp, last_timestamp, timestamp_range
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@ -8,6 +8,8 @@ from faster_whisper import WhisperModel
 from timeit import default_timer as timer
 from loguru import logger
 import google.generativeai as genai
+from moviepy.editor import VideoFileClip
+import os

 from app.config import config
 from app.utils import utils
@ -362,29 +364,86 @@ def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Option
        return None


+def extract_audio_and_create_subtitle(video_file: str, subtitle_file: str = "") -> Optional[str]:
+    """
+    从视频文件中提取音频并生成字幕文件。
+
+    参数:
+    - video_file: MP4视频文件的路径
+    - subtitle_file: 输出字幕文件的路径（可选）。如果未提供，将根据视频文件名自动生成。
+
+    返回:
+    - str: 生成的字幕文件路径
+    - None: 如果处理过程中出现错误
+    """
+    try:
+        # 获取视频文件所在目录
+        video_dir = os.path.dirname(video_file)
+        video_name = os.path.splitext(os.path.basename(video_file))[0]
+        
+        # 设置音频文件路径
+        audio_file = os.path.join(video_dir, f"{video_name}_audio.wav")
+        
+        # 如果未指定字幕文件路径，则自动生成
+        if not subtitle_file:
+            subtitle_file = os.path.join(video_dir, f"{video_name}.srt")
+        
+        logger.info(f"开始从视频提取音频: {video_file}")
+        
+        # 加载视频文件
+        video = VideoFileClip(video_file)
+        
+        # 提取音频并保存为WAV格式
+        logger.info(f"正在提取音频到: {audio_file}")
+        video.audio.write_audiofile(audio_file, codec='pcm_s16le')
+        
+        # 关闭视频文件
+        video.close()
+        
+        logger.info("音频提取完成，开始生成字幕")
+        
+        # 使用create函数生成字幕
+        create(audio_file, subtitle_file)
+        
+        # 删除临时音频文件
+        if os.path.exists(audio_file):
+            os.remove(audio_file)
+            logger.info("已清理临时音频文件")
+        
+        return subtitle_file
+        
+    except Exception as e:
+        logger.error(f"处理视频文件时出错: {str(e)}")
+        logger.error(traceback.format_exc())
+        return None
+
+
 if __name__ == "__main__":
-    task_id = "test456"
+    task_id = "123456"
    task_dir = utils.task_dir(task_id)
-    subtitle_file = f"{task_dir}/subtitle.srt"
+    subtitle_file = f"{task_dir}/subtitle_123456.srt"
    audio_file = f"{task_dir}/audio.wav"
+    video_file = "/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_1702.mp4"

-    subtitles = file_to_subtitles(subtitle_file)
-    print(subtitles)
+    extract_audio_and_create_subtitle(video_file, subtitle_file)

-    # script_file = f"{task_dir}/script.json"
-    # with open(script_file, "r") as f:
-    #     script_content = f.read()
-    # s = json.loads(script_content)
-    # script = s.get("script")
-    #
-    # correct(subtitle_file, script)
+    # subtitles = file_to_subtitles(subtitle_file)
+    # print(subtitles)

-    subtitle_file = f"{task_dir}/subtitle111.srt"
-    create(audio_file, subtitle_file)
+    # # script_file = f"{task_dir}/script.json"
+    # # with open(script_file, "r") as f:
+    # #     script_content = f.read()
+    # # s = json.loads(script_content)
+    # # script = s.get("script")
+    # #
+    # # correct(subtitle_file, script)

-    # # 使用Gemini模型处理音频
-    # gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
-    # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
-    #
-    # if gemini_subtitle_file:
-    #     print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
+    # subtitle_file = f"{task_dir}/subtitle111.srt"
+    # create(audio_file, subtitle_file)
+
+    # # # 使用Gemini模型处理音频
+    # # gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
+    # # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
+    # #
+    # # if gemini_subtitle_file:
+    # #     print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
--- a/app/services/task.py
+++ b/app/services/task.py
@ -206,134 +206,14 @@ def generate_final_videos(
    return final_video_paths, combined_video_paths


-def start(task_id, params: VideoParams, stop_at: str = "video"):
-    logger.info(f"start task: {task_id}, stop_at: {stop_at}")
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
-
-    if type(params.video_concat_mode) is str:
-        params.video_concat_mode = VideoConcatMode(params.video_concat_mode)
-
-    # 1. Generate script
-    video_script = generate_script(task_id, params)
-    if not video_script:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
-
-    if stop_at == "script":
-        sm.state.update_task(
-            task_id, state=const.TASK_STATE_COMPLETE, progress=100, script=video_script
-        )
-        return {"script": video_script}
-
-    # 2. Generate terms
-    video_terms = ""
-    if params.video_source != "local":
-        video_terms = generate_terms(task_id, params, video_script)
-        if not video_terms:
-            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-            return
-
-    save_script_data(task_id, video_script, video_terms, params)
-
-    if stop_at == "terms":
-        sm.state.update_task(
-            task_id, state=const.TASK_STATE_COMPLETE, progress=100, terms=video_terms
-        )
-        return {"script": video_script, "terms": video_terms}
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
-
-    # 3. Generate audio
-    audio_file, audio_duration, sub_maker = generate_audio(task_id, params, video_script)
-    if not audio_file:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
-
-    if stop_at == "audio":
-        sm.state.update_task(
-            task_id,
-            state=const.TASK_STATE_COMPLETE,
-            progress=100,
-            audio_file=audio_file,
-        )
-        return {"audio_file": audio_file, "audio_duration": audio_duration}
-
-    # 4. Generate subtitle
-    subtitle_path = generate_subtitle(task_id, params, video_script, sub_maker, audio_file)
-
-    if stop_at == "subtitle":
-        sm.state.update_task(
-            task_id,
-            state=const.TASK_STATE_COMPLETE,
-            progress=100,
-            subtitle_path=subtitle_path,
-        )
-        return {"subtitle_path": subtitle_path}
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
-
-    # 5. Get video materials
-    downloaded_videos = get_video_materials(
-        task_id, params, video_terms, audio_duration
-    )
-    if not downloaded_videos:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    if stop_at == "materials":
-        sm.state.update_task(
-            task_id,
-            state=const.TASK_STATE_COMPLETE,
-            progress=100,
-            materials=downloaded_videos,
-        )
-        return {"materials": downloaded_videos}
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
-
-    # 6. Generate final videos
-    final_video_paths, combined_video_paths = generate_final_videos(
-        task_id, params, downloaded_videos, audio_file, subtitle_path
-    )
-
-    if not final_video_paths:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    logger.success(
-        f"task {task_id} finished, generated {len(final_video_paths)} videos."
-    )
-
-    kwargs = {
-        "videos": final_video_paths,
-        "combined_videos": combined_video_paths,
-        "script": video_script,
-        "terms": video_terms,
-        "audio_file": audio_file,
-        "audio_duration": audio_duration,
-        "subtitle_path": subtitle_path,
-        "materials": downloaded_videos,
-    }
-    sm.state.update_task(
-        task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs
-    )
-    return kwargs
-
-
-def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: list):
-    """
-    后台任务（自动剪辑视频进行剪辑）
-
-        task_id: 任务ID
-        params: 剪辑参数
-        subclip_path_videos: 视频文件路径
-
-    """
+def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
+    """后台任务（自动剪辑视频进行剪辑）"""
    logger.info(f"\n\n## 开始任务: {task_id}")
+    
+    # 初始化 ImageMagick
+    if not utils.init_imagemagick():
+        logger.warning("ImageMagick 初始化失败，字幕可能无法正常显示")
+    
    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)

    # tts 角色名称
@ -341,8 +221,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li

    logger.info("\n\n## 1. 加载视频脚本")
    video_script_path = path.join(params.video_clip_json_path)
-    # video_script_path = video_clip_json_path
-    # 判断json文件是否存在
+    
    if path.exists(video_script_path):
        try:
            with open(video_script_path, "r", encoding="utf-8") as f:
@ -355,10 +234,12 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li
                logger.debug(f"解说完整脚本: \n{video_script}")
                logger.debug(f"解说 OST 列表: \n{video_ost}")
                logger.debug(f"解说时间戳列表: \n{time_list}")
+                
                # 获取视频总时长(单位 s)
-                total_duration = list_script[-1]['new_timestamp']
-                total_duration = int(total_duration.split("-")[1].split(":")[0]) * 60 + int(
-                    total_duration.split("-")[1].split(":")[1])
+                last_timestamp = list_script[-1]['new_timestamp']
+                end_time = last_timestamp.split("-")[1]
+                total_duration = utils.time_to_seconds(end_time)
+                
        except Exception as e:
            logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
            raise ValueError("无法读取视频json脚本，请检查配置是否正确")
@ -366,32 +247,51 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li
        logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
        raise ValueError("解说脚本不存在！请检查配置是否正确。")

-    logger.info("\n\n## 2. 生成音频列表")
-    audio_files, sub_maker_list = voice.tts_multiple(
-        task_id=task_id,
-        list_script=list_script,
-        voice_name=voice_name,
-        voice_rate=params.voice_rate,
-        voice_pitch=params.voice_pitch,
-        force_regenerate=True
+    logger.info("\n\n## 2. 根据OST设置生成音频列表")
+    # 只为OST=0或2的片段生成TTS音频
+    tts_segments = [
+        segment for segment in list_script 
+        if segment['OST'] in [0, 2]
+    ]
+    # logger.debug(f"tts_segments: {tts_segments}")
+    if tts_segments:
+        audio_files, sub_maker_list = voice.tts_multiple(
+            task_id=task_id,
+            list_script=tts_segments,  # 只传入需要TTS的片段
+            voice_name=voice_name,
+            voice_rate=params.voice_rate,
+            voice_pitch=params.voice_pitch,
+            force_regenerate=True
+        )
+        if audio_files is None:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
+            return
+    else:
+        audio_files = []
+        
+    logger.info(f"合并音频文件:\n{audio_files}")
+    # 传入OST信息以便正确处理音频
+    final_audio = audio_merger.merge_audio_files(
+        task_id=task_id, 
+        audio_files=audio_files, 
+        total_duration=total_duration, 
+        list_script=list_script  # 传入完整脚本以便处理OST
    )
-    if audio_files is None:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error(
-            "TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
-        return
-    logger.info(f"合并音频:\n\n {audio_files}")
-    audio_file = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script)

    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)

+    # 只为OST=0或2的片段生成字幕
    subtitle_path = ""
    if params.subtitle_enabled:
        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
        subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
        logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-        # 使用 faster-whisper-large-v2 模型生成字幕
-        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+         
+        subtitle.create(
+            audio_file=final_audio,
+            subtitle_file=subtitle_path,
+        )

        subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
        if not subtitle_lines:
@ -402,7 +302,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li

    logger.info("\n\n## 4. 裁剪视频")
    subclip_videos = [x for x in subclip_path_videos.values()]
-    logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}")
+    # logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}")

    if not subclip_videos:
        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
@ -434,14 +334,15 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: li

    final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")

-    logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}")
-    # 把所有东西合到在一起
+    logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}")
+    # 传入OST信息以便正确处理音频和视频
    video.generate_video_v2(
        video_path=combined_video_path,
-        audio_path=audio_file,
+        audio_path=final_audio,
        subtitle_path=subtitle_path,
        output_file=final_video_path,
        params=params,
+        list_script=list_script  # 传入完整脚本以便处理OST
    )

    _progress += 50 / 2
--- a/app/services/video.py
+++ b/app/services/video.py
@ -18,6 +18,15 @@ from app.utils import utils


 def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
+    """
+    获取背景音乐文件路径
+    Args:
+        bgm_type: 背景音乐类型，可选值: random(随机), ""(无背景音乐)
+        bgm_file: 指定的背景音乐文件路径
+
+    Returns:
+        str: 背景音乐文件路径
+    """
    if not bgm_type:
        return ""

@ -48,21 +57,35 @@ def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):


 def combine_videos(
-    combined_video_path: str,
-    video_paths: List[str],
-    audio_file: str,
-    video_aspect: VideoAspect = VideoAspect.portrait,
-    video_concat_mode: VideoConcatMode = VideoConcatMode.random,
-    max_clip_duration: int = 5,
-    threads: int = 2,
+        combined_video_path: str,
+        video_paths: List[str],
+        audio_file: str,
+        video_aspect: VideoAspect = VideoAspect.portrait,
+        video_concat_mode: VideoConcatMode = VideoConcatMode.random,
+        max_clip_duration: int = 5,
+        threads: int = 2,
 ) -> str:
+    """
+    合并多个视频片段
+    Args:
+        combined_video_path: 合并后的视频保存路径
+        video_paths: 待合并的视频路径列表
+        audio_file: 音频文件路径
+        video_aspect: 视频宽高比
+        video_concat_mode: 视频拼接模式(随机/顺序)
+        max_clip_duration: 每个片段的最大时长(秒)
+        threads: 处理线程数
+
+    Returns:
+        str: 合并后的视频路径
+    """
    audio_clip = AudioFileClip(audio_file)
    audio_duration = audio_clip.duration
-    logger.info(f"max duration of audio: {audio_duration} seconds")
-    # Required duration of each clip
+    logger.info(f"音频时长: {audio_duration} 秒")
+    # 每个片段的所需时长
    req_dur = audio_duration / len(video_paths)
    req_dur = max_clip_duration
-    logger.info(f"each clip will be maximum {req_dur} seconds long")
+    logger.info(f"每个片段最大时长: {req_dur} 秒")
    output_dir = os.path.dirname(combined_video_path)

    aspect = VideoAspect(video_aspect)
@ -81,22 +104,22 @@ def combine_videos(
            end_time = min(start_time + max_clip_duration, clip_duration)
            split_clip = clip.subclip(start_time, end_time)
            raw_clips.append(split_clip)
-            # logger.info(f"splitting from {start_time:.2f} to {end_time:.2f}, clip duration {clip_duration:.2f}, split_clip duration {split_clip.duration:.2f}")
+            # logger.info(f"从 {start_time:.2f} 到 {end_time:.2f}, 片段时长 {clip_duration:.2f}, 分割片段时长 {split_clip.duration:.2f}")
            start_time = end_time
            if video_concat_mode.value == VideoConcatMode.sequential.value:
                break

-    # random video_paths order
+    # 随机视频片段顺序
    if video_concat_mode.value == VideoConcatMode.random.value:
        random.shuffle(raw_clips)

-    # Add downloaded clips over and over until the duration of the audio (max_duration) has been reached
+    # 添加下载的片段，直到音频时长(max_duration)达到
    while video_duration < audio_duration:
        for clip in raw_clips:
-            # Check if clip is longer than the remaining audio
+            # 检查片段是否比剩余音频时长长
            if (audio_duration - video_duration) < clip.duration:
                clip = clip.subclip(0, (audio_duration - video_duration))
-            # Only shorten clips if the calculated clip length (req_dur) is shorter than the actual clip to prevent still image
+            # 仅当计算的片段时长(req_dur)小于实际片段时长时，缩短片段
            elif req_dur < clip.duration:
                clip = clip.subclip(0, req_dur)
            clip = clip.set_fps(30)
@ -134,7 +157,7 @@ def combine_videos(
                    )

                logger.info(
-                    f"resizing video to {video_width} x {video_height}, clip size: {clip_w} x {clip_h}"
+                    f"调整视频尺寸为 {video_width} x {video_height}, 片段尺寸: {clip_w} x {clip_h}"
                )

            if clip.duration > max_clip_duration:
@ -146,7 +169,7 @@ def combine_videos(
    video_clip = concatenate_videoclips(clips)
    video_clip = video_clip.set_fps(30)
    logger.info("writing")
-    # https://github.com/harry0703/NarratoAI/issues/111#issuecomment-2032354030
+
    video_clip.write_videofile(
        filename=combined_video_path,
        threads=threads,
@ -161,6 +184,17 @@ def combine_videos(


 def wrap_text(text, max_width, font, fontsize=60):
+    """
+    文本自动换行处理
+    Args:
+        text: 待处理的文本
+        max_width: 最大宽度
+        font: 字体文件路径
+        fontsize: 字体大小
+
+    Returns:
+        tuple: (换行后的文本, 文本高度)
+    """
    # 创建字体对象
    font = ImageFont.truetype(font, fontsize)

@ -220,6 +254,14 @@ def wrap_text(text, max_width, font, fontsize=60):

@contextmanager
 def manage_clip(clip):
+    """
+    视频片段资源管理器
+    Args:
+        clip: 视频片段对象
+
+    Yields:
+        VideoFileClip: 视频片段对象
+    """
    try:
        yield clip
    finally:
@ -232,6 +274,7 @@ def generate_video_v2(
        audio_path: str,
        subtitle_path: str,
        output_file: str,
+        list_script: list,
        params: Union[VideoParams, VideoClipParams],
        progress_callback=None,
 ):
@ -250,7 +293,7 @@ def generate_video_v2(
    """
    total_steps = 4
    current_step = 0
-    
+
    def update_progress(step_name):
        nonlocal current_step
        current_step += 1
@ -260,7 +303,7 @@ def generate_video_v2(

    try:
        validate_params(video_path, audio_path, output_file, params)
-        
+
        with manage_clip(VideoFileClip(video_path)) as video_clip:
            aspect = VideoAspect(params.video_aspect)
            video_width, video_height = aspect.to_resolution()
@ -304,7 +347,7 @@ def generate_video_v2(
                _clip = _clip.set_start(subtitle_item[0][0])
                _clip = _clip.set_end(subtitle_item[0][1])
                _clip = _clip.set_duration(duration)
-                
+
                if params.subtitle_position == "bottom":
                    _clip = _clip.set_position(("center", video_height * 0.95 - _clip.h))
                elif params.subtitle_position == "top":
@ -335,6 +378,7 @@ def generate_video_v2(
            update_progress("字幕处理完成")

            # 合并音频和导出
+            logger.info("开始导出视频 (此步骤耗时较长请耐心等待)")
            video_clip = video_clip.set_audio(final_audio)
            video_clip.write_videofile(
                output_file,
@ -344,7 +388,7 @@ def generate_video_v2(
                logger=None,
                fps=30,
            )
-            
+
    except FileNotFoundError as e:
        logger.error(f"文件不存在: {str(e)}")
        raise
@ -356,15 +400,25 @@ def generate_video_v2(


 def process_audio_tracks(original_audio, new_audio, params, video_duration):
-    """处理所有音轨"""
+    """
+    处理所有音轨(原声、配音、背景音乐)
+    Args:
+        original_audio: 原始音频
+        new_audio: 新音频
+        params: 视频参数
+        video_duration: 视频时长
+
+    Returns:
+        CompositeAudioClip: 合成后的音频
+    """
    audio_tracks = []
-    
+
    if original_audio is not None:
        audio_tracks.append(original_audio)
-    
+
    new_audio = new_audio.volumex(params.voice_volume)
    audio_tracks.append(new_audio)
-    
+
    # 处理背景音乐
    bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
    if bgm_file:
@ -374,35 +428,54 @@ def process_audio_tracks(original_audio, new_audio, params, video_duration):
            audio_tracks.append(bgm_clip)
        except Exception as e:
            logger.error(f"添加背景音乐失败: {str(e)}")
-    
+
    return CompositeAudioClip(audio_tracks) if audio_tracks else new_audio


 def process_subtitles(subtitle_path, video_clip, video_duration, create_text_clip):
-    """处理字幕"""
+    """
+    处理字幕
+    Args:
+        subtitle_path: 字幕文件路径
+        video_clip: 视频片段
+        video_duration: 视频时长
+        create_text_clip: 创建文本片段的回调函数
+
+    Returns:
+        CompositeVideoClip: 添加字幕后的视频
+    """
    if not (subtitle_path and os.path.exists(subtitle_path)):
        return video_clip
-        
+
    sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8")
    text_clips = []
-    
+
    for item in sub.subtitles:
        clip = create_text_clip(subtitle_item=item)
-        
+
        # 时间范围调整
        start_time = max(clip.start, 0)
        if start_time >= video_duration:
            continue
-            
+
        end_time = min(clip.end, video_duration)
        clip = clip.set_start(start_time).set_end(end_time)
        text_clips.append(clip)
-    
+
    logger.info(f"处理了 {len(text_clips)} 段字幕")
    return CompositeVideoClip([video_clip, *text_clips])


 def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
+    """
+    预处理视频素材
+    Args:
+        materials: 素材信息列表
+        clip_duration: 片段时长(秒)
+
+    Returns:
+        List[MaterialInfo]: 处理后的素材信息列表
+    """
    for material in materials:
        if not material.url:
            continue
@ -430,12 +503,12 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
            # 使用resize方法来添加缩放效果。这里使用了lambda函数来使得缩放效果随时间变化。
            # 假设我们想要从原始大小逐渐放大到120%的大小。
            # t代表当前时间，clip.duration为视频总时长，这里是3秒。
-            # 注意：1 表示100%的大小，所以1.2表示120%的大小
+            # 注意：1 表示100%的大小所以1.2表示120%的大小
            zoom_clip = clip.resize(
                lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
            )

-            # 如果需要，可以创建一个包含缩放剪辑的复合视频剪辑
+            # 如果需要，可以创建一个包含缩放剪辑的复合频剪辑
            # （这在您想要在视频中添加其他元素时非常有用）
            final_clip = CompositeVideoClip([zoom_clip])

@ -472,7 +545,7 @@ def combine_clip_videos(combined_video_path: str,
    from app.utils.utils import calculate_total_duration
    audio_duration = calculate_total_duration(list_script)
    logger.info(f"音频的最大持续时间: {audio_duration} s")
-    
+
    output_dir = os.path.dirname(combined_video_path)
    aspect = VideoAspect(video_aspect)
    video_width, video_height = aspect.to_resolution()
@ -481,25 +554,25 @@ def combine_clip_videos(combined_video_path: str,
    for video_path, video_ost in zip(video_paths, video_ost_list):
        try:
            clip = VideoFileClip(video_path)
-            
+
            if video_ost == 0:  # 不保留原声
                clip = clip.without_audio()
            # video_ost 为 1 或 2 时都保留原声，不需要特殊处理
-                
+
            clip = clip.set_fps(30)

            # 处理视频尺寸
            clip_w, clip_h = clip.size
            if clip_w != video_width or clip_h != video_height:
                clip = resize_video_with_padding(
-                    clip, 
-                    target_width=video_width, 
+                    clip,
+                    target_width=video_width,
                    target_height=video_height
                )
                logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}")

            clips.append(clip)
-            
+
        except Exception as e:
            logger.error(f"处理视频 {video_path} 时出错: {str(e)}")
            continue
@ -510,8 +583,8 @@ def combine_clip_videos(combined_video_path: str,
    try:
        video_clip = concatenate_videoclips(clips)
        video_clip = video_clip.set_fps(30)
-        
-        logger.info("开始合并视频...")
+
+        logger.info("开始合并视频... (过程中出现 UserWarning: 不必理会)")
        video_clip.write_videofile(
            filename=combined_video_path,
            threads=threads,
@ -521,7 +594,7 @@ def combine_clip_videos(combined_video_path: str,
            temp_audiofile=os.path.join(output_dir, "temp-audio.m4a")
        )
    finally:
-        # 确保资源被正确<EFBFBD><EFBFBD><EFBFBD>放
+        # 确保资源被正确放
        video_clip.close()
        for clip in clips:
            clip.close()
@ -531,13 +604,22 @@ def combine_clip_videos(combined_video_path: str,


 def resize_video_with_padding(clip, target_width: int, target_height: int):
-    """辅助函数：调整视频尺寸并添加黑边"""
+    """
+    调整视频尺寸并添加黑边
+    Args:
+        clip: 视频片段
+        target_width: 目标宽度
+        target_height: 目标高度
+
+    Returns:
+        CompositeVideoClip: 调整尺寸后的视频
+    """
    clip_ratio = clip.w / clip.h
    target_ratio = target_width / target_height

    if clip_ratio == target_ratio:
        return clip.resize((target_width, target_height))
-    
+
    if clip_ratio > target_ratio:
        scale_factor = target_width / clip.w
    else:
@ -548,10 +630,10 @@ def resize_video_with_padding(clip, target_width: int, target_height: int):
    clip_resized = clip.resize(newsize=(new_width, new_height))

    background = ColorClip(
-        size=(target_width, target_height), 
+        size=(target_width, target_height),
        color=(0, 0, 0)
    ).set_duration(clip.duration)
-    
+
    return CompositeVideoClip([
        background,
        clip_resized.set_position("center")
@ -559,106 +641,100 @@ def resize_video_with_padding(clip, target_width: int, target_height: int):


 def validate_params(video_path, audio_path, output_file, params):
-    """验证输入参数"""
+    """
+    验证输入参数
+    Args:
+        video_path: 视频文件路径
+        audio_path: 音频文件路径
+        output_file: 输出文件路径
+        params: 视频参数
+
+    Raises:
+        FileNotFoundError: 文件不存在时抛出
+        ValueError: 参数无效时抛出
+    """
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"视频文件不存在: {video_path}")
-        
+
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"音频文件不存在: {audio_path}")
-        
+
    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        raise FileNotFoundError(f"输出目录不存在: {output_dir}")
-        
+
    if not hasattr(params, 'video_aspect'):
        raise ValueError("params 缺少必要参数 video_aspect")


 if __name__ == "__main__":
-    # combined_video_path = "../../storage/tasks/12312312/com123.mp4"
-    #
-    # video_paths = ['../../storage/cache_videos/vid-00_00-00_03.mp4',
-    #                '../../storage/cache_videos/vid-00_03-00_07.mp4',
-    #                '../../storage/cache_videos/vid-00_12-00_17.mp4',
-    #                '../../storage/cache_videos/vid-00_26-00_31.mp4']
-    # video_ost_list = [False, True, False, True]
-    # list_script = [
-    #     {
-    #         "picture": "夜晚，一个小孩在树林里奔跑，后面有人拿着火把在追赶",
-    #         "timestamp": "00:00-00:03",
-    #         "narration": "夜黑风高的树林，一个小孩在拼命奔跑，后面的人穷追不舍！",
-    #         "OST": False,
-    #         "new_timestamp": "00:00-00:03"
-    #     },
-    #     {
-    #         "picture": "追赶的人命令抓住小孩",
-    #         "timestamp": "00:03-00:07",
-    #         "narration": "原声播放1",
-    #         "OST": True,
-    #         "new_timestamp": "00:03-00:07"
-    #     },
-    #     {
-    #         "picture": "小孩躲在草丛里，黑衣人用脚踢了踢他",
-    #         "timestamp": "00:12-00:17",
-    #         "narration": "小孩脱下外套，跑进树林, 一路奔跑，直到第二天清晨",
-    #         "OST": False,
-    #         "new_timestamp": "00:07-00:12"
-    #     },
-    #     {
-    #         "picture": "小孩跑到车前，慌慌张张地对女人说有人要杀他",
-    #         "timestamp": "00:26-00:31",
-    #         "narration": "原声播放2",
-    #         "OST": True,
-    #         "new_timestamp": "00:12-00:17"
-    #     }
-    # ]
+    combined_video_path = "../../storage/tasks/123/combined.mp4"
+
+    video_paths = ['../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-00-10_000-00-00-43_039.mp4',
+                   '../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-00-45_439-00-01-01_600.mp4',
+                   '../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-01-07_920-00-01-25_719.mp4',
+                   '../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-01-36_959-00-01-53_719.mp4']
+    video_ost_list = [2, 2, 2, 2]
+    list_script = [
+        {
+            "timestamp": "00:10-00:43",
+            "picture": "好的，以下是视频画面的客观描述：\n\n视频显示一个男人在一个树木繁茂的地区，靠近一个泥土斜坡他穿着一件深色T恤、卡其色长裤和登山靴。他背着一个军绿色背包，里面似乎装有头和其他工具。\n\n第一个镜头显示该男子从远处走近斜坡，背对着镜头。下一个镜头特写显示了的背包，一个镐头从背包中伸出来。下一个镜头显示该男子用镐头敲打斜坡。下一个镜头是该男子脚上的特写镜头，他穿着登山靴，正站在泥土斜坡上。最后一个镜显示该男子在斜坡上，仔细地拨开树根和泥土。周围的环境是树木繁茂的，阳光透过树叶照射下来。土壤是浅棕色的，斜坡上有许多树根和植被。",
+            "narration": "（接上文）好吧，今天我们的男主角，背着一个看似随时要发射军绿色背包，竟然化身“泥土探险家”，在斜坡上挥舞着镐头！他这是准备挖宝还是给树根做个“美容”？阳光洒下来，简直是自然界的聚光灯，仿佛在说：“快来看看，这位勇士要挑战泥土极限！”我只能默默想，如果树根能说话，它们一定会喊：“别打我，我还有家人！”这就是生活，总有些搞笑的瞬间等着我们去发现！",
+            "OST": 2,
+            "new_timestamp": "00:00:00,000-00:00:33,000"
+        },
+        {
+            "timestamp": "00:45-01:01",
+            "picture": "好的以下是视频画面的客观描述：\n\n视频显示了一个人在森林里挖掘。\n\n第一个镜头是地面特写，显示出松<EFBFBD><EFBFBD>的泥土、碎石和落叶。光线照在部分区域。\n\n第二个镜头中，一模糊不清的蹲一个树根旁挖掘，一个橄榄绿色的背包放在地上。树根缠绕着常春藤。\n\n第三个镜头显示该人在一个更开阔的区域挖掘，那里有一些树根，以及部分倒的树干。他起来像是在挖掘一个较大的坑。\n\n第四个镜头是特写镜头，显示该人用工具清理土坑的墙壁。\n\n第五个镜头是土坑内部的特写镜头，可以看到土质的纹理，有一些小树根和它植被的残留物。",
+            "narration": "现在，这位勇敢的挖掘者就像个“现代版的土豆农夫”，在林里开辟新天地。的目标是什么？挖一个宝藏还块“树根披萨”？小心哦，别让树根追着你喊：“不要挖我，我也是有故事的！”",
+            "OST": 2,
+            "new_timestamp": "00:00:33,000-00:00:49,000"
+        },
+        {
+            "timestamp": "01:07-01:25",
+            "picture": "好，以下是视频画面的客观描述：\n\n画面1：特写镜头，显示出一丛带有水珠的深绿色灌木叶片。叶片呈椭圆形，边缘光滑。背景是树根和泥土。\n\n画面2：一个留着胡子的男人正在一个森林中土坑里挖掘。他穿着黑色T恤和卡其色裤子，跪在地，用具挖掘泥土。周围环绕着树木、树根和灌木。一个倒下的树干横跨土坑上方。\n\n画面3：同一个男人坐在他刚才挖的坑的边缘，看着前方。他的表情似乎略带沉思。背景与画面2相同。\n\n画面4：一个广角镜头显示出他挖出的坑。这是一个不规则形状的土坑，在树木繁茂的斜坡上。土壤呈深棕色，可见树根。\n\n画面5：同一个男人跪在地上，用一把小斧头砍一根木头。他穿着与前几个画面相同的衣服。地面上覆盖着落叶。周围是树木和灌木。",
+            "narration": "“哎呀，这片灌木叶子滴水如雨，感觉像是大自然的洗发水广告！但我这位‘挖宝达人’似乎更适合拍个‘森林里的单身狗’真人秀。等会儿，我要给树根唱首歌，听说它们爱音乐！”",
+            "OST": 2,
+            "new_timestamp": "00:00:49,000-00:01:07,000"
+        },
+        {
+            "timestamp": "01:36-01:53",
+            "picture": "好的，以下是视频画面内容的客观描述：\n\n视频包含三个镜头：\n\n**镜头一：**个小型、浅水池塘，位于树林中。池塘的水看起来浑浊，呈绿褐色。池塘周围遍布泥土和落叶。多根树枝和树干横跨池塘，部分浸没在水中。周围的植被茂密主要是深色树木和灌木。\n\n**镜头二：**距拍摄树深处，阳光透过树叶洒落在植被上。镜头中可见粗大的树干、树枝和各种绿叶植物。部分树枝似乎被砍断，切口可见。\n\n**镜头三：**近距离特写镜头，聚焦在树枝和绿叶上。叶片呈圆形，颜色为鲜绿色，有些叶片上有缺损。树枝颜色较深，呈现深褐色。背景是模糊的树林。\n",
+            "narration": "“好吧，看来我们的‘挖宝达人’终于找到了一‘宝藏’——一个色泽如同绿豆汤的池塘！我敢打赌，这里不仅是小鱼儿的游乐场更是树枝们的‘水疗中心’！下次来这里，我得带上浮潜装备！”",
+            "OST": 2,
+            "new_timestamp": "00:01:07,000-00:01:24,000"
+        }
+    ]
+    # 合并子视频
    # combine_clip_videos(combined_video_path=combined_video_path, video_paths=video_paths, video_ost_list=video_ost_list, list_script=list_script)

-    # cfg = VideoClipParams()
-    # cfg.video_aspect = VideoAspect.portrait
-    # cfg.font_name = "STHeitiMedium.ttc"
-    # cfg.font_size = 60
-    # cfg.stroke_color = "#000000"
-    # cfg.stroke_width = 1.5
-    # cfg.text_fore_color = "#FFFFFF"
-    # cfg.text_background_color = "transparent"
-    # cfg.bgm_type = "random"
-    # cfg.bgm_file = ""
-    # cfg.bgm_volume = 1.0
-    # cfg.subtitle_enabled = True
-    # cfg.subtitle_position = "bottom"
-    # cfg.n_threads = 2
-    # cfg.paragraph_number = 1
-    #
-    # cfg.voice_volume = 1.0
+    cfg = VideoClipParams()
+    cfg.video_aspect = VideoAspect.portrait
+    cfg.font_name = "STHeitiMedium.ttc"
+    cfg.font_size = 60
+    cfg.stroke_color = "#000000"
+    cfg.stroke_width = 1.5
+    cfg.text_fore_color = "#FFFFFF"
+    cfg.text_background_color = "transparent"
+    cfg.bgm_type = "random"
+    cfg.bgm_file = ""
+    cfg.bgm_volume = 1.0
+    cfg.subtitle_enabled = True
+    cfg.subtitle_position = "bottom"
+    cfg.n_threads = 2
+    cfg.video_volume = 1

-    # generate_video(video_path=video_file,
-    #                audio_path=audio_file,
-    #                subtitle_path=subtitle_file,
-    #                output_file=output_file,
-    #                params=cfg
-    #                )
-    #
-    # video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4"
-    #
-    # audio_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3"
-    #
-    # subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt"
-    #
-    # output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4"
-    #
-    # generate_video_v2(video_path=video_path,
-    #                    audio_path=audio_path,
-    #                    subtitle_path=subtitle_path,
-    #                    output_file=output_file,
-    #                    params=cfg
-    #                   )
+    cfg.voice_volume = 1.0

-    # 合并视频
-    video_list = [
-        './storage/cache_videos/vid-01_03-01_50.mp4',
-        './storage/cache_videos/vid-01_55-02_29.mp4',
-        './storage/cache_videos/vid-03_24-04_04.mp4',
-        './storage/cache_videos/vid-04_50-05_28.mp4'
-    ]
+    video_path = "../../storage/tasks/123/combined.mp4"
+    audio_path = "../../storage/tasks/123/final_audio.mp3"
+    subtitle_path = "../../storage/tasks/123/subtitle.srt"
+    output_file = "../../storage/tasks/123/final-123.mp4"

+    generate_video_v2(video_path=video_path,
+                      audio_path=audio_path,
+                      subtitle_path=subtitle_path,
+                      output_file=output_file,
+                      params=cfg,
+                      list_script=list_script,
+                      )
--- a/app/services/video_service.py
+++ b/app/services/video_service.py
@ -0,0 +1,58 @@
+import os
+from uuid import uuid4
+from loguru import logger
+from typing import Dict, List, Optional, Tuple
+
+from app.services import material
+from app.models.schema import VideoClipParams
+from app.utils import utils
+
+
+class VideoService:
+    @staticmethod
+    async def crop_video(
+        video_path: str,
+        video_script: List[dict]
+    ) -> Tuple[str, Dict[str, str]]:
+        """
+        裁剪视频服务
+        
+        Args:
+            video_path: 视频文件路径
+            video_script: 视频脚本列表
+            
+        Returns:
+            Tuple[str, Dict[str, str]]: (task_id, 裁剪后的视频片段字典)
+            视频片段字典格式: {timestamp: video_path}
+        """
+        try:
+            task_id = str(uuid4())
+            
+            # 从脚本中提取时间戳列表
+            time_list = [scene['timestamp'] for scene in video_script]
+            
+            # 调用裁剪服务
+            subclip_videos = material.clip_videos(
+                task_id=task_id,
+                timestamp_terms=time_list,
+                origin_video=video_path
+            )
+            
+            if subclip_videos is None:
+                raise ValueError("裁剪视频失败")
+                
+            # 更新脚本中的视频路径
+            for scene in video_script:
+                try:
+                    scene['path'] = subclip_videos[scene['timestamp']]
+                except KeyError as err:
+                    logger.error(f"更新视频路径失败: {err}")
+                    
+            logger.debug(f"裁剪视频成功，共生成 {len(time_list)} 个视频片段")
+            logger.debug(f"视频片段路径: {subclip_videos}")
+            
+            return task_id, subclip_videos
+            
+        except Exception as e:
+            logger.exception("裁剪视频失败")
+            raise 
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -11,6 +11,7 @@ from edge_tts.submaker import mktimestamp
 from xml.sax.saxutils import unescape
 from edge_tts import submaker, SubMaker
 from moviepy.video.tools import subtitles
+import time

 from app.config import config
 from app.utils import utils
@ -989,6 +990,9 @@ Gender: Female

 Name: zh-CN-XiaoxiaoMultilingualNeural-V2
 Gender: Female
+
+Name: zh-CN-YunxiNeural-V2
+Gender: Male
    """.strip()
    voices = []
    name = ""
@ -1034,8 +1038,8 @@ def is_azure_v2_voice(voice_name: str):
 def tts(
    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
 ) -> [SubMaker, None]:
-    # if is_azure_v2_voice(voice_name):
-    #     return azure_tts_v2(text, voice_name, voice_file)
+    if is_azure_v2_voice(voice_name):
+        return azure_tts_v2(text, voice_name, voice_file)
    return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)


@ -1068,33 +1072,47 @@ def azure_tts_v1(
    pitch_str = convert_pitch_to_percent(voice_pitch)
    for i in range(3):
        try:
-            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
+            logger.info(f"第 {i+1} 次使用 edge_tts 生成音频")

-            async def _do() -> SubMaker:
+            async def _do() -> tuple[SubMaker, bytes]:
                communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, pitch=pitch_str, proxy=config.proxy.get("http"))
                sub_maker = edge_tts.SubMaker()
-                with open(voice_file, "wb") as file:
-                    async for chunk in communicate.stream():
-                        if chunk["type"] == "audio":
-                            file.write(chunk["data"])
-                        elif chunk["type"] == "WordBoundary":
-                            sub_maker.create_sub(
-                                (chunk["offset"], chunk["duration"]), chunk["text"]
-                            )
-                return sub_maker
-            # 判断音频文件是否一件存在
+                audio_data = bytes()  # 用于存储音频数据
+                
+                async for chunk in communicate.stream():
+                    if chunk["type"] == "audio":
+                        audio_data += chunk["data"]
+                    elif chunk["type"] == "WordBoundary":
+                        sub_maker.create_sub(
+                            (chunk["offset"], chunk["duration"]), chunk["text"]
+                        )
+                return sub_maker, audio_data
+
+            # 判断音频文件是否已存在
            if os.path.exists(voice_file):
                logger.info(f"voice file exists, skip tts: {voice_file}")
                continue
-            sub_maker = asyncio.run(_do())
-            if not sub_maker or not sub_maker.subs:
-                logger.warning(f"failed, sub_maker is None or sub_maker.subs is None")
+
+            # 获取音频数据和字幕信息
+            sub_maker, audio_data = asyncio.run(_do())
+            
+            # 验证数据是否有效
+            if not sub_maker or not sub_maker.subs or not audio_data:
+                logger.warning(f"failed, invalid data generated")
+                if i < 2:
+                    time.sleep(1)
                continue

+            # 数据有效，写入文件
+            with open(voice_file, "wb") as file:
+                file.write(audio_data)
+
            logger.info(f"completed, output file: {voice_file}")
            return sub_maker
        except Exception as e:
-            logger.error(f"failed, error: {str(e)}")
+            logger.error(f"生成音频文件时出错: {str(e)}")
+            if i < 2:
+                time.sleep(1)
    return None


@ -1130,14 +1148,6 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None
            sub_maker = SubMaker()

            def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
-                # print('WordBoundary event:')
-                # print('\tBoundaryType: {}'.format(evt.boundary_type))
-                # print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000)))
-                # print('\tDuration: {}'.format(evt.duration))
-                # print('\tText: {}'.format(evt.text))
-                # print('\tTextOffset: {}'.format(evt.text_offset))
-                # print('\tWordLength: {}'.format(evt.word_length))
-
                duration = _format_duration_to_offset(str(evt.duration))
                offset = _format_duration_to_offset(evt.audio_offset)
                sub_maker.subs.append(evt.text)
@ -1183,9 +1193,13 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None
                    logger.error(
                        f"azure v2 speech synthesis error: {cancellation_details.error_details}"
                    )
+            if i < 2:  # 如果不是最后一次重试，则等待1秒
+                time.sleep(1)
            logger.info(f"completed, output file: {voice_file}")
        except Exception as e:
            logger.error(f"failed, error: {str(e)}")
+            if i < 2:  # 如果不是最后一次重试，则等待1秒
+                time.sleep(1)
    return None


@ -1443,7 +1457,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f

            if sub_maker is None:
                logger.error(f"无法为时间戳 {timestamp} 生成音频; "
-                             f"如果您在中国，请使用VPN。或者手动选择 zh-CN-YunyangNeural 等角色；"
+                             f"如果您在中国，请使用VPN; "
                             f"或者使用其他 tts 引擎")
                continue

@ -1460,17 +1474,12 @@ if __name__ == "__main__":
    voice_name = parse_voice_name(voice_name)
    print(voice_name)

-    with open("../../resource/scripts/test.json", 'r', encoding='utf-8') as f:
+    with open("../../resource/scripts/2024-1203-205442.json", 'r', encoding='utf-8') as f:
        data = json.load(f)

-    audio_files, sub_maker_list = tts_multiple(task_id="12312312", list_script=data, voice_name=voice_name, voice_rate=1)
+    audio_files, sub_maker_list = tts_multiple(task_id="12312312", list_script=data, voice_name=voice_name, voice_rate=1, voice_pitch=1)

    full_text = " ".join([item['narration'] for item in data if not item['OST']])
    subtitle_file = os.path.join(utils.task_dir("12312312"), "subtitle_multiple.srt")
    create_subtitle_from_multiple(full_text, sub_maker_list, data, subtitle_file)
    print(f"生成的音频文件列表: {audio_files}")
-    print(f"生成的字幕文件: {subtitle_file}")
-
-    # text = " ".join([item['narration'] for item in data])
-    # sub_marks = tts(text=text, voice_name=voice_name, voice_rate=1, voice_file="../../storage/tasks/12312312/aaa.mp3")
-    # create_subtitle(text=text, sub_maker=sub_marks, subtitle_file="../../storage/tasks/12312312/subtitle_123.srt")
--- a/app/services/youtube_service.py
+++ b/app/services/youtube_service.py
@ -0,0 +1,146 @@
+import yt_dlp
+import os
+from typing import List, Dict, Optional, Tuple
+from loguru import logger
+from uuid import uuid4
+
+from app.utils import utils
+from app.services import video as VideoService
+
+
+class YoutubeService:
+    def __init__(self):
+        self.supported_formats = ['mp4', 'mkv', 'webm', 'flv', 'avi']
+
+    def _get_video_formats(self, url: str) -> List[Dict]:
+        """获取视频可用的格式列表"""
+        ydl_opts = {
+            'quiet': True,
+            'no_warnings': True
+        }
+
+        try:
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(url, download=False)
+                formats = info.get('formats', [])
+
+                format_list = []
+                for f in formats:
+                    format_info = {
+                        'format_id': f.get('format_id', 'N/A'),
+                        'ext': f.get('ext', 'N/A'),
+                        'resolution': f.get('format_note', 'N/A'),
+                        'filesize': f.get('filesize', 'N/A'),
+                        'vcodec': f.get('vcodec', 'N/A'),
+                        'acodec': f.get('acodec', 'N/A')
+                    }
+                    format_list.append(format_info)
+
+                return format_list
+        except Exception as e:
+            logger.error(f"获取视频格式失败: {str(e)}")
+            raise
+
+    def _validate_format(self, output_format: str) -> None:
+        """验证输出格式是否支持"""
+        if output_format.lower() not in self.supported_formats:
+            raise ValueError(
+                f"不支持的视频格式: {output_format}。"
+                f"支持的格式: {', '.join(self.supported_formats)}"
+            )
+
+    async def download_video(
+            self,
+            url: str,
+            resolution: str,
+            output_format: str = 'mp4',
+            rename: Optional[str] = None
+    ) -> Tuple[str, str, str]:
+        """
+        下载指定分辨率的视频
+        
+        Args:
+            url: YouTube视频URL
+            resolution: 目标分辨率 ('2160p', '1440p', '1080p', '720p' etc.)
+                       注意：对于类似'1080p60'的输入会被处理为'1080p'
+            output_format: 输出视频格式
+            rename: 可选的重命名
+            
+        Returns:
+            Tuple[str, str, str]: (task_id, output_path, filename)
+        """
+        try:
+            task_id = str(uuid4())
+            self._validate_format(output_format)
+
+            # 标准化分辨率格式
+            base_resolution = resolution.split('p')[0] + 'p'
+            
+            # 获取所有可用格式
+            formats = self._get_video_formats(url)
+
+            # 查找指定分辨率的最佳视频格式
+            target_format = None
+            for fmt in formats:
+                fmt_resolution = fmt['resolution']
+                # 将格式的分辨率也标准化后进行比较
+                if fmt_resolution != 'N/A':
+                    fmt_base_resolution = fmt_resolution.split('p')[0] + 'p'
+                    if fmt_base_resolution == base_resolution and fmt['vcodec'] != 'none':
+                        target_format = fmt
+                        break
+
+            if target_format is None:
+                # 收集可用分辨率时也进行标准化
+                available_resolutions = set(
+                    fmt['resolution'].split('p')[0] + 'p'
+                    for fmt in formats
+                    if fmt['resolution'] != 'N/A' and fmt['vcodec'] != 'none'
+                )
+                raise ValueError(
+                    f"未找到 {base_resolution} 分辨率的视频。"
+                    f"可用分辨率: {', '.join(sorted(available_resolutions))}"
+                )
+
+            # 创建输出目录
+            output_dir = utils.video_dir()
+            os.makedirs(output_dir, exist_ok=True)
+
+            # 设置下载选项
+            if rename:
+                # 如果指定了重命名，直接使用新名字
+                filename = f"{rename}.{output_format}"
+                output_template = os.path.join(output_dir, filename)
+            else:
+                # 否则使用任务ID和原标题
+                output_template = os.path.join(output_dir, f'{task_id}_%(title)s.%(ext)s')
+
+            ydl_opts = {
+                'format': f"{target_format['format_id']}+bestaudio[ext=m4a]/best",
+                'outtmpl': output_template,
+                'merge_output_format': output_format.lower(),
+                'postprocessors': [{
+                    'key': 'FFmpegVideoConvertor',
+                    'preferedformat': output_format.lower(),
+                }]
+            }
+
+            # 执行下载
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(url, download=True)
+                if rename:
+                    # 如果指定了重命名，使用新文件名
+                    output_path = output_template
+                    filename = os.path.basename(output_path)
+                else:
+                    # 否则使用原始标题
+                    video_title = info.get('title', task_id)
+                    filename = f"{task_id}_{video_title}.{output_format}"
+                    output_path = os.path.join(output_dir, filename)
+
+            logger.info(f"视频下载成功: {output_path}")
+            return task_id, output_path, filename
+
+        except Exception as e:
+            logger.exception("下载视频失败")
+            raise
--- a/app/test/test_moviepy.py
+++ b/app/test/test_moviepy.py
@ -1,21 +1,32 @@
 """
-使用 moviepy 库剪辑指定时间戳视频
+使用 moviepy 库剪辑指定时间戳视频，支持时分秒毫秒精度
 """

 from moviepy.editor import VideoFileClip
 from datetime import datetime
+import os


 def time_str_to_seconds(time_str: str) -> float:
    """
    将时间字符串转换为秒数
    参数:
-        time_str: 格式为"MM:SS"的时间字符串
+        time_str: 格式为"HH:MM:SS,mmm"的时间字符串，例如"00:01:23,456"
    返回:
-        转换后的秒数
+        转换后的秒数(float)
    """
-    time_obj = datetime.strptime(time_str, "%M:%S")
-    return time_obj.minute * 60 + time_obj.second
+    try:
+        # 分离时间和毫秒
+        time_part, ms_part = time_str.split(',')
+        # 转换时分秒
+        time_obj = datetime.strptime(time_part, "%H:%M:%S")
+        # 计算总秒数
+        total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second
+        # 添加毫秒部分
+        total_seconds += int(ms_part) / 1000
+        return total_seconds
+    except ValueError as e:
+        raise ValueError("时间格式错误，请使用 HH:MM:SS,mmm 格式，例如 00:01:23,456") from e


 def format_duration(seconds: float) -> str:
@ -24,40 +35,88 @@ def format_duration(seconds: float) -> str:
    参数:
        seconds: 秒数
    返回:
-        格式化的时间字符串 (MM:SS)
+        格式化的时间字符串 (HH:MM:SS,mmm)
    """
-    minutes = int(seconds // 60)
-    remaining_seconds = int(seconds % 60)
-    return f"{minutes:02d}:{remaining_seconds:02d}"
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds_remain = seconds % 60
+    whole_seconds = int(seconds_remain)
+    milliseconds = int((seconds_remain - whole_seconds) * 1000)
+    
+    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"


-def cut_video(video_path: str, start_time: str, end_time: str) -> None:
+def cut_video(video_path: str, start_time: str, end_time: str, output_path: str) -> None:
    """
    剪辑视频
    参数:
        video_path: 视频文件路径
-        start_time: 开始时间 (格式: "MM:SS")
-        end_time: 结束时间 (格式: "MM:SS")
+        start_time: 开始时间 (格式: "HH:MM:SS,mmm")
+        end_time: 结束时间 (格式: "HH:MM:SS,mmm")
+        output_path: 输出文件路径
    """
-    # 转换时间字符串为秒数
-    start_seconds = time_str_to_seconds(start_time)
-    end_seconds = time_str_to_seconds(end_time)
-    
-    # 加载视频文件
-    video = VideoFileClip(video_path)
-    
-    # 计算剪辑时长
-    clip_duration = end_seconds - start_seconds
-    print(f"原视频总长度: {format_duration(video.duration)}")
-    print(f"剪辑时长: {format_duration(clip_duration)}")
-    
-    # 剪辑视频
-    video = video.subclip(start_seconds, end_seconds)
-    video.write_videofile("../../resource/videos/cut_video2.mp4")
-    
-    # 释放资源
-    video.close()
+    try:
+        # 确保输出目录存在
+        output_dir = os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+            
+        # 如果输出文件已存在，先尝试删除
+        if os.path.exists(output_path):
+            try:
+                os.remove(output_path)
+            except PermissionError:
+                print(f"无法删除已存在的文件：{output_path}，请确保文件未被其他程序占用")
+                return
+        
+        # 转换时间字符串为秒数
+        start_seconds = time_str_to_seconds(start_time)
+        end_seconds = time_str_to_seconds(end_time)
+        
+        # 加载视频文件
+        video = VideoFileClip(video_path)
+        
+        # 验证时间范围
+        if start_seconds >= video.duration or end_seconds > video.duration:
+            raise ValueError(f"剪辑时间超出视频长度！视频总长度为: {format_duration(video.duration)}")
+        
+        if start_seconds >= end_seconds:
+            raise ValueError("结束时间必须大于开始时间！")
+        
+        # 计算剪辑时长
+        clip_duration = end_seconds - start_seconds
+        print(f"原视频总长度: {format_duration(video.duration)}")
+        print(f"剪辑时长: {format_duration(clip_duration)}")
+        print(f"剪辑区间: {start_time} -> {end_time}")
+        
+        # 剪辑视频
+        video = video.subclip(start_seconds, end_seconds)
+        
+        # 添加错误处理的写入过程
+        try:
+            video.write_videofile(
+                output_path,
+                codec='libx264',
+                audio_codec='aac',
+                temp_audiofile='temp-audio.m4a',
+                remove_temp=True
+            )
+        except IOError as e:
+            print(f"写入视频文件时发生错误：{str(e)}")
+            raise
+        finally:
+            # 确保资源被释放
+            video.close()
+            
+    except Exception as e:
+        print(f"视频剪辑过程中发生错误：{str(e)}")
+        raise


 if __name__ == "__main__":
-    cut_video("../../resource/videos/best.mp4", "00:40", "02:40")
+    cut_video(
+        video_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp.mp4",
+        start_time="00:00:00,789",
+        end_time="00:02:00,123",
+        output_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp_cut3.mp4"
+    )
--- a/app/test/test_qwen.py
+++ b/app/test/test_qwen.py
@ -0,0 +1,105 @@
+import os
+import traceback
+import json
+from openai import OpenAI
+from pydantic import BaseModel
+from typing import List
+from app.utils import utils
+from app.services.subtitle import extract_audio_and_create_subtitle
+
+
+class Step(BaseModel):
+    timestamp: str
+    picture: str
+    narration: str
+    OST: int
+    new_timestamp: str
+
+class MathReasoning(BaseModel):
+    result: List[Step]
+
+
+def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
+    """
+    与通义千问AI模型进行对话
+    
+    Args:
+        prompt (str): 用户输入的问题或提示
+        system_message (str): 系统提示信息，用于设定AI助手的行为。默认为"You are a helpful assistant."
+        subtitle_path (str): 字幕文件路径
+    Returns:
+        str: AI助手的回复内容
+
+    Raises:
+        Exception: 当API调用失败时抛出异常
+    """
+    try:
+        client = OpenAI(
+            api_key="sk-a1acd853d88d41d3ae92777d7bfa2612",
+            base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+        )
+
+        # 读取字幕文件
+        with open(subtitle_path, "r", encoding="utf-8") as file:
+            subtitle_content = file.read()
+
+        completion = client.chat.completions.create(
+            model="qwen-turbo-2024-11-01",
+            messages=[
+                {'role': 'system', 'content': system_message},
+                {'role': 'user', 'content': prompt + subtitle_content}
+            ]
+        )
+        return completion.choices[0].message.content
+
+    except Exception as e:
+        error_message = f"调用千问API时发生错误：{str(e)}"
+        print(error_message)
+        print("请参考文档：https://help.aliyun.com/zh/model-studio/developer-reference/error-code")
+        raise Exception(error_message)
+
+
+# 使用示例
+if __name__ == "__main__":
+    try:
+        video_path = utils.video_dir("duanju_yuansp.mp4")
+        # # 判断视频是否存在
+        # if not os.path.exists(video_path):
+        #     print(f"视频文件不存在：{video_path}")
+        #     exit(1)
+        # 提取字幕
+        subtitle_path = os.path.join(utils.video_dir(""), f"duanju_yuan.srt")
+        extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path)
+        # 分析字幕
+        system_message = """
+        你是一个视频srt字幕分析剪辑器, 输入视频的srt字幕, 分析其中的精彩且尽可能连续的片段并裁剪出来, 注意确保文字与时间戳的正确匹配。
+        输出需严格按照如下 json 格式:
+        [
+            {
+                "timestamp": "00:00:50,020-00,01:44,000",
+                "picture": "画面1",
+                "narration": "播放原声",
+                "OST": 0,
+                "new_timestamp": "00:00:00,000-00:00:54,020"
+            },
+            {
+                "timestamp": "01:49-02:30",
+                "picture": "画面2",
+                "narration": "播放原声",
+                "OST": 2,
+                "new_timestamp": "00:54-01:35"
+            },
+        ]
+        """
+        prompt = "字幕如下：\n"
+        response = chat_with_qwen(prompt, system_message, subtitle_path)
+        print(response)
+        # 保存json，注意json中是时间戳需要转换为 分:秒(现在的时间是 "timestamp": "00:00:00,020-00:00:01,660", 需要转换为 "timestamp": "00:00-01:66")
+        # response = json.loads(response)
+        # for item in response:
+        #     item["timestamp"] = item["timestamp"].replace(":", "-")
+        # with open(os.path.join(utils.video_dir(""), "duanju_yuan.json"), "w", encoding="utf-8") as file:
+        #     json.dump(response, file, ensure_ascii=False)
+
+    except Exception as e:
+        print(traceback.format_exc())
--- a/app/utils/gemini_analyzer.py
+++ b/app/utils/gemini_analyzer.py
@ -10,6 +10,7 @@ from google.api_core import exceptions
 import google.generativeai as genai
 import PIL.Image
 import traceback
+from app.utils import utils


 class VisionAnalyzer:
@ -146,14 +147,34 @@ class VisionAnalyzer:
            response_text = result['response']
            image_paths = result['image_paths']

-            img_name_start = Path(image_paths[0]).stem.split('_')[-1]
-            img_name_end = Path(image_paths[-1]).stem.split('_')[-1]
-            txt_path = os.path.join(output_dir, f"frame_{img_name_start}_{img_name_end}.txt")
+            # 从文件名中提取时间戳并转换为标准格式
+            def format_timestamp(img_path):
+                # 从文件名中提取时间部分
+                timestamp = Path(img_path).stem.split('_')[-1]
+                try:
+                    # 将时间转换为秒
+                    seconds = utils.time_to_seconds(timestamp.replace('_', ':'))
+                    # 转换为 HH:MM:SS,mmm 格式
+                    hours = int(seconds // 3600)
+                    minutes = int((seconds % 3600) // 60)
+                    seconds_remainder = seconds % 60
+                    whole_seconds = int(seconds_remainder)
+                    milliseconds = int((seconds_remainder - whole_seconds) * 1000)
+                    
+                    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
+                except Exception as e:
+                    logger.error(f"时间戳格式转换错误: {timestamp}, {str(e)}")
+                    return timestamp
+
+            start_timestamp = format_timestamp(image_paths[0])
+            end_timestamp = format_timestamp(image_paths[-1])
+            
+            txt_path = os.path.join(output_dir, f"frame_{start_timestamp}_{end_timestamp}.txt")

            # 保存结果到txt文件
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(response_text.strip())
-            print(f"已保存分析结果到: {txt_path}")
+            logger.info(f"已保存分析结果到: {txt_path}")

    def load_images(self, image_paths: List[str]) -> List[PIL.Image.Image]:
        """
--- a/app/utils/qwenvl_analyzer.py
+++ b/app/utils/qwenvl_analyzer.py
@ -0,0 +1,265 @@
+import json
+from typing import List, Union, Dict
+import os
+from pathlib import Path
+from loguru import logger
+from tqdm import tqdm
+import asyncio
+from tenacity import retry, stop_after_attempt, RetryError, wait_exponential
+from openai import OpenAI
+import PIL.Image
+import base64
+import io
+import traceback
+
+
+class QwenAnalyzer:
+    """千问视觉分析器类"""
+
+    def __init__(self, model_name: str = "qwen-vl-max-latest", api_key: str = None, base_url: str = None):
+        """
+        初始化千问视觉分析器
+        
+        Args:
+            model_name: 模型名称，默认使用 qwen-vl-max-latest
+            api_key: 阿里云API密钥
+            base_url: API基础URL，如果为None则使用默认值
+        """
+        if not api_key:
+            raise ValueError("必须提供API密钥")
+
+        self.model_name = model_name
+        self.api_key = api_key
+        self.base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
+
+        # 配置API客户端
+        self._configure_client()
+
+    def _configure_client(self):
+        """
+        配置API客户端
+        使用最简化的参数配置，避免不必要的参数
+        """
+        try:
+            self.client = OpenAI(
+                api_key=self.api_key,
+                base_url=self.base_url
+            )
+        except Exception as e:
+            logger.error(f"初始化OpenAI客户端失败: {str(e)}")
+            raise
+
+    def _image_to_base64(self, image: PIL.Image.Image) -> str:
+        """
+        将PIL图片对象转换为base64字符串
+        """
+        buffered = io.BytesIO()
+        image.save(buffered, format="JPEG")
+        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=4, max=10)
+    )
+    async def _generate_content_with_retry(self, prompt: str, batch: List[PIL.Image.Image]):
+        """使用重试机制的内部方法来调用千问API"""
+        try:
+            # 构建消息内容
+            content = []
+
+            # 添加图片
+            for img in batch:
+                base64_image = self._image_to_base64(img)
+                content.append({
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}"
+                    }
+                })
+
+            # 添加文本提示
+            content.append({
+                "type": "text",
+                "text": prompt
+            })
+
+            # 调用API
+            response = await asyncio.to_thread(
+                self.client.chat.completions.create,
+                model=self.model_name,
+                messages=[{
+                    "role": "user",
+                    "content": content
+                }]
+            )
+
+            return response.choices[0].message.content
+
+        except Exception as e:
+            logger.error(f"API调用错误: {str(e)}")
+            raise RetryError("API调用失败")
+
+    async def analyze_images(self,
+                             images: Union[List[str], List[PIL.Image.Image]],
+                             prompt: str,
+                             batch_size: int = 5) -> List[Dict]:
+        """
+        批量分析多张图片
+        Args:
+            images: 图片路径列表或PIL图片对象列表
+            prompt: 分析提示词
+            batch_size: 批处理大小
+        Returns:
+            分析结果列表
+        """
+        try:
+            # 保存原始图片路径（如果是路径列表的话）
+            original_paths = images if isinstance(images[0], str) else None
+
+            # 加载图片
+            if isinstance(images[0], str):
+                logger.info("正在加载图片...")
+                images = self.load_images(images)
+
+            # 验证图片列表
+            if not images:
+                raise ValueError("图片列表为空")
+
+            # 验证每个图片对象
+            valid_images = []
+            valid_paths = []
+            for i, img in enumerate(images):
+                if not isinstance(img, PIL.Image.Image):
+                    logger.error(f"无效的图片对象，索引 {i}: {type(img)}")
+                    continue
+                valid_images.append(img)
+                if original_paths:
+                    valid_paths.append(original_paths[i])
+
+            if not valid_images:
+                raise ValueError("没有有效的图片对象")
+
+            images = valid_images
+            results = []
+            total_batches = (len(images) + batch_size - 1) // batch_size
+
+            with tqdm(total=total_batches, desc="分析进度") as pbar:
+                for i in range(0, len(images), batch_size):
+                    batch = images[i:i + batch_size]
+                    batch_paths = valid_paths[i:i + batch_size] if valid_paths else None
+                    retry_count = 0
+
+                    while retry_count < 3:
+                        try:
+                            # 在每个批次处理前<E79086><E5898D>加小延迟
+                            if i > 0:
+                                await asyncio.sleep(2)
+
+                            # 确保每个批次的图片都是有效的
+                            valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)]
+                            if not valid_batch:
+                                raise ValueError(f"批次 {i // batch_size} 中没有有效的图片")
+
+                            response = await self._generate_content_with_retry(prompt, valid_batch)
+                            result_dict = {
+                                'batch_index': i // batch_size,
+                                'images_processed': len(valid_batch),
+                                'response': response,
+                                'model_used': self.model_name
+                            }
+
+                            # 添加图片路径信息（如果有的话）
+                            if batch_paths:
+                                result_dict['image_paths'] = batch_paths
+
+                            results.append(result_dict)
+                            break
+
+                        except Exception as e:
+                            retry_count += 1
+                            error_msg = f"批次 {i // batch_size} 处理出错: {str(e)}"
+                            logger.error(error_msg)
+
+                            if retry_count >= 3:
+                                results.append({
+                                    'batch_index': i // batch_size,
+                                    'images_processed': len(batch),
+                                    'error': error_msg,
+                                    'model_used': self.model_name,
+                                    'image_paths': batch_paths if batch_paths else []
+                                })
+                            else:
+                                logger.info(f"批次 {i // batch_size} 处理失败，等待60秒后重试当前批次...")
+                                await asyncio.sleep(60)
+
+                    pbar.update(1)
+
+            return results
+
+        except Exception as e:
+            error_msg = f"图片分析过程中发生错误: {str(e)}\n{traceback.format_exc()}"
+            logger.error(error_msg)
+            raise Exception(error_msg)
+
+    def save_results_to_txt(self, results: List[Dict], output_dir: str):
+        """将分析结果保存到txt文件"""
+        # 确保输出目录存在
+        os.makedirs(output_dir, exist_ok=True)
+
+        for i, result in enumerate(results):
+            response_text = result['response']
+
+            # 如果有图片路径信息，<E681AF><EFBC8C><EFBFBD>用它来生成文件名
+            if result.get('image_paths'):
+                image_paths = result['image_paths']
+                img_name_start = Path(image_paths[0]).stem.split('_')[-1]
+                img_name_end = Path(image_paths[-1]).stem.split('_')[-1]
+                file_name = f"frame_{img_name_start}_{img_name_end}.txt"
+            else:
+                # 如果没有路径信息，使用批次索引
+                file_name = f"batch_{result['batch_index']}.txt"
+
+            txt_path = os.path.join(output_dir, file_name)
+
+            # 保存结果到txt文件
+            with open(txt_path, 'w', encoding='utf-8') as f:
+                f.write(response_text.strip())
+            logger.info(f"已保存分析结果到: {txt_path}")
+
+    def load_images(self, image_paths: List[str]) -> List[PIL.Image.Image]:
+        """
+        加载多张图片
+        Args:
+            image_paths: 图片路径列表
+        Returns:
+            加载后的PIL Image对象列表
+        """
+        images = []
+        failed_images = []
+
+        for img_path in image_paths:
+            try:
+                if not os.path.exists(img_path):
+                    logger.error(f"图片文件不存在: {img_path}")
+                    failed_images.append(img_path)
+                    continue
+
+                img = PIL.Image.open(img_path)
+                # 确保图片被完全加载
+                img.load()
+                # 转换为RGB模式
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                images.append(img)
+
+            except Exception as e:
+                logger.error(f"无法加载图片 {img_path}: {str(e)}")
+                failed_images.append(img_path)
+
+        if failed_images:
+            logger.warning(f"以下图片加载失败:\n{json.dumps(failed_images, indent=2, ensure_ascii=False)}")
+
+        if not images:
+            raise ValueError("没有成功加载任何图片")
+
+        return images
--- a/app/utils/script_generator.py
+++ b/app/utils/script_generator.py
@ -374,22 +374,65 @@ class ScriptProcessor:
 记住：要敢于用"温和的违反"制造笑点，但要把握好尺度，让观众在轻松愉快中感受到乐趣。"""

    def calculate_duration_and_word_count(self, time_range: str) -> int:
+        """
+        计算时间范围的持续时长并估算合适的字数
+        
+        Args:
+            time_range: 时间范围字符串,格式为 "HH:MM:SS,mmm-HH:MM:SS,mmm"
+                       例如: "00:00:50,100-00:01:21,500"
+        
+        Returns:
+            int: 估算的合适字数
+                  基于经验公式: 每0.35秒可以说一个字
+                  例如: 10秒可以说约28个字 (10/0.35≈28.57)
+        """
        try:
            start_str, end_str = time_range.split('-')
-
-            def time_to_seconds(time_str):
-                minutes, seconds = map(int, time_str.split(':'))
-                return minutes * 60 + seconds
-
+            
+            def time_to_seconds(time_str: str) -> float:
+                """
+                将时间字符串转换为秒数(带毫秒精度)
+                
+                Args:
+                    time_str: 时间字符串,格式为 "HH:MM:SS,mmm"
+                             例如: "00:00:50,100" 表示50.1秒
+                
+                Returns:
+                    float: 转换后的秒数(带毫秒)
+                """
+                try:
+                    # 处理毫秒部分
+                    time_part, ms_part = time_str.split(',')
+                    hours, minutes, seconds = map(int, time_part.split(':'))
+                    milliseconds = int(ms_part)
+                    
+                    # 转换为秒
+                    total_seconds = (hours * 3600) + (minutes * 60) + seconds + (milliseconds / 1000)
+                    return total_seconds
+                    
+                except ValueError as e:
+                    logger.warning(f"时间格式解析错误: {time_str}, error: {e}")
+                    return 0.0
+            
+            # 计算开始和结束时间的秒数
            start_seconds = time_to_seconds(start_str)
            end_seconds = time_to_seconds(end_str)
+            
+            # 计算持续时间(秒)
            duration = end_seconds - start_seconds
-            word_count = int(duration / 0.35)
-
+            
+            # 根据经验公式计算字数: 每0.5秒一个字
+            word_count = int(duration / 0.4)
+            
+            # 确保字数在合理范围内
+            word_count = max(10, min(word_count, 500))  # 限制在10-500字之间
+            
+            logger.debug(f"时间范围 {time_range} 的持续时间为 {duration:.3f}秒, 估算字数: {word_count}")
            return word_count
+            
        except Exception as e:
-            logger.info(f"时间格式转换错误: {traceback.format_exc()}")
-            return 100
+            logger.warning(f"字数计算错误: {traceback.format_exc()}")
+            return 100  # 发生错误时返回默认字数

    def process_frames(self, frame_content_list: List[Dict]) -> List[Dict]:
        for frame_content in frame_content_list:
@ -406,22 +449,47 @@ class ScriptProcessor:
    def _save_results(self, frame_content_list: List[Dict]):
        """保存处理结果，并添加新的时间戳"""
        try:
-            # 转换秒数为 MM:SS 格式
-            def seconds_to_time(seconds):
-                minutes = seconds // 60
-                remaining_seconds = seconds % 60
-                return f"{minutes:02d}:{remaining_seconds:02d}"
+            def format_timestamp(seconds: float) -> str:
+                """将秒数转换为 HH:MM:SS,mmm 格式"""
+                hours = int(seconds // 3600)
+                minutes = int((seconds % 3600) // 60)
+                seconds_remainder = seconds % 60
+                whole_seconds = int(seconds_remainder)
+                milliseconds = int((seconds_remainder - whole_seconds) * 1000)
+                
+                return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"

            # 计算新的时间戳
-            current_time = 0  # 当前时间点（秒）
+            current_time = 0.0  # 当前时间点（秒，包含毫秒）

            for frame in frame_content_list:
                # 获取原始时间戳的持续时间
                start_str, end_str = frame['timestamp'].split('-')

-                def time_to_seconds(time_str):
-                    minutes, seconds = map(int, time_str.split(':'))
-                    return minutes * 60 + seconds
+                def time_to_seconds(time_str: str) -> float:
+                    """将时间字符串转换为秒数（包含毫秒）"""
+                    try:
+                        if ',' in time_str:
+                            time_part, ms_part = time_str.split(',')
+                            ms = float(ms_part) / 1000
+                        else:
+                            time_part = time_str
+                            ms = 0
+
+                        parts = time_part.split(':')
+                        if len(parts) == 3:  # HH:MM:SS
+                            h, m, s = map(float, parts)
+                            seconds = h * 3600 + m * 60 + s
+                        elif len(parts) == 2:  # MM:SS
+                            m, s = map(float, parts)
+                            seconds = m * 60 + s
+                        else:  # SS
+                            seconds = float(parts[0])
+
+                        return seconds + ms
+                    except Exception as e:
+                        logger.error(f"时间格式转换错误 {time_str}: {str(e)}")
+                        return 0.0

                # 计算当前片段的持续时间
                start_seconds = time_to_seconds(start_str)
@ -429,8 +497,8 @@ class ScriptProcessor:
                duration = end_seconds - start_seconds

                # 设置新的时间戳
-                new_start = seconds_to_time(current_time)
-                new_end = seconds_to_time(current_time + duration)
+                new_start = format_timestamp(current_time)
+                new_end = format_timestamp(current_time + duration)
                frame['new_timestamp'] = f"{new_start}-{new_end}"

                # 更新当前时间点
@ -443,7 +511,7 @@ class ScriptProcessor:
            with open(file_name, 'w', encoding='utf-8') as file:
                json.dump(frame_content_list, file, ensure_ascii=False, indent=4)

-            logger.info(f"保存脚本成功，总时长: {seconds_to_time(current_time)}")
+            logger.info(f"保存脚本成功，总时长: {format_timestamp(current_time)}")

        except Exception as e:
            logger.error(f"保存结果时发生错误: {str(e)}\n{traceback.format_exc()}")
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@ -40,7 +40,7 @@ def to_json(obj):
            # 如果对象是二进制数据，转换为base64编码的字符串
            elif isinstance(o, bytes):
                return "*** binary data ***"
-            # 如果对象是字典，递归处理每个键值对
+            # 如果象是字典，递归处理每个键值对
            elif isinstance(o, dict):
                return {k: serialize(v) for k, v in o.items()}
            # 如果对象是列表或元组，递归处理每个元素
@ -56,7 +56,7 @@ def to_json(obj):
        # 使用serialize函数处理输入对象
        serialized_obj = serialize(obj)

-        # 序列化处理后的对象为JSON<EFBFBD><EFBFBD><EFBFBD>符串
+        # 序列化处理后的对象为JSON符串
        return json.dumps(serialized_obj, ensure_ascii=False, indent=4)
    except Exception as e:
        return None
@ -126,6 +126,15 @@ def public_dir(sub_dir: str = ""):
    return d


+def srt_dir(sub_dir: str = ""):
+    d = resource_dir(f"srt")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
 def run_in_background(func, *args, **kwargs):
    def run():
        try:
@ -302,15 +311,49 @@ def get_current_country():


 def time_to_seconds(time_str: str) -> float:
-    parts = time_str.split(':')
-    if len(parts) == 2:
-        m, s = map(float, parts)
-        return m * 60 + s
-    elif len(parts) == 3:
-        h, m, s = map(float, parts)
-        return h * 3600 + m * 60 + s
-    else:
-        raise ValueError(f"Invalid time format: {time_str}")
+    """
+    将时间字符串转换为秒数，支持多种格式：
+    - "HH:MM:SS,mmm" -> 小时:分钟:秒,毫秒
+    - "MM:SS,mmm" -> 分钟:秒,毫秒
+    - "SS,mmm" -> 秒,毫秒
+    - "SS-mmm" -> 秒-毫秒
+    
+    Args:
+        time_str: 时间字符串
+        
+    Returns:
+        float: 转换后的秒数(包含毫秒)
+    """
+    try:
+        # 处理带有'-'的毫秒格式
+        if '-' in time_str:
+            time_part, ms_part = time_str.split('-')
+            ms = float(ms_part) / 1000
+        # 处理带有','的毫秒格式
+        elif ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = float(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 分割时间部分
+        parts = time_part.split(':')
+        
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(float, parts)
+            seconds = h * 3600 + m * 60 + s
+        elif len(parts) == 2:  # MM:SS
+            m, s = map(float, parts)
+            seconds = m * 60 + s
+        else:  # SS
+            seconds = float(parts[0])
+
+        return seconds + ms
+        
+    except (ValueError, IndexError) as e:
+        logger.error(f"时间格式转换错误 {time_str}: {str(e)}")
+        return 0.0


 def seconds_to_time(seconds: float) -> str:
@ -320,15 +363,25 @@ def seconds_to_time(seconds: float) -> str:


 def calculate_total_duration(scenes):
+    """
+    计算场景列表的总时长
+    
+    Args:
+        scenes: 场景列表，每个场景包含 timestamp 字段，格式如 "00:00:28,350-00:00:41,000"
+        
+    Returns:
+        float: 总时长（秒）
+    """
    total_seconds = 0
    
    for scene in scenes:
        start, end = scene['timestamp'].split('-')
-        start_time = datetime.strptime(start, '%M:%S')
-        end_time = datetime.strptime(end, '%M:%S')
+        # 使用 time_to_seconds 函数处理更精确的时间格式
+        start_seconds = time_to_seconds(start)
+        end_seconds = time_to_seconds(end)
        
-        duration = end_time - start_time
-        total_seconds += duration.total_seconds()
+        duration = end_seconds - start_seconds
+        total_seconds += duration
    
    return total_seconds

@ -451,7 +504,7 @@ def clear_keyframes_cache(video_path: str = None):
            return
            
        if video_path:
-            # <EFBFBD><EFBFBD><EFBFBD>理指定视频的缓存
+            # 理指定视频的缓存
            video_hash = md5(video_path + str(os.path.getmtime(video_path)))
            video_keyframes_dir = os.path.join(keyframes_dir, video_hash)
            if os.path.exists(video_keyframes_dir):
@ -520,3 +573,21 @@ def download_font(url: str, font_path: str):
    except Exception as e:
        logger.error(f"下载字体文件失败: {e}")
        raise
+
+def init_imagemagick():
+    """初始化 ImageMagick 配置"""
+    try:
+        # 检查 ImageMagick 是否已安装
+        import subprocess
+        result = subprocess.run(['magick', '-version'], capture_output=True, text=True)
+        if result.returncode != 0:
+            logger.error("ImageMagick 未安装或配置不正确")
+            return False
+            
+        # 设置 IMAGEMAGICK_BINARY 环境变量
+        os.environ['IMAGEMAGICK_BINARY'] = 'magick'
+        
+        return True
+    except Exception as e:
+        logger.error(f"初始化 ImageMagick 失败: {str(e)}")
+        return False
--- a/app/utils/video_processor_v2.py
+++ b/app/utils/video_processor_v2.py
@ -51,21 +51,34 @@ class VideoProcessor:
    def detect_shot_boundaries(self, frames: List[np.ndarray], threshold: int = 30) -> List[int]:
        """
        使用帧差法检测镜头边界
-
+        
        Args:
            frames: 视频帧列表
-            threshold: 差异阈值
-
+            threshold: 差异阈值，默认值调低为30
+        
        Returns:
            List[int]: 镜头边界帧的索引列表
        """
        shot_boundaries = []
+        if len(frames) < 2:  # 添加帧数检查
+            logger.warning("视频帧数过少，无法检测场景边界")
+            return [len(frames) - 1]  # 返回最后一帧作为边界
+        
        for i in range(1, len(frames)):
            prev_frame = cv2.cvtColor(frames[i - 1], cv2.COLOR_BGR2GRAY)
            curr_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
-            diff = np.mean(np.abs(curr_frame.astype(int) - prev_frame.astype(int)))
+            
+            # 计算帧差
+            diff = np.mean(np.abs(curr_frame.astype(float) - prev_frame.astype(float)))
+            
            if diff > threshold:
                shot_boundaries.append(i)
+
+        # 如果没有检测到任何边界，至少返回最后一帧
+        if not shot_boundaries:
+            logger.warning("未检测到场景边界，将视频作为单个场景处理")
+            shot_boundaries.append(len(frames) - 1)
+        
        return shot_boundaries

    def extract_keyframes(self, frames: List[np.ndarray], shot_boundaries: List[int]) -> Tuple[
@ -113,12 +126,7 @@ class VideoProcessor:
                       output_dir: str, desc: str = "保存关键帧") -> None:
        """
        保存关键帧到指定目录，文件名格式为：keyframe_帧序号_时间戳.jpg
-
-        Args:
-            keyframes: 关键帧列表
-            keyframe_indices: 关键帧索引列表
-            output_dir: 输出目录
-            desc: 进度条描述
+        时间戳精确到毫秒，格式为：HHMMSSmmm
        """
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
@ -126,11 +134,13 @@ class VideoProcessor:
        for keyframe, frame_idx in tqdm(zip(keyframes, keyframe_indices),
                                        total=len(keyframes),
                                        desc=desc):
+            # 计算精确到毫秒的时间戳
            timestamp = frame_idx / self.fps
            hours = int(timestamp // 3600)
            minutes = int((timestamp % 3600) // 60)
            seconds = int(timestamp % 60)
-            time_str = f"{hours:02d}{minutes:02d}{seconds:02d}"
+            milliseconds = int((timestamp % 1) * 1000)  # 计算毫秒部分
+            time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"

            output_path = os.path.join(output_dir,
                                       f'keyframe_{frame_idx:06d}_{time_str}.jpg')
@ -138,11 +148,7 @@ class VideoProcessor:

    def extract_frames_by_numbers(self, frame_numbers: List[int], output_folder: str) -> None:
        """
-        根据指定的帧号提取帧，如果多个帧在同一秒内，只保留一个
-
-        Args:
-            frame_numbers: 要提取的帧号列表
-            output_folder: 输出文件夹路径
+        根据指定的帧号提取帧，如果多个帧在同一毫秒内，只保留一个
        """
        if not frame_numbers:
            raise ValueError("未提供帧号列表")
@ -153,29 +159,31 @@ class VideoProcessor:
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

-        # 用于记录已处理的时间戳（秒）
-        processed_seconds = set()
+        # 用于记录已处理的时间戳（毫秒）
+        processed_timestamps = set()

        for frame_number in tqdm(frame_numbers, desc="提取高清帧"):
-            # 计算时间戳（秒）
-            timestamp_seconds = int(frame_number / self.fps)
+            # 计算精确到毫秒的时间戳
+            timestamp = frame_number / self.fps
+            timestamp_ms = int(timestamp * 1000)  # 转换为毫秒

-            # 如果这一秒已经处理过，跳过
-            if timestamp_seconds in processed_seconds:
+            # 如果这一毫秒已经处理过，跳过
+            if timestamp_ms in processed_timestamps:
                continue

            self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
            ret, frame = self.cap.read()

            if ret:
-                # 记录这一秒已经处理
-                processed_seconds.add(timestamp_seconds)
+                # 记录这一毫秒已经处理
+                processed_timestamps.add(timestamp_ms)

                # 计算时间戳字符串
-                hours = int(timestamp_seconds // 3600)
-                minutes = int((timestamp_seconds % 3600) // 60)
-                seconds = int(timestamp_seconds % 60)
-                time_str = f"{hours:02d}{minutes:02d}{seconds:02d}"
+                hours = int(timestamp // 3600)
+                minutes = int((timestamp % 3600) // 60)
+                seconds = int(timestamp % 60)
+                milliseconds = int((timestamp % 1) * 1000)  # 计算毫秒部分
+                time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"

                output_path = os.path.join(output_folder,
                                           f"keyframe_{frame_number:06d}_{time_str}.jpg")
@ -183,27 +191,34 @@ class VideoProcessor:
            else:
                logger.info(f"无法读取帧 {frame_number}")

-        logger.info(f"共提取了 {len(processed_seconds)} 个不同时间戳的帧")
+        logger.info(f"共提取了 {len(processed_timestamps)} 个不同时间戳的帧")

    @staticmethod
    def extract_numbers_from_folder(folder_path: str) -> List[int]:
        """
        从文件夹中提取帧号
-
+        
        Args:
            folder_path: 关键帧文件夹路径
-
+        
        Returns:
            List[int]: 排序后的帧号列表
        """
        files = [f for f in os.listdir(folder_path) if f.endswith('.jpg')]
-        # 更新正则表达式以匹配新的文件名格式：keyframe_000123_010534.jpg
-        pattern = re.compile(r'keyframe_(\d+)_\d+\.jpg$')
+        # 更新正则表达式以匹配新的文件名格式：keyframe_000123_010534123.jpg
+        pattern = re.compile(r'keyframe_(\d+)_\d{9}\.jpg$')
        numbers = []
+        
        for f in files:
            match = pattern.search(f)
            if match:
                numbers.append(int(match.group(1)))
+            else:
+                logger.warning(f"文件名格式不匹配: {f}")
+        
+        if not numbers:
+            logger.error(f"在目录 {folder_path} 中未找到有效的关键帧文件")
+        
        return sorted(numbers)

    def process_video(self, output_dir: str, skip_seconds: float = 0, threshold: int = 30) -> None:
@ -212,7 +227,7 @@ class VideoProcessor:

        Args:
            output_dir: 输出目录
-            skip_seconds: 跳过视<EFBFBD><EFBFBD><EFBFBD>开头的秒数
+            skip_seconds: 跳过视频开头的秒数
        """
        skip_frames = int(skip_seconds * self.fps)

@ -240,11 +255,14 @@ class VideoProcessor:
    def process_video_pipeline(self,
                               output_dir: str,
                               skip_seconds: float = 0,
-                               threshold: int = 30,
+                               threshold: int = 20,  # 降低默认阈值
                               compressed_width: int = 320,
                               keep_temp: bool = False) -> None:
        """
-        执行完整的视频处理流程：压缩、提取关键帧、导出高清帧
+        执行完整的视频处理流程
+        
+        Args:
+            threshold: 降低默认阈值为20，使场景检测更敏感
        """
        os.makedirs(output_dir, exist_ok=True)
        temp_dir = os.path.join(output_dir, 'temp')
@ -358,7 +376,7 @@ if __name__ == "__main__":
    import time

    start_time = time.time()
-    processor = VideoProcessor("best.mp4")
-    processor.process_video_pipeline(output_dir="output4")
+    processor = VideoProcessor("E:\\projects\\NarratoAI\\resource\\videos\\test.mp4")
+    processor.process_video_pipeline(output_dir="output")
    end_time = time.time()
    print(f"处理完成！总耗时: {end_time - start_time:.2f} 秒")
--- a/config.example.toml
+++ b/config.example.toml
@ -1,5 +1,5 @@
 [app]
-    project_version="0.3.5"
+    project_version="0.3.9"
    # 支持视频理解的大模型提供商
    #   gemini
    #   NarratoAPI
--- a/main.py
+++ b/main.py
@ -1,3 +1,4 @@
+import os
 import uvicorn
 from loguru import logger

@ -7,6 +8,8 @@ if __name__ == "__main__":
    logger.info(
        "start server, docs: http://127.0.0.1:" + str(config.listen_port) + "/docs"
    )
+    os.environ["HTTP_PROXY"] = config.proxy.get("http")
+    os.environ["HTTPS_PROXY"] = config.proxy.get("https")
    uvicorn.run(
        app="app.asgi:app",
        host=config.listen_host,
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 requests~=2.31.0
-moviepy~=2.0.0.dev2
+moviepy==2.0.0.dev2
 faster-whisper~=1.0.1
 edge_tts~=6.1.15
 uvicorn~=0.27.1
@ -26,9 +26,12 @@ psutil>=5.9.0
 opencv-python~=4.10.0.84
 scikit-learn~=1.5.2
 google-generativeai~=0.8.3
-Pillow>=11.0.0
+pillow==10.3.0
 python-dotenv~=1.0.1
 openai~=1.53.0
 tqdm>=4.66.6
 tenacity>=9.0.0
-tiktoken==0.8.0
+tiktoken==0.8.0
+yt-dlp==2024.11.18
+pysrt==1.1.2
+httpx==0.27.2
--- a/resource/fonts/fonts_in_here.txt
+++ b/resource/fonts/fonts_in_here.txt
@ -0,0 +1 @@
+此处放字体文件
--- a/resource/scripts/script_in_here.txt
+++ b/resource/scripts/script_in_here.txt
--- a/resource/songs/song_in_here.txt
+++ b/resource/songs/song_in_here.txt
--- a/resource/srt/srt_in_here.txt
+++ b/resource/srt/srt_in_here.txt
--- a/resource/videos/video_in_here.txt
+++ b/resource/videos/video_in_here.txt
--- a/video_pipeline.py
+++ b/video_pipeline.py
@ -0,0 +1,178 @@
+import requests
+import json
+import os
+import time
+from typing import Dict, Any
+
+class VideoPipeline:
+    def __init__(self, base_url: str = "http://127.0.0.1:8080"):
+        self.base_url = base_url
+        
+    def download_video(self, url: str, resolution: str = "1080p", 
+                      output_format: str = "mp4", rename: str = None) -> Dict[str, Any]:
+        """下载视频的第一步"""
+        endpoint = f"{self.base_url}/api/v2/youtube/download"
+        payload = {
+            "url": url,
+            "resolution": resolution,
+            "output_format": output_format,
+            "rename": rename or time.strftime("%Y-%m-%d")
+        }
+        
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def generate_script(self, video_path: str, skip_seconds: int = 0,
+                       threshold: int = 30, vision_batch_size: int = 10,
+                       vision_llm_provider: str = "gemini") -> Dict[str, Any]:
+        """生成脚本的第二步"""
+        endpoint = f"{self.base_url}/api/v2/scripts/generate"
+        payload = {
+            "video_path": video_path,
+            "skip_seconds": skip_seconds,
+            "threshold": threshold,
+            "vision_batch_size": vision_batch_size,
+            "vision_llm_provider": vision_llm_provider
+        }
+        
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def crop_video(self, video_path: str, script: list) -> Dict[str, Any]:
+        """剪辑视频的第三步"""
+        endpoint = f"{self.base_url}/api/v2/scripts/crop"
+        payload = {
+            "video_origin_path": video_path,
+            "video_script": script
+        }
+        
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def generate_final_video(self, task_id: str, video_path: str, 
+                           script_path: str, script: list, subclip_videos: Dict[str, str], voice_name: str) -> Dict[str, Any]:
+        """生成最终视频的第四步"""
+        endpoint = f"{self.base_url}/api/v2/scripts/start-subclip"
+        
+        request_data = {
+            "video_clip_json": script,
+            "video_clip_json_path": script_path,
+            "video_origin_path": video_path,
+            "video_aspect": "16:9",
+            "video_language": "zh-CN",
+            "voice_name": voice_name,
+            "voice_volume": 1,
+            "voice_rate": 1.2,
+            "voice_pitch": 1,
+            "bgm_name": "random",
+            "bgm_type": "random",
+            "bgm_file": "",
+            "bgm_volume": 0.3,
+            "subtitle_enabled": True,
+            "subtitle_position": "bottom",
+            "font_name": "STHeitiMedium.ttc",
+            "text_fore_color": "#FFFFFF",
+            "text_background_color": "transparent",
+            "font_size": 75,
+            "stroke_color": "#000000",
+            "stroke_width": 1.5,
+            "custom_position": 70,
+            "n_threads": 8
+        }
+        
+        payload = {
+            "request": request_data,
+            "subclip_videos": subclip_videos
+        }
+        
+        params = {"task_id": task_id}
+        response = requests.post(endpoint, params=params, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def save_script_to_json(self, script: list, script_path: str) -> str:
+        """保存脚本到json文件"""        
+        try:
+            with open(script_path, 'w', encoding='utf-8') as f:
+                json.dump(script, f, ensure_ascii=False, indent=2)
+            print(f"脚本已保存到: {script_path}")
+            return script_path
+        except Exception as e:
+            print(f"保存脚本失败: {str(e)}")
+            raise
+    
+    def run_pipeline(self, task_id: str, script_name: str, youtube_url: str, video_name: str="null", skip_seconds: int = 0, threshold: int = 30, vision_batch_size: int = 10, vision_llm_provider: str = "gemini", voice_name: str = "zh-CN-YunjianNeural") -> Dict[str, Any]:
+        """运行完整的pipeline"""
+        try:
+            current_path = os.path.dirname(os.path.abspath(__file__))
+            video_path = os.path.join(current_path, "resource", "videos", f"{video_name}.mp4")
+            # 判断视频是否存在
+            if not os.path.exists(video_path):
+                # 1. 下载视频
+                print(f"视频不存在, 开始下载视频: {video_path}")
+                download_result = self.download_video(url=youtube_url, resolution="1080p", output_format="mp4", rename=video_name)
+                video_path = download_result["output_path"]
+            else:
+                print(f"视频已存在: {video_path}")
+            
+            # 2. 判断script_name是否存在
+            # 2.1.1 拼接脚本路径 NarratoAI/resource/scripts
+            script_path = os.path.join(current_path, "resource", "scripts", script_name)
+            if os.path.exists(script_path):
+                script = json.load(open(script_path, "r", encoding="utf-8"))
+            else:
+                # 2.1.2 生成脚本
+                print("开始生成脚本...")
+                script_result = self.generate_script(video_path=video_path, skip_seconds=skip_seconds, threshold=threshold, vision_batch_size=vision_batch_size, vision_llm_provider=vision_llm_provider)
+                script = script_result["script"]
+            
+            # 2.2 保存脚本到json文件
+            print("保存脚本到json文件...")
+            self.save_script_to_json(script=script, script_path=script_path)
+            
+            # 3. 剪辑视频
+            print("开始剪辑视频...")
+            crop_result = self.crop_video(video_path=video_path, script=script)
+            subclip_videos = crop_result["subclip_videos"]
+            
+            # 4. 生成最终视频
+            print("开始生成最终视频...")
+            self.generate_final_video(
+                task_id=task_id,
+                video_path=video_path,
+                script_path=script_path,
+                script=script,
+                subclip_videos=subclip_videos,
+                voice_name=voice_name
+            )
+            
+            return {
+                "status": "等待异步生成视频",
+                "path": os.path.join(current_path, "storage", "tasks", task_id)
+            }
+            
+        except Exception as e:
+            return {
+                "status": "error",
+                "error": str(e)
+            }
+
+
+# 使用示例
+if __name__ == "__main__":
+    pipeline = VideoPipeline()
+    result = pipeline.run_pipeline(
+        task_id="test_111901",
+        script_name="test.json",
+        youtube_url="https://www.youtube.com/watch?v=vLJ7Yed6FQ4",
+        video_name="2024-11-19-01",
+        skip_seconds=50,
+        threshold=35,
+        vision_batch_size=10,
+        vision_llm_provider="gemini",
+        voice_name="zh-CN-YunjianNeural",
+    )
+    print(result)
--- a/webui.py
+++ b/webui.py
@ -3,7 +3,7 @@ import os
 import sys
 from uuid import uuid4
 from app.config import config
-from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, review_settings
+from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, review_settings, merge_settings, system_settings
 from webui.utils import cache, file_utils
 from app.utils import utils
 from app.models.schema import VideoClipParams, VideoAspect
@ -178,7 +178,9 @@ def main():
    
    # 渲染基础设置面板
    basic_settings.render_basic_settings(tr)
-    
+    # 渲染合并设置
+    merge_settings.render_merge_settings(tr)
+
    # 渲染主面板
    panel = st.columns(3)
    with panel[0]:
@ -188,6 +190,8 @@ def main():
        audio_settings.render_audio_panel(tr)
    with panel[2]:
        subtitle_settings.render_subtitle_panel(tr)
+        # 渲染系统设置面板
+        system_settings.render_system_panel(tr)
    
    # 渲染视频审查面板
    review_settings.render_review_panel(tr)
--- a/webui.txt
+++ b/webui.txt
@ -47,3 +47,328 @@ pause

 rem set HF_ENDPOINT=https://hf-mirror.com
 streamlit run webui.py --browser.serverAddress="127.0.0.1" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False
+
+请求0：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/youtube/download' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "url": "https://www.youtube.com/watch?v=Kenm35gdqtk",
+  "resolution": "1080p",
+  "output_format": "mp4",
+  "rename": "2024-11-19"
+}'
+{
+  "url": "https://www.youtube.com/watch?v=Kenm35gdqtk",
+  "resolution": "1080p",
+  "output_format": "mp4",
+  "rename": "2024-11-19"
+}
+
+请求1：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "skip_seconds": 0,
+  "threshold": 30,
+  "vision_batch_size": 10,
+  "vision_llm_provider": "gemini"
+}'
+{
+  "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "skip_seconds": 0,
+  "threshold": 30,
+  "vision_batch_size": 10,
+  "vision_llm_provider": "gemini"
+}
+
+请求2：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/crop' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}'
+{
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}
+
+请求3：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/start-subclip?task_id=12121' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "request": {
+  "video_clip_json": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ],
+  "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json",
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_aspect": "16:9",
+  "video_language": "zh-CN",
+  "voice_name": "zh-CN-YunjianNeural",
+  "voice_volume": 1,
+  "voice_rate": 1.2,
+  "voice_pitch": 1,
+  "bgm_name": "random",
+  "bgm_type": "random",
+  "bgm_file": "",
+  "bgm_volume": 0.3,
+  "subtitle_enabled": true,
+  "subtitle_position": "bottom",
+  "font_name": "STHeitiMedium.ttc",
+  "text_fore_color": "#FFFFFF",
+  "text_background_color": "transparent",
+  "font_size": 75,
+  "stroke_color": "#000000",
+  "stroke_width": 1.5,
+  "custom_position": 70,
+  "n_threads": 8
+  },
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}'
+{
+  "request": {
+  "video_clip_json": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ],
+  "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json",
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_aspect": "16:9",
+  "video_language": "zh-CN",
+  "voice_name": "zh-CN-YunjianNeural",
+  "voice_volume": 1,
+  "voice_rate": 1.2,
+  "voice_pitch": 1,
+  "bgm_name": "random",
+  "bgm_type": "random",
+  "bgm_file": "",
+  "bgm_volume": 0.3,
+  "subtitle_enabled": true,
+  "subtitle_position": "bottom",
+  "font_name": "STHeitiMedium.ttc",
+  "text_fore_color": "#FFFFFF",
+  "text_background_color": "transparent",
+  "font_size": 75,
+  "stroke_color": "#000000",
+  "stroke_width": 1.5,
+  "custom_position": 70,
+  "n_threads": 8
+  },
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}
+
+
+请在最外层新建一个pipeline 工作流执行逻辑的代码；
+他会按照下面的顺序请求接口
+1.下载视频
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/youtube/download' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "url": "https://www.youtube.com/watch?v=Kenm35gdqtk",
+  "resolution": "1080p",
+  "output_format": "mp4",
+  "rename": "2024-11-19"
+}'
+2.生成脚本
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "skip_seconds": 0,
+  "threshold": 30,
+  "vision_batch_size": 10,
+  "vision_llm_provider": "gemini"
+}'
+3. 剪辑视频
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/crop' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}'
+4.生成视频
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/start-subclip?task_id=12121' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "request": {
+  "video_clip_json": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ],
+  "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json",
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_aspect": "16:9",
+  "video_language": "zh-CN",
+  "voice_name": "zh-CN-YunjianNeural",
+  "voice_volume": 1,
+  "voice_rate": 1.2,
+  "voice_pitch": 1,
+  "bgm_name": "random",
+  "bgm_type": "random",
+  "bgm_file": "",
+  "bgm_volume": 0.3,
+  "subtitle_enabled": true,
+  "subtitle_position": "bottom",
+  "font_name": "STHeitiMedium.ttc",
+  "text_fore_color": "#FFFFFF",
+  "text_background_color": "transparent",
+  "font_size": 75,
+  "stroke_color": "#000000",
+  "stroke_width": 1.5,
+  "custom_position": 70,
+  "n_threads": 8
+  },
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}'
+
+请求1，返回的参数是：
+{
+  "task_id": "4e9b575f-68c0-4ae1-b218-db42b67993d0",
+  "output_path": "E:\\projects\\NarratoAI\\resource\\videos\\2024-11-19.mp4",
+  "resolution": "1080p",
+  "format": "mp4",
+  "filename": "2024-11-19.mp4"
+}
+output_path需要传递给请求2
+请求2，返回数据为：
+{
+  "task_id": "04497017-953c-44b4-bf1d-9d8ed3ebbbce",
+  "script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是對影片畫面的客觀描述：\n\n影片顯示一名留著鬍鬚的男子在一處樹林茂密的斜坡上挖掘。\n\n畫面一：男子從後方出現，背著一個軍綠色的背包，背包裡似乎裝有工具。他穿著卡其色的長褲和深色的登山鞋。\n\n畫面二：特寫鏡頭顯示男子的背包，一個舊的鎬頭從包裡露出來，包裡還有其他工具，包括一個鏟子。\n\n畫面三：男子用鎬頭在斜坡上挖土，背包放在他旁邊。\n\n畫面四：特寫鏡頭顯示男子的登山鞋在泥土中。\n\n畫面五：男子坐在斜坡上，用手清理樹根和泥土。\n\n畫面六：地上有一些鬆動的泥土和落葉。\n\n畫面七：男子的背包近景鏡頭，他正在挖掘。\n\n畫面八：男子在斜坡上挖掘，揚起一陣塵土。\n\n畫面九：特寫鏡頭顯示男子用手清理泥土。\n\n畫面十：特寫鏡頭顯示挖出的泥土剖面，可以看到土壤的層次。",
+      "narration": "上一个画面是我在绝美的自然中，准备开启我的“土豪”挖掘之旅。现在，你们看到这位留着胡子的“大哥”，他背着个军绿色的包，里面装的可不仅仅是工具，还有我对生活的无限热爱（以及一丝不安）。看！这把旧镐头就像我的前任——用起来费劲，但又舍不得扔掉。\n\n他在斜坡上挖土，泥土飞扬，仿佛在跟大地进行一场“泥巴大战”。每一铲下去，都能听到大地微微的呻吟：哎呀，我这颗小树根可比我当年的情感纠葛还难处理呢！别担心，这些泥土层次分明，简直可以开个“泥土博物馆”。所以，朋友们，跟着我一起享受这场泥泞中的乐趣吧！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是對影片畫面內容的客觀描述：\n\n影片以一系列森林環境的鏡頭開始。第一個鏡頭展示了綠葉植物的特寫鏡頭，葉子上有一些水珠。接下來的鏡頭是一個男人在森林裡挖掘一個小坑，他跪在地上，用鏟子挖土。\n\n接下來的鏡頭是同一個男人坐在他挖的坑旁邊，望著前方。然後，鏡頭顯示該坑的廣角鏡頭，顯示其結構和大小。\n\n之後的鏡頭，同一個男人在樹林裡劈柴。鏡頭最後呈現出一潭渾濁的水，周圍環繞著樹枝。然後鏡頭又回到了森林裡生長茂盛的植物特寫鏡頭。",
+      "narration": "好嘞，朋友们，我们已经在泥土博物馆里捣鼓了一阵子，现在是时候跟大自然亲密接触了！看看这片森林，绿叶上水珠闪闪发光，就像我曾经的爱情，虽然短暂，却美得让人心碎。\n\n现在，我在这里挖个小坑，感觉自己就像是一位新晋“挖土大王”，不过说实话，这手艺真不敢恭维，连铲子都快对我崩溃了。再说劈柴，这动作简直比我前任的情绪波动还要激烈！最后这一潭浑浊的水，别担心，它只是告诉我：生活就像这水，总有些杂质，但也别忘了，要勇敢面对哦！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}
+output_path和script参数需要传递给请求3
+请求3返回参数是
+{
+  "task_id": "b6f5a98a-b2e0-4e3d-89c5-64fb90db2ec1",
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}
+subclip_videos和 output_path和script参数需要传递给请求4
+最后完成工作流
+
+0代表只播放文案音频，禁用视频原声；1代表只播放视频原声，不需要播放文案音频和字幕；2代表即播放文案音频也要播放视频原声；
--- a/webui/components/audio_settings.py
+++ b/webui/components/audio_settings.py
@ -20,7 +20,7 @@ def render_audio_panel(tr):
 def render_tts_settings(tr):
    """渲染TTS(文本转语音)设置"""
    # 获取支持的语音列表
-    support_locales = ["zh-CN", "zh-HK", "zh-TW", "en-US"]
+    support_locales = ["zh-CN"]
    voices = voice.get_all_azure_voices(filter_locals=support_locales)
    
    # 创建友好的显示名称
--- a/webui/components/basic_settings.py
+++ b/webui/components/basic_settings.py
@ -52,18 +52,34 @@ def render_language_settings(tr):

 def render_proxy_settings(tr):
    """渲染代理设置"""
-    proxy_url_http = config.proxy.get("http", "") or os.getenv("VPN_PROXY_URL", "")
-    proxy_url_https = config.proxy.get("https", "") or os.getenv("VPN_PROXY_URL", "")
+    # 获取当前代理状态
+    proxy_enabled = config.proxy.get("enabled", True)
+    proxy_url_http = config.proxy.get("http")
+    proxy_url_https = config.proxy.get("https")

-    HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http)
-    HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https)
+    # 添加代理开关
+    proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled)
+    
+    # 保存代理开关状态
+    config.proxy["enabled"] = proxy_enabled

-    if HTTP_PROXY:
-        config.proxy["http"] = HTTP_PROXY
-        os.environ["HTTP_PROXY"] = HTTP_PROXY
-    if HTTPS_PROXY:
-        config.proxy["https"] = HTTPS_PROXY
-        os.environ["HTTPS_PROXY"] = HTTPS_PROXY
+    # 只有在代理启用时才显示代理设置输入框
+    if proxy_enabled:
+        HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http)
+        HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https)
+
+        if HTTP_PROXY:
+            config.proxy["http"] = HTTP_PROXY
+            os.environ["HTTP_PROXY"] = HTTP_PROXY
+        if HTTPS_PROXY:
+            config.proxy["https"] = HTTPS_PROXY
+            os.environ["HTTPS_PROXY"] = HTTPS_PROXY
+    else:
+        # 当代理被禁用时，清除环境变量和配置
+        os.environ.pop("HTTP_PROXY", None)
+        os.environ.pop("HTTPS_PROXY", None)
+        config.proxy["http"] = ""
+        config.proxy["https"] = ""


 def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
@ -90,6 +106,28 @@ def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
        except Exception as e:
            return False, f"{tr('gemini model is not available')}: {str(e)}"

+    elif provider.lower() == 'qwenvl':
+        from openai import OpenAI
+        try:
+            client = OpenAI(
+                api_key=api_key,
+                base_url=base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
+            )
+            
+            # 发送一个简单的测试请求
+            response = client.chat.completions.create(
+                model=model_name or "qwen-vl-max-latest",
+                messages=[{"role": "user", "content": "直接回复我文本'当前网络可用'"}]
+            )
+            
+            if response and response.choices:
+                return True, tr("QwenVL model is available")
+            else:
+                return False, tr("QwenVL model returned invalid response")
+                
+        except Exception as e:
+            return False, f"{tr('QwenVL model is not available')}: {str(e)}"
+            
    elif provider.lower() == 'narratoapi':
        import requests
        try:
@ -116,7 +154,7 @@ def render_vision_llm_settings(tr):
    st.subheader(tr("Vision Model Settings"))

    # 视频分析模型提供商选择
-    vision_providers = ['Gemini', 'NarratoAPI(待发布)', 'QwenVL(待发布)']
+    vision_providers = ['Gemini', 'QwenVL', 'NarratoAPI(待发布)']
    saved_vision_provider = config.app.get("vision_llm_provider", "Gemini").lower()
    saved_provider_index = 0

@ -142,18 +180,33 @@ def render_vision_llm_settings(tr):
    # 渲染视觉模型配置输入框
    st_vision_api_key = st.text_input(tr("Vision API Key"), value=vision_api_key, type="password")
    
-    # 当选择 Gemini 时禁用 base_url 输入
-    if vision_provider.lower() == 'gemini':
+    # 根据不同提供商设置默认值和帮助信息
+    if vision_provider == 'gemini':
        st_vision_base_url = st.text_input(
            tr("Vision Base URL"), 
            value=vision_base_url,
            disabled=True,
            help=tr("Gemini API does not require a base URL")
        )
+        st_vision_model_name = st.text_input(
+            tr("Vision Model Name"), 
+            value=vision_model_name or "gemini-1.5-flash",
+            help=tr("Default: gemini-1.5-flash")
+        )
+    elif vision_provider == 'qwenvl':
+        st_vision_base_url = st.text_input(
+            tr("Vision Base URL"), 
+            value=vision_base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1",
+            help=tr("Default: https://dashscope.aliyuncs.com/compatible-mode/v1")
+        )
+        st_vision_model_name = st.text_input(
+            tr("Vision Model Name"), 
+            value=vision_model_name or "qwen-vl-max-latest",
+            help=tr("Default: qwen-vl-max-latest")
+        )
    else:
        st_vision_base_url = st.text_input(tr("Vision Base URL"), value=vision_base_url)
-        
-    st_vision_model_name = st.text_input(tr("Vision Model Name"), value=vision_model_name)
+        st_vision_model_name = st.text_input(tr("Vision Model Name"), value=vision_model_name)

    # 在配置输入框后添加测试按钮
    if st.button(tr("Test Connection"), key="test_vision_connection"):
@ -174,7 +227,7 @@ def render_vision_llm_settings(tr):
    # 保存视觉模型配置
    if st_vision_api_key:
        config.app[f"vision_{vision_provider}_api_key"] = st_vision_api_key
-        st.session_state[f"vision_{vision_provider}_api_key"] = st_vision_api_key  # 用于script_settings.py
+        st.session_state[f"vision_{vision_provider}_api_key"] = st_vision_api_key
    if st_vision_base_url:
        config.app[f"vision_{vision_provider}_base_url"] = st_vision_base_url
        st.session_state[f"vision_{vision_provider}_base_url"] = st_vision_base_url
@ -182,81 +235,6 @@ def render_vision_llm_settings(tr):
        config.app[f"vision_{vision_provider}_model_name"] = st_vision_model_name
        st.session_state[f"vision_{vision_provider}_model_name"] = st_vision_model_name

-    # # NarratoAPI 特殊配置
-    # if vision_provider == 'narratoapi':
-    #     st.subheader(tr("Narrato Additional Settings"))
-    #
-    #     # Narrato API 基础配置
-    #     narrato_api_key = st.text_input(
-    #         tr("Narrato API Key"),
-    #         value=config.app.get("narrato_api_key", ""),
-    #         type="password",
-    #         help="用于访问 Narrato API 的密钥"
-    #     )
-    #     if narrato_api_key:
-    #         config.app["narrato_api_key"] = narrato_api_key
-    #         st.session_state['narrato_api_key'] = narrato_api_key
-    #
-    #     narrato_api_url = st.text_input(
-    #         tr("Narrato API URL"),
-    #         value=config.app.get("narrato_api_url", "http://127.0.0.1:8000/api/v1/video/analyze")
-    #     )
-    #     if narrato_api_url:
-    #         config.app["narrato_api_url"] = narrato_api_url
-    #         st.session_state['narrato_api_url'] = narrato_api_url
-    #
-    #     # 视频分析模型配置
-    #     st.markdown("##### " + tr("Vision Model Settings"))
-    #     narrato_vision_model = st.text_input(
-    #         tr("Vision Model Name"),
-    #         value=config.app.get("narrato_vision_model", "gemini-1.5-flash")
-    #     )
-    #     narrato_vision_key = st.text_input(
-    #         tr("Vision Model API Key"),
-    #         value=config.app.get("narrato_vision_key", ""),
-    #         type="password",
-    #         help="用于视频分析的模 API Key"
-    #     )
-    #
-    #     if narrato_vision_model:
-    #         config.app["narrato_vision_model"] = narrato_vision_model
-    #         st.session_state['narrato_vision_model'] = narrato_vision_model
-    #     if narrato_vision_key:
-    #         config.app["narrato_vision_key"] = narrato_vision_key
-    #         st.session_state['narrato_vision_key'] = narrato_vision_key
-    #
-    #     # 文案生成模型配置
-    #     st.markdown("##### " + tr("Text Generation Model Settings"))
-    #     narrato_llm_model = st.text_input(
-    #         tr("LLM Model Name"),
-    #         value=config.app.get("narrato_llm_model", "qwen-plus")
-    #     )
-    #     narrato_llm_key = st.text_input(
-    #         tr("LLM Model API Key"),
-    #         value=config.app.get("narrato_llm_key", ""),
-    #         type="password",
-    #         help="用于文案生成的模型 API Key"
-    #     )
-    #
-    #     if narrato_llm_model:
-    #         config.app["narrato_llm_model"] = narrato_llm_model
-    #         st.session_state['narrato_llm_model'] = narrato_llm_model
-    #     if narrato_llm_key:
-    #         config.app["narrato_llm_key"] = narrato_llm_key
-    #         st.session_state['narrato_llm_key'] = narrato_llm_key
-    #
-    #     # 批处理配置
-    #     narrato_batch_size = st.number_input(
-    #         tr("Batch Size"),
-    #         min_value=1,
-    #         max_value=50,
-    #         value=config.app.get("narrato_batch_size", 10),
-    #         help="每批处理的图片数量"
-    #     )
-    #     if narrato_batch_size:
-    #         config.app["narrato_batch_size"] = narrato_batch_size
-    #         st.session_state['narrato_batch_size'] = narrato_batch_size
-

 def test_text_model_connection(api_key, base_url, model_name, provider, tr):
    """测试文本模型连接
@ -328,6 +306,7 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr):
    except Exception as e:
        return False, f"{tr('Connection failed')}: {str(e)}"

+
 def render_text_llm_settings(tr):
    """渲染文案生成模型设置"""
    st.subheader(tr("Text Generation Model Settings"))
--- a/webui/components/merge_settings.py
+++ b/webui/components/merge_settings.py
@ -0,0 +1,303 @@
+import os
+import time
+import math
+import sys
+import tempfile
+import traceback
+import shutil
+
+import streamlit as st
+from loguru import logger
+from typing import List, Dict, Tuple
+from dataclasses import dataclass
+from streamlit.runtime.uploaded_file_manager import UploadedFile
+
+from webui.utils.merge_video import merge_videos_and_subtitles
+from app.utils.utils import video_dir, srt_dir
+from app.services.subtitle import extract_audio_and_create_subtitle
+
+# 定义临时目录路径
+TEMP_MERGE_DIR = os.path.join("storage", "temp", "merge")
+
+# 确保临时目录存在
+os.makedirs(TEMP_MERGE_DIR, exist_ok=True)
+
+
+@dataclass
+class VideoSubtitlePair:
+    video_file: UploadedFile | None
+    subtitle_file: str | None
+    base_name: str
+    order: int = 0
+
+
+def save_uploaded_file(uploaded_file: UploadedFile, target_dir: str) -> str:
+    """Save uploaded file to target directory and return the file path"""
+    file_path = os.path.join(target_dir, uploaded_file.name)
+    # 如果文件已存在，先删除它
+    if os.path.exists(file_path):
+        os.remove(file_path)
+    with open(file_path, "wb") as f:
+        f.write(uploaded_file.getvalue())
+    return file_path
+
+
+def clean_temp_dir():
+    """清空临时目录"""
+    if os.path.exists(TEMP_MERGE_DIR):
+        for file in os.listdir(TEMP_MERGE_DIR):
+            file_path = os.path.join(TEMP_MERGE_DIR, file)
+            try:
+                if os.path.isfile(file_path):
+                    os.unlink(file_path)
+            except Exception as e:
+                logger.error(f"清理临时文件失败: {str(e)}")
+
+
+def group_files(files: List[UploadedFile]) -> Dict[str, VideoSubtitlePair]:
+    """Group uploaded files by their base names"""
+    pairs = {}
+    order_counter = 0
+    
+    # 首先处理所有视频文件
+    for file in files:
+        base_name = os.path.splitext(file.name)[0]
+        ext = os.path.splitext(file.name)[1].lower()
+        
+        if ext == ".mp4":
+            if base_name not in pairs:
+                pairs[base_name] = VideoSubtitlePair(None, None, base_name, order_counter)
+                order_counter += 1
+            pairs[base_name].video_file = file
+            # 保存视频文件到临时目录
+            video_path = save_uploaded_file(file, TEMP_MERGE_DIR)
+    
+    # 然后处理所有字幕文件
+    for file in files:
+        base_name = os.path.splitext(file.name)[0]
+        ext = os.path.splitext(file.name)[1].lower()
+        
+        if ext == ".srt":
+            # 即使没有对应视频也保存字幕文件
+            subtitle_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt")
+            save_uploaded_file(file, TEMP_MERGE_DIR)
+            
+            if base_name in pairs:  # 如果有对应的视频
+                pairs[base_name].subtitle_file = subtitle_path
+            
+    return pairs
+
+
+def render_merge_settings(tr):
+    """Render the merge settings section"""
+    with st.expander(tr("Video Subtitle Merge"), expanded=False):
+        # 上传文件区域
+        uploaded_files = st.file_uploader(
+            tr("Upload Video and Subtitle Files"),
+            type=["mp4", "srt"],
+            accept_multiple_files=True,
+            key="merge_files"
+        )
+        
+        if uploaded_files:
+            all_pairs = group_files(uploaded_files)
+            
+            if all_pairs:
+                st.write(tr("All Uploaded Files"))
+                
+                # 初始化或更新session state中的排序信息
+                if 'file_orders' not in st.session_state:
+                    st.session_state.file_orders = {
+                        name: pair.order for name, pair in all_pairs.items()
+                    }
+                    st.session_state.needs_reorder = False
+                
+                # 确保所有新文件都有排序值
+                for name, pair in all_pairs.items():
+                    if name not in st.session_state.file_orders:
+                        st.session_state.file_orders[name] = pair.order
+                
+                # 移除不存在的文件的排序值
+                st.session_state.file_orders = {
+                    k: v for k, v in st.session_state.file_orders.items() 
+                    if k in all_pairs
+                }
+                
+                # 按照排序值对文件对进行排序
+                sorted_pairs = sorted(
+                    all_pairs.items(),
+                    key=lambda x: st.session_state.file_orders[x[0]]
+                )
+                
+                # 计算需要多少行来显示所有视频（每行5个）
+                num_pairs = len(sorted_pairs)
+                num_rows = (num_pairs + 4) // 5  # 向上取整,每行5个
+                
+                # 遍历每一行
+                for row in range(num_rows):
+                    # 创建5列
+                    cols = st.columns(5)
+                    
+                    # 在这一行中填充视频（最多5个）
+                    for col_idx in range(5):
+                        pair_idx = row * 5 + col_idx
+                        if pair_idx < num_pairs:
+                            base_name, pair = sorted_pairs[pair_idx]
+                            with cols[col_idx]:
+                                st.caption(base_name)
+                                
+                                # 显示视频预览（如果存在）
+                                video_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.mp4")
+                                if os.path.exists(video_path):
+                                    st.video(video_path)
+                                else:
+                                    st.warning(tr("Missing Video"))
+                                
+                                # 显示字幕预览（如果存在）
+                                subtitle_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt")
+                                if os.path.exists(subtitle_path):
+                                    with open(subtitle_path, 'r', encoding='utf-8') as f:
+                                        subtitle_content = f.read()
+                                        st.markdown(tr("Subtitle Preview"))
+                                        st.text_area(
+                                            "Subtitle Content",
+                                            value=subtitle_content,
+                                            height=100,  # 减高度以适应5列布局
+                                            label_visibility="collapsed",
+                                            key=f"subtitle_preview_{base_name}"
+                                        )
+                                else:
+                                    st.warning(tr("Missing Subtitle"))
+                                    # 如果有视频但没有字幕，显示一键转录按钮
+                                    if os.path.exists(video_path):
+                                        if st.button(tr("One-Click Transcribe"), key=f"transcribe_{base_name}"):
+                                            with st.spinner(tr("Transcribing...")):
+                                                try:
+                                                    # 生成字幕文件
+                                                    result = extract_audio_and_create_subtitle(video_path, subtitle_path)
+                                                    if result:
+                                                        # 读取生成的字幕文件内容并显示预览
+                                                        with open(subtitle_path, 'r', encoding='utf-8') as f:
+                                                            subtitle_content = f.read()
+                                                            st.markdown(tr("Subtitle Preview"))
+                                                            st.text_area(
+                                                                "Subtitle Content",
+                                                                value=subtitle_content,
+                                                                height=150,
+                                                                label_visibility="collapsed",
+                                                                key=f"subtitle_preview_transcribed_{base_name}"
+                                                            )
+                                                            st.success(tr("Transcription Complete!"))
+                                                            # 更新pair的字幕文件路径
+                                                            pair.subtitle_file = subtitle_path
+                                                    else:
+                                                        st.error(tr("Transcription Failed. Please try again."))
+                                                except Exception as e:
+                                                    error_message = str(e)
+                                                    logger.error(traceback.format_exc())
+                                                    if "rate limit exceeded" in error_message.lower():
+                                                        st.error(tr("API rate limit exceeded. Please wait about an hour and try again."))
+                                                    elif "resource_exhausted" in error_message.lower():
+                                                        st.error(tr("Resources exhausted. Please try again later."))
+                                                    else:
+                                                        st.error(f"{tr('Transcription Failed')}: {str(e)}")
+                                
+                                # 排序输入框
+                                order = st.number_input(
+                                    tr("Order"),
+                                    min_value=0,
+                                    value=st.session_state.file_orders[base_name],
+                                    key=f"order_{base_name}",
+                                    on_change=lambda: setattr(st.session_state, 'needs_reorder', True)
+                                )
+                                if order != st.session_state.file_orders[base_name]:
+                                    st.session_state.file_orders[base_name] = order
+                                    st.session_state.needs_reorder = True
+                
+                # 如果需要重新排序，重新加载页面
+                if st.session_state.needs_reorder:
+                    st.session_state.needs_reorder = False
+                    st.rerun()
+                
+                # 找出有完整视频和字幕的文件对
+                complete_pairs = {
+                    k: v for k, v in all_pairs.items()
+                    if os.path.exists(os.path.join(TEMP_MERGE_DIR, f"{k}.mp4")) and 
+                    os.path.exists(os.path.join(TEMP_MERGE_DIR, f"{k}.srt"))
+                }
+                
+                # 合并按钮和结果显示
+                cols = st.columns([1, 2, 1])
+                with cols[0]:
+                    st.write(f"{tr('Mergeable Files')}: {len(complete_pairs)}")
+                
+                merge_videos_result = None
+                
+                with cols[1]:
+                    if st.button(tr("Merge All Files"), type="primary", use_container_width=True):
+                        try:
+                            # 获取排序后的完整文件对
+                            sorted_complete_pairs = sorted(
+                                [(k, v) for k, v in complete_pairs.items()],
+                                key=lambda x: st.session_state.file_orders[x[0]]
+                            )
+                            
+                            video_paths = []
+                            subtitle_paths = []
+                            for base_name, _ in sorted_complete_pairs:
+                                video_paths.append(os.path.join(TEMP_MERGE_DIR, f"{base_name}.mp4"))
+                                subtitle_paths.append(os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt"))
+                            
+                            # 获取输出文件路径
+                            output_video = os.path.join(video_dir(), f"merged_video_{time.strftime('%M%S')}.mp4")
+                            output_subtitle = os.path.join(srt_dir(), f"merged_subtitle_{time.strftime('%M%S')}.srt")
+                            
+                            with st.spinner(tr("Merging files...")):
+                                # 合并文件
+                                merge_videos_and_subtitles(
+                                    video_paths,
+                                    subtitle_paths,
+                                    output_video,
+                                    output_subtitle
+                                )
+                                
+                                success = True
+                                error_msg = ""
+                                
+                                # 检查输出文件是否成功生成
+                                if not os.path.exists(output_video):
+                                    success = False
+                                    error_msg += tr("Failed to generate merged video. ")
+                                if not os.path.exists(output_subtitle):
+                                    success = False
+                                    error_msg += tr("Failed to generate merged subtitle. ")
+                                
+                                if success:
+                                    # 显示成功消息
+                                    st.success(tr("Merge completed!"))
+                                    merge_videos_result = (output_video, output_subtitle)
+                                    # 清理临时目录
+                                    clean_temp_dir()
+                                else:
+                                    st.error(error_msg)
+                                    
+                        except Exception as e:
+                            error_message = str(e)
+                            if "moviepy" in error_message.lower():
+                                st.error(tr("Error processing video files. Please check if the videos are valid MP4 files."))
+                            elif "pysrt" in error_message.lower():
+                                st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files."))
+                            else:
+                                st.error(f"{tr('Error during merge')}: {error_message}")
+                
+                # 合并结果预览放在合并按钮下方
+                if merge_videos_result:
+                    st.markdown(f"<h3 style='text-align: center'>{tr('Merge Result Preview')}</h3>", unsafe_allow_html=True)
+                    # 使用列布局使视频居中
+                    col1, col2, col3 = st.columns([1,2,1])
+                    with col2:
+                        st.video(merge_videos_result[0])
+                        st.code(f"{tr('Video Path')}: {merge_videos_result[0]}")
+                        st.code(f"{tr('Subtitle Path')}: {merge_videos_result[1]}")
+            else:
+                st.warning(tr("No Files Found"))
--- a/webui/components/script_settings.py
+++ b/webui/components/script_settings.py
@ -1,86 +1,15 @@
 import os
-import ssl
 import glob
 import json
 import time
-import asyncio
-import traceback
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
-import requests
 import streamlit as st
-from loguru import logger

 from app.config import config
 from app.models.schema import VideoClipParams
-from app.utils.script_generator import ScriptProcessor
-from app.utils import utils, check_script, vision_analyzer, video_processor, video_processor_v2
-from webui.utils import file_utils
+from app.utils import utils, check_script
+from webui.tools.generate_script_docu import generate_script_docu


-def get_batch_timestamps(batch_files, prev_batch_files=None):
-    """
-    获取一批文件的时间戳范围
-    返回: (first_timestamp, last_timestamp, timestamp_range)
-    
-    文件名格式: keyframe_001253_000050.jpg
-    其中 000050 表示 00:00:50 (50秒)
-         000101 表示 00:01:01 (1分1秒)
-         
-    Args:
-        batch_files: 当前批次的文件列表
-        prev_batch_files: 上一个批次的文件列表，用于处理单张图片的情况
-    """
-    if not batch_files:
-        logger.warning("Empty batch files")
-        return "00:00", "00:00", "00:00-00:00"
-        
-    # 如果当前批次只有一张图片，且有上一个批次的文件，则使用上一批次的最后一张作为首帧
-    if len(batch_files) == 1 and prev_batch_files and len(prev_batch_files) > 0:
-        first_frame = os.path.basename(prev_batch_files[-1])
-        last_frame = os.path.basename(batch_files[0])
-        logger.debug(f"单张图片批次，使用上一批次最后一帧作为首帧: {first_frame}")
-    else:
-        # 提取首帧和尾帧的时间戳
-        first_frame = os.path.basename(batch_files[0])
-        last_frame = os.path.basename(batch_files[-1])
-    
-    # 从文件名中提取时间信息
-    first_time = first_frame.split('_')[2].replace('.jpg', '')  # 000050
-    last_time = last_frame.split('_')[2].replace('.jpg', '')    # 000101
-    
-    # 转换为分:秒格式
-    def format_timestamp(time_str):
-        # 时间格式为 MMSS，如 0050 表示 00:50, 0101 表示 01:01
-        if len(time_str) < 4:
-            logger.warning(f"Invalid timestamp format: {time_str}")
-            return "00:00"
-            
-        minutes = int(time_str[-4:-2])  # 取后4位的前2位作为分钟
-        seconds = int(time_str[-2:])    # 取后2位作为秒数
-        
-        # 处理进位
-        if seconds >= 60:
-            minutes += seconds // 60
-            seconds = seconds % 60
-            
-        return f"{minutes:02d}:{seconds:02d}"
-    
-    first_timestamp = format_timestamp(first_time)
-    last_timestamp = format_timestamp(last_time)
-    timestamp_range = f"{first_timestamp}-{last_timestamp}"
-    
-    logger.debug(f"解析时间戳: {first_frame} -> {first_timestamp}, {last_frame} -> {last_timestamp}")
-    return first_timestamp, last_timestamp, timestamp_range
-
-def get_batch_files(keyframe_files, result, batch_size=5):
-    """
-    获取当前批次的图片文件
-    """
-    batch_start = result['batch_index'] * batch_size
-    batch_end = min(batch_start + batch_size, len(keyframe_files))
-    return keyframe_files[batch_start:batch_end]
-
 def render_script_panel(tr):
    """渲染脚本配置面板"""
    with st.container(border=True):
@ -102,7 +31,11 @@ def render_script_panel(tr):

 def render_script_file(tr, params):
    """渲染脚本文件选择"""
-    script_list = [(tr("None"), ""), (tr("Auto Generate"), "auto")]
+    script_list = [
+        (tr("None"), ""), 
+        (tr("Auto Generate"), "auto"), 
+        (tr("Upload Script"), "upload_script")  # 新增上传脚本选项
+    ]

    # 获取已有脚本文件
    suffix = "*.json"
@ -132,7 +65,7 @@ def render_script_file(tr, params):

    selected_script_index = st.selectbox(
        tr("Script Files"),
-        index=selected_index,  # 使用找到的索引
+        index=selected_index,
        options=range(len(script_list)),
        format_func=lambda x: script_list[x][0]
    )
@ -141,10 +74,50 @@ def render_script_file(tr, params):
    st.session_state['video_clip_json_path'] = script_path
    params.video_clip_json_path = script_path

+    # 处理脚本上传
+    if script_path == "upload_script":
+        uploaded_file = st.file_uploader(
+            tr("Upload Script File"),
+            type=["json"],
+            accept_multiple_files=False,
+        )
+
+        if uploaded_file is not None:
+            try:
+                # 读取上传的JSON内容并验证格式
+                script_content = uploaded_file.read().decode('utf-8')
+                json_data = json.loads(script_content)
+                
+                # 保存到脚本目录
+                script_file_path = os.path.join(script_dir, uploaded_file.name)
+                file_name, file_extension = os.path.splitext(uploaded_file.name)
+                
+                # 如果文件已存在,添加时间戳
+                if os.path.exists(script_file_path):
+                    timestamp = time.strftime("%Y%m%d%H%M%S")
+                    file_name_with_timestamp = f"{file_name}_{timestamp}"
+                    script_file_path = os.path.join(script_dir, file_name_with_timestamp + file_extension)
+
+                # 写入文件
+                with open(script_file_path, "w", encoding='utf-8') as f:
+                    json.dump(json_data, f, ensure_ascii=False, indent=2)
+                
+                # 更新状态
+                st.success(tr("Script Uploaded Successfully"))
+                st.session_state['video_clip_json_path'] = script_file_path
+                params.video_clip_json_path = script_file_path
+                time.sleep(1)
+                st.rerun()
+                
+            except json.JSONDecodeError:
+                st.error(tr("Invalid JSON format"))
+            except Exception as e:
+                st.error(f"{tr('Upload failed')}: {str(e)}")
+

 def render_video_file(tr, params):
    """渲染视频文件选择"""
-    video_list = [(tr("None"), ""), (tr("Upload Local Files"), "local")]
+    video_list = [(tr("None"), ""), (tr("Upload Local Files"), "upload_local")]

    # 获取已有视频文件
    for suffix in ["*.mp4", "*.mov", "*.avi", "*.mkv"]:
@ -164,7 +137,7 @@ def render_video_file(tr, params):
    st.session_state['video_origin_path'] = video_path
    params.video_origin_path = video_path

-    if video_path == "local":
+    if video_path == "upload_local":
        uploaded_file = st.file_uploader(
            tr("Upload Local Files"),
            type=["mp4", "mov", "avi", "flv", "mkv"],
@ -250,7 +223,7 @@ def render_script_buttons(tr, params):

    if st.button(button_name, key="script_action", disabled=not script_path):
        if script_path == "auto":
-            generate_script(tr, params)
+            generate_script_docu(tr, params)
        else:
            load_script(tr, script_path)

@ -305,379 +278,6 @@ def load_script(tr, script_path):
        st.error(f"{tr('Failed to load script')}: {str(e)}")


-def generate_script(tr, params):
-    """生成视频脚本"""
-    progress_bar = st.progress(0)
-    status_text = st.empty()
-
-    def update_progress(progress: float, message: str = ""):
-        progress_bar.progress(progress)
-        if message:
-            status_text.text(f"{progress}% - {message}")
-        else:
-            status_text.text(f"进度: {progress}%")
-
-    try:
-        with st.spinner("正在生成脚本..."):
-            if not params.video_origin_path:
-                st.error("请先选择视频文件")
-                return
-            
-            # ===================提取键帧===================
-            update_progress(10, "正在提取关键帧...")
-            
-            # 创建临时目录用于存储关键帧
-            keyframes_dir = os.path.join(utils.temp_dir(), "keyframes")
-            video_hash = utils.md5(params.video_origin_path + str(os.path.getmtime(params.video_origin_path)))
-            video_keyframes_dir = os.path.join(keyframes_dir, video_hash)
-            
-            # 检查是否已经提取过关键帧
-            keyframe_files = []
-            if os.path.exists(video_keyframes_dir):
-                # 取已有的关键帧文件
-                for filename in sorted(os.listdir(video_keyframes_dir)):
-                    if filename.endswith('.jpg'):
-                        keyframe_files.append(os.path.join(video_keyframes_dir, filename))
-                
-                if keyframe_files:
-                    logger.info(f"使用已缓存的关键帧: {video_keyframes_dir}")
-                    st.info(f"使用已缓存的关键帧，如需重新提取请删除目录: {video_keyframes_dir}")
-                    update_progress(20, f"使用已缓存关键帧，共 {len(keyframe_files)} 帧")
-            
-            # 如果没有缓存的关键帧，则进行提取
-            if not keyframe_files:
-                try:
-                    # 确保目录存在
-                    os.makedirs(video_keyframes_dir, exist_ok=True)
-                    
-                    # 初始化视频处理器
-                    if config.frames.get("version") == "v2":
-                        processor = video_processor_v2.VideoProcessor(params.video_origin_path)
-                        # 处理视频并提取关键帧
-                        processor.process_video_pipeline(
-                            output_dir=video_keyframes_dir,
-                            skip_seconds=st.session_state.get('skip_seconds'),
-                            threshold=st.session_state.get('threshold')
-                        )
-                    else:
-                        processor = video_processor.VideoProcessor(params.video_origin_path)
-                        # 处理视频并提取关键帧
-                        processor.process_video(
-                            output_dir=video_keyframes_dir,
-                            skip_seconds=0
-                        )
-                    
-                    # 获取所有关键帧文件路径
-                    for filename in sorted(os.listdir(video_keyframes_dir)):
-                        if filename.endswith('.jpg'):
-                            keyframe_files.append(os.path.join(video_keyframes_dir, filename))
-                    
-                    if not keyframe_files:
-                        raise Exception("未提取到任何关键帧")
-                        
-                    update_progress(20, f"关键帧提取完成，共 {len(keyframe_files)} 帧")
-                    
-                except Exception as e:
-                    # 如果提取失败，清理创建的目录
-                    try:
-                        if os.path.exists(video_keyframes_dir):
-                            import shutil
-                            shutil.rmtree(video_keyframes_dir)
-                    except Exception as cleanup_err:
-                        logger.error(f"清理失败的关键帧目录时出错: {cleanup_err}")
-                    
-                    raise Exception(f"关键帧提取失败: {str(e)}")
-
-            # 根据不同的 LLM 提供商处理
-            vision_llm_provider = st.session_state.get('vision_llm_providers').lower()
-            logger.debug(f"Vision LLM 提供商: {vision_llm_provider}")
-            
-            if vision_llm_provider == 'gemini':
-                try:
-                    # ===================初始化视觉分析器===================
-                    update_progress(30, "正在初始化视觉分析器...")
-                    
-                    # 从配置中获取 Gemini 相关配置
-                    vision_api_key = st.session_state.get('vision_gemini_api_key')
-                    vision_model = st.session_state.get('vision_gemini_model_name')
-                    vision_base_url = st.session_state.get('vision_gemini_base_url')
-                    
-                    if not vision_api_key or not vision_model:
-                        raise ValueError("未配置 Gemini API Key 或者 模型，请在基础设置中配置")
-
-                    analyzer = vision_analyzer.VisionAnalyzer(
-                        model_name=vision_model,
-                        api_key=vision_api_key,
-                    )
-
-                    update_progress(40, "正在分析关键帧...")
-
-                    # ===================创建异步事件循环===================
-                    loop = asyncio.new_event_loop()
-                    asyncio.set_event_loop(loop)
-                    
-                    # 执行异步分析
-                    vision_batch_size = st.session_state.get('vision_batch_size') or config.frames.get("vision_batch_size")
-                    results = loop.run_until_complete(
-                        analyzer.analyze_images(
-                            images=keyframe_files,
-                            prompt=config.app.get('vision_analysis_prompt'),
-                            batch_size=vision_batch_size
-                        )
-                    )
-                    loop.close()
-
-                    # ===================处理分析结果===================
-                    update_progress(60, "正在整理分析结果...")
-                    
-                    # 合并所有批次的析结果
-                    frame_analysis = ""
-                    prev_batch_files = None
-
-                    for result in results:
-                        if 'error' in result:
-                            logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}")
-                            continue
-                        # 获取当前批次的文件列表
-                        batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
-                        logger.debug(f"批次 {result['batch_index']} 处理完成，共 {len(batch_files)} 张图片")
-                        logger.debug(batch_files)
-                        
-                        first_timestamp, last_timestamp, _ = get_batch_timestamps(batch_files, prev_batch_files)
-                        logger.debug(f"处理时间戳: {first_timestamp}-{last_timestamp}")
-                        
-                        # 添加带时间戳的分析结果
-                        frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
-                        frame_analysis += result['response']
-                        frame_analysis += "\n"
-                        
-                        # 更新上一个批次的文件
-                        prev_batch_files = batch_files
-                    
-                    if not frame_analysis.strip():
-                        raise Exception("未能生成有效的帧分析结果")
-                    
-                    # 保存分析结果
-                    analysis_path = os.path.join(utils.temp_dir(), "frame_analysis.txt")
-                    with open(analysis_path, 'w', encoding='utf-8') as f:
-                        f.write(frame_analysis)
-                    
-                    update_progress(70, "正在生成脚本...")
-
-                    # 从配置中获取文本生成相关配置
-                    text_provider = config.app.get('text_llm_provider', 'gemini').lower()
-                    text_api_key = config.app.get(f'text_{text_provider}_api_key')
-                    text_model = config.app.get(f'text_{text_provider}_model_name')
-                    text_base_url = config.app.get(f'text_{text_provider}_base_url')
-                    
-                    # 构建帧内容列表
-                    frame_content_list = []
-                    prev_batch_files = None
-
-                    for i, result in enumerate(results):
-                        if 'error' in result:
-                            continue
-                        
-                        batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
-                        _, _, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files)
-                        
-                        frame_content = {
-                            "timestamp": timestamp_range,
-                            "picture": result['response'],
-                            "narration": "",
-                            "OST": 2
-                        }
-                        frame_content_list.append(frame_content)
-                        
-                        logger.debug(f"添加帧内容: 时间范围={timestamp_range}, 分析结果长度={len(result['response'])}")
-                        
-                        # 更新上一个批次的文件
-                        prev_batch_files = batch_files
-                    
-                    if not frame_content_list:
-                        raise Exception("没有有效的帧内容可以处理")
-
-                    # ===================开始生成文案===================
-                    update_progress(80, "正在生成文案...")
-                    # 校验配置
-                    api_params = {
-                        "vision_api_key": vision_api_key,
-                        "vision_model_name": vision_model, 
-                        "vision_base_url": vision_base_url or "",
-                        "text_api_key": text_api_key,
-                        "text_model_name": text_model,
-                        "text_base_url": text_base_url or ""
-                    }
-                    headers = {
-                        'accept': 'application/json',
-                        'Content-Type': 'application/json'
-                    }
-                    session = requests.Session()
-                    retry_strategy = Retry(
-                        total=3,
-                        backoff_factor=1,
-                        status_forcelist=[500, 502, 503, 504]
-                    )
-                    adapter = HTTPAdapter(max_retries=retry_strategy)
-                    session.mount("https://", adapter)
-                    try:
-                        response = session.post(
-                            f"{config.app.get('narrato_api_url')}/video/config",
-                            headers=headers,
-                            json=api_params,
-                            timeout=30,
-                            verify=True
-                        )
-                    except Exception as e:
-                        pass
-                    custom_prompt = st.session_state.get('custom_prompt', '')
-                    processor = ScriptProcessor(
-                        model_name=text_model,
-                        api_key=text_api_key,
-                        prompt=custom_prompt,
-                        base_url=text_base_url or "",
-                        video_theme=st.session_state.get('video_theme', '')
-                    )
-
-                    # 处理帧内容生成脚本
-                    script_result = processor.process_frames(frame_content_list)
-
-                    # <20><>结果转换为JSON字符串
-                    script = json.dumps(script_result, ensure_ascii=False, indent=2)
-                    
-                except Exception as e:
-                    logger.exception(f"大模型处理过程中发生错误\n{traceback.format_exc()}")
-                    raise Exception(f"分析失败: {str(e)}")
-
-            elif vision_llm_provider == 'narratoapi':  # NarratoAPI
-                try:
-                    # 创建临时目录
-                    temp_dir = utils.temp_dir("narrato")
-                    
-                    # 打包关键帧
-                    update_progress(30, "正在打包关键帧...")
-                    zip_path = os.path.join(temp_dir, f"keyframes_{int(time.time())}.zip")
-                    if not file_utils.create_zip(keyframe_files, zip_path):
-                        raise Exception("打包关键帧失败")
-                    
-                    # 获取API配置
-                    api_url = st.session_state.get('narrato_api_url')
-                    api_key = st.session_state.get('narrato_api_key')
-                    
-                    if not api_key:
-                        raise ValueError("未配置 Narrato API Key，请在基础设置中配置")
-                    
-                    # 准<><E58786><EFBFBD>API请求
-                    headers = {
-                        'X-API-Key': api_key,
-                        'accept': 'application/json'
-                    }
-                    
-                    api_params = {
-                        'batch_size': st.session_state.get('narrato_batch_size', 10),
-                        'use_ai': False,
-                        'start_offset': 0,
-                        'vision_model': st.session_state.get('narrato_vision_model', 'gemini-1.5-flash'),
-                        'vision_api_key': st.session_state.get('narrato_vision_key'),
-                        'llm_model': st.session_state.get('narrato_llm_model', 'qwen-plus'),
-                        'llm_api_key': st.session_state.get('narrato_llm_key'),
-                        'custom_prompt': st.session_state.get('custom_prompt', '')
-                    }
-                    
-                    # 发送API请求
-                    logger.info(f"请求NarratoAPI: {api_url}")
-                    update_progress(40, "正在上传文件...")
-                    with open(zip_path, 'rb') as f:
-                        files = {'file': (os.path.basename(zip_path), f, 'application/x-zip-compressed')}
-                        try:
-                            response = requests.post(
-                                f"{api_url}/video/analyze",
-                                headers=headers, 
-                                params=api_params, 
-                                files=files,
-                                timeout=30  # 设置超时时间
-                            )
-                            response.raise_for_status()
-                        except requests.RequestException as e:
-                            logger.error(f"Narrato API 请求失败:\n{traceback.format_exc()}")
-                            raise Exception(f"API请求失败: {str(e)}")
-                    
-                    task_data = response.json()
-                    task_id = task_data["data"].get('task_id')
-                    if not task_id:
-                        raise Exception(f"无效的API响应: {response.text}")
-                    
-                    # 轮询任务状态
-                    update_progress(50, "正在等待分析结果...")
-                    retry_count = 0
-                    max_retries = 60  # 最多等待2分钟
-                    
-                    while retry_count < max_retries:
-                        try:
-                            status_response = requests.get(
-                                f"{api_url}/video/tasks/{task_id}",
-                                headers=headers,
-                                timeout=10
-                            )
-                            status_response.raise_for_status()
-                            task_status = status_response.json()['data']
-                            
-                            if task_status['status'] == 'SUCCESS':
-                                script = task_status['result']['data']
-                                break
-                            elif task_status['status'] in ['FAILURE', 'RETRY']:
-                                raise Exception(f"任务失败: {task_status.get('error')}")
-                            
-                            retry_count += 1
-                            time.sleep(2)
-                            
-                        except requests.RequestException as e:
-                            logger.warning(f"获取任务状态失败，重试中: {str(e)}")
-                            retry_count += 1
-                            time.sleep(2)
-                            continue
-                    
-                    if retry_count >= max_retries:
-                        raise Exception("任务执行超时")
-                    
-                except Exception as e:
-                    logger.exception(f"NarratoAPI 处理过程中发生错误\n{traceback.format_exc()}")
-                    raise Exception(f"NarratoAPI 处理失败: {str(e)}")
-                finally:
-                    # 清理临时文件
-                    try:
-                        if os.path.exists(zip_path):
-                            os.remove(zip_path)
-                    except Exception as e:
-                        logger.warning(f"清理临时文件失败: {str(e)}")
-
-            else:
-                logger.exception("Vision Model 未启用，请检查配置")
-
-            if script is None:
-                st.error("生成脚本失败，请检查日志")
-                st.stop()
-            logger.info(f"脚本生成完成")
-            if isinstance(script, list):
-                st.session_state['video_clip_json'] = script
-            elif isinstance(script, str):
-                st.session_state['video_clip_json'] = json.loads(script)
-            update_progress(80, "脚本生成完成")
-
-        time.sleep(0.1)
-        progress_bar.progress(100)
-        status_text.text("脚本生成完成！")
-        st.success("视频脚本生成成功！")
-        
-    except Exception as err:
-        st.error(f"生成过程中发生错误: {str(err)}")
-        logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")
-    finally:
-        time.sleep(2)
-        progress_bar.empty()
-        status_text.empty()
-
-
 def save_script(tr, video_clip_json_details):
    """保存视频脚本"""
    if not video_clip_json_details:
@ -724,7 +324,7 @@ def crop_video(tr, params):
        utils.cut_video(params, update_progress)
        time.sleep(0.5)
        progress_bar.progress(100)
-        status_text.text("剪辑完成！")
+        status_text.text("剪完成！")
        st.success("视频剪辑成功完成！")
    except Exception as e:
        st.error(f"剪辑过程中发生错误: {str(e)}")
@ -732,14 +332,3 @@ def crop_video(tr, params):
        time.sleep(2)
        progress_bar.empty()
        status_text.empty()
-
-
-def get_script_params():
-    """获取脚本参数"""
-    return {
-        'video_language': st.session_state.get('video_language', ''),
-        'video_clip_json_path': st.session_state.get('video_clip_json_path', ''),
-        'video_origin_path': st.session_state.get('video_origin_path', ''),
-        'video_name': st.session_state.get('video_name', ''),
-        'video_plot': st.session_state.get('video_plot', '')
-    }
--- a/webui/components/system_settings.py
+++ b/webui/components/system_settings.py
@ -0,0 +1,45 @@
+import streamlit as st
+import os
+import shutil
+from loguru import logger
+
+from app.utils.utils import storage_dir
+
+
+def clear_directory(dir_path, tr):
+    """清理指定目录"""
+    if os.path.exists(dir_path):
+        try:
+            for item in os.listdir(dir_path):
+                item_path = os.path.join(dir_path, item)
+                try:
+                    if os.path.isfile(item_path):
+                        os.unlink(item_path)
+                    elif os.path.isdir(item_path):
+                        shutil.rmtree(item_path)
+                except Exception as e:
+                    logger.error(f"Failed to delete {item_path}: {e}")
+            st.success(tr("Directory cleared"))
+            logger.info(f"Cleared directory: {dir_path}")
+        except Exception as e:
+            st.error(f"{tr('Failed to clear directory')}: {str(e)}")
+            logger.error(f"Failed to clear directory {dir_path}: {e}")
+    else:
+        st.warning(tr("Directory does not exist"))
+
+def render_system_panel(tr):
+    """渲染系统设置面板"""
+    with st.expander(tr("System settings"), expanded=False):
+        col1, col2, col3 = st.columns(3)
+                
+        with col1:
+            if st.button(tr("Clear frames"), use_container_width=True):
+                clear_directory(os.path.join(storage_dir(), "temp/keyframes"), tr)
+                
+        with col2:
+            if st.button(tr("Clear clip videos"), use_container_width=True):
+                clear_directory(os.path.join(storage_dir(), "temp/clip_video"), tr)
+                
+        with col3:
+            if st.button(tr("Clear tasks"), use_container_width=True):
+                clear_directory(os.path.join(storage_dir(), "tasks"), tr)
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -15,7 +15,7 @@
    "Crop Video": "裁剪视频",
    "Video File": "视频文件（:blue[1️⃣支持上传视频文件(限制2G) 2️⃣大文件建议直接导入 ./resource/videos 目录]）",
    "Plot Description": "剧情描述 (:blue[可从 https://www.tvmao.com/ 获取])",
-    "Generate Video Keywords": "点击使用AI根据**文案**生成【视频关键<EFBFBD><EFBFBD>】",
+    "Generate Video Keywords": "点击使用AI根据**文案**生成【视频关键】",
    "Please Enter the Video Subject": "请先填写视频文案",
    "Generating Video Script and Keywords": "AI正在生成视频文案和关键词...",
    "Generating Video Keywords": "AI正在生成视频关键词...",
@ -95,7 +95,7 @@
    "Check Format": "脚本格式检查",
    "Script Loaded Successfully": "脚本加载成功",
    "Script format check passed": "脚本格式检查通过",
-    "Script format check failed": "脚本格式检查失<EFBFBD><EFBFBD>",
+    "Script format check failed": "脚本格式检查失败",
    "Failed to Load Script": "加载脚本失败",
    "Failed to Save Script": "保存脚本失败",
    "Script saved successfully": "脚本保存成功",
@ -103,7 +103,6 @@
    "Video Quality": "视频质量",
    "Custom prompt for LLM, leave empty to use default prompt": "自定义提示词，留空则使用默认提示词",
    "Proxy Settings": "代理设置",
-    "Language": "界面语言",
    "HTTP_PROXY": "HTTP 代理",
    "HTTPs_PROXY": "HTTPS 代理",
    "Vision Model Settings": "视频分析模型设置",
@ -134,6 +133,61 @@
    "Unsupported provider": "不支持的提供商",
    "0: Keep the audio only, 1: Keep the original sound only, 2: Keep the original sound and audio": "0: 仅保留音频，1: 仅保留原声，2: 保留原声和音频",
    "Text model is not available": "文案生成模型不可用",
-    "Text model is available": "文案生成模型可用"
+    "Text model is available": "文案生成模型可用",
+    "Upload Script": "上传脚本",
+    "Upload Script File": "上传脚本文件",
+    "Script Uploaded Successfully": "脚本上传成功",
+    "Invalid JSON format": "无效的JSON格式",
+    "Upload failed": "上传失败",
+    "Video Subtitle Merge": "**合并视频与字幕**",
+    "Upload Video and Subtitle Files": "上传视频和字幕文件",
+    "Matched File Pairs": "已匹配的文件对",
+    "Merge All Files": "合并所有文件",
+    "Merge Function Not Implemented": "合并功能待实现",
+    "No Matched Pairs Found": "未找到匹配的文件对",
+    "Missing Subtitle": "缺少对应的字幕文件",
+    "Missing Video": "缺少对应的视频文件",
+    "All Uploaded Files": "所有上传的文件",
+    "Order": "排序序号",
+    "Reorder": "重新排序",
+    "Merging files...": "正在合并文件...",
+    "Merge completed!": "合并完成！",
+    "Download Merged Video": "下载合并后的视频",
+    "Download Merged Subtitle": "下载合并后的字幕",
+    "Error during merge": "合并过程中出错",
+    "Failed to generate merged video.": "生成合并视频失败。",
+    "Failed to generate merged subtitle.": "生成合并字幕失败。",
+    "Error reading merged video file": "读取合并后的视频文件时出错",
+    "Error reading merged subtitle file": "读取合并后的字幕文件时出错",
+    "Error processing video files. Please check if the videos are valid MP4 files.": "处理视频文件时出错。请检查视频是否为有效的MP4文件。",
+    "Error processing subtitle files. Please check if the subtitles are valid SRT files.": "处理字幕文件时出错。请检查字幕是否为有效的SRT文件。",
+    "Preview Merged Video": "预览合并后的视频",
+    "Video Path": "视频路径",
+    "Subtitle Path": "字幕路径",
+    "Enable Proxy": "启用代理",
+    "QwenVL model is available": "QwenVL 模型可用",
+    "QwenVL model is not available": "QwenVL 模型不可用",
+    "System settings": "系统设置",
+    "Clear Cache": "清理缓存",
+    "Cache cleared": "缓存清理完成",
+    "storage directory does not exist": "storage目录不存在",
+    "Failed to clear cache": "清理缓存失败",
+    "Clear frames": "清理关键帧",
+    "Clear clip videos": "清理裁剪视频",
+    "Clear tasks": "清理任务",
+    "Directory cleared": "目录清理完成",
+    "Directory does not exist": "目录不存在",
+    "Failed to clear directory": "清理目录失败",
+    "Subtitle Preview": "字幕预览",
+    "One-Click Transcribe": "一键转录",
+    "Transcribing...": "正在转录中...",
+    "Transcription Complete!": "转录完成！",
+    "Transcription Failed. Please try again.": "转录失败，请重试。",
+    "API rate limit exceeded. Please wait about an hour and try again.": "API 调用次数已达到限制，请等待约一小时后再试。",
+    "Resources exhausted. Please try again later.": "资源已耗尽，请稍后再试。",
+    "Transcription Failed": "转录失败",
+    "Mergeable Files": "可合并文件数",
+    "Subtitle Content": "字幕内容",
+    "Merge Result Preview": "合并结果预览"
  }
 }
--- a/webui/tools/base.py
+++ b/webui/tools/base.py
@ -0,0 +1,141 @@
+import os
+import streamlit as st
+from loguru import logger
+
+from app.utils import gemini_analyzer, qwenvl_analyzer
+
+
+def create_vision_analyzer(provider, api_key, model, base_url):
+    """
+    创建视觉分析器实例
+    
+    Args:
+        provider: 提供商名称 ('gemini' 或 'qwenvl')
+        api_key: API密钥
+        model: 模型名称
+        base_url: API基础URL
+        
+    Returns:
+        VisionAnalyzer 或 QwenAnalyzer 实例
+    """
+    if provider == 'gemini':
+        return gemini_analyzer.VisionAnalyzer(model_name=model, api_key=api_key)
+    elif provider == 'qwenvl':
+        # 只传入必要的参数
+        return qwenvl_analyzer.QwenAnalyzer(
+            model_name=model, 
+            api_key=api_key,
+            base_url=base_url
+        )
+    else:
+        raise ValueError(f"不支持的视觉分析提供商: {provider}")
+
+
+def get_script_params():
+    """获取脚本参数"""
+    return {
+        'video_language': st.session_state.get('video_language', ''),
+        'video_clip_json_path': st.session_state.get('video_clip_json_path', ''),
+        'video_origin_path': st.session_state.get('video_origin_path', ''),
+        'video_name': st.session_state.get('video_name', ''),
+        'video_plot': st.session_state.get('video_plot', '')
+    }
+
+
+def get_batch_timestamps(batch_files, prev_batch_files=None):
+    """
+    解析一批文件的时间戳范围,支持毫秒级精度
+
+    Args:
+        batch_files: 当前批次的文件列表
+        prev_batch_files: 上一个批次的文件列表,用于处理单张图片的情况
+
+    Returns:
+        tuple: (first_timestamp, last_timestamp, timestamp_range)
+        时间戳格式: HH:MM:SS,mmm (时:分:秒,毫秒)
+        例如: 00:00:50,100 表示50秒100毫秒
+
+    示例文件名格式:
+        keyframe_001253_000050100.jpg
+        其中 000050100 表示 00:00:50,100 (50秒100毫秒)
+    """
+    if not batch_files:
+        logger.warning("Empty batch files")
+        return "00:00:00,000", "00:00:00,000", "00:00:00,000-00:00:00,000"
+
+    def get_frame_files():
+        """获取首帧和尾帧文件名"""
+        if len(batch_files) == 1 and prev_batch_files and prev_batch_files:
+            # 单张图片情况:使用上一批次最后一帧作为首帧
+            first = os.path.basename(prev_batch_files[-1])
+            last = os.path.basename(batch_files[0])
+            logger.debug(f"单张图片批次,使用上一批次最后一帧作为首帧: {first}")
+        else:
+            first = os.path.basename(batch_files[0])
+            last = os.path.basename(batch_files[-1])
+        return first, last
+
+    def extract_time(filename):
+        """从文件名提取时间信息"""
+        try:
+            # 提取类似 000050100 的时间戳部分
+            time_str = filename.split('_')[2].replace('.jpg', '')
+            if len(time_str) < 9:  # 处理旧格式
+                time_str = time_str.ljust(9, '0')
+            return time_str
+        except (IndexError, AttributeError) as e:
+            logger.warning(f"Invalid filename format: {filename}, error: {e}")
+            return "000000000"
+
+    def format_timestamp(time_str):
+        """
+        将时间字符串转换为 HH:MM:SS,mmm 格式
+
+        Args:
+            time_str: 9位数字字符串,格式为 HHMMSSMMM
+                     例如: 000010000 表示 00时00分10秒000毫秒
+                          000043039 表示 00时00分43秒039毫秒
+
+        Returns:
+            str: HH:MM:SS,mmm 格式的时间戳
+        """
+        try:
+            if len(time_str) < 9:
+                logger.warning(f"Invalid timestamp format: {time_str}")
+                return "00:00:00,000"
+
+            # 从时间戳中提取时、分、秒和毫秒
+            hours = int(time_str[0:2])  # 前2位作为小时
+            minutes = int(time_str[2:4])  # 第3-4位作为分钟
+            seconds = int(time_str[4:6])  # 第5-6位作为秒数
+            milliseconds = int(time_str[6:])  # 最后3位作为毫秒
+
+            return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+
+        except ValueError as e:
+            logger.warning(f"时间戳格式转换失败: {time_str}, error: {e}")
+            return "00:00:00,000"
+
+    # 获取首帧和尾帧文件名
+    first_frame, last_frame = get_frame_files()
+
+    # 从文件名中提取时间信息
+    first_time = extract_time(first_frame)
+    last_time = extract_time(last_frame)
+
+    # 转换为标准时间戳格式
+    first_timestamp = format_timestamp(first_time)
+    last_timestamp = format_timestamp(last_time)
+    timestamp_range = f"{first_timestamp}-{last_timestamp}"
+
+    # logger.debug(f"解析时间戳: {first_frame} -> {first_timestamp}, {last_frame} -> {last_timestamp}")
+    return first_timestamp, last_timestamp, timestamp_range
+
+
+def get_batch_files(keyframe_files, result, batch_size=5):
+    """
+    获取当前批次的图片文件
+    """
+    batch_start = result['batch_index'] * batch_size
+    batch_end = min(batch_start + batch_size, len(keyframe_files))
+    return keyframe_files[batch_start:batch_end]
--- a/webui/tools/generate_script_docu.py
+++ b/webui/tools/generate_script_docu.py
@ -0,0 +1,293 @@
+# 纪录片脚本生成
+import os
+import json
+import time
+import asyncio
+import traceback
+import requests
+import streamlit as st
+from loguru import logger
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+from app.config import config
+from app.utils.script_generator import ScriptProcessor
+from app.utils import utils, video_processor, video_processor_v2, qwenvl_analyzer
+from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps
+
+
+def generate_script_docu(tr, params):
+    """
+    生成 纪录片 视频脚本
+    """
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+
+    def update_progress(progress: float, message: str = ""):
+        progress_bar.progress(progress)
+        if message:
+            status_text.text(f"{progress}% - {message}")
+        else:
+            status_text.text(f"进度: {progress}%")
+
+    try:
+        with st.spinner("正在生成脚本..."):
+            if not params.video_origin_path:
+                st.error("请先选择视频文件")
+                return
+
+            # ===================提取键帧===================
+            update_progress(10, "正在提取关键帧...")
+
+            # 创建临时目录用于存储关键帧
+            keyframes_dir = os.path.join(utils.temp_dir(), "keyframes")
+            video_hash = utils.md5(params.video_origin_path + str(os.path.getmtime(params.video_origin_path)))
+            video_keyframes_dir = os.path.join(keyframes_dir, video_hash)
+
+            # 检查是否已经提取过关键帧
+            keyframe_files = []
+            if os.path.exists(video_keyframes_dir):
+                # 取已有的关键帧文件
+                for filename in sorted(os.listdir(video_keyframes_dir)):
+                    if filename.endswith('.jpg'):
+                        keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+
+                if keyframe_files:
+                    logger.info(f"使用已缓存的关键帧: {video_keyframes_dir}")
+                    st.info(f"使用已缓存的关键帧，如需重新提取请删除目录: {video_keyframes_dir}")
+                    update_progress(20, f"使用已缓存关键帧，共 {len(keyframe_files)} 帧")
+
+            # 如果没有缓存的关键帧，则进行提取
+            if not keyframe_files:
+                try:
+                    # 确保目录存在
+                    os.makedirs(video_keyframes_dir, exist_ok=True)
+
+                    # 初始化视频处理器
+                    if config.frames.get("version") == "v2":
+                        processor = video_processor_v2.VideoProcessor(params.video_origin_path)
+                        # 处理视频并提取关键帧
+                        processor.process_video_pipeline(
+                            output_dir=video_keyframes_dir,
+                            skip_seconds=st.session_state.get('skip_seconds'),
+                            threshold=st.session_state.get('threshold')
+                        )
+                    else:
+                        processor = video_processor.VideoProcessor(params.video_origin_path)
+                        # 处理视频并提取关键帧
+                        processor.process_video(
+                            output_dir=video_keyframes_dir,
+                            skip_seconds=0
+                        )
+
+                    # 获取所有关键文件路径
+                    for filename in sorted(os.listdir(video_keyframes_dir)):
+                        if filename.endswith('.jpg'):
+                            keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+
+                    if not keyframe_files:
+                        raise Exception("未提取到任何关键帧")
+
+                    update_progress(20, f"关键帧提取完成，共 {len(keyframe_files)} 帧")
+
+                except Exception as e:
+                    # 如果提取失败，清理创建的目录
+                    try:
+                        if os.path.exists(video_keyframes_dir):
+                            import shutil
+                            shutil.rmtree(video_keyframes_dir)
+                    except Exception as cleanup_err:
+                        logger.error(f"清理失败的关键帧目录时出错: {cleanup_err}")
+
+                    raise Exception(f"关键帧提取失败: {str(e)}")
+
+            # 根据不同的 LLM 提供商处理
+            vision_llm_provider = st.session_state.get('vision_llm_providers').lower()
+            logger.debug(f"Vision LLM 提供商: {vision_llm_provider}")
+
+            try:
+                # ===================初始化视觉分析器===================
+                update_progress(30, "正在初始化视觉分析器...")
+
+                # 从配置中获取相关配置
+                if vision_llm_provider == 'gemini':
+                    vision_api_key = st.session_state.get('vision_gemini_api_key')
+                    vision_model = st.session_state.get('vision_gemini_model_name')
+                    vision_base_url = st.session_state.get('vision_gemini_base_url')
+                elif vision_llm_provider == 'qwenvl':
+                    vision_api_key = st.session_state.get('vision_qwenvl_api_key')
+                    vision_model = st.session_state.get('vision_qwenvl_model_name', 'qwen-vl-max-latest')
+                    vision_base_url = st.session_state.get('vision_qwenvl_base_url',
+                                                           'https://dashscope.aliyuncs.com/compatible-mode/v1')
+                else:
+                    raise ValueError(f"不支持的视觉分析提供商: {vision_llm_provider}")
+
+                # 创建视觉分析器实例
+                analyzer = create_vision_analyzer(
+                    provider=vision_llm_provider,
+                    api_key=vision_api_key,
+                    model=vision_model,
+                    base_url=vision_base_url
+                )
+
+                update_progress(40, "正在分析关键帧...")
+
+                # ===================创建异步事件循环===================
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+
+                # 执行异步分析
+                vision_batch_size = st.session_state.get('vision_batch_size') or config.frames.get("vision_batch_size")
+                results = loop.run_until_complete(
+                    analyzer.analyze_images(
+                        images=keyframe_files,
+                        prompt=config.app.get('vision_analysis_prompt'),
+                        batch_size=vision_batch_size
+                    )
+                )
+                loop.close()
+
+                # ===================处理分析结果===================
+                update_progress(60, "正在整理分析结果...")
+
+                # 合并所有批次的析结果
+                frame_analysis = ""
+                prev_batch_files = None
+
+                for result in results:
+                    if 'error' in result:
+                        logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}")
+
+                    # 获取当前批次的文件列表 keyframe_001136_000045.jpg 将 000045 精度提升到 毫秒
+                    batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
+                    logger.debug(f"批次 {result['batch_index']} 处理完成，共 {len(batch_files)} 张图片")
+                    # logger.debug(batch_files)
+
+                    first_timestamp, last_timestamp, _ = get_batch_timestamps(batch_files, prev_batch_files)
+                    logger.debug(f"处理时间戳: {first_timestamp}-{last_timestamp}")
+
+                    # 添加带时间戳的分析结果
+                    frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
+                    frame_analysis += result['response']
+                    frame_analysis += "\n"
+
+                    # 更新上一个批次的文件
+                    prev_batch_files = batch_files
+
+                if not frame_analysis.strip():
+                    raise Exception("未能生成有效的帧分析结果")
+
+                # 保存分析结果
+                analysis_path = os.path.join(utils.temp_dir(), "frame_analysis.txt")
+                with open(analysis_path, 'w', encoding='utf-8') as f:
+                    f.write(frame_analysis)
+
+                update_progress(70, "正在生成脚本...")
+
+                # 从配置中获取文本生成相关配置
+                text_provider = config.app.get('text_llm_provider', 'gemini').lower()
+                text_api_key = config.app.get(f'text_{text_provider}_api_key')
+                text_model = config.app.get(f'text_{text_provider}_model_name')
+                text_base_url = config.app.get(f'text_{text_provider}_base_url')
+
+                # 构建帧内容列表
+                frame_content_list = []
+                prev_batch_files = None
+
+                for i, result in enumerate(results):
+                    if 'error' in result:
+                        continue
+
+                    batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
+                    _, _, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files)
+
+                    frame_content = {
+                        "timestamp": timestamp_range,
+                        "picture": result['response'],
+                        "narration": "",
+                        "OST": 2
+                    }
+                    frame_content_list.append(frame_content)
+
+                    logger.debug(f"添加帧内容: 时间范围={timestamp_range}, 分析结果长度={len(result['response'])}")
+
+                    # 更新上一个批次的文件
+                    prev_batch_files = batch_files
+
+                if not frame_content_list:
+                    raise Exception("没有有效的帧内容可以处理")
+
+                # ===================开始生成文案===================
+                update_progress(80, "正在生成文案...")
+                # 校验配置
+                api_params = {
+                    "vision_api_key": vision_api_key,
+                    "vision_model_name": vision_model,
+                    "vision_base_url": vision_base_url or "",
+                    "text_api_key": text_api_key,
+                    "text_model_name": text_model,
+                    "text_base_url": text_base_url or ""
+                }
+                headers = {
+                    'accept': 'application/json',
+                    'Content-Type': 'application/json'
+                }
+                session = requests.Session()
+                retry_strategy = Retry(
+                    total=3,
+                    backoff_factor=1,
+                    status_forcelist=[500, 502, 503, 504]
+                )
+                adapter = HTTPAdapter(max_retries=retry_strategy)
+                session.mount("https://", adapter)
+                try:
+                    response = session.post(
+                        f"{config.app.get('narrato_api_url')}/video/config",
+                        headers=headers,
+                        json=api_params,
+                        timeout=30,
+                        verify=True
+                    )
+                except Exception as e:
+                    pass
+                custom_prompt = st.session_state.get('custom_prompt', '')
+                processor = ScriptProcessor(
+                    model_name=text_model,
+                    api_key=text_api_key,
+                    prompt=custom_prompt,
+                    base_url=text_base_url or "",
+                    video_theme=st.session_state.get('video_theme', '')
+                )
+
+                # 处理帧内容生成脚本
+                script_result = processor.process_frames(frame_content_list)
+
+                # 结果转换为JSON字符串
+                script = json.dumps(script_result, ensure_ascii=False, indent=2)
+
+            except Exception as e:
+                logger.exception(f"大模型处理过程中发生错误\n{traceback.format_exc()}")
+                raise Exception(f"分析失败: {str(e)}")
+
+            if script is None:
+                st.error("生成脚本失败，请检查日志")
+                st.stop()
+            logger.info(f"脚本生成完成")
+            if isinstance(script, list):
+                st.session_state['video_clip_json'] = script
+            elif isinstance(script, str):
+                st.session_state['video_clip_json'] = json.loads(script)
+            update_progress(80, "脚本生成完成")
+
+        time.sleep(0.1)
+        progress_bar.progress(100)
+        status_text.text("脚本生成完成！")
+        st.success("视频脚本生成成功！")
+
+    except Exception as err:
+        st.error(f"生成过程中发生错误: {str(err)}")
+        logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")
+    finally:
+        time.sleep(2)
+        progress_bar.empty()
+        status_text.empty()
--- a/webui/utils/merge_video.py
+++ b/webui/utils/merge_video.py
@ -0,0 +1,115 @@
+"""
+合并视频和字幕文件
+"""
+from moviepy.editor import VideoFileClip, concatenate_videoclips
+import pysrt
+import os
+
+
+def get_video_duration(video_path):
+    """获取视频时长（秒）"""
+    video = VideoFileClip(video_path)
+    duration = video.duration
+    video.close()
+    return duration
+
+
+def adjust_subtitle_timing(subtitle_path, time_offset):
+    """调整字幕时间戳"""
+    subs = pysrt.open(subtitle_path)
+
+    # 为每个字幕项添加时间偏移
+    for sub in subs:
+        sub.start.hours += int(time_offset / 3600)
+        sub.start.minutes += int((time_offset % 3600) / 60)
+        sub.start.seconds += int(time_offset % 60)
+        sub.start.milliseconds += int((time_offset * 1000) % 1000)
+
+        sub.end.hours += int(time_offset / 3600)
+        sub.end.minutes += int((time_offset % 3600) / 60)
+        sub.end.seconds += int(time_offset % 60)
+        sub.end.milliseconds += int((time_offset * 1000) % 1000)
+
+    return subs
+
+
+def merge_videos_and_subtitles(video_paths, subtitle_paths, output_video_path, output_subtitle_path):
+    """合并视频和字幕文件"""
+    if len(video_paths) != len(subtitle_paths):
+        raise ValueError("视频文件数量与字幕文件数量不匹配")
+
+    # 1. 合并视频
+    video_clips = []
+    accumulated_duration = 0
+    merged_subs = pysrt.SubRipFile()
+
+    try:
+        # 处理所有视频和字幕
+        for i, (video_path, subtitle_path) in enumerate(zip(video_paths, subtitle_paths)):
+            # 添加视频
+            print(f"处理视频 {i + 1}/{len(video_paths)}: {video_path}")
+            video_clip = VideoFileClip(video_path)
+            video_clips.append(video_clip)
+
+            # 处理字幕
+            print(f"处理字幕 {i + 1}/{len(subtitle_paths)}: {subtitle_path}")
+            if i == 0:
+                # 第一个字幕文件直接读取
+                current_subs = pysrt.open(subtitle_path)
+            else:
+                # 后续字幕文件需要调整时间戳
+                current_subs = adjust_subtitle_timing(subtitle_path, accumulated_duration)
+
+            # 合并字幕
+            merged_subs.extend(current_subs)
+
+            # 更新累计时长
+            accumulated_duration += video_clip.duration
+
+        # 判断视频是否存在，若已经存在不重复合并
+        if not os.path.exists(output_video_path):
+            print("合并视频中...")
+            final_video = concatenate_videoclips(video_clips)
+
+            # 保存合并后的视频
+            print("保存合并后的视频...")
+            final_video.write_videofile(output_video_path, audio_codec='aac')
+
+        # 保存合并后的字幕
+        print("保存合并后的字幕...")
+        merged_subs.save(output_subtitle_path, encoding='utf-8')
+
+        print("合并完成")
+
+    finally:
+        # 清理资源
+        for clip in video_clips:
+            clip.close()
+
+
+def main():
+    # 示例用法
+    video_paths = [
+        "temp/1.mp4",
+        "temp/2.mp4",
+        "temp/3.mp4",
+        "temp/4.mp4",
+        "temp/5.mp4",
+    ]
+
+    subtitle_paths = [
+        "temp/1.srt",
+        "temp/2.srt",
+        "temp/3.srt",
+        "temp/4.srt",
+        "temp/5.srt",
+    ]
+
+    output_video_path = "temp/merged_video.mp4"
+    output_subtitle_path = "temp/merged_subtitle.srt"
+
+    merge_videos_and_subtitles(video_paths, subtitle_paths, output_video_path, output_subtitle_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/webui/utils/vision_analyzer.py
+++ b/webui/utils/vision_analyzer.py
@ -0,0 +1,100 @@
+import logging
+from typing import List, Dict, Any, Optional
+from app.utils import gemini_analyzer, qwenvl_analyzer
+
+logger = logging.getLogger(__name__)
+
+class VisionAnalyzer:
+    def __init__(self):
+        self.provider = None
+        self.api_key = None
+        self.model = None
+        self.base_url = None
+        self.analyzer = None
+        
+    def initialize_gemini(self, api_key: str, model: str, base_url: str) -> None:
+        """
+        初始化Gemini视觉分析器
+        
+        Args:
+            api_key: Gemini API密钥
+            model: 模型名称
+            base_url: API基础URL
+        """
+        self.provider = 'gemini'
+        self.api_key = api_key
+        self.model = model
+        self.base_url = base_url
+        self.analyzer = gemini_analyzer.VisionAnalyzer(
+            model_name=model,
+            api_key=api_key
+        )
+
+    def initialize_qwenvl(self, api_key: str, model: str, base_url: str) -> None:
+        """
+        初始化QwenVL视觉分析器
+        
+        Args:
+            api_key: 阿里云API密钥
+            model: 模型名称
+            base_url: API基础URL
+        """
+        self.provider = 'qwenvl'
+        self.api_key = api_key
+        self.model = model
+        self.base_url = base_url
+        self.analyzer = qwenvl_analyzer.QwenAnalyzer(
+            model_name=model,
+            api_key=api_key
+        )
+        
+    async def analyze_images(self, images: List[str], prompt: str, batch_size: int = 5) -> Dict[str, Any]:
+        """
+        分析图片内容
+        
+        Args:
+            images: 图片路径列表
+            prompt: 分析提示词
+            batch_size: 每批处理的图片数量，默认为5
+            
+        Returns:
+            Dict: 分析结果
+        """
+        if not self.analyzer:
+            raise ValueError("未初始化视觉分析器")
+            
+        return await self.analyzer.analyze_images(
+            images=images,
+            prompt=prompt,
+            batch_size=batch_size
+        )
+
+def create_vision_analyzer(provider: str, **kwargs) -> VisionAnalyzer:
+    """
+    创建视觉分析器实例
+    
+    Args:
+        provider: 提供商名称 ('gemini' 或 'qwenvl')
+        **kwargs: 提供商特定的配置参数
+        
+    Returns:
+        VisionAnalyzer: 配置好的视觉分析器实例
+    """
+    analyzer = VisionAnalyzer()
+    
+    if provider.lower() == 'gemini':
+        analyzer.initialize_gemini(
+            api_key=kwargs.get('api_key'),
+            model=kwargs.get('model'),
+            base_url=kwargs.get('base_url')
+        )
+    elif provider.lower() == 'qwenvl':
+        analyzer.initialize_qwenvl(
+            api_key=kwargs.get('api_key'),
+            model=kwargs.get('model'),
+            base_url=kwargs.get('base_url')
+        )
+    else:
+        raise ValueError(f"不支持的视觉分析提供商: {provider}")
+        
+    return analyzer