From 8eb961bc0ef4a4b3f1169a42d1d4cdb6ffefdd7d Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Mon, 18 Nov 2024 16:30:00 +0800
Subject: [PATCH 1/9] =?UTF-8?q?feat(app):=20=E6=96=B0=E5=A2=9E=E8=84=9A?=
 =?UTF-8?q?=E6=9C=AC=E7=94=9F=E6=88=90=20V2=20=E6=8E=A5=E5=8F=A3=E5=B9=B6?=
 =?UTF-8?q?=E9=87=8D=E6=9E=84=E7=9B=B8=E5=85=B3=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 新增 V2脚本生成接口和相关服务
- 重构脚本生成逻辑，提高可维护性和可扩展性
- 优化关键帧提取和处理流程
- 改进错误处理和日志记录
---
 app/controllers/v1/video.py    | 210 +++++++++---------
 app/controllers/v2/base.py     |  11 +
 app/controllers/v2/script.py   |  45 ++++
 app/models/schema_v2.py        |  15 ++
 app/router.py                  |   4 +
 app/services/script_service.py | 378 +++++++++++++++++++++++++++++++++
 webui/i18n/zh.json             |   1 -
 7 files changed, 558 insertions(+), 106 deletions(-)
 create mode 100644 app/controllers/v2/base.py
 create mode 100644 app/controllers/v2/script.py
 create mode 100644 app/models/schema_v2.py
 create mode 100644 app/services/script_service.py

diff --git a/app/controllers/v1/video.py b/app/controllers/v1/video.py
index 0430707..336084f 100644
--- a/app/controllers/v1/video.py
+++ b/app/controllers/v1/video.py
@@ -163,109 +163,109 @@ def delete_video(request: Request, task_id: str = Path(..., description="Task ID
     )
 
 
-@router.get(
-    "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files"
-)
-def get_bgm_list(request: Request):
-    suffix = "*.mp3"
-    song_dir = utils.song_dir()
-    files = glob.glob(os.path.join(song_dir, suffix))
-    bgm_list = []
-    for file in files:
-        bgm_list.append(
-            {
-                "name": os.path.basename(file),
-                "size": os.path.getsize(file),
-                "file": file,
-            }
-        )
-    response = {"files": bgm_list}
-    return utils.get_response(200, response)
+# @router.get(
+#     "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files"
+# )
+# def get_bgm_list(request: Request):
+#     suffix = "*.mp3"
+#     song_dir = utils.song_dir()
+#     files = glob.glob(os.path.join(song_dir, suffix))
+#     bgm_list = []
+#     for file in files:
+#         bgm_list.append(
+#             {
+#                 "name": os.path.basename(file),
+#                 "size": os.path.getsize(file),
+#                 "file": file,
+#             }
+#         )
+#     response = {"files": bgm_list}
+#     return utils.get_response(200, response)
+#
 
-
-@router.post(
-    "/musics",
-    response_model=BgmUploadResponse,
-    summary="Upload the BGM file to the songs directory",
-)
-def upload_bgm_file(request: Request, file: UploadFile = File(...)):
-    request_id = base.get_task_id(request)
-    # check file ext
-    if file.filename.endswith("mp3"):
-        song_dir = utils.song_dir()
-        save_path = os.path.join(song_dir, file.filename)
-        # save file
-        with open(save_path, "wb+") as buffer:
-            # If the file already exists, it will be overwritten
-            file.file.seek(0)
-            buffer.write(file.file.read())
-        response = {"file": save_path}
-        return utils.get_response(200, response)
-
-    raise HttpException(
-        "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded"
-    )
-
-
-@router.get("/stream/{file_path:path}")
-async def stream_video(request: Request, file_path: str):
-    tasks_dir = utils.task_dir()
-    video_path = os.path.join(tasks_dir, file_path)
-    range_header = request.headers.get("Range")
-    video_size = os.path.getsize(video_path)
-    start, end = 0, video_size - 1
-
-    length = video_size
-    if range_header:
-        range_ = range_header.split("bytes=")[1]
-        start, end = [int(part) if part else None for part in range_.split("-")]
-        if start is None:
-            start = video_size - end
-            end = video_size - 1
-        if end is None:
-            end = video_size - 1
-        length = end - start + 1
-
-    def file_iterator(file_path, offset=0, bytes_to_read=None):
-        with open(file_path, "rb") as f:
-            f.seek(offset, os.SEEK_SET)
-            remaining = bytes_to_read or video_size
-            while remaining > 0:
-                bytes_to_read = min(4096, remaining)
-                data = f.read(bytes_to_read)
-                if not data:
-                    break
-                remaining -= len(data)
-                yield data
-
-    response = StreamingResponse(
-        file_iterator(video_path, start, length), media_type="video/mp4"
-    )
-    response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}"
-    response.headers["Accept-Ranges"] = "bytes"
-    response.headers["Content-Length"] = str(length)
-    response.status_code = 206  # Partial Content
-
-    return response
-
-
-@router.get("/download/{file_path:path}")
-async def download_video(_: Request, file_path: str):
-    """
-    download video
-    :param _: Request request
-    :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4
-    :return: video file
-    """
-    tasks_dir = utils.task_dir()
-    video_path = os.path.join(tasks_dir, file_path)
-    file_path = pathlib.Path(video_path)
-    filename = file_path.stem
-    extension = file_path.suffix
-    headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"}
-    return FileResponse(
-        path=video_path,
-        headers=headers,
-        filename=f"{filename}{extension}",
-        media_type=f"video/{extension[1:]}",
-    )
+# @router.post(
+#     "/musics",
+#     response_model=BgmUploadResponse,
+#     summary="Upload the BGM file to the songs directory",
+# )
+# def upload_bgm_file(request: Request, file: UploadFile = File(...)):
+#     request_id = base.get_task_id(request)
+#     # check file ext
+#     if file.filename.endswith("mp3"):
+#         song_dir = utils.song_dir()
+#         save_path = os.path.join(song_dir, file.filename)
+#         # save file
+#         with open(save_path, "wb+") as buffer:
+#             # If the file already exists, it will be overwritten
+#             file.file.seek(0)
+#             buffer.write(file.file.read())
+#         response = {"file": save_path}
+#         return utils.get_response(200, response)
+#
+#     raise HttpException(
+#         "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded"
+#     )
+#
+#
+# @router.get("/stream/{file_path:path}")
+# async def stream_video(request: Request, file_path: str):
+#     tasks_dir = utils.task_dir()
+#     video_path = os.path.join(tasks_dir, file_path)
+#     range_header = request.headers.get("Range")
+#     video_size = os.path.getsize(video_path)
+#     start, end = 0, video_size - 1
+#
+#     length = video_size
+#     if range_header:
+#         range_ = range_header.split("bytes=")[1]
+#         start, end = [int(part) if part else None for part in range_.split("-")]
+#         if start is None:
+#             start = video_size - end
+#             end = video_size - 1
+#         if end is None:
+#             end = video_size - 1
+#         length = end - start + 1
+#
+#     def file_iterator(file_path, offset=0, bytes_to_read=None):
+#         with open(file_path, "rb") as f:
+#             f.seek(offset, os.SEEK_SET)
+#             remaining = bytes_to_read or video_size
+#             while remaining > 0:
+#                 bytes_to_read = min(4096, remaining)
+#                 data = f.read(bytes_to_read)
+#                 if not data:
+#                     break
+#                 remaining -= len(data)
+#                 yield data
+#
+#     response = StreamingResponse(
+#         file_iterator(video_path, start, length), media_type="video/mp4"
+#     )
+#     response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}"
+#     response.headers["Accept-Ranges"] = "bytes"
+#     response.headers["Content-Length"] = str(length)
+#     response.status_code = 206  # Partial Content
+#
+#     return response
+#
+#
+# @router.get("/download/{file_path:path}")
+# async def download_video(_: Request, file_path: str):
+#     """
+#     download video
+#     :param _: Request request
+#     :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4
+#     :return: video file
+#     """
+#     tasks_dir = utils.task_dir()
+#     video_path = os.path.join(tasks_dir, file_path)
+#     file_path = pathlib.Path(video_path)
+#     filename = file_path.stem
+#     extension = file_path.suffix
+#     headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"}
+#     return FileResponse(
+#         path=video_path,
+#         headers=headers,
+#         filename=f"{filename}{extension}",
+#         media_type=f"video/{extension[1:]}",
+#     )
diff --git a/app/controllers/v2/base.py b/app/controllers/v2/base.py
new file mode 100644
index 0000000..4612983
--- /dev/null
+++ b/app/controllers/v2/base.py
@@ -0,0 +1,11 @@
+from fastapi import APIRouter, Depends
+
+
+def v2_router(dependencies=None):
+    router = APIRouter()
+    router.tags = ["V2"]
+    router.prefix = "/api/v2"
+    # 将认证依赖项应用于所有路由
+    if dependencies:
+        router.dependencies = dependencies
+    return router
diff --git a/app/controllers/v2/script.py b/app/controllers/v2/script.py
new file mode 100644
index 0000000..85f4238
--- /dev/null
+++ b/app/controllers/v2/script.py
@@ -0,0 +1,45 @@
+from fastapi import APIRouter, BackgroundTasks
+from loguru import logger
+
+from app.models.schema_v2 import GenerateScriptRequest, GenerateScriptResponse
+from app.services.script_service import ScriptGenerator
+from app.utils import utils
+from app.controllers.v2.base import v2_router
+
+# router = APIRouter(prefix="/api/v2", tags=["Script Generation V2"])
+router = v2_router()
+
+@router.post(
+    "/scripts/generate",
+    response_model=GenerateScriptResponse,
+    summary="生成视频脚本 (V2)"
+)
+async def generate_script(
+    request: GenerateScriptRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    生成视频脚本的V2版本API
+    """
+    task_id = utils.get_uuid()
+    
+    try:
+        generator = ScriptGenerator()
+        script = await generator.generate_script(
+            video_path=request.video_path,
+            video_theme=request.video_theme,
+            custom_prompt=request.custom_prompt,
+            skip_seconds=request.skip_seconds,
+            threshold=request.threshold,
+            vision_batch_size=request.vision_batch_size,
+            vision_llm_provider=request.vision_llm_provider
+        )
+        
+        return {
+            "task_id": task_id,
+            "script": script
+        }
+        
+    except Exception as e:
+        logger.exception(f"Generate script failed: {str(e)}")
+        raise 
\ No newline at end of file
diff --git a/app/models/schema_v2.py b/app/models/schema_v2.py
new file mode 100644
index 0000000..786c018
--- /dev/null
+++ b/app/models/schema_v2.py
@@ -0,0 +1,15 @@
+from typing import Optional, List
+from pydantic import BaseModel
+
+class GenerateScriptRequest(BaseModel):
+    video_path: str
+    video_theme: Optional[str] = ""
+    custom_prompt: Optional[str] = ""
+    skip_seconds: Optional[int] = 0
+    threshold: Optional[int] = 30
+    vision_batch_size: Optional[int] = 5
+    vision_llm_provider: Optional[str] = "gemini"
+    
+class GenerateScriptResponse(BaseModel):
+    task_id: str
+    script: List[dict] 
\ No newline at end of file
diff --git a/app/router.py b/app/router.py
index cf84037..df60500 100644
--- a/app/router.py
+++ b/app/router.py
@@ -10,8 +10,12 @@ Resources:
 from fastapi import APIRouter
 
 from app.controllers.v1 import llm, video
+from app.controllers.v2 import script
 
 root_api_router = APIRouter()
 # v1
 root_api_router.include_router(video.router)
 root_api_router.include_router(llm.router)
+
+# v2
+root_api_router.include_router(script.router)
diff --git a/app/services/script_service.py b/app/services/script_service.py
new file mode 100644
index 0000000..1693cbc
--- /dev/null
+++ b/app/services/script_service.py
@@ -0,0 +1,378 @@
+import os
+import json
+import time
+import asyncio
+import requests
+from loguru import logger
+from typing import List, Dict, Any, Callable
+
+from app.utils import utils, vision_analyzer, video_processor, video_processor_v2
+from app.utils.script_generator import ScriptProcessor
+from app.config import config
+
+
+class ScriptGenerator:
+    def __init__(self):
+        self.temp_dir = utils.temp_dir()
+        self.keyframes_dir = os.path.join(self.temp_dir, "keyframes")
+        
+    async def generate_script(
+        self,
+        video_path: str,
+        video_theme: str = "",
+        custom_prompt: str = "",
+        skip_seconds: int = 0,
+        threshold: int = 30,
+        vision_batch_size: int = 5,
+        vision_llm_provider: str = "gemini",
+        progress_callback: Callable[[float, str], None] = None
+    ) -> List[Dict[Any, Any]]:
+        """
+        生成视频脚本的核心逻辑
+        
+        Args:
+            video_path: 视频文件路径
+            video_theme: 视频主题
+            custom_prompt: 自定义提示词
+            skip_seconds: 跳过开始的秒数
+            threshold: 差异阈值
+            vision_batch_size: 视觉处理批次大小
+            vision_llm_provider: 视觉模型提供商
+            progress_callback: 进度回调函数
+            
+        Returns:
+            List[Dict]: 生成的视频脚本
+        """
+        if progress_callback is None:
+            progress_callback = lambda p, m: None
+            
+        try:
+            # 提取关键帧
+            progress_callback(10, "正在提取关键帧...")
+            keyframe_files = await self._extract_keyframes(
+                video_path, 
+                skip_seconds,
+                threshold
+            )
+            
+            if vision_llm_provider == "gemini":
+                script = await self._process_with_gemini(
+                    keyframe_files,
+                    video_theme,
+                    custom_prompt,
+                    vision_batch_size,
+                    progress_callback
+                )
+            elif vision_llm_provider == "narratoapi":
+                script = await self._process_with_narrato(
+                    keyframe_files,
+                    video_theme,
+                    custom_prompt,
+                    vision_batch_size,
+                    progress_callback
+                )
+            else:
+                raise ValueError(f"Unsupported vision provider: {vision_llm_provider}")
+                
+            return json.loads(script) if isinstance(script, str) else script
+            
+        except Exception as e:
+            logger.exception("Generate script failed")
+            raise
+            
+    async def _extract_keyframes(
+        self,
+        video_path: str,
+        skip_seconds: int,
+        threshold: int
+    ) -> List[str]:
+        """提取视频关键帧"""
+        video_hash = utils.md5(video_path + str(os.path.getmtime(video_path)))
+        video_keyframes_dir = os.path.join(self.keyframes_dir, video_hash)
+        
+        # 检查缓存
+        keyframe_files = []
+        if os.path.exists(video_keyframes_dir):
+            for filename in sorted(os.listdir(video_keyframes_dir)):
+                if filename.endswith('.jpg'):
+                    keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+                    
+            if keyframe_files:
+                logger.info(f"Using cached keyframes: {video_keyframes_dir}")
+                return keyframe_files
+                
+        # 提取新的关键帧
+        os.makedirs(video_keyframes_dir, exist_ok=True)
+        
+        try:
+            if config.frames.get("version") == "v2":
+                processor = video_processor_v2.VideoProcessor(video_path)
+                processor.process_video_pipeline(
+                    output_dir=video_keyframes_dir,
+                    skip_seconds=skip_seconds,
+                    threshold=threshold
+                )
+            else:
+                processor = video_processor.VideoProcessor(video_path)
+                processor.process_video(
+                    output_dir=video_keyframes_dir,
+                    skip_seconds=skip_seconds
+                )
+                
+            for filename in sorted(os.listdir(video_keyframes_dir)):
+                if filename.endswith('.jpg'):
+                    keyframe_files.append(os.path.join(video_keyframes_dir, filename))
+                    
+            return keyframe_files
+            
+        except Exception as e:
+            if os.path.exists(video_keyframes_dir):
+                import shutil
+                shutil.rmtree(video_keyframes_dir)
+            raise
+            
+    async def _process_with_gemini(
+        self,
+        keyframe_files: List[str],
+        video_theme: str,
+        custom_prompt: str,
+        vision_batch_size: int,
+        progress_callback: Callable[[float, str], None]
+    ) -> str:
+        """使用Gemini处理视频帧"""
+        progress_callback(30, "正在初始化视觉分析器...")
+        
+        # 获取Gemini配置
+        vision_api_key = config.app.get("vision_gemini_api_key")
+        vision_model = config.app.get("vision_gemini_model_name")
+        
+        if not vision_api_key or not vision_model:
+            raise ValueError("未配置 Gemini API Key 或者模型")
+
+        analyzer = vision_analyzer.VisionAnalyzer(
+            model_name=vision_model,
+            api_key=vision_api_key,
+        )
+
+        progress_callback(40, "正在分析关键帧...")
+
+        # 执行异步分析
+        results = await analyzer.analyze_images(
+            images=keyframe_files,
+            prompt=config.app.get('vision_analysis_prompt'),
+            batch_size=vision_batch_size
+        )
+
+        progress_callback(60, "正在整理分析结果...")
+        
+        # 合并所有批次的分析结果
+        frame_analysis = ""
+        prev_batch_files = None
+
+        for result in results:
+            if 'error' in result:
+                logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}")
+                continue
+                
+            batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size)
+            first_timestamp, last_timestamp, _ = self._get_batch_timestamps(batch_files, prev_batch_files)
+            
+            # 添加带时间戳的分析结果
+            frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
+            frame_analysis += result['response']
+            frame_analysis += "\n"
+            
+            prev_batch_files = batch_files
+        
+        if not frame_analysis.strip():
+            raise Exception("未能生成有效的帧分析结果")
+        
+        progress_callback(70, "正在生成脚本...")
+
+        # 构建帧内容列表
+        frame_content_list = []
+        prev_batch_files = None
+
+        for result in results:
+            if 'error' in result:
+                continue
+            
+            batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size)
+            _, _, timestamp_range = self._get_batch_timestamps(batch_files, prev_batch_files)
+            
+            frame_content = {
+                "timestamp": timestamp_range,
+                "picture": result['response'],
+                "narration": "",
+                "OST": 2
+            }
+            frame_content_list.append(frame_content)
+            prev_batch_files = batch_files
+
+        if not frame_content_list:
+            raise Exception("没有有效的帧内容可以处理")
+
+        progress_callback(90, "正在生成文案...")
+        
+        # 获取文本生成配置
+        text_provider = config.app.get('text_llm_provider', 'gemini').lower()
+        text_api_key = config.app.get(f'text_{text_provider}_api_key')
+        text_model = config.app.get(f'text_{text_provider}_model_name')
+
+        processor = ScriptProcessor(
+            model_name=text_model,
+            api_key=text_api_key,
+            prompt=custom_prompt,
+            video_theme=video_theme
+        )
+
+        return processor.process_frames(frame_content_list)
+
+    async def _process_with_narrato(
+        self,
+        keyframe_files: List[str],
+        video_theme: str,
+        custom_prompt: str,
+        vision_batch_size: int,
+        progress_callback: Callable[[float, str], None]
+    ) -> str:
+        """使用NarratoAPI处理视频帧"""
+        # 创建临时目录
+        temp_dir = utils.temp_dir("narrato")
+        
+        # 打包关键帧
+        progress_callback(30, "正在打包关键帧...")
+        zip_path = os.path.join(temp_dir, f"keyframes_{int(time.time())}.zip")
+        
+        try:
+            if not utils.create_zip(keyframe_files, zip_path):
+                raise Exception("打包关键帧失败")
+            
+            # 获取API配置
+            api_url = config.app.get("narrato_api_url")
+            api_key = config.app.get("narrato_api_key")
+            
+            if not api_key:
+                raise ValueError("未配置 Narrato API Key")
+            
+            headers = {
+                'X-API-Key': api_key,
+                'accept': 'application/json'
+            }
+            
+            api_params = {
+                'batch_size': vision_batch_size,
+                'use_ai': False,
+                'start_offset': 0,
+                'vision_model': config.app.get('narrato_vision_model', 'gemini-1.5-flash'),
+                'vision_api_key': config.app.get('narrato_vision_key'),
+                'llm_model': config.app.get('narrato_llm_model', 'qwen-plus'),
+                'llm_api_key': config.app.get('narrato_llm_key'),
+                'custom_prompt': custom_prompt
+            }
+            
+            progress_callback(40, "正在上传文件...")
+            with open(zip_path, 'rb') as f:
+                files = {'file': (os.path.basename(zip_path), f, 'application/x-zip-compressed')}
+                response = requests.post(
+                    f"{api_url}/video/analyze",
+                    headers=headers, 
+                    params=api_params, 
+                    files=files,
+                    timeout=30
+                )
+                response.raise_for_status()
+            
+            task_data = response.json()
+            task_id = task_data["data"].get('task_id')
+            if not task_id:
+                raise Exception(f"无效的API响应: {response.text}")
+            
+            progress_callback(50, "正在等待分析结果...")
+            retry_count = 0
+            max_retries = 60
+            
+            while retry_count < max_retries:
+                try:
+                    status_response = requests.get(
+                        f"{api_url}/video/tasks/{task_id}",
+                        headers=headers,
+                        timeout=10
+                    )
+                    status_response.raise_for_status()
+                    task_status = status_response.json()['data']
+                    
+                    if task_status['status'] == 'SUCCESS':
+                        return task_status['result']['data']
+                    elif task_status['status'] in ['FAILURE', 'RETRY']:
+                        raise Exception(f"任务失败: {task_status.get('error')}")
+                    
+                    retry_count += 1
+                    time.sleep(2)
+                    
+                except requests.RequestException as e:
+                    logger.warning(f"获取任务状态失败，重试中: {str(e)}")
+                    retry_count += 1
+                    time.sleep(2)
+                    continue
+            
+            raise Exception("任务执行超时")
+            
+        finally:
+            # 清理临时文件
+            try:
+                if os.path.exists(zip_path):
+                    os.remove(zip_path)
+            except Exception as e:
+                logger.warning(f"清理临时文件失败: {str(e)}")
+
+    def _get_batch_files(
+        self, 
+        keyframe_files: List[str], 
+        result: Dict[str, Any], 
+        batch_size: int
+    ) -> List[str]:
+        """获取当前批次的图片文件"""
+        batch_start = result['batch_index'] * batch_size
+        batch_end = min(batch_start + batch_size, len(keyframe_files))
+        return keyframe_files[batch_start:batch_end]
+
+    def _get_batch_timestamps(
+        self, 
+        batch_files: List[str], 
+        prev_batch_files: List[str] = None
+    ) -> tuple[str, str, str]:
+        """获取一批文件的时间戳范围"""
+        if not batch_files:
+            logger.warning("Empty batch files")
+            return "00:00", "00:00", "00:00-00:00"
+            
+        if len(batch_files) == 1 and prev_batch_files and len(prev_batch_files) > 0:
+            first_frame = os.path.basename(prev_batch_files[-1])
+            last_frame = os.path.basename(batch_files[0])
+        else:
+            first_frame = os.path.basename(batch_files[0])
+            last_frame = os.path.basename(batch_files[-1])
+        
+        first_time = first_frame.split('_')[2].replace('.jpg', '')
+        last_time = last_frame.split('_')[2].replace('.jpg', '')
+        
+        def format_timestamp(time_str: str) -> str:
+            if len(time_str) < 4:
+                logger.warning(f"Invalid timestamp format: {time_str}")
+                return "00:00"
+                
+            minutes = int(time_str[-4:-2])
+            seconds = int(time_str[-2:])
+            
+            if seconds >= 60:
+                minutes += seconds // 60
+                seconds = seconds % 60
+                
+            return f"{minutes:02d}:{seconds:02d}"
+        
+        first_timestamp = format_timestamp(first_time)
+        last_timestamp = format_timestamp(last_time)
+        timestamp_range = f"{first_timestamp}-{last_timestamp}"
+        
+        return first_timestamp, last_timestamp, timestamp_range
\ No newline at end of file
diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json
index 68b968a..db17ccc 100644
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@@ -103,7 +103,6 @@
     "Video Quality": "视频质量",
     "Custom prompt for LLM, leave empty to use default prompt": "自定义提示词，留空则使用默认提示词",
     "Proxy Settings": "代理设置",
-    "Language": "界面语言",
     "HTTP_PROXY": "HTTP 代理",
     "HTTPs_PROXY": "HTTPS 代理",
     "Vision Model Settings": "视频分析模型设置",

From 45fae0b982dea83092b8685aa2807fec93f263d0 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Mon, 18 Nov 2024 17:38:30 +0800
Subject: [PATCH 2/9] =?UTF-8?q?feat(v2):=20=E6=96=B0=E5=A2=9E=E8=A7=86?=
 =?UTF-8?q?=E9=A2=91=E8=A3=81=E5=89=AA=E5=92=8CYouTube=E8=A7=86=E9=A2=91?=
 =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 在 schema_v2.py 中添加了新的请求和响应模型
- 在 script.py 中实现了 /scripts/crop 和 /youtube/download 接口- 新增 video_service.py 和 youtube_service.py 服务模块
- 更新 utils.py 中的工具函数以支持新功能
---
 app/controllers/v2/script.py    |  84 +++++++++++++++++++-
 app/models/schema_v2.py         |  31 +++++++-
 app/services/video_service.py   |  58 ++++++++++++++
 app/services/youtube_service.py | 135 ++++++++++++++++++++++++++++++++
 requirements.txt                |   3 +-
 5 files changed, 304 insertions(+), 7 deletions(-)
 create mode 100644 app/services/video_service.py
 create mode 100644 app/services/youtube_service.py

diff --git a/app/controllers/v2/script.py b/app/controllers/v2/script.py
index 85f4238..94a337b 100644
--- a/app/controllers/v2/script.py
+++ b/app/controllers/v2/script.py
@@ -1,18 +1,29 @@
 from fastapi import APIRouter, BackgroundTasks
 from loguru import logger
+import os
 
-from app.models.schema_v2 import GenerateScriptRequest, GenerateScriptResponse
+from app.models.schema_v2 import (
+    GenerateScriptRequest, 
+    GenerateScriptResponse,
+    CropVideoRequest,
+    CropVideoResponse,
+    DownloadVideoRequest,
+    DownloadVideoResponse
+)
 from app.services.script_service import ScriptGenerator
+from app.services.video_service import VideoService
 from app.utils import utils
 from app.controllers.v2.base import v2_router
+from app.models.schema import VideoClipParams
+from app.services.youtube_service import YoutubeService
 
-# router = APIRouter(prefix="/api/v2", tags=["Script Generation V2"])
 router = v2_router()
 
+
 @router.post(
     "/scripts/generate",
     response_model=GenerateScriptResponse,
-    summary="生成视频脚本 (V2)"
+    summary="同步请求；生成视频脚本 (V2)"
 )
 async def generate_script(
     request: GenerateScriptRequest,
@@ -42,4 +53,69 @@ async def generate_script(
         
     except Exception as e:
         logger.exception(f"Generate script failed: {str(e)}")
-        raise 
\ No newline at end of file
+        raise
+
+
+@router.post(
+    "/scripts/crop",
+    response_model=CropVideoResponse,
+    summary="同步请求；裁剪视频 (V2)"
+)
+async def crop_video(
+    request: CropVideoRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    根据脚本裁剪视频的V2版本API
+    """
+    try:
+        # 调用视频裁剪服务
+        video_service = VideoService()
+        task_id, subclip_videos = await video_service.crop_video(
+            video_path=request.video_origin_path,
+            video_script=request.video_script
+        )
+        logger.debug(f"裁剪视频成功，视频片段路径: {subclip_videos}")
+        logger.debug(type(subclip_videos))
+        return {
+            "task_id": task_id,
+            "subclip_videos": subclip_videos
+        }
+        
+    except Exception as e:
+        logger.exception(f"Crop video failed: {str(e)}")
+        raise
+
+
+@router.post(
+    "/youtube/download",
+    response_model=DownloadVideoResponse,
+    summary="同步请求；下载YouTube视频 (V2)"
+)
+async def download_youtube_video(
+    request: DownloadVideoRequest,
+    background_tasks: BackgroundTasks
+):
+    """
+    下载指定分辨率的YouTube视频
+    """
+    try:
+        youtube_service = YoutubeService()
+        task_id, output_path, filename = await youtube_service.download_video(
+            url=request.url,
+            resolution=request.resolution,
+            output_format=request.output_format,
+            rename=request.rename
+        )
+        
+        return {
+            "task_id": task_id,
+            "output_path": output_path,
+            "resolution": request.resolution,
+            "format": request.output_format,
+            "filename": filename
+        }
+        
+    except Exception as e:
+        logger.exception(f"Download YouTube video failed: {str(e)}")
+        raise
diff --git a/app/models/schema_v2.py b/app/models/schema_v2.py
index 786c018..9894d89 100644
--- a/app/models/schema_v2.py
+++ b/app/models/schema_v2.py
@@ -1,6 +1,7 @@
 from typing import Optional, List
 from pydantic import BaseModel
 
+
 class GenerateScriptRequest(BaseModel):
     video_path: str
     video_theme: Optional[str] = ""
@@ -9,7 +10,33 @@ class GenerateScriptRequest(BaseModel):
     threshold: Optional[int] = 30
     vision_batch_size: Optional[int] = 5
     vision_llm_provider: Optional[str] = "gemini"
-    
+
+
 class GenerateScriptResponse(BaseModel):
     task_id: str
-    script: List[dict] 
\ No newline at end of file
+    script: List[dict]
+
+
+class CropVideoRequest(BaseModel):
+    video_origin_path: str
+    video_script: List[dict]
+
+
+class CropVideoResponse(BaseModel):
+    task_id: str
+    subclip_videos: dict
+
+
+class DownloadVideoRequest(BaseModel):
+    url: str
+    resolution: str
+    output_format: Optional[str] = "mp4"
+    rename: Optional[str] = None
+
+
+class DownloadVideoResponse(BaseModel):
+    task_id: str
+    output_path: str
+    resolution: str
+    format: str
+    filename: str
diff --git a/app/services/video_service.py b/app/services/video_service.py
new file mode 100644
index 0000000..2a0a9a6
--- /dev/null
+++ b/app/services/video_service.py
@@ -0,0 +1,58 @@
+import os
+from uuid import uuid4
+from loguru import logger
+from typing import Dict, List, Optional, Tuple
+
+from app.services import material
+from app.models.schema import VideoClipParams
+from app.utils import utils
+
+
+class VideoService:
+    @staticmethod
+    async def crop_video(
+        video_path: str,
+        video_script: List[dict]
+    ) -> Tuple[str, Dict[str, str]]:
+        """
+        裁剪视频服务
+        
+        Args:
+            video_path: 视频文件路径
+            video_script: 视频脚本列表
+            
+        Returns:
+            Tuple[str, Dict[str, str]]: (task_id, 裁剪后的视频片段字典)
+            视频片段字典格式: {timestamp: video_path}
+        """
+        try:
+            task_id = str(uuid4())
+            
+            # 从脚本中提取时间戳列表
+            time_list = [scene['timestamp'] for scene in video_script]
+            
+            # 调用裁剪服务
+            subclip_videos = material.clip_videos(
+                task_id=task_id,
+                timestamp_terms=time_list,
+                origin_video=video_path
+            )
+            
+            if subclip_videos is None:
+                raise ValueError("裁剪视频失败")
+                
+            # 更新脚本中的视频路径
+            for scene in video_script:
+                try:
+                    scene['path'] = subclip_videos[scene['timestamp']]
+                except KeyError as err:
+                    logger.error(f"更新视频路径失败: {err}")
+                    
+            logger.debug(f"裁剪视频成功，共生成 {len(time_list)} 个视频片段")
+            logger.debug(f"视频片段路径: {subclip_videos}")
+            
+            return task_id, subclip_videos
+            
+        except Exception as e:
+            logger.exception("裁剪视频失败")
+            raise 
\ No newline at end of file
diff --git a/app/services/youtube_service.py b/app/services/youtube_service.py
new file mode 100644
index 0000000..d478198
--- /dev/null
+++ b/app/services/youtube_service.py
@@ -0,0 +1,135 @@
+import yt_dlp
+import os
+from typing import List, Dict, Optional, Tuple
+from loguru import logger
+from uuid import uuid4
+
+from app.utils import utils
+
+
+class YoutubeService:
+    def __init__(self):
+        self.supported_formats = ['mp4', 'mkv', 'webm', 'flv', 'avi']
+
+    def _get_video_formats(self, url: str) -> List[Dict]:
+        """获取视频可用的格式列表"""
+        ydl_opts = {
+            'quiet': True,
+            'no_warnings': True
+        }
+
+        try:
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(url, download=False)
+                formats = info.get('formats', [])
+
+                format_list = []
+                for f in formats:
+                    format_info = {
+                        'format_id': f.get('format_id', 'N/A'),
+                        'ext': f.get('ext', 'N/A'),
+                        'resolution': f.get('format_note', 'N/A'),
+                        'filesize': f.get('filesize', 'N/A'),
+                        'vcodec': f.get('vcodec', 'N/A'),
+                        'acodec': f.get('acodec', 'N/A')
+                    }
+                    format_list.append(format_info)
+
+                return format_list
+        except Exception as e:
+            logger.error(f"获取视频格式失败: {str(e)}")
+            raise
+
+    def _validate_format(self, output_format: str) -> None:
+        """验证输出格式是否支持"""
+        if output_format.lower() not in self.supported_formats:
+            raise ValueError(
+                f"不支持的视频格式: {output_format}。"
+                f"支持的格式: {', '.join(self.supported_formats)}"
+            )
+
+    async def download_video(
+            self,
+            url: str,
+            resolution: str,
+            output_format: str = 'mp4',
+            rename: Optional[str] = None
+    ) -> Tuple[str, str, str]:
+        """
+        下载指定分辨率的视频
+        
+        Args:
+            url: YouTube视频URL
+            resolution: 目标分辨率 ('2160p', '1440p', '1080p', '720p' etc.)
+            output_format: 输出视频格式
+            rename: 可选的重命名
+            
+        Returns:
+            Tuple[str, str, str]: (task_id, output_path, filename)
+        """
+        try:
+            task_id = str(uuid4())
+            self._validate_format(output_format)
+
+            # 获取所有可用格式
+            formats = self._get_video_formats(url)
+
+            # 查找指定分辨率的最佳视频格式
+            target_format = None
+            for fmt in formats:
+                if fmt['resolution'] == resolution and fmt['vcodec'] != 'none':
+                    target_format = fmt
+                    break
+
+            if target_format is None:
+                available_resolutions = set(
+                    fmt['resolution'] for fmt in formats
+                    if fmt['resolution'] != 'N/A' and fmt['vcodec'] != 'none'
+                )
+                raise ValueError(
+                    f"未找到 {resolution} 分辨率的视频。"
+                    f"可用分辨率: {', '.join(sorted(available_resolutions))}"
+                )
+
+            # 创建输出目录
+            output_dir = utils.video_dir()
+            os.makedirs(output_dir, exist_ok=True)
+
+            # 设置下载选项
+            if rename:
+                # 如果指定了重命名，直接使用新名字
+                filename = f"{rename}.{output_format}"
+                output_template = os.path.join(output_dir, filename)
+            else:
+                # 否则使用任务ID和原标题
+                output_template = os.path.join(output_dir, f'{task_id}_%(title)s.%(ext)s')
+
+            ydl_opts = {
+                'format': f"{target_format['format_id']}+bestaudio[ext=m4a]/best",
+                'outtmpl': output_template,
+                'merge_output_format': output_format.lower(),
+                'postprocessors': [{
+                    'key': 'FFmpegVideoConvertor',
+                    'preferedformat': output_format.lower(),
+                }]
+            }
+
+            # 执行下载
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(url, download=True)
+                if rename:
+                    # 如果指定了重命名，使用新文件名
+                    output_path = output_template
+                    filename = os.path.basename(output_path)
+                else:
+                    # 否则使用原始标题
+                    video_title = info.get('title', task_id)
+                    filename = f"{task_id}_{video_title}.{output_format}"
+                    output_path = os.path.join(output_dir, filename)
+
+            logger.info(f"视频下载成功: {output_path}")
+            return task_id, output_path, filename
+
+        except Exception as e:
+            logger.exception("下载视频失败")
+            raise
diff --git a/requirements.txt b/requirements.txt
index 2ae1f29..3024e71 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,4 +31,5 @@ python-dotenv~=1.0.1
 openai~=1.53.0
 tqdm>=4.66.6
 tenacity>=9.0.0
-tiktoken==0.8.0
\ No newline at end of file
+tiktoken==0.8.0
+yt-dlp==2024.11.18

From 58773d605c782604f150705fb5d74703031891f0 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Mon, 18 Nov 2024 18:01:31 +0800
Subject: [PATCH 3/9] =?UTF-8?q?feat(v2):=20=E6=B7=BB=E5=8A=A0=E5=BC=80?=
 =?UTF-8?q?=E5=A7=8B=E8=A7=86=E9=A2=91=E5=89=AA=E8=BE=91=E4=BB=BB=E5=8A=A1?=
 =?UTF-8?q?=E7=9A=84=20API=20=E6=8E=A5=E5=8F=A3-=20=E6=96=B0=E5=A2=9E=20St?=
 =?UTF-8?q?artSubclipRequest=20=E5=92=8C=20StartSubclipResponse=20?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B-=20=E5=AE=9E=E7=8E=B0=20/scripts/start-subcl?=
 =?UTF-8?q?ip=20=E6=8E=A5=E5=8F=A3=EF=BC=8C=E7=94=A8=E4=BA=8E=E5=90=AF?=
 =?UTF-8?q?=E5=8A=A8=E8=A7=86=E9=A2=91=E5=89=AA=E8=BE=91=E4=BB=BB=E5=8A=A1?=
 =?UTF-8?q?=20-=20=E6=94=AF=E6=8C=81=E5=BC=82=E6=AD=A5=E5=A4=84=E7=90=86?=
 =?UTF-8?q?=EF=BC=8C=E8=BF=94=E5=9B=9E=E4=BB=BB=E5=8A=A1=20ID=20=E5=92=8C?=
 =?UTF-8?q?=E5=88=9D=E5=A7=8B=E7=8A=B6=E6=80=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/controllers/v2/script.py | 49 +++++++++++++++++++++++++++++++++++-
 app/models/schema_v2.py      | 20 +++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/app/controllers/v2/script.py b/app/controllers/v2/script.py
index 94a337b..c50ee0e 100644
--- a/app/controllers/v2/script.py
+++ b/app/controllers/v2/script.py
@@ -8,14 +8,18 @@ from app.models.schema_v2 import (
     CropVideoRequest,
     CropVideoResponse,
     DownloadVideoRequest,
-    DownloadVideoResponse
+    DownloadVideoResponse,
+    StartSubclipRequest,
+    StartSubclipResponse
 )
+from app.models.schema import VideoClipParams
 from app.services.script_service import ScriptGenerator
 from app.services.video_service import VideoService
 from app.utils import utils
 from app.controllers.v2.base import v2_router
 from app.models.schema import VideoClipParams
 from app.services.youtube_service import YoutubeService
+from app.services import task as task_service
 
 router = v2_router()
 
@@ -119,3 +123,46 @@ async def download_youtube_video(
     except Exception as e:
         logger.exception(f"Download YouTube video failed: {str(e)}")
         raise
+
+
+@router.post(
+    "/scripts/start-subclip",
+    response_model=StartSubclipResponse,
+    summary="异步请求；开始视频剪辑任务 (V2)"
+)
+async def start_subclip(
+    request: VideoClipParams,
+    background_tasks: BackgroundTasks
+):
+    """
+    开始视频剪辑任务的V2版本API
+    """
+    try:
+        # 构建参数对象
+        params = VideoClipParams(
+            video_origin_path=request.video_origin_path,
+            video_clip_json_path=request.video_clip_json_path,
+            voice_name=request.voice_name,
+            voice_rate=request.voice_rate,
+            voice_pitch=request.voice_pitch,
+            subtitle_enabled=request.subtitle_enabled,
+            video_aspect=request.video_aspect,
+            n_threads=request.n_threads
+        )
+        
+        # 在后台任务中执行视频剪辑
+        background_tasks.add_task(
+            task_service.start_subclip,
+            task_id=request.task_id,
+            params=params,
+            subclip_path_videos=request.subclip_videos
+        )
+        
+        return {
+            "task_id": request.task_id,
+            "state": "PROCESSING"  # 初始状态
+        }
+        
+    except Exception as e:
+        logger.exception(f"Start subclip task failed: {str(e)}")
+        raise
diff --git a/app/models/schema_v2.py b/app/models/schema_v2.py
index 9894d89..1611a3b 100644
--- a/app/models/schema_v2.py
+++ b/app/models/schema_v2.py
@@ -40,3 +40,23 @@ class DownloadVideoResponse(BaseModel):
     resolution: str
     format: str
     filename: str
+
+
+class StartSubclipRequest(BaseModel):
+    task_id: str
+    video_origin_path: str
+    video_clip_json_path: str
+    voice_name: Optional[str] = None
+    voice_rate: Optional[int] = 0
+    voice_pitch: Optional[int] = 0
+    subtitle_enabled: Optional[bool] = True
+    video_aspect: Optional[str] = "16:9"
+    n_threads: Optional[int] = 4
+    subclip_videos: list  # 从裁剪视频接口获取的视频片段字典
+
+
+class StartSubclipResponse(BaseModel):
+    task_id: str
+    state: str
+    videos: Optional[List[str]] = None
+    combined_videos: Optional[List[str]] = None

From b34d9fe14c18ebfb599afbace3663e862fa3ec11 Mon Sep 17 00:00:00 2001
From: linyqh <linyqemail@163.com>
Date: Tue, 19 Nov 2024 01:23:20 +0800
Subject: [PATCH 4/9] =?UTF-8?q?refactor(webui):=20=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=E9=9F=B3=E9=A2=91=E8=AE=BE=E7=BD=AE=E7=95=8C=E9=9D=A2=E5=B9=B6?=
 =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BB=A3=E7=90=86=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 修改支持的语音列表，仅保留中文语音
- 在主程序中添加代理配置环境变量
-优化剪辑视频函数，改为返回字典类型
- 更新任务服务中的剪辑视频函数，适应新的参数类型
- 修改测试用例中的视频剪辑函数，增加输出路径参数
- 更新脚本控制器中的剪辑视频函数，集成任务 ID 和子视频字典
---
 app/controllers/v2/script.py       |   8 +-
 app/pipeline/video_pipeline.py     | 162 +++++++++++++++
 app/services/material.py           |   2 +-
 app/services/task.py               |   2 +-
 app/services/voice.py              |   7 +-
 app/services/youtube_service.py    |  21 +-
 app/test/test_moviepy.py           |   8 +-
 main.py                            |   3 +
 webui.txt                          | 323 +++++++++++++++++++++++++++++
 webui/components/audio_settings.py |   2 +-
 10 files changed, 522 insertions(+), 16 deletions(-)
 create mode 100644 app/pipeline/video_pipeline.py

diff --git a/app/controllers/v2/script.py b/app/controllers/v2/script.py
index c50ee0e..c3501eb 100644
--- a/app/controllers/v2/script.py
+++ b/app/controllers/v2/script.py
@@ -132,6 +132,8 @@ async def download_youtube_video(
 )
 async def start_subclip(
     request: VideoClipParams,
+    task_id: str,
+    subclip_videos: dict,
     background_tasks: BackgroundTasks
 ):
     """
@@ -153,13 +155,13 @@ async def start_subclip(
         # 在后台任务中执行视频剪辑
         background_tasks.add_task(
             task_service.start_subclip,
-            task_id=request.task_id,
+            task_id=task_id,
             params=params,
-            subclip_path_videos=request.subclip_videos
+            subclip_path_videos=subclip_videos
         )
         
         return {
-            "task_id": request.task_id,
+            "task_id": task_id,
             "state": "PROCESSING"  # 初始状态
         }
         
diff --git a/app/pipeline/video_pipeline.py b/app/pipeline/video_pipeline.py
new file mode 100644
index 0000000..1c54bad
--- /dev/null
+++ b/app/pipeline/video_pipeline.py
@@ -0,0 +1,162 @@
+import requests
+import json
+import time
+from typing import Dict, Any
+
+class VideoPipeline:
+    def __init__(self, base_url: str = "http://127.0.0.1:8080"):
+        self.base_url = base_url
+        
+    def download_video(self, url: str, resolution: str = "1080p", 
+                      output_format: str = "mp4", rename: str = None) -> Dict[str, Any]:
+        """下载视频的第一步"""
+        endpoint = f"{self.base_url}/api/v2/youtube/download"
+        payload = {
+            "url": url,
+            "resolution": resolution,
+            "output_format": output_format,
+            "rename": rename or time.strftime("%Y-%m-%d")
+        }
+        
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def generate_script(self, video_path: str, skip_seconds: int = 0,
+                       threshold: int = 30, vision_batch_size: int = 10,
+                       vision_llm_provider: str = "gemini") -> Dict[str, Any]:
+        """生成脚本的第二步"""
+        endpoint = f"{self.base_url}/api/v2/scripts/generate"
+        payload = {
+            "video_path": video_path,
+            "skip_seconds": skip_seconds,
+            "threshold": threshold,
+            "vision_batch_size": vision_batch_size,
+            "vision_llm_provider": vision_llm_provider
+        }
+        
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def crop_video(self, video_path: str, script: list) -> Dict[str, Any]:
+        """剪辑视频的第三步"""
+        endpoint = f"{self.base_url}/api/v2/scripts/crop"
+        payload = {
+            "video_origin_path": video_path,
+            "video_script": script
+        }
+        
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def generate_final_video(self, task_id: str, video_path: str, 
+                           script_path: str, script: list, subclip_videos: Dict[str, str]) -> Dict[str, Any]:
+        """生成最终视频的第四步"""
+        endpoint = f"{self.base_url}/api/v2/scripts/start-subclip"
+        
+        request_data = {
+            "video_clip_json": script,
+            "video_clip_json_path": script_path,
+            "video_origin_path": video_path,
+            "video_aspect": "16:9",
+            "video_language": "zh-CN",
+            "voice_name": "zh-CN-YunjianNeural",
+            "voice_volume": 1,
+            "voice_rate": 1.2,
+            "voice_pitch": 1,
+            "bgm_name": "random",
+            "bgm_type": "random",
+            "bgm_file": "",
+            "bgm_volume": 0.3,
+            "subtitle_enabled": True,
+            "subtitle_position": "bottom",
+            "font_name": "STHeitiMedium.ttc",
+            "text_fore_color": "#FFFFFF",
+            "text_background_color": "transparent",
+            "font_size": 75,
+            "stroke_color": "#000000",
+            "stroke_width": 1.5,
+            "custom_position": 70,
+            "n_threads": 8
+        }
+        
+        payload = {
+            "request": request_data,
+            "subclip_videos": subclip_videos
+        }
+        
+        params = {"task_id": task_id}
+        response = requests.post(endpoint, params=params, json=payload)
+        response.raise_for_status()
+        return response.json()
+    
+    def save_script_to_json(self, script: list) -> str:
+        """保存脚本到json文件"""
+        timestamp = time.strftime("%Y-%m%d-%H%M%S")
+        script_path = f"E:\\projects\\NarratoAI\\resource\\scripts\\{timestamp}.json"
+        
+        try:
+            with open(script_path, 'w', encoding='utf-8') as f:
+                json.dump(script, f, ensure_ascii=False, indent=2)
+            print(f"脚本已保存到: {script_path}")
+            return script_path
+        except Exception as e:
+            print(f"保存脚本失败: {str(e)}")
+            raise
+    
+    def run_pipeline(self, youtube_url: str) -> Dict[str, Any]:
+        """运行完整的pipeline"""
+        try:
+            # 1. 下载视频
+            print("开始下载视频...")
+            download_result = self.download_video(youtube_url)
+            video_path = download_result["output_path"]
+            
+            # 2. 生成脚本
+            print("开始生成脚本...")
+            script_result = self.generate_script(video_path)
+            script = script_result["script"]
+            
+            # 2.1 保存脚本到json文件
+            print("保存脚本到json文件...")
+            script_path = self.save_script_to_json(script)
+            script_result["script_path"] = script_path
+            
+            # 3. 剪辑视频
+            print("开始剪辑视频...")
+            crop_result = self.crop_video(video_path, script)
+            subclip_videos = crop_result["subclip_videos"]
+            
+            # 4. 生成最终视频
+            print("开始生成最终视频...")
+            final_result = self.generate_final_video(
+                crop_result["task_id"],
+                video_path,
+                script_path,
+                script,
+                subclip_videos
+            )
+            
+            return {
+                "status": "success",
+                "download_result": download_result,
+                "script_result": script_result,
+                "crop_result": crop_result,
+                "final_result": final_result
+            }
+            
+        except Exception as e:
+            return {
+                "status": "error",
+                "error": str(e)
+            }
+
+# 使用示例
+if __name__ == "__main__":
+    pipeline = VideoPipeline()
+    result = pipeline.run_pipeline("https://www.youtube.com/watch?v=Kenm35gdqtk")
+    print(json.dumps(result, indent=2, ensure_ascii=False))
+    result2 = pipeline.run_pipeline("https://www.youtube.com/watch?v=aEsHAcedzgw")
+    print(json.dumps(result2, indent=2, ensure_ascii=False))
diff --git a/app/services/material.py b/app/services/material.py
index bab1aba..696eda8 100644
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -363,7 +363,7 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
     return {}
 
 
-def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None):
+def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None) -> dict:
     """
     剪辑视频
     Args:
diff --git a/app/services/task.py b/app/services/task.py
index c903047..c030574 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -324,7 +324,7 @@ def start(task_id, params: VideoParams, stop_at: str = "video"):
     return kwargs
 
 
-def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: list):
+def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
     """
     后台任务（自动剪辑视频进行剪辑）
 
diff --git a/app/services/voice.py b/app/services/voice.py
index 02245f6..21082c1 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -989,6 +989,9 @@ Gender: Female
 
 Name: zh-CN-XiaoxiaoMultilingualNeural-V2
 Gender: Female
+
+Name: zh-CN-YunxiNeural-V2
+Gender: Male
     """.strip()
     voices = []
     name = ""
@@ -1034,8 +1037,8 @@ def is_azure_v2_voice(voice_name: str):
 def tts(
     text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
 ) -> [SubMaker, None]:
-    # if is_azure_v2_voice(voice_name):
-    #     return azure_tts_v2(text, voice_name, voice_file)
+    if is_azure_v2_voice(voice_name):
+        return azure_tts_v2(text, voice_name, voice_file)
     return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
 
 
diff --git a/app/services/youtube_service.py b/app/services/youtube_service.py
index d478198..e4a7a79 100644
--- a/app/services/youtube_service.py
+++ b/app/services/youtube_service.py
@@ -5,6 +5,7 @@ from loguru import logger
 from uuid import uuid4
 
 from app.utils import utils
+from app.services import video as VideoService
 
 
 class YoutubeService:
@@ -61,6 +62,7 @@ class YoutubeService:
         Args:
             url: YouTube视频URL
             resolution: 目标分辨率 ('2160p', '1440p', '1080p', '720p' etc.)
+                       注意：对于类似'1080p60'的输入会被处理为'1080p'
             output_format: 输出视频格式
             rename: 可选的重命名
             
@@ -71,23 +73,32 @@ class YoutubeService:
             task_id = str(uuid4())
             self._validate_format(output_format)
 
+            # 标准化分辨率格式
+            base_resolution = resolution.split('p')[0] + 'p'
+            
             # 获取所有可用格式
             formats = self._get_video_formats(url)
 
             # 查找指定分辨率的最佳视频格式
             target_format = None
             for fmt in formats:
-                if fmt['resolution'] == resolution and fmt['vcodec'] != 'none':
-                    target_format = fmt
-                    break
+                fmt_resolution = fmt['resolution']
+                # 将格式的分辨率也标准化后进行比较
+                if fmt_resolution != 'N/A':
+                    fmt_base_resolution = fmt_resolution.split('p')[0] + 'p'
+                    if fmt_base_resolution == base_resolution and fmt['vcodec'] != 'none':
+                        target_format = fmt
+                        break
 
             if target_format is None:
+                # 收集可用分辨率时也进行标准化
                 available_resolutions = set(
-                    fmt['resolution'] for fmt in formats
+                    fmt['resolution'].split('p')[0] + 'p'
+                    for fmt in formats
                     if fmt['resolution'] != 'N/A' and fmt['vcodec'] != 'none'
                 )
                 raise ValueError(
-                    f"未找到 {resolution} 分辨率的视频。"
+                    f"未找到 {base_resolution} 分辨率的视频。"
                     f"可用分辨率: {', '.join(sorted(available_resolutions))}"
                 )
 
diff --git a/app/test/test_moviepy.py b/app/test/test_moviepy.py
index d37d518..208b708 100644
--- a/app/test/test_moviepy.py
+++ b/app/test/test_moviepy.py
@@ -31,7 +31,7 @@ def format_duration(seconds: float) -> str:
     return f"{minutes:02d}:{remaining_seconds:02d}"
 
 
-def cut_video(video_path: str, start_time: str, end_time: str) -> None:
+def cut_video(video_path: str, start_time: str, end_time: str, output_path: str) -> None:
     """
     剪辑视频
     参数:
@@ -53,11 +53,13 @@ def cut_video(video_path: str, start_time: str, end_time: str) -> None:
     
     # 剪辑视频
     video = video.subclip(start_seconds, end_seconds)
-    video.write_videofile("../../resource/videos/cut_video2.mp4")
+    video.write_videofile("../../resource/videos/cut_video3.mp4")
     
     # 释放资源
     video.close()
 
 
 if __name__ == "__main__":
-    cut_video("../../resource/videos/best.mp4", "00:40", "02:40")
+    # cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "00:00", "07:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-1")
+    # cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "07:00", "14:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-2")
+    cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "14:00", "22:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-3")
diff --git a/main.py b/main.py
index e84f32b..bfec175 100644
--- a/main.py
+++ b/main.py
@@ -1,3 +1,4 @@
+import os
 import uvicorn
 from loguru import logger
 
@@ -7,6 +8,8 @@ if __name__ == "__main__":
     logger.info(
         "start server, docs: http://127.0.0.1:" + str(config.listen_port) + "/docs"
     )
+    os.environ["HTTP_PROXY"] = config.proxy.get("http")
+    os.environ["HTTPS_PROXY"] = config.proxy.get("https")
     uvicorn.run(
         app="app.asgi:app",
         host=config.listen_host,
diff --git a/webui.txt b/webui.txt
index e835524..b64b320 100644
--- a/webui.txt
+++ b/webui.txt
@@ -47,3 +47,326 @@ pause
 
 rem set HF_ENDPOINT=https://hf-mirror.com
 streamlit run webui.py --browser.serverAddress="127.0.0.1" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False
+
+请求0：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/youtube/download' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "url": "https://www.youtube.com/watch?v=Kenm35gdqtk",
+  "resolution": "1080p",
+  "output_format": "mp4",
+  "rename": "2024-11-19"
+}'
+{
+  "url": "https://www.youtube.com/watch?v=Kenm35gdqtk",
+  "resolution": "1080p",
+  "output_format": "mp4",
+  "rename": "2024-11-19"
+}
+
+请求1：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "skip_seconds": 0,
+  "threshold": 30,
+  "vision_batch_size": 10,
+  "vision_llm_provider": "gemini"
+}'
+{
+  "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "skip_seconds": 0,
+  "threshold": 30,
+  "vision_batch_size": 10,
+  "vision_llm_provider": "gemini"
+}
+
+请求2：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/crop' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}'
+{
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}
+
+请求3：
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/start-subclip?task_id=12121' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "request": {
+  "video_clip_json": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ],
+  "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json",
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_aspect": "16:9",
+  "video_language": "zh-CN",
+  "voice_name": "zh-CN-YunjianNeural",
+  "voice_volume": 1,
+  "voice_rate": 1.2,
+  "voice_pitch": 1,
+  "bgm_name": "random",
+  "bgm_type": "random",
+  "bgm_file": "",
+  "bgm_volume": 0.3,
+  "subtitle_enabled": true,
+  "subtitle_position": "bottom",
+  "font_name": "STHeitiMedium.ttc",
+  "text_fore_color": "#FFFFFF",
+  "text_background_color": "transparent",
+  "font_size": 75,
+  "stroke_color": "#000000",
+  "stroke_width": 1.5,
+  "custom_position": 70,
+  "n_threads": 8
+  },
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}'
+{
+  "request": {
+  "video_clip_json": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ],
+  "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json",
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_aspect": "16:9",
+  "video_language": "zh-CN",
+  "voice_name": "zh-CN-YunjianNeural",
+  "voice_volume": 1,
+  "voice_rate": 1.2,
+  "voice_pitch": 1,
+  "bgm_name": "random",
+  "bgm_type": "random",
+  "bgm_file": "",
+  "bgm_volume": 0.3,
+  "subtitle_enabled": true,
+  "subtitle_position": "bottom",
+  "font_name": "STHeitiMedium.ttc",
+  "text_fore_color": "#FFFFFF",
+  "text_background_color": "transparent",
+  "font_size": 75,
+  "stroke_color": "#000000",
+  "stroke_width": 1.5,
+  "custom_position": 70,
+  "n_threads": 8
+  },
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}
+
+
+请在最外层新建一个pipeline 工作流执行逻辑的代码；
+他会按照下面的顺序请求接口
+1.下载视频
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/youtube/download' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "url": "https://www.youtube.com/watch?v=Kenm35gdqtk",
+  "resolution": "1080p",
+  "output_format": "mp4",
+  "rename": "2024-11-19"
+}'
+2.生成脚本
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/generate' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "skip_seconds": 0,
+  "threshold": 30,
+  "vision_batch_size": 10,
+  "vision_llm_provider": "gemini"
+}'
+3. 剪辑视频
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/crop' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}'
+4.生成视频
+curl -X 'POST' \
+  'http://127.0.0.1:8080/api/v2/scripts/start-subclip?task_id=12121' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "request": {
+  "video_clip_json": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频展现一名留着胡须的男子在森林里挖掘。\n\n画面首先展现男子从后方视角，背着军绿色背包，穿着卡其色长裤和深色T恤，走向一个泥土斜坡。背包上似乎有一个镐头。\n\n下一个镜头特写展现了该背包，一个镐头从背包里伸出来，包里还有一些其他工具。\n\n然后，视频显示该男子用镐头挖掘泥土斜坡。\n\n接下来是一些近景镜头，展现男子的靴子在泥土中行走，以及男子用手清理泥土。\n\n其他镜头从不同角度展现该男子在挖掘，包括从侧面和上方。\n\n可以看到他用工具挖掘，清理泥土，并检查挖出的土壤。\n\n最后，一个镜头展现了挖出的土壤的质地和颜色。",
+      "narration": "好的，接下来就是我们这位“胡须大侠”的精彩冒险了！只见他背着军绿色的背包，迈着比我上班还不情愿的步伐走向那泥土斜坡。哎呀，这个背包可真是个宝贝，里面藏着一把镐头和一些工具，简直像是个随身携带的“建筑工具箱”！ \n\n看他挥舞着镐头，挖掘泥土的姿势，仿佛在进行一场“挖土大赛”，结果却比我做饭还要糟糕。泥土飞扬中，他的靴子也成了“泥巴艺术家”。最后，那堆色泽各异的土壤就像他心情的写照——五彩斑斓又略显混乱！真是一次让人捧腹的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是视频画面的客观描述：\n\n视频以一系列森林环境的镜头开头。\n\n第一个镜头是一个特写镜头，镜头中显示的是一些带有水滴的绿色叶子。\n\n第二个镜头显示一个留着胡须的男子在森林中挖掘一个洞。 他跪在地上，用工具挖土。\n\n第三个镜头是一个中等镜头，显示同一个人坐在他挖好的洞边休息。\n\n第四个镜头显示该洞的内部结构，该洞在树根和地面之间。\n\n第五个镜头显示该男子用斧头砍树枝。\n\n第六个镜头显示一堆树枝横跨一个泥泞的小水坑。\n\n第七个镜头显示更多茂盛的树叶和树枝在阳光下。\n\n第八个镜头显示更多茂盛的树叶和树枝。\n\n\n",
+      "narration": "接下来，我们的“挖土大师”又开始了他的森林探险。看这镜头，水滴在叶子上闪烁，仿佛在说：“快来，快来，这里有故事！”他一边挖洞，一边像个新手厨师试图切洋葱——每一下都小心翼翼，生怕自己不小心挖出个“历史遗址”。坐下休息的时候，脸上的表情就像发现新大陆一样！然后，他拿起斧头砍树枝，简直是现代版的“神雕侠侣”，只不过对象是树木。最后，那堆树枝架过泥泞的小水坑，仿佛在说：“我就是不怕湿脚的勇士！”这就是我们的建造之旅！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ],
+  "video_clip_json_path": "E:\\projects\\NarratoAI\\resource\\scripts\\2024-1118-230421.json",
+  "video_origin_path": "E:\\projects\\NarratoAI\\resource\\videos\\test.mp4",
+  "video_aspect": "16:9",
+  "video_language": "zh-CN",
+  "voice_name": "zh-CN-YunjianNeural",
+  "voice_volume": 1,
+  "voice_rate": 1.2,
+  "voice_pitch": 1,
+  "bgm_name": "random",
+  "bgm_type": "random",
+  "bgm_file": "",
+  "bgm_volume": 0.3,
+  "subtitle_enabled": true,
+  "subtitle_position": "bottom",
+  "font_name": "STHeitiMedium.ttc",
+  "text_fore_color": "#FFFFFF",
+  "text_background_color": "transparent",
+  "font_size": 75,
+  "stroke_color": "#000000",
+  "stroke_width": 1.5,
+  "custom_position": 70,
+  "n_threads": 8
+  },
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}'
+
+请求1，返回的参数是：
+{
+  "task_id": "4e9b575f-68c0-4ae1-b218-db42b67993d0",
+  "output_path": "E:\\projects\\NarratoAI\\resource\\videos\\2024-11-19.mp4",
+  "resolution": "1080p",
+  "format": "mp4",
+  "filename": "2024-11-19.mp4"
+}
+output_path需要传递给请求2
+请求2，返回数据为：
+{
+  "task_id": "04497017-953c-44b4-bf1d-9d8ed3ebbbce",
+  "script": [
+    {
+      "timestamp": "00:10-01:01",
+      "picture": "好的，以下是對影片畫面的客觀描述：\n\n影片顯示一名留著鬍鬚的男子在一處樹林茂密的斜坡上挖掘。\n\n畫面一：男子從後方出現，背著一個軍綠色的背包，背包裡似乎裝有工具。他穿著卡其色的長褲和深色的登山鞋。\n\n畫面二：特寫鏡頭顯示男子的背包，一個舊的鎬頭從包裡露出來，包裡還有其他工具，包括一個鏟子。\n\n畫面三：男子用鎬頭在斜坡上挖土，背包放在他旁邊。\n\n畫面四：特寫鏡頭顯示男子的登山鞋在泥土中。\n\n畫面五：男子坐在斜坡上，用手清理樹根和泥土。\n\n畫面六：地上有一些鬆動的泥土和落葉。\n\n畫面七：男子的背包近景鏡頭，他正在挖掘。\n\n畫面八：男子在斜坡上挖掘，揚起一陣塵土。\n\n畫面九：特寫鏡頭顯示男子用手清理泥土。\n\n畫面十：特寫鏡頭顯示挖出的泥土剖面，可以看到土壤的層次。",
+      "narration": "上一个画面是我在绝美的自然中，准备开启我的“土豪”挖掘之旅。现在，你们看到这位留着胡子的“大哥”，他背着个军绿色的包，里面装的可不仅仅是工具，还有我对生活的无限热爱（以及一丝不安）。看！这把旧镐头就像我的前任——用起来费劲，但又舍不得扔掉。\n\n他在斜坡上挖土，泥土飞扬，仿佛在跟大地进行一场“泥巴大战”。每一铲下去，都能听到大地微微的呻吟：哎呀，我这颗小树根可比我当年的情感纠葛还难处理呢！别担心，这些泥土层次分明，简直可以开个“泥土博物馆”。所以，朋友们，跟着我一起享受这场泥泞中的乐趣吧！",
+      "OST": 2,
+      "new_timestamp": "00:00-00:51"
+    },
+    {
+      "timestamp": "01:07-01:53",
+      "picture": "好的，以下是對影片畫面內容的客觀描述：\n\n影片以一系列森林環境的鏡頭開始。第一個鏡頭展示了綠葉植物的特寫鏡頭，葉子上有一些水珠。接下來的鏡頭是一個男人在森林裡挖掘一個小坑，他跪在地上，用鏟子挖土。\n\n接下來的鏡頭是同一個男人坐在他挖的坑旁邊，望著前方。然後，鏡頭顯示該坑的廣角鏡頭，顯示其結構和大小。\n\n之後的鏡頭，同一個男人在樹林裡劈柴。鏡頭最後呈現出一潭渾濁的水，周圍環繞著樹枝。然後鏡頭又回到了森林裡生長茂盛的植物特寫鏡頭。",
+      "narration": "好嘞，朋友们，我们已经在泥土博物馆里捣鼓了一阵子，现在是时候跟大自然亲密接触了！看看这片森林，绿叶上水珠闪闪发光，就像我曾经的爱情，虽然短暂，却美得让人心碎。\n\n现在，我在这里挖个小坑，感觉自己就像是一位新晋“挖土大王”，不过说实话，这手艺真不敢恭维，连铲子都快对我崩溃了。再说劈柴，这动作简直比我前任的情绪波动还要激烈！最后这一潭浑浊的水，别担心，它只是告诉我：生活就像这水，总有些杂质，但也别忘了，要勇敢面对哦！",
+      "OST": 2,
+      "new_timestamp": "00:51-01:37"
+    }
+  ]
+}
+output_path和script参数需要传递给请求3
+请求3返回参数是
+{
+  "task_id": "b6f5a98a-b2e0-4e3d-89c5-64fb90db2ec1",
+  "subclip_videos": {
+    "00:10-01:01": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_10-01_01.mp4",
+    "01:07-01:53": "E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_07-01_53.mp4"
+  }
+}
+subclip_videos和 output_path和script参数需要传递给请求4
+最后完成工作流
\ No newline at end of file
diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py
index a189f65..f81effe 100644
--- a/webui/components/audio_settings.py
+++ b/webui/components/audio_settings.py
@@ -20,7 +20,7 @@ def render_audio_panel(tr):
 def render_tts_settings(tr):
     """渲染TTS(文本转语音)设置"""
     # 获取支持的语音列表
-    support_locales = ["zh-CN", "zh-HK", "zh-TW", "en-US"]
+    support_locales = ["zh-CN"]
     voices = voice.get_all_azure_voices(filter_locals=support_locales)
     
     # 创建友好的显示名称

From 38f23983ef0c80f270fd5c6bef118acdeb76cd34 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Tue, 19 Nov 2024 14:50:30 +0800
Subject: [PATCH 5/9] =?UTF-8?q?refactor(video=5Fpipeline):=20=E9=87=8D?=
 =?UTF-8?q?=E6=9E=84=E8=A7=86=E9=A2=91=E5=A4=84=E7=90=86=E7=AE=A1=E9=81=93?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 新增参数以支持更灵活的配置
-优化脚本保存逻辑，支持自定义脚本名称
-增加视频下载和脚本生成的条件判断，提高效率
- 异步生成最终视频，返回任务路径
- 更新示例使用新的管道配置
---
 .../video_pipeline.py => video_pipeline.py    | 81 ++++++++++++-------
 1 file changed, 50 insertions(+), 31 deletions(-)
 rename app/pipeline/video_pipeline.py => video_pipeline.py (62%)

diff --git a/app/pipeline/video_pipeline.py b/video_pipeline.py
similarity index 62%
rename from app/pipeline/video_pipeline.py
rename to video_pipeline.py
index 1c54bad..3e35544 100644
--- a/app/pipeline/video_pipeline.py
+++ b/video_pipeline.py
@@ -1,5 +1,6 @@
 import requests
 import json
+import os
 import time
 from typing import Dict, Any
 
@@ -52,7 +53,7 @@ class VideoPipeline:
         return response.json()
     
     def generate_final_video(self, task_id: str, video_path: str, 
-                           script_path: str, script: list, subclip_videos: Dict[str, str]) -> Dict[str, Any]:
+                           script_path: str, script: list, subclip_videos: Dict[str, str], voice_name: str) -> Dict[str, Any]:
         """生成最终视频的第四步"""
         endpoint = f"{self.base_url}/api/v2/scripts/start-subclip"
         
@@ -62,7 +63,7 @@ class VideoPipeline:
             "video_origin_path": video_path,
             "video_aspect": "16:9",
             "video_language": "zh-CN",
-            "voice_name": "zh-CN-YunjianNeural",
+            "voice_name": voice_name,
             "voice_volume": 1,
             "voice_rate": 1.2,
             "voice_pitch": 1,
@@ -92,10 +93,9 @@ class VideoPipeline:
         response.raise_for_status()
         return response.json()
     
-    def save_script_to_json(self, script: list) -> str:
+    def save_script_to_json(self, script: list, script_name: str) -> str:
         """保存脚本到json文件"""
-        timestamp = time.strftime("%Y-%m%d-%H%M%S")
-        script_path = f"E:\\projects\\NarratoAI\\resource\\scripts\\{timestamp}.json"
+        script_path = f"E:\\projects\\NarratoAI\\resource\\scripts\\{script_name}.json"
         
         try:
             with open(script_path, 'w', encoding='utf-8') as f:
@@ -106,45 +106,55 @@ class VideoPipeline:
             print(f"保存脚本失败: {str(e)}")
             raise
     
-    def run_pipeline(self, youtube_url: str) -> Dict[str, Any]:
+    def run_pipeline(self, task_id: str, script_name: str, youtube_url: str, video_name: str="null", skip_seconds: int = 0, threshold: int = 30, vision_batch_size: int = 10, vision_llm_provider: str = "gemini", voice_name: str = "zh-CN-YunjianNeural") -> Dict[str, Any]:
         """运行完整的pipeline"""
         try:
-            # 1. 下载视频
-            print("开始下载视频...")
-            download_result = self.download_video(youtube_url)
-            video_path = download_result["output_path"]
+            current_path = os.path.dirname(os.path.abspath(__file__))
+            video_path = os.path.join(current_path, "resource", "videos", video_name)
+            # 判断视频是否存在
+            if not os.path.exists(video_path):
+                # 1. 下载视频
+                print(f"视频不存在, 开始下载视频: {video_path}")
+                download_result = self.download_video(youtube_url=youtube_url, resolution="1080p", output_format="mp4", rename=video_name)
+                video_path = download_result["output_path"]
+            else:
+                print(f"视频已存在: {video_path}")
             
-            # 2. 生成脚本
-            print("开始生成脚本...")
-            script_result = self.generate_script(video_path)
-            script = script_result["script"]
+            # 2. 判断script_name是否存在
+            # 2.1.1 拼接脚本路径 NarratoAI/resource/scripts
+            script_path = os.path.join(current_path, "resource", "scripts", script_name)
+            if os.path.exists(script_path):
+                script = json.load(open(script_path, "r", encoding="utf-8"))
+            else:
+                # 2.1.2 生成脚本
+                print("开始生成脚本...")
+                script_result = self.generate_script(video_path=video_path, skip_seconds=skip_seconds, threshold=threshold, vision_batch_size=vision_batch_size, vision_llm_provider=vision_llm_provider)
+                script = script_result["script"]
             
-            # 2.1 保存脚本到json文件
+            # 2.2 保存脚本到json文件
             print("保存脚本到json文件...")
-            script_path = self.save_script_to_json(script)
+            script_path = self.save_script_to_json(script, script_name)
             script_result["script_path"] = script_path
             
             # 3. 剪辑视频
             print("开始剪辑视频...")
-            crop_result = self.crop_video(video_path, script)
+            crop_result = self.crop_video(video_path=video_path, script=script)
             subclip_videos = crop_result["subclip_videos"]
             
             # 4. 生成最终视频
             print("开始生成最终视频...")
             final_result = self.generate_final_video(
-                crop_result["task_id"],
-                video_path,
-                script_path,
-                script,
-                subclip_videos
+                task_id=task_id,
+                video_path=video_path,
+                script_path=script_path,
+                script=script,
+                subclip_videos=subclip_videos,
+                voice_name=voice_name
             )
             
             return {
-                "status": "success",
-                "download_result": download_result,
-                "script_result": script_result,
-                "crop_result": crop_result,
-                "final_result": final_result
+                "status": "等待异步生成视频",
+                "path": os.path.join(current_path, "storage", "tasks", task_id)
             }
             
         except Exception as e:
@@ -153,10 +163,19 @@ class VideoPipeline:
                 "error": str(e)
             }
 
+
 # 使用示例
 if __name__ == "__main__":
     pipeline = VideoPipeline()
-    result = pipeline.run_pipeline("https://www.youtube.com/watch?v=Kenm35gdqtk")
-    print(json.dumps(result, indent=2, ensure_ascii=False))
-    result2 = pipeline.run_pipeline("https://www.youtube.com/watch?v=aEsHAcedzgw")
-    print(json.dumps(result2, indent=2, ensure_ascii=False))
+    result = pipeline.run_pipeline(
+        task_id="test_123",
+        script_name="test.json",
+        youtube_url="https://www.youtube.com/watch?v=Kenm35gdqtk",
+        video_name="test.mp4",
+        skip_seconds=0,
+        threshold=30,
+        vision_batch_size=10,
+        vision_llm_provider="gemini",
+        voice_name="zh-CN-YunjianNeural",
+    )
+    print(result)

From 1be304a696c530da50d5b7da0fa33b867af13421 Mon Sep 17 00:00:00 2001
From: linyqh <linyqemail@163.com>
Date: Wed, 20 Nov 2024 00:34:11 +0800
Subject: [PATCH 6/9] =?UTF-8?q?feat(subtitle):=20=E6=B7=BB=E5=8A=A0?=
 =?UTF-8?q?=E4=BB=8E=E8=A7=86=E9=A2=91=E6=8F=90=E5=8F=96=E9=9F=B3=E9=A2=91?=
 =?UTF-8?q?=E5=B9=B6=E7=94=9F=E6=88=90=E5=AD=97=E5=B9=95=E7=9A=84=E5=8A=9F?=
 =?UTF-8?q?=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 新增 extract_audio_and_create_subtitle函数，用于从视频文件中提取音频并生成字幕文件
- 更新 video_pipeline.py，修改视频下载和处理的相关参数
---
 app/services/subtitle.py | 95 ++++++++++++++++++++++++++++++++--------
 video_pipeline.py        | 14 +++---
 2 files changed, 84 insertions(+), 25 deletions(-)

diff --git a/app/services/subtitle.py b/app/services/subtitle.py
index f37eb65..7b18e8d 100644
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@@ -8,6 +8,8 @@ from faster_whisper import WhisperModel
 from timeit import default_timer as timer
 from loguru import logger
 import google.generativeai as genai
+from moviepy.editor import VideoFileClip
+import os
 
 from app.config import config
 from app.utils import utils
@@ -362,29 +364,86 @@ def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Option
         return None
 
 
+def extract_audio_and_create_subtitle(video_file: str, subtitle_file: str = "") -> Optional[str]:
+    """
+    从视频文件中提取音频并生成字幕文件。
+
+    参数:
+    - video_file: MP4视频文件的路径
+    - subtitle_file: 输出字幕文件的路径（可选）。如果未提供，将根据视频文件名自动生成。
+
+    返回:
+    - str: 生成的字幕文件路径
+    - None: 如果处理过程中出现错误
+    """
+    try:
+        # 获取视频文件所在目录
+        video_dir = os.path.dirname(video_file)
+        video_name = os.path.splitext(os.path.basename(video_file))[0]
+        
+        # 设置音频文件路径
+        audio_file = os.path.join(video_dir, f"{video_name}_audio.wav")
+        
+        # 如果未指定字幕文件路径，则自动生成
+        if not subtitle_file:
+            subtitle_file = os.path.join(video_dir, f"{video_name}.srt")
+        
+        logger.info(f"开始从视频提取音频: {video_file}")
+        
+        # 加载视频文件
+        video = VideoFileClip(video_file)
+        
+        # 提取音频并保存为WAV格式
+        logger.info(f"正在提取音频到: {audio_file}")
+        video.audio.write_audiofile(audio_file, codec='pcm_s16le')
+        
+        # 关闭视频文件
+        video.close()
+        
+        logger.info("音频提取完成，开始生成字幕")
+        
+        # 使用create函数生成字幕
+        create(audio_file, subtitle_file)
+        
+        # 删除临时音频文件
+        if os.path.exists(audio_file):
+            os.remove(audio_file)
+            logger.info("已清理临时音频文件")
+        
+        return subtitle_file
+        
+    except Exception as e:
+        logger.error(f"处理视频文件时出错: {str(e)}")
+        logger.error(traceback.format_exc())
+        return None
+
+
 if __name__ == "__main__":
-    task_id = "test456"
+    task_id = "12121"
     task_dir = utils.task_dir(task_id)
     subtitle_file = f"{task_dir}/subtitle.srt"
     audio_file = f"{task_dir}/audio.wav"
+    video_file = f"{task_dir}/duanju_demo.mp4"
 
-    subtitles = file_to_subtitles(subtitle_file)
-    print(subtitles)
+    extract_audio_and_create_subtitle(video_file, subtitle_file)
 
-    # script_file = f"{task_dir}/script.json"
-    # with open(script_file, "r") as f:
-    #     script_content = f.read()
-    # s = json.loads(script_content)
-    # script = s.get("script")
-    #
-    # correct(subtitle_file, script)
+    # subtitles = file_to_subtitles(subtitle_file)
+    # print(subtitles)
 
-    subtitle_file = f"{task_dir}/subtitle111.srt"
-    create(audio_file, subtitle_file)
+    # # script_file = f"{task_dir}/script.json"
+    # # with open(script_file, "r") as f:
+    # #     script_content = f.read()
+    # # s = json.loads(script_content)
+    # # script = s.get("script")
+    # #
+    # # correct(subtitle_file, script)
 
-    # # 使用Gemini模型处理音频
-    # gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
-    # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
-    #
-    # if gemini_subtitle_file:
-    #     print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
+    # subtitle_file = f"{task_dir}/subtitle111.srt"
+    # create(audio_file, subtitle_file)
+
+    # # # 使用Gemini模型处理音频
+    # # gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
+    # # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
+    # #
+    # # if gemini_subtitle_file:
+    # #     print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
diff --git a/video_pipeline.py b/video_pipeline.py
index 3e35544..5dca576 100644
--- a/video_pipeline.py
+++ b/video_pipeline.py
@@ -110,12 +110,12 @@ class VideoPipeline:
         """运行完整的pipeline"""
         try:
             current_path = os.path.dirname(os.path.abspath(__file__))
-            video_path = os.path.join(current_path, "resource", "videos", video_name)
+            video_path = os.path.join(current_path, "resource", "videos", f"{video_name}.mp4")
             # 判断视频是否存在
             if not os.path.exists(video_path):
                 # 1. 下载视频
                 print(f"视频不存在, 开始下载视频: {video_path}")
-                download_result = self.download_video(youtube_url=youtube_url, resolution="1080p", output_format="mp4", rename=video_name)
+                download_result = self.download_video(url=youtube_url, resolution="1080p", output_format="mp4", rename=video_name)
                 video_path = download_result["output_path"]
             else:
                 print(f"视频已存在: {video_path}")
@@ -168,12 +168,12 @@ class VideoPipeline:
 if __name__ == "__main__":
     pipeline = VideoPipeline()
     result = pipeline.run_pipeline(
-        task_id="test_123",
+        task_id="test_111901",
         script_name="test.json",
-        youtube_url="https://www.youtube.com/watch?v=Kenm35gdqtk",
-        video_name="test.mp4",
-        skip_seconds=0,
-        threshold=30,
+        youtube_url="https://www.youtube.com/watch?v=vLJ7Yed6FQ4",
+        video_name="2024-11-19-01",
+        skip_seconds=50,
+        threshold=35,
         vision_batch_size=10,
         vision_llm_provider="gemini",
         voice_name="zh-CN-YunjianNeural",

From f3248ef03a57432fab0ec9e4f7162810cb712a4b Mon Sep 17 00:00:00 2001
From: linyqh <linyqemail@163.com>
Date: Wed, 20 Nov 2024 02:45:52 +0800
Subject: [PATCH 7/9] =?UTF-8?q?feat(test):=20=E6=B7=BB=E5=8A=A0=E4=B8=8E?=
 =?UTF-8?q?=E9=80=9A=E4=B9=89=E5=8D=83=E9=97=AEAI=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?=E5=AF=B9=E8=AF=9D=E7=9A=84=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 新增 chat_with_qwen 函数，用于与通义千问AI模型进行对话
- 添加错误处理和资源管理，提高代码健壮性
- 优化视频剪辑功能，增加输出路径参数
-读取字幕文件并将其作为输入发送给AI模型
-处理API调用异常，并提供错误文档链接
---
 app/test/test_moviepy.py | 68 +++++++++++++++++++++--------
 app/test/test_qwen.py    | 93 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 18 deletions(-)
 create mode 100644 app/test/test_qwen.py

diff --git a/app/test/test_moviepy.py b/app/test/test_moviepy.py
index 208b708..5b24ebf 100644
--- a/app/test/test_moviepy.py
+++ b/app/test/test_moviepy.py
@@ -4,6 +4,7 @@
 
 from moviepy.editor import VideoFileClip
 from datetime import datetime
+import os
 
 
 def time_str_to_seconds(time_str: str) -> float:
@@ -38,25 +39,56 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
         video_path: 视频文件路径
         start_time: 开始时间 (格式: "MM:SS")
         end_time: 结束时间 (格式: "MM:SS")
+        output_path: 输出文件路径
     """
-    # 转换时间字符串为秒数
-    start_seconds = time_str_to_seconds(start_time)
-    end_seconds = time_str_to_seconds(end_time)
-    
-    # 加载视频文件
-    video = VideoFileClip(video_path)
-    
-    # 计算剪辑时长
-    clip_duration = end_seconds - start_seconds
-    print(f"原视频总长度: {format_duration(video.duration)}")
-    print(f"剪辑时长: {format_duration(clip_duration)}")
-    
-    # 剪辑视频
-    video = video.subclip(start_seconds, end_seconds)
-    video.write_videofile("../../resource/videos/cut_video3.mp4")
-    
-    # 释放资源
-    video.close()
+    try:
+        # 确保输出目录存在
+        output_dir = os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+            
+        # 如果输出文件已存在，先尝试删除
+        if os.path.exists(output_path):
+            try:
+                os.remove(output_path)
+            except PermissionError:
+                print(f"无法删除已存在的文件：{output_path}，请确保文件未被其他程序占用")
+                return
+        
+        # 转换时间字符串为秒数
+        start_seconds = time_str_to_seconds(start_time)
+        end_seconds = time_str_to_seconds(end_time)
+        
+        # 加载视频文件
+        video = VideoFileClip(video_path)
+        
+        # 计算剪辑时长
+        clip_duration = end_seconds - start_seconds
+        print(f"原视频总长度: {format_duration(video.duration)}")
+        print(f"剪辑时长: {format_duration(clip_duration)}")
+        
+        # 剪辑视频
+        video = video.subclip(start_seconds, end_seconds)
+        
+        # 添加错误处理的写入过程
+        try:
+            video.write_videofile(
+                output_path,
+                codec='libx264',
+                audio_codec='aac',
+                temp_audiofile='temp-audio.m4a',
+                remove_temp=True
+            )
+        except IOError as e:
+            print(f"写入视频文件时发生错误：{str(e)}")
+            raise
+        finally:
+            # 确保资源被释放
+            video.close()
+            
+    except Exception as e:
+        print(f"视频剪辑过程中发生错误：{str(e)}")
+        raise
 
 
 if __name__ == "__main__":
diff --git a/app/test/test_qwen.py b/app/test/test_qwen.py
new file mode 100644
index 0000000..77bca56
--- /dev/null
+++ b/app/test/test_qwen.py
@@ -0,0 +1,93 @@
+import os
+import traceback
+import json
+from openai import OpenAI
+from test_moviepy import cut_video
+from app.utils import utils
+from app.services.subtitle import extract_audio_and_create_subtitle
+
+
+def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
+    """
+    与通义千问AI模型进行对话
+    
+    Args:
+        prompt (str): 用户输入的问题或提示
+        system_message (str): 系统提示信息，用于设定AI助手的行为。默认为"You are a helpful assistant."
+        subtitle_path (str): 字幕文件路径
+    Returns:
+        str: AI助手的回复内容
+
+    Raises:
+        Exception: 当API调用失败时抛出异常
+    """
+    try:
+        client = OpenAI(
+            api_key="sk-",
+            base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+        )
+
+        # 读取字幕文件
+        with open(subtitle_path, "r", encoding="utf-8") as file:
+            subtitle_content = file.read()
+
+        completion = client.chat.completions.create(
+            model="qwen-turbo-2024-11-01",
+            messages=[
+                {'role': 'system', 'content': system_message},
+                {'role': 'user', 'content': prompt + subtitle_content}
+            ]
+        )
+        return completion.choices[0].message.content
+
+    except Exception as e:
+        error_message = f"调用千问API时发生错误：{str(e)}"
+        print(error_message)
+        print("请参考文档：https://help.aliyun.com/zh/model-studio/developer-reference/error-code")
+        raise Exception(error_message)
+
+
+# 使用示例
+if __name__ == "__main__":
+    try:
+        # video_path = utils.video_dir("duanju_yuansp.mp4")
+        # # 判断视频是否存在
+        # if not os.path.exists(video_path):
+        #     print(f"视频文件不存在：{video_path}")
+        #     exit(1)
+        # 提取字幕
+        subtitle_path = os.path.join(utils.video_dir(""), f"duanju_yuan.srt")
+        # extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path)
+        # 分析字幕
+        system_message = """
+        你是一个视频srt字幕分析剪辑器, 输入视频的srt字幕, 分析其中的精彩且尽可能连续的片段并裁剪出来, 注意确保文字与时间戳的正确匹配。
+        输出需严格按照如下 json 格式: 
+        [
+            {
+                "timestamp": "00:50-01:44",
+                "picture": "画面1",
+                "narration": "播放原声",
+                "OST": 0,
+                "new_timestamp": "00:00-00:54"
+            },
+            {
+                "timestamp": "01:49-02:30",
+                "picture": "画面2",
+                "narration": "播放原声",
+                "OST": 2,
+                "new_timestamp": "00:54-01:35"
+            },
+        ]
+        """
+        prompt = "字幕如下：\n"
+        response = chat_with_qwen(prompt, system_message, subtitle_path)
+        print(response)
+        # 保存json，注意json中是时间戳需要转换为 分:秒(现在的时间是 "timestamp": "00:00:00,020-00:00:01,660", 需要转换为 "timestamp": "00:00-01:66")
+        # response = json.loads(response)
+        # for item in response:
+        #     item["timestamp"] = item["timestamp"].replace(":", "-")
+        # with open(os.path.join(utils.video_dir(""), "duanju_yuan.json"), "w", encoding="utf-8") as file:
+        #     json.dump(response, file, ensure_ascii=False)
+
+    except Exception as e:
+        print(traceback.format_exc())

From 86d398d8fd5eee96d73f8b53fb46d55c606c801e Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Wed, 20 Nov 2024 18:12:45 +0800
Subject: [PATCH 8/9] =?UTF-8?q?feat(audio):=20=E6=94=B9=E8=BF=9B=E9=9F=B3?=
 =?UTF-8?q?=E9=A2=91=E5=90=88=E5=B9=B6=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=20OST=20=E8=AE=BE=E7=BD=AE=EF=BC=8C=E6=8F=90=E5=8D=87?=
 =?UTF-8?q?=E6=97=B6=E9=97=B4=E6=88=B3=E7=B2=BE=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

-重构了 merge_audio_files 函数，增加了对 OST 设置的支持
- 新增 time_to_seconds 函数，支持多种时间格式的转换
- 修改了 audio_merger 模块的逻辑，根据 OST 设置处理音频
- 更新了 task 模块中的 start_subclip 函数，传入 OST 信息
- 优化了 subtitle 和 video 模块的逻辑，适应新的音频处理方式
---
 app/models/schema.py         |   2 +
 app/services/audio_merger.py | 174 +++++++++--------
 app/services/material.py     |  73 +++++--
 app/services/task.py         | 203 +++++---------------
 app/services/video.py        | 360 +++++++++++++++++++++++------------
 app/test/test_moviepy.py     |  53 ++++--
 app/test/test_qwen.py        |  26 ++-
 app/utils/utils.py           |  72 ++++++-
 video_pipeline.py            |  11 +-
 webui.txt                    |   4 +-
 10 files changed, 566 insertions(+), 412 deletions(-)

diff --git a/app/models/schema.py b/app/models/schema.py
index 9d0c5d4..6621772 100644
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -366,6 +366,8 @@ class VideoClipParams(BaseModel):
     custom_position: float = Field(default=70.0, description="自定义位置")
 
     n_threads: Optional[int] = 8    # 线程数，有助于提升视频处理速度
+    tts_volume: float = 1.0  # TTS音频音量
+    video_volume: float = 0.1  # 视频原声音量
 
 class VideoTranscriptionRequest(BaseModel):
     video_name: str
diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py
index f0face0..c7edc77 100644
--- a/app/services/audio_merger.py
+++ b/app/services/audio_merger.py
@@ -18,95 +18,119 @@ def check_ffmpeg():
         return False
 
 
-def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int, video_script: list):
+def merge_audio_files(task_id: str, audio_files: list, total_duration: float, list_script: list):
     """
-    合并多个音频文件到一个指定总时长的音频文件中，并生成相应的字幕
-    :param task_id: 任务ID
-    :param audio_file_paths: 音频文件路径列表
-    :param total_duration: 最终音频文件的总时长（秒）
-    :param video_script: JSON格式的视频脚本
+    合并音频文件，根据OST设置处理不同的音频轨道
+    
+    Args:
+        task_id: 任务ID
+        audio_files: TTS生成的音频文件列表
+        total_duration: 总时长
+        list_script: 完整脚本信息，包含OST设置
+    
+    Returns:
+        str: 合并后的音频文件路径
     """
-    output_dir = utils.task_dir(task_id)
-
+    # 检查FFmpeg是否安装
     if not check_ffmpeg():
-        logger.error("错误：FFmpeg未安装。请安装FFmpeg后再运行此脚本。")
-        return None, None
+        logger.error("FFmpeg未安装，无法合并音频文件")
+        return None
 
-    # 创建一个总时长为total_duration的空白音频
-    blank_audio = AudioSegment.silent(duration=total_duration * 1000)  # pydub使用毫秒
+    # 创建一个空的音频片段
+    final_audio = AudioSegment.silent(duration=total_duration * 1000)  # 总时长以毫秒为单位
 
-    for audio_path in audio_file_paths:
-        if not os.path.exists(audio_path):
-            logger.info(f"警告：文件 {audio_path} 不存在，已跳过。")
+    # 遍历脚本中的每个片段
+    for segment, audio_file in zip(list_script, audio_files):
+        try:
+            # 加载TTS音频文件
+            tts_audio = AudioSegment.from_file(audio_file)
+
+            # 获取片段的开始和结束时间
+            start_time, end_time = segment['new_timestamp'].split('-')
+            start_seconds = utils.time_to_seconds(start_time)
+            end_seconds = utils.time_to_seconds(end_time)
+
+            # 根据OST设置处理音频
+            if segment['OST'] == 0:
+                # 只使用TTS音频
+                final_audio = final_audio.overlay(tts_audio, position=start_seconds * 1000)
+            elif segment['OST'] == 1:
+                # 只使用原声（假设原声已经在视频中）
+                continue
+            elif segment['OST'] == 2:
+                # 混合TTS音频和原声
+                original_audio = AudioSegment.silent(duration=(end_seconds - start_seconds) * 1000)
+                mixed_audio = original_audio.overlay(tts_audio)
+                final_audio = final_audio.overlay(mixed_audio, position=start_seconds * 1000)
+
+        except Exception as e:
+            logger.error(f"处理音频文件 {audio_file} 时出错: {str(e)}")
             continue
 
-        # 从文件名中提取时间戳
-        filename = os.path.basename(audio_path)
-        start_time, end_time = extract_timestamp(filename)
+    # 保存合并后的音频文件
+    output_audio_path = os.path.join(utils.task_dir(task_id), "final_audio.mp3")
+    final_audio.export(output_audio_path, format="mp3")
+    logger.info(f"合并后的音频文件已保存: {output_audio_path}")
 
-        # 读取音频文件
-        try:
-            audio = AudioSegment.from_mp3(audio_path)
-        except Exception as e:
-            logger.error(f"错误：无法读取文件 {audio_path}。错误信息：{str(e)}")
-            continue
-        
-        # 将音频插入到空白音频的指定位置
-        blank_audio = blank_audio.overlay(audio, position=start_time * 1000)
-
-    # 尝试导出为WAV格式
-    try:
-        output_file = os.path.join(output_dir, "audio.wav")
-        blank_audio.export(output_file, format="wav")
-        logger.info(f"音频合并完成，已保存为 {output_file}")
-    except Exception as e:
-        logger.info(f"导出为WAV格式失败，尝试使用MP3格式：{str(e)}")
-        try:
-            output_file = os.path.join(output_dir, "audio.mp3")
-            blank_audio.export(output_file, format="mp3", codec="libmp3lame")
-            logger.info(f"音频合并完成，已保存为 {output_file}")
-        except Exception as e:
-            logger.error(f"导出音频失败：{str(e)}")
-            return None, None
-
-    return output_file
-
-def parse_timestamp(timestamp: str):
-    """解析时间戳字符串为秒数"""
-    # 确保使用冒号作为分隔符
-    timestamp = timestamp.replace('_', ':')
-    return time_to_seconds(timestamp)
-
-def extract_timestamp(filename):
-    """从文件名中提取开始和结束时间戳"""
-    # 从 "audio_00_06-00_24.mp3" 这样的格式中提取时间
-    time_part = filename.split('_', 1)[1].split('.')[0]  # 获取 "00_06-00_24" 部分
-    start_time, end_time = time_part.split('-')  # 分割成 "00_06" 和 "00_24"
-    
-    # 将下划线格式转换回冒号格式
-    start_time = start_time.replace('_', ':')
-    end_time = end_time.replace('_', ':')
-    
-    # 将时间戳转换为秒
-    start_seconds = time_to_seconds(start_time)
-    end_seconds = time_to_seconds(end_time)
-
-    return start_seconds, end_seconds
+    return output_audio_path
 
 
 def time_to_seconds(time_str):
-    """将 "00:06" 或 "00_06" 格式转换为总秒数"""
-    # 确保使用冒号作为分隔符
-    time_str = time_str.replace('_', ':')
+    """
+    将时间字符串转换为秒数，支持多种格式：
+    1. 'HH:MM:SS,mmm' (时:分:秒,毫秒)
+    2. 'MM:SS,mmm' (分:秒,毫秒)
+    3. 'SS,mmm' (秒,毫秒)
+    """
     try:
-        parts = time_str.split(':')
-        if len(parts) != 2:
-            logger.error(f"Invalid time format: {time_str}")
-            return 0
-        return int(parts[0]) * 60 + int(parts[1])
+        # 处理毫秒部分
+        if ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = float(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 分割时间部分
+        parts = time_part.split(':')
+        
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(int, parts)
+            seconds = h * 3600 + m * 60 + s
+        elif len(parts) == 2:  # MM:SS
+            m, s = map(int, parts)
+            seconds = m * 60 + s
+        else:  # SS
+            seconds = int(parts[0])
+
+        return seconds + ms
     except (ValueError, IndexError) as e:
         logger.error(f"Error parsing time {time_str}: {str(e)}")
-        return 0
+        return 0.0
+
+
+def extract_timestamp(filename):
+    """
+    从文件名中提取开始和结束时间戳
+    例如: "audio_00_06,500-00_24,800.mp3" -> (6.5, 24.8)
+    """
+    try:
+        # 从文件名中提取时间部分
+        time_part = filename.split('_', 1)[1].split('.')[0]  # 获取 "00_06,500-00_24,800" 部分
+        start_time, end_time = time_part.split('-')  # 分割成开始和结束时间
+        
+        # 将下划线格式转换回冒号格式
+        start_time = start_time.replace('_', ':')
+        end_time = end_time.replace('_', ':')
+        
+        # 将时间戳转换为秒
+        start_seconds = time_to_seconds(start_time)
+        end_seconds = time_to_seconds(end_time)
+
+        return start_seconds, end_seconds
+    except Exception as e:
+        logger.error(f"Error extracting timestamp from {filename}: {str(e)}")
+        return 0.0, 0.0
 
 
 if __name__ == "__main__":
diff --git a/app/services/material.py b/app/services/material.py
index 696eda8..5ec6ee4 100644
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -3,6 +3,7 @@ import subprocess
 import random
 import traceback
 from urllib.parse import urlencode
+from datetime import datetime
 
 import requests
 from typing import List
@@ -253,34 +254,58 @@ def download_videos(
 
 def time_to_seconds(time_str: str) -> float:
     """
-    将时间字符串转换为秒数
-    支持格式：
-    1. "MM:SS" (分:秒)
-    2. "SS" (纯秒数)
+    将时间字符串转换为秒数，支持多种格式：
+    1. 'HH:MM:SS,mmm' (时:分:秒,毫秒)
+    2. 'MM:SS' (分:秒)
+    3. 'SS' (秒)
     """
-    parts = time_str.split(':')
-    if len(parts) == 2:
-        minutes, seconds = map(float, parts)
-        return minutes * 60 + seconds
-    return float(time_str)
+    try:
+        # 处理毫秒部分
+        if ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = int(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 根据格式分别处理
+        parts = time_part.split(':')
+        if len(parts) == 3:  # HH:MM:SS
+            time_obj = datetime.strptime(time_part, "%H:%M:%S")
+            seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second
+        elif len(parts) == 2:  # MM:SS
+            time_obj = datetime.strptime(time_part, "%M:%S")
+            seconds = time_obj.minute * 60 + time_obj.second
+        else:  # SS
+            seconds = float(time_part)
+
+        return seconds + ms
+    except ValueError as e:
+        logger.error(f"时间格式错误: {time_str}")
+        raise ValueError(f"时间格式错误，支持的格式：HH:MM:SS,mmm 或 MM:SS 或 SS") from e
 
 
 def format_timestamp(seconds: float) -> str:
     """
-    将秒数转换为 "MM:SS" 格式的时间字符串
+    将秒数转换为可读的时间格式 (HH:MM:SS,mmm)
     """
-    minutes = int(seconds) // 60
-    secs = int(seconds) % 60
-    return f"{minutes:02d}:{secs:02d}"
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds_remain = seconds % 60
+    whole_seconds = int(seconds_remain)
+    milliseconds = int((seconds_remain - whole_seconds) * 1000)
+    
+    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
 
 
 def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict:
     """
     保存剪辑后的视频
     Args:
-        timestamp: 需要裁剪的单个时间戳，支持两种格式：
-                  1. '00:36-00:40' (分:秒-分:秒)
-                  2. 'SS-SS' (秒-秒)
+        timestamp: 需要裁剪的单个时间戳，支持格式：
+                  1. 'HH:MM:SS,mmm-HH:MM:SS,mmm' (时:分:秒,毫秒)
+                  2. 'MM:SS-MM:SS' (分:秒-分:秒)
+                  3. 'SS-SS' (秒-秒)
         origin_video: 原视频路径
         save_dir: 存储目录
 
@@ -293,7 +318,7 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
     if not os.path.exists(save_dir):
         os.makedirs(save_dir)
 
-    video_id = f"vid-{timestamp.replace(':', '_')}"
+    video_id = f"vid-{timestamp.replace(':', '_').replace(',', '-')}"
     video_path = f"{save_dir}/{video_id}.mp4"
 
     if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
@@ -312,12 +337,12 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
         
         # 验证时间段是否有效
         if start >= total_duration:
-            logger.warning(f"起始时间 {format_timestamp(start)} ({start:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒)")
+            logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)")
             video.close()
             return {}
             
         if end > total_duration:
-            logger.warning(f"结束时间 {format_timestamp(end)} ({end:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒)，将自动调整为视频结尾")
+            logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)，将自动调整为视频结尾")
             end = total_duration
             
         if end <= start:
@@ -332,7 +357,15 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
         
         try:
             # 检查视频是否有音频轨道并写入文件
-            subclip.write_videofile(video_path, audio=(subclip.audio is not None), logger=None)
+            subclip.write_videofile(
+                video_path,
+                codec='libx264',
+                audio_codec='aac',
+                temp_audiofile='temp-audio.m4a',
+                remove_temp=True,
+                audio=(subclip.audio is not None),
+                logger=None
+            )
             
             # 验证生成的视频文件
             if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
diff --git a/app/services/task.py b/app/services/task.py
index c030574..5cd31ed 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -206,134 +206,14 @@ def generate_final_videos(
     return final_video_paths, combined_video_paths
 
 
-def start(task_id, params: VideoParams, stop_at: str = "video"):
-    logger.info(f"start task: {task_id}, stop_at: {stop_at}")
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
-
-    if type(params.video_concat_mode) is str:
-        params.video_concat_mode = VideoConcatMode(params.video_concat_mode)
-
-    # 1. Generate script
-    video_script = generate_script(task_id, params)
-    if not video_script:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
-
-    if stop_at == "script":
-        sm.state.update_task(
-            task_id, state=const.TASK_STATE_COMPLETE, progress=100, script=video_script
-        )
-        return {"script": video_script}
-
-    # 2. Generate terms
-    video_terms = ""
-    if params.video_source != "local":
-        video_terms = generate_terms(task_id, params, video_script)
-        if not video_terms:
-            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-            return
-
-    save_script_data(task_id, video_script, video_terms, params)
-
-    if stop_at == "terms":
-        sm.state.update_task(
-            task_id, state=const.TASK_STATE_COMPLETE, progress=100, terms=video_terms
-        )
-        return {"script": video_script, "terms": video_terms}
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
-
-    # 3. Generate audio
-    audio_file, audio_duration, sub_maker = generate_audio(task_id, params, video_script)
-    if not audio_file:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
-
-    if stop_at == "audio":
-        sm.state.update_task(
-            task_id,
-            state=const.TASK_STATE_COMPLETE,
-            progress=100,
-            audio_file=audio_file,
-        )
-        return {"audio_file": audio_file, "audio_duration": audio_duration}
-
-    # 4. Generate subtitle
-    subtitle_path = generate_subtitle(task_id, params, video_script, sub_maker, audio_file)
-
-    if stop_at == "subtitle":
-        sm.state.update_task(
-            task_id,
-            state=const.TASK_STATE_COMPLETE,
-            progress=100,
-            subtitle_path=subtitle_path,
-        )
-        return {"subtitle_path": subtitle_path}
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
-
-    # 5. Get video materials
-    downloaded_videos = get_video_materials(
-        task_id, params, video_terms, audio_duration
-    )
-    if not downloaded_videos:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    if stop_at == "materials":
-        sm.state.update_task(
-            task_id,
-            state=const.TASK_STATE_COMPLETE,
-            progress=100,
-            materials=downloaded_videos,
-        )
-        return {"materials": downloaded_videos}
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
-
-    # 6. Generate final videos
-    final_video_paths, combined_video_paths = generate_final_videos(
-        task_id, params, downloaded_videos, audio_file, subtitle_path
-    )
-
-    if not final_video_paths:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    logger.success(
-        f"task {task_id} finished, generated {len(final_video_paths)} videos."
-    )
-
-    kwargs = {
-        "videos": final_video_paths,
-        "combined_videos": combined_video_paths,
-        "script": video_script,
-        "terms": video_terms,
-        "audio_file": audio_file,
-        "audio_duration": audio_duration,
-        "subtitle_path": subtitle_path,
-        "materials": downloaded_videos,
-    }
-    sm.state.update_task(
-        task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs
-    )
-    return kwargs
-
-
 def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
-    """
-    后台任务（自动剪辑视频进行剪辑）
-
-        task_id: 任务ID
-        params: 剪辑参数
-        subclip_path_videos: 视频文件路径
-
-    """
+    """后台任务（自动剪辑视频进行剪辑）"""
     logger.info(f"\n\n## 开始任务: {task_id}")
+    
+    # 初始化 ImageMagick
+    if not utils.init_imagemagick():
+        logger.warning("ImageMagick 初始化失败，字幕可能无法正常显示")
+    
     sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
 
     # tts 角色名称
@@ -341,8 +221,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
 
     logger.info("\n\n## 1. 加载视频脚本")
     video_script_path = path.join(params.video_clip_json_path)
-    # video_script_path = video_clip_json_path
-    # 判断json文件是否存在
+    
     if path.exists(video_script_path):
         try:
             with open(video_script_path, "r", encoding="utf-8") as f:
@@ -355,10 +234,12 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
                 logger.debug(f"解说完整脚本: \n{video_script}")
                 logger.debug(f"解说 OST 列表: \n{video_ost}")
                 logger.debug(f"解说时间戳列表: \n{time_list}")
+                
                 # 获取视频总时长(单位 s)
-                total_duration = list_script[-1]['new_timestamp']
-                total_duration = int(total_duration.split("-")[1].split(":")[0]) * 60 + int(
-                    total_duration.split("-")[1].split(":")[1])
+                last_timestamp = list_script[-1]['new_timestamp']
+                end_time = last_timestamp.split("-")[1]
+                total_duration = utils.time_to_seconds(end_time)
+                
         except Exception as e:
             logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
             raise ValueError("无法读取视频json脚本，请检查配置是否正确")
@@ -366,32 +247,51 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
         logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
         raise ValueError("解说脚本不存在！请检查配置是否正确。")
 
-    logger.info("\n\n## 2. 生成音频列表")
-    audio_files, sub_maker_list = voice.tts_multiple(
-        task_id=task_id,
-        list_script=list_script,
-        voice_name=voice_name,
-        voice_rate=params.voice_rate,
-        voice_pitch=params.voice_pitch,
-        force_regenerate=True
+    logger.info("\n\n## 2. 根据OST设置生成音频列表")
+    # 只为OST=0或2的片段生成TTS音频
+    tts_segments = [
+        segment for segment in list_script 
+        if segment['OST'] in [0, 2]
+    ]
+    logger.debug(f"tts_segments: {tts_segments}")
+    if tts_segments:
+        audio_files, sub_maker_list = voice.tts_multiple(
+            task_id=task_id,
+            list_script=tts_segments,  # 只传入需要TTS的片段
+            voice_name=voice_name,
+            voice_rate=params.voice_rate,
+            voice_pitch=params.voice_pitch,
+            force_regenerate=True
+        )
+        if audio_files is None:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
+            return
+    else:
+        audio_files = []
+        
+    logger.info(f"合并音频文件:\n{audio_files}")
+    # 传入OST信息以便正确处理音频
+    final_audio = audio_merger.merge_audio_files(
+        task_id=task_id, 
+        audio_files=audio_files, 
+        total_duration=total_duration, 
+        list_script=list_script  # 传入完整脚本以便处理OST
     )
-    if audio_files is None:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error(
-            "TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
-        return
-    logger.info(f"合并音频:\n\n {audio_files}")
-    audio_file = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script)
 
     sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
 
+    # 只为OST=0或2的片段生成字幕
     subtitle_path = ""
     if params.subtitle_enabled:
         subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
         logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-        # 使用 faster-whisper-large-v2 模型生成字幕
-        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+         
+        subtitle.create(
+            audio_file=final_audio,
+            subtitle_file=subtitle_path,
+        )
 
         subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
         if not subtitle_lines:
@@ -434,14 +334,15 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
 
     final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
 
-    logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}")
-    # 把所有东西合到在一起
+    logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}")
+    # 传入OST信息以便正确处理音频和视频
     video.generate_video_v2(
         video_path=combined_video_path,
-        audio_path=audio_file,
+        audio_path=final_audio,
         subtitle_path=subtitle_path,
         output_file=final_video_path,
         params=params,
+        list_script=list_script  # 传入完整脚本以便处理OST
     )
 
     _progress += 50 / 2
diff --git a/app/services/video.py b/app/services/video.py
index 1d270fa..8e6e32d 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -173,7 +173,7 @@ def wrap_text(text, max_width, font, fontsize=60):
     if width <= max_width:
         return text, height
 
-    logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 文本: {text}")
+    logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 本: {text}")
 
     processed = True
 
@@ -228,131 +228,93 @@ def manage_clip(clip):
 
 
 def generate_video_v2(
-        video_path: str,
-        audio_path: str,
-        subtitle_path: str,
-        output_file: str,
-        params: Union[VideoParams, VideoClipParams],
-        progress_callback=None,
+    video_path: str,
+    audio_path: str,
+    subtitle_path: str,
+    output_file: str,
+    params: VideoClipParams,
+    list_script: list = None
 ):
     """
-    合并所有素材
+    生成最终视频，处理音频和字幕
+
     Args:
-        video_path: 视频路径
-        audio_path: 单个音频文件路径
+        video_path: 视频文件路径
+        audio_path: 音频文件路径
         subtitle_path: 字幕文件路径
         output_file: 输出文件路径
         params: 视频参数
-        progress_callback: 进度回调函数，接收 0-100 的进度值
-
-    Returns:
-
+        list_script: 视频脚本列表，包含OST设置
     """
-    total_steps = 4
-    current_step = 0
-    
-    def update_progress(step_name):
-        nonlocal current_step
-        current_step += 1
-        if progress_callback:
-            progress_callback(int(current_step * 100 / total_steps))
-        logger.info(f"完成步骤: {step_name}")
-
     try:
-        validate_params(video_path, audio_path, output_file, params)
+        video_clip = VideoFileClip(video_path)
         
-        with manage_clip(VideoFileClip(video_path)) as video_clip:
-            aspect = VideoAspect(params.video_aspect)
-            video_width, video_height = aspect.to_resolution()
-
-            logger.info(f"开始，视频尺寸: {video_width} x {video_height}")
-            logger.info(f"  ① 视频: {video_path}")
-            logger.info(f"  ② 音频: {audio_path}")
-            logger.info(f"  ③ 字幕: {subtitle_path}")
-            logger.info(f"  ④ 输出: {output_file}")
-
-            output_dir = os.path.dirname(output_file)
-            update_progress("初始化完成")
-
-            # 字体设置
-            font_path = ""
-            if params.subtitle_enabled:
-                if not params.font_name:
-                    params.font_name = "STHeitiMedium.ttc"
-                font_path = os.path.join(utils.font_dir(), params.font_name)
-                if os.name == "nt":
-                    font_path = font_path.replace("\\", "/")
-                logger.info(f"使用字体: {font_path}")
-
-            def create_text_clip(subtitle_item):
-                phrase = subtitle_item[1]
-                max_width = video_width * 0.9
-                wrapped_txt, txt_height = wrap_text(
-                    phrase, max_width=max_width, font=font_path, fontsize=params.font_size
-                )
-                _clip = TextClip(
-                    wrapped_txt,
-                    font=font_path,
-                    fontsize=params.font_size,
-                    color=params.text_fore_color,
-                    bg_color=params.text_background_color,
-                    stroke_color=params.stroke_color,
-                    stroke_width=params.stroke_width,
-                    print_cmd=False,
-                )
-                duration = subtitle_item[0][1] - subtitle_item[0][0]
-                _clip = _clip.set_start(subtitle_item[0][0])
-                _clip = _clip.set_end(subtitle_item[0][1])
-                _clip = _clip.set_duration(duration)
-                
-                if params.subtitle_position == "bottom":
-                    _clip = _clip.set_position(("center", video_height * 0.95 - _clip.h))
-                elif params.subtitle_position == "top":
-                    _clip = _clip.set_position(("center", video_height * 0.05))
-                elif params.subtitle_position == "custom":
-                    margin = 10
-                    max_y = video_height - _clip.h - margin
-                    min_y = margin
-                    custom_y = (video_height - _clip.h) * (params.custom_position / 100)
-                    custom_y = max(min_y, min(custom_y, max_y))
-                    _clip = _clip.set_position(("center", custom_y))
-                else:  # center
-                    _clip = _clip.set_position(("center", "center"))
-                return _clip
-
-            update_progress("字体设置完成")
-
-            # 处理音频
-            original_audio = video_clip.audio
-            video_duration = video_clip.duration
-            new_audio = AudioFileClip(audio_path)
-            final_audio = process_audio_tracks(original_audio, new_audio, params, video_duration)
-            update_progress("音频处理完成")
-
-            # 处理字幕
-            if subtitle_path and os.path.exists(subtitle_path):
-                video_clip = process_subtitles(subtitle_path, video_clip, video_duration, create_text_clip)
-            update_progress("字幕处理完成")
-
-            # 合并音频和导出
-            video_clip = video_clip.set_audio(final_audio)
-            video_clip.write_videofile(
-                output_file,
-                audio_codec="aac",
-                temp_audiofile=os.path.join(output_dir, "temp-audio.m4a"),
-                threads=params.n_threads,
-                logger=None,
-                fps=30,
-            )
+        # 处理音频
+        if audio_path and os.path.exists(audio_path):
+            audio_clip = AudioFileClip(audio_path)
             
-    except FileNotFoundError as e:
-        logger.error(f"文件不存在: {str(e)}")
-        raise
+            if list_script:
+                # 根据OST设置处理音频
+                # OST=0: 只使用TTS音频
+                # OST=1: 只使用视频原声
+                # OST=2: 混合TTS音频和视频原声
+                original_audio = video_clip.audio
+                
+                # 设置音频音量
+                tts_volume = params.tts_volume if hasattr(params, 'tts_volume') else 1.0
+                video_volume = params.video_volume if hasattr(params, 'video_volume') else 0.1
+                
+                # 创建最终音频
+                if original_audio:
+                    # 有些片段需要原声，有些需要TTS
+                    final_audio = CompositeAudioClip([
+                        audio_clip.volumex(tts_volume),  # TTS音频
+                        original_audio.volumex(video_volume)  # 原声音频
+                    ])
+                else:
+                    final_audio = audio_clip.volumex(tts_volume)
+            else:
+                # 如果没有OST设置，使用默认行为
+                final_audio = audio_clip
+                
+            video_clip = video_clip.set_audio(final_audio)
+
+        # 处理字幕
+        if subtitle_path and os.path.exists(subtitle_path):
+            # 添加字幕
+            video_clip = add_subtitles(
+                video_clip,
+                subtitle_path,
+                params.font_size,
+                params.font_name,
+                params.text_fore_color,
+                params.subtitle_position,
+                params.stroke_color,
+                params.stroke_width
+            )
+
+        # 写入最终视频文件
+        video_clip.write_videofile(
+            output_file,
+            codec="libx264",
+            audio_codec="aac",
+            temp_audiofile="temp-audio.m4a",
+            remove_temp=True,
+            threads=params.n_threads
+        )
+
     except Exception as e:
-        logger.error(f"视频生成失败: {str(e)}")
-        raise
+        logger.error(f"生成视频时发生错误: {str(e)}")
+        raise e
+
     finally:
-        logger.success("完成")
+        # 清理资源
+        if 'video_clip' in locals():
+            video_clip.close()
+        if 'audio_clip' in locals():
+            audio_clip.close()
+        if 'final_audio' in locals():
+            final_audio.close()
 
 
 def process_audio_tracks(original_audio, new_audio, params, video_duration):
@@ -389,7 +351,7 @@ def process_subtitles(subtitle_path, video_clip, video_duration, create_text_cli
     for item in sub.subtitles:
         clip = create_text_clip(subtitle_item=item)
         
-        # 时间范围调整
+        # 时间范围��整
         start_time = max(clip.start, 0)
         if start_time >= video_duration:
             continue
@@ -450,12 +412,12 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
 
 
 def combine_clip_videos(combined_video_path: str,
-                        video_paths: List[str],
-                        video_ost_list: List[int],
-                        list_script: list,
-                        video_aspect: VideoAspect = VideoAspect.portrait,
-                        threads: int = 2,
-                        ) -> str:
+                       video_paths: List[str],
+                       video_ost_list: List[int],
+                       list_script: list,
+                       video_aspect: VideoAspect = VideoAspect.portrait,
+                       threads: int = 2,
+                       ) -> str:
     """
     合并子视频
     Args:
@@ -469,9 +431,18 @@ def combine_clip_videos(combined_video_path: str,
     Returns:
         str: 合并后的视频路径
     """
-    from app.utils.utils import calculate_total_duration
-    audio_duration = calculate_total_duration(list_script)
-    logger.info(f"音频的最大持续时间: {audio_duration} s")
+    # 计算总时长时需要考虑毫秒精度
+    total_duration = 0.0
+    for item in list_script:
+        timestamp = item.get('new_timestamp', '')
+        if timestamp:
+            start_str, end_str = timestamp.split('-')
+            start_time = utils.time_to_seconds(start_str)
+            end_time = utils.time_to_seconds(end_str)
+            duration = end_time - start_time
+            total_duration += duration
+            
+    logger.info(f"音频的最大持续时间: {total_duration:.3f} s")
     
     output_dir = os.path.dirname(combined_video_path)
     aspect = VideoAspect(video_aspect)
@@ -480,11 +451,17 @@ def combine_clip_videos(combined_video_path: str,
     clips = []
     for video_path, video_ost in zip(video_paths, video_ost_list):
         try:
+            # 加载视频片段
             clip = VideoFileClip(video_path)
             
+            # 根据OST设置处理音频
             if video_ost == 0:  # 不保留原声
                 clip = clip.without_audio()
-            # video_ost 为 1 或 2 时都保留原声，不需要特殊处理
+            elif video_ost == 1:  # 只保留原声
+                # 保持原声，但可能需要调整音量
+                if clip.audio:
+                    clip = clip.set_audio(clip.audio.volumex(1.0))  # 可以调整音量系数
+            # OST == 2 的情况会在后续处理中混合音频
                 
             clip = clip.set_fps(30)
 
@@ -498,6 +475,16 @@ def combine_clip_videos(combined_video_path: str,
                 )
                 logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}")
 
+            # 精确控制视频时长
+            filename = os.path.basename(video_path)
+            timestamp = extract_timestamp_from_filename(filename)
+            if timestamp:
+                start_time, end_time = timestamp
+                clip_duration = end_time - start_time
+                if abs(clip.duration - clip_duration) > 0.1:  # 允许0.1秒的误差
+                    logger.warning(f"视频 {video_path} 时长与时间戳不匹配，进行调整")
+                    clip = clip.set_duration(clip_duration)
+
             clips.append(clip)
             
         except Exception as e:
@@ -508,6 +495,7 @@ def combine_clip_videos(combined_video_path: str,
         raise ValueError("没有有效的视频片段可以合并")
 
     try:
+        # 合并所有视频片段
         video_clip = concatenate_videoclips(clips)
         video_clip = video_clip.set_fps(30)
         
@@ -521,7 +509,7 @@ def combine_clip_videos(combined_video_path: str,
             temp_audiofile=os.path.join(output_dir, "temp-audio.m4a")
         )
     finally:
-        # 确保资源被正确���放
+        # 确保资源被正确释放
         video_clip.close()
         for clip in clips:
             clip.close()
@@ -530,6 +518,59 @@ def combine_clip_videos(combined_video_path: str,
     return combined_video_path
 
 
+def extract_timestamp_from_filename(filename: str) -> tuple:
+    """
+    从文件名中提取时间戳，支持多种格式：
+    - "vid-00_06,500-00_24,800.mp4" -> (6.5, 24.8)
+    - "vid-00_00_00-020-00_00_10-400.mp4" -> (0.02, 10.4)
+    """
+    try:
+        # 提取时间戳部分
+        match = re.search(r'vid-(.+?)\.mp4$', filename)
+        if not match:
+            logger.warning(f"文件名格式不正确: {filename}")
+            return None
+            
+        timestamp = match.group(1)
+        
+        # 处理包含毫秒的格式 (00_00_00-020-00_00_10-400)
+        if timestamp.count('-') == 3:
+            parts = timestamp.split('-')
+            start_time = f"{parts[0]}-{parts[1]}"  # 组合开始时间和毫秒
+            end_time = f"{parts[2]}-{parts[3]}"    # 组合结束时间和毫秒
+            
+            # 转换开始时间
+            start_time_str = start_time.replace('_', ':')
+            if start_time_str.count(':') == 2:  # 如果是 00:00:00-020 格式
+                start_base = utils.time_to_seconds(start_time_str.split('-')[0])
+                start_ms = int(start_time_str.split('-')[1]) / 1000
+                start_seconds = start_base + start_ms
+            else:
+                start_seconds = utils.time_to_seconds(start_time_str)
+            
+            # 转换结束时间
+            end_time_str = end_time.replace('_', ':')
+            if end_time_str.count(':') == 2:  # 如果是 00:00:10-400 格式
+                end_base = utils.time_to_seconds(end_time_str.split('-')[0])
+                end_ms = int(end_time_str.split('-')[1]) / 1000
+                end_seconds = end_base + end_ms
+            else:
+                end_seconds = utils.time_to_seconds(end_time_str)
+                
+        # 处理简单格式 (00_06-00_24)
+        else:
+            start_str, end_str = timestamp.split('-')
+            start_seconds = utils.time_to_seconds(start_str.replace('_', ':'))
+            end_seconds = utils.time_to_seconds(end_str.replace('_', ':'))
+        
+        logger.debug(f"从文件名 {filename} 提取时间戳: {start_seconds:.3f} - {end_seconds:.3f}")
+        return start_seconds, end_seconds
+        
+    except Exception as e:
+        logger.error(f"从文件名提取时间戳失败 {filename}: {str(e)}\n{traceback.format_exc()}")
+        return None
+
+
 def resize_video_with_padding(clip, target_width: int, target_height: int):
     """辅助函数：调整视频尺寸并添加黑边"""
     clip_ratio = clip.w / clip.h
@@ -574,6 +615,71 @@ def validate_params(video_path, audio_path, output_file, params):
         raise ValueError("params 缺少必要参数 video_aspect")
 
 
+def add_subtitles(video_clip, subtitle_path, font_size, font_name, font_color, position, shadow_color, shadow_offset):
+    """
+    为视频添加字幕
+
+    Args:
+        video_clip: 视频剪辑对象
+        subtitle_path: 字幕文件路径
+        font_size: 字体大小
+        font_name: 字体名称
+        font_color: 字体颜色
+        position: 字幕位置 ('top', 'center', 'bottom')
+        shadow_color: 阴影颜色
+        shadow_offset: 阴影偏移
+
+    Returns:
+        带有字幕的视频剪辑对象
+    """
+    try:
+        # 确保字体文件存在
+        font_path = os.path.join(utils.font_dir(), font_name)
+        if not os.path.exists(font_path):
+            logger.error(f"字体文件不存在: {font_path}")
+            # 尝试使用系统默认字体
+            font_path = "Arial" if os.name == 'nt' else "/System/Library/Fonts/STHeiti Light.ttc"
+            logger.info(f"使用默认字体: {font_path}")
+
+        # 设置字幕位置
+        if position == "top":
+            pos = ("center", 50)
+        elif position == "center":
+            pos = "center"
+        else:  # bottom
+            pos = ("center", -50)
+
+        def subtitle_generator(txt):
+            return TextClip(
+                txt, 
+                fontsize=font_size,
+                font=font_path,
+                color=font_color,
+                stroke_color=shadow_color,
+                stroke_width=shadow_offset,
+                method='caption',  # 使用 caption 方法可能更稳定
+                size=(video_clip.w * 0.9, None)  # 限制字幕宽度
+            )
+
+        subtitles = SubtitlesClip(
+            subtitle_path,
+            subtitle_generator
+        )
+        
+        # 添加字幕到视频
+        video_with_subtitles = CompositeVideoClip([
+            video_clip,
+            subtitles.set_position(pos)
+        ])
+        
+        return video_with_subtitles
+
+    except Exception as e:
+        logger.error(f"添加字幕时出错: {str(e)}\n{traceback.format_exc()}")
+        # 如果添加字幕失败，返回原始视频
+        return video_clip
+
+
 if __name__ == "__main__":
     # combined_video_path = "../../storage/tasks/12312312/com123.mp4"
     #
@@ -586,7 +692,7 @@ if __name__ == "__main__":
     #     {
     #         "picture": "夜晚，一个小孩在树林里奔跑，后面有人拿着火把在追赶",
     #         "timestamp": "00:00-00:03",
-    #         "narration": "夜黑风高的树林，一个小孩在拼命奔跑，后面的人穷追不舍！",
+    #         "narration": "夜���风高的树林，一个小孩在拼命奔跑，后面的人穷追不舍！",
     #         "OST": False,
     #         "new_timestamp": "00:00-00:03"
     #     },
diff --git a/app/test/test_moviepy.py b/app/test/test_moviepy.py
index 5b24ebf..79d93c2 100644
--- a/app/test/test_moviepy.py
+++ b/app/test/test_moviepy.py
@@ -1,5 +1,5 @@
 """
-使用 moviepy 库剪辑指定时间戳视频
+使用 moviepy 库剪辑指定时间戳视频，支持时分秒毫秒精度
 """
 
 from moviepy.editor import VideoFileClip
@@ -11,12 +11,22 @@ def time_str_to_seconds(time_str: str) -> float:
     """
     将时间字符串转换为秒数
     参数:
-        time_str: 格式为"MM:SS"的时间字符串
+        time_str: 格式为"HH:MM:SS,mmm"的时间字符串，例如"00:01:23,456"
     返回:
-        转换后的秒数
+        转换后的秒数(float)
     """
-    time_obj = datetime.strptime(time_str, "%M:%S")
-    return time_obj.minute * 60 + time_obj.second
+    try:
+        # 分离时间和毫秒
+        time_part, ms_part = time_str.split(',')
+        # 转换时分秒
+        time_obj = datetime.strptime(time_part, "%H:%M:%S")
+        # 计算总秒数
+        total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second
+        # 添加毫秒部分
+        total_seconds += int(ms_part) / 1000
+        return total_seconds
+    except ValueError as e:
+        raise ValueError("时间格式错误，请使用 HH:MM:SS,mmm 格式，例如 00:01:23,456") from e
 
 
 def format_duration(seconds: float) -> str:
@@ -25,11 +35,15 @@ def format_duration(seconds: float) -> str:
     参数:
         seconds: 秒数
     返回:
-        格式化的时间字符串 (MM:SS)
+        格式化的时间字符串 (HH:MM:SS,mmm)
     """
-    minutes = int(seconds // 60)
-    remaining_seconds = int(seconds % 60)
-    return f"{minutes:02d}:{remaining_seconds:02d}"
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds_remain = seconds % 60
+    whole_seconds = int(seconds_remain)
+    milliseconds = int((seconds_remain - whole_seconds) * 1000)
+    
+    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
 
 
 def cut_video(video_path: str, start_time: str, end_time: str, output_path: str) -> None:
@@ -37,8 +51,8 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
     剪辑视频
     参数:
         video_path: 视频文件路径
-        start_time: 开始时间 (格式: "MM:SS")
-        end_time: 结束时间 (格式: "MM:SS")
+        start_time: 开始时间 (格式: "HH:MM:SS,mmm")
+        end_time: 结束时间 (格式: "HH:MM:SS,mmm")
         output_path: 输出文件路径
     """
     try:
@@ -62,10 +76,18 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
         # 加载视频文件
         video = VideoFileClip(video_path)
         
+        # 验证时间范围
+        if start_seconds >= video.duration or end_seconds > video.duration:
+            raise ValueError(f"剪辑时间超出视频长度！视频总长度为: {format_duration(video.duration)}")
+        
+        if start_seconds >= end_seconds:
+            raise ValueError("结束时间必须大于开始时间！")
+        
         # 计算剪辑时长
         clip_duration = end_seconds - start_seconds
         print(f"原视频总长度: {format_duration(video.duration)}")
         print(f"剪辑时长: {format_duration(clip_duration)}")
+        print(f"剪辑区间: {start_time} -> {end_time}")
         
         # 剪辑视频
         video = video.subclip(start_seconds, end_seconds)
@@ -92,6 +114,9 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
 
 
 if __name__ == "__main__":
-    # cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "00:00", "07:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-1")
-    # cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "07:00", "14:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-2")
-    cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "14:00", "22:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-3")
+    cut_video(
+        video_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp.mp4",
+        start_time="00:00:00,789",
+        end_time="00:02:00,123",
+        output_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp_cut3.mp4"
+    )
diff --git a/app/test/test_qwen.py b/app/test/test_qwen.py
index 77bca56..2a69225 100644
--- a/app/test/test_qwen.py
+++ b/app/test/test_qwen.py
@@ -2,11 +2,23 @@ import os
 import traceback
 import json
 from openai import OpenAI
-from test_moviepy import cut_video
+from pydantic import BaseModel
+from typing import List
 from app.utils import utils
 from app.services.subtitle import extract_audio_and_create_subtitle
 
 
+class Step(BaseModel):
+    timestamp: str
+    picture: str
+    narration: str
+    OST: int
+    new_timestamp: str
+
+class MathReasoning(BaseModel):
+    result: List[Step]
+
+
 def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
     """
     与通义千问AI模型进行对话
@@ -23,7 +35,7 @@ def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
     """
     try:
         client = OpenAI(
-            api_key="sk-",
+            api_key="sk-a1acd853d88d41d3ae92777d7bfa2612",
             base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
         )
 
@@ -50,25 +62,25 @@ def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
 # 使用示例
 if __name__ == "__main__":
     try:
-        # video_path = utils.video_dir("duanju_yuansp.mp4")
+        video_path = utils.video_dir("duanju_yuansp.mp4")
         # # 判断视频是否存在
         # if not os.path.exists(video_path):
         #     print(f"视频文件不存在：{video_path}")
         #     exit(1)
         # 提取字幕
         subtitle_path = os.path.join(utils.video_dir(""), f"duanju_yuan.srt")
-        # extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path)
+        extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path)
         # 分析字幕
         system_message = """
         你是一个视频srt字幕分析剪辑器, 输入视频的srt字幕, 分析其中的精彩且尽可能连续的片段并裁剪出来, 注意确保文字与时间戳的正确匹配。
-        输出需严格按照如下 json 格式: 
+        输出需严格按照如下 json 格式:
         [
             {
-                "timestamp": "00:50-01:44",
+                "timestamp": "00:00:50,020-00,01:44,000",
                 "picture": "画面1",
                 "narration": "播放原声",
                 "OST": 0,
-                "new_timestamp": "00:00-00:54"
+                "new_timestamp": "00:00:00,000-00:00:54,020"
             },
             {
                 "timestamp": "01:49-02:30",
diff --git a/app/utils/utils.py b/app/utils/utils.py
index 307823c..e864341 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -40,7 +40,7 @@ def to_json(obj):
             # 如果对象是二进制数据，转换为base64编码的字符串
             elif isinstance(o, bytes):
                 return "*** binary data ***"
-            # 如果对象是字典，递归处理每个键值对
+            # 如果���象是字典，递归处理每个键值对
             elif isinstance(o, dict):
                 return {k: serialize(v) for k, v in o.items()}
             # 如果对象是列表或元组，递归处理每个元素
@@ -302,15 +302,49 @@ def get_current_country():
 
 
 def time_to_seconds(time_str: str) -> float:
-    parts = time_str.split(':')
-    if len(parts) == 2:
-        m, s = map(float, parts)
-        return m * 60 + s
-    elif len(parts) == 3:
-        h, m, s = map(float, parts)
-        return h * 3600 + m * 60 + s
-    else:
-        raise ValueError(f"Invalid time format: {time_str}")
+    """
+    将时间字符串转换为秒数，支持多种格式：
+    - "HH:MM:SS,mmm" -> 小时:分钟:秒,毫秒
+    - "MM:SS,mmm" -> 分钟:秒,毫秒
+    - "SS,mmm" -> 秒,毫秒
+    - "SS-mmm" -> 秒-毫秒
+    
+    Args:
+        time_str: 时间字符串
+        
+    Returns:
+        float: 转换后的秒数(包含毫秒)
+    """
+    try:
+        # 处理带有'-'的毫秒格式
+        if '-' in time_str:
+            time_part, ms_part = time_str.split('-')
+            ms = float(ms_part) / 1000
+        # 处理带有','的毫秒格式
+        elif ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = float(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 分割时间部分
+        parts = time_part.split(':')
+        
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(float, parts)
+            seconds = h * 3600 + m * 60 + s
+        elif len(parts) == 2:  # MM:SS
+            m, s = map(float, parts)
+            seconds = m * 60 + s
+        else:  # SS
+            seconds = float(parts[0])
+
+        return seconds + ms
+        
+    except (ValueError, IndexError) as e:
+        logger.error(f"时间格式转换错误 {time_str}: {str(e)}")
+        return 0.0
 
 
 def seconds_to_time(seconds: float) -> str:
@@ -520,3 +554,21 @@ def download_font(url: str, font_path: str):
     except Exception as e:
         logger.error(f"下载字体文件失败: {e}")
         raise
+
+def init_imagemagick():
+    """初始化 ImageMagick 配置"""
+    try:
+        # 检查 ImageMagick 是否已安装
+        import subprocess
+        result = subprocess.run(['magick', '-version'], capture_output=True, text=True)
+        if result.returncode != 0:
+            logger.error("ImageMagick 未安装或配置不正确")
+            return False
+            
+        # 设置 IMAGEMAGICK_BINARY 环境变量
+        os.environ['IMAGEMAGICK_BINARY'] = 'magick'
+        
+        return True
+    except Exception as e:
+        logger.error(f"初始化 ImageMagick 失败: {str(e)}")
+        return False
diff --git a/video_pipeline.py b/video_pipeline.py
index 5dca576..dc7fa26 100644
--- a/video_pipeline.py
+++ b/video_pipeline.py
@@ -93,10 +93,8 @@ class VideoPipeline:
         response.raise_for_status()
         return response.json()
     
-    def save_script_to_json(self, script: list, script_name: str) -> str:
-        """保存脚本到json文件"""
-        script_path = f"E:\\projects\\NarratoAI\\resource\\scripts\\{script_name}.json"
-        
+    def save_script_to_json(self, script: list, script_path: str) -> str:
+        """保存脚本到json文件"""        
         try:
             with open(script_path, 'w', encoding='utf-8') as f:
                 json.dump(script, f, ensure_ascii=False, indent=2)
@@ -133,8 +131,7 @@ class VideoPipeline:
             
             # 2.2 保存脚本到json文件
             print("保存脚本到json文件...")
-            script_path = self.save_script_to_json(script, script_name)
-            script_result["script_path"] = script_path
+            self.save_script_to_json(script=script, script_path=script_path)
             
             # 3. 剪辑视频
             print("开始剪辑视频...")
@@ -143,7 +140,7 @@ class VideoPipeline:
             
             # 4. 生成最终视频
             print("开始生成最终视频...")
-            final_result = self.generate_final_video(
+            self.generate_final_video(
                 task_id=task_id,
                 video_path=video_path,
                 script_path=script_path,
diff --git a/webui.txt b/webui.txt
index b64b320..c8d66c9 100644
--- a/webui.txt
+++ b/webui.txt
@@ -369,4 +369,6 @@ output_path和script参数需要传递给请求3
   }
 }
 subclip_videos和 output_path和script参数需要传递给请求4
-最后完成工作流
\ No newline at end of file
+最后完成工作流
+
+0代表只播放文案音频，禁用视频原声；1代表只播放视频原声，不需要播放文案音频和字幕；2代表即播放文案音频也要播放视频原声；
\ No newline at end of file

From 73729dcb7b7116ae858ee023f6d86d786a269c38 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Wed, 20 Nov 2024 18:32:34 +0800
Subject: [PATCH 9/9] =?UTF-8?q?feat(utils):=20=E4=BC=98=E5=8C=96=E6=97=B6?=
 =?UTF-8?q?=E9=97=B4=E6=88=B3=E5=A4=84=E7=90=86=E5=B9=B6=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?=E6=AF=AB=E7=A7=92=E7=BA=A7=E7=B2=BE=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 重构了时间戳转换函数，支持 HH:MM:SS,mmm 格式
- 优化了时间戳计算逻辑，提高了精度- 更新了相关服务和工具类中的时间戳处理方法
---
 app/services/script_service.py | 59 +++++++++++++++++++++++++---------
 app/utils/script_generator.py  | 49 +++++++++++++++++++++-------
 app/utils/vision_analyzer.py   | 29 ++++++++++++++---
 3 files changed, 105 insertions(+), 32 deletions(-)

diff --git a/app/services/script_service.py b/app/services/script_service.py
index 1693cbc..d42a232 100644
--- a/app/services/script_service.py
+++ b/app/services/script_service.py
@@ -35,7 +35,7 @@ class ScriptGenerator:
             video_theme: 视频主题
             custom_prompt: 自定义提示词
             skip_seconds: 跳过开始的秒数
-            threshold: 差异阈值
+            threshold: 差异���值
             vision_batch_size: 视觉处理批次大小
             vision_llm_provider: 视觉模型提供商
             progress_callback: 进度回调函数
@@ -177,7 +177,7 @@ class ScriptGenerator:
             batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size)
             first_timestamp, last_timestamp, _ = self._get_batch_timestamps(batch_files, prev_batch_files)
             
-            # 添加带时间戳的分析结果
+            # 添加带时间戳的分��结果
             frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
             frame_analysis += result['response']
             frame_analysis += "\n"
@@ -214,7 +214,7 @@ class ScriptGenerator:
 
         progress_callback(90, "正在生成文案...")
         
-        # 获取文本生成配置
+        # 获取文本生��配置
         text_provider = config.app.get('text_llm_provider', 'gemini').lower()
         text_api_key = config.app.get(f'text_{text_provider}_api_key')
         text_model = config.app.get(f'text_{text_provider}_model_name')
@@ -286,7 +286,7 @@ class ScriptGenerator:
             task_data = response.json()
             task_id = task_data["data"].get('task_id')
             if not task_id:
-                raise Exception(f"无效的API响应: {response.text}")
+                raise Exception(f"无效的API��应: {response.text}")
             
             progress_callback(50, "正在等待分析结果...")
             retry_count = 0
@@ -342,10 +342,10 @@ class ScriptGenerator:
         batch_files: List[str], 
         prev_batch_files: List[str] = None
     ) -> tuple[str, str, str]:
-        """获取一批文件的时间戳范围"""
+        """获取一批文件的时间戳范围，支持毫秒级精度"""
         if not batch_files:
             logger.warning("Empty batch files")
-            return "00:00", "00:00", "00:00-00:00"
+            return "00:00:00,000", "00:00:00,000", "00:00:00,000-00:00:00,000"
             
         if len(batch_files) == 1 and prev_batch_files and len(prev_batch_files) > 0:
             first_frame = os.path.basename(prev_batch_files[-1])
@@ -358,18 +358,45 @@ class ScriptGenerator:
         last_time = last_frame.split('_')[2].replace('.jpg', '')
         
         def format_timestamp(time_str: str) -> str:
-            if len(time_str) < 4:
-                logger.warning(f"Invalid timestamp format: {time_str}")
-                return "00:00"
+            """将时间字符串转换为 HH:MM:SS,mmm 格式"""
+            try:
+                if len(time_str) < 4:
+                    logger.warning(f"Invalid timestamp format: {time_str}")
+                    return "00:00:00,000"
                 
-            minutes = int(time_str[-4:-2])
-            seconds = int(time_str[-2:])
-            
-            if seconds >= 60:
-                minutes += seconds // 60
-                seconds = seconds % 60
+                # 处理毫秒部分
+                if ',' in time_str:
+                    time_part, ms_part = time_str.split(',')
+                    ms = int(ms_part)
+                else:
+                    time_part = time_str
+                    ms = 0
                 
-            return f"{minutes:02d}:{seconds:02d}"
+                # 处理时分秒
+                parts = time_part.split(':')
+                if len(parts) == 3:  # HH:MM:SS
+                    h, m, s = map(int, parts)
+                elif len(parts) == 2:  # MM:SS
+                    h = 0
+                    m, s = map(int, parts)
+                else:  # SS
+                    h = 0
+                    m = 0
+                    s = int(parts[0])
+                    
+                # 处理进位
+                if s >= 60:
+                    m += s // 60
+                    s = s % 60
+                if m >= 60:
+                    h += m // 60
+                    m = m % 60
+                    
+                return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+                
+            except Exception as e:
+                logger.error(f"时间戳格式转换错误 {time_str}: {str(e)}")
+                return "00:00:00,000"
         
         first_timestamp = format_timestamp(first_time)
         last_timestamp = format_timestamp(last_time)
diff --git a/app/utils/script_generator.py b/app/utils/script_generator.py
index e36064a..9005e32 100644
--- a/app/utils/script_generator.py
+++ b/app/utils/script_generator.py
@@ -406,22 +406,47 @@ class ScriptProcessor:
     def _save_results(self, frame_content_list: List[Dict]):
         """保存处理结果，并添加新的时间戳"""
         try:
-            # 转换秒数为 MM:SS 格式
-            def seconds_to_time(seconds):
-                minutes = seconds // 60
-                remaining_seconds = seconds % 60
-                return f"{minutes:02d}:{remaining_seconds:02d}"
+            def format_timestamp(seconds: float) -> str:
+                """将秒数转换为 HH:MM:SS,mmm 格式"""
+                hours = int(seconds // 3600)
+                minutes = int((seconds % 3600) // 60)
+                seconds_remainder = seconds % 60
+                whole_seconds = int(seconds_remainder)
+                milliseconds = int((seconds_remainder - whole_seconds) * 1000)
+                
+                return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
 
             # 计算新的时间戳
-            current_time = 0  # 当前时间点（秒）
+            current_time = 0.0  # 当前时间点（秒，包含毫秒）
 
             for frame in frame_content_list:
                 # 获取原始时间戳的持续时间
                 start_str, end_str = frame['timestamp'].split('-')
 
-                def time_to_seconds(time_str):
-                    minutes, seconds = map(int, time_str.split(':'))
-                    return minutes * 60 + seconds
+                def time_to_seconds(time_str: str) -> float:
+                    """将时间字符串转换为秒数（包含毫秒）"""
+                    try:
+                        if ',' in time_str:
+                            time_part, ms_part = time_str.split(',')
+                            ms = float(ms_part) / 1000
+                        else:
+                            time_part = time_str
+                            ms = 0
+
+                        parts = time_part.split(':')
+                        if len(parts) == 3:  # HH:MM:SS
+                            h, m, s = map(float, parts)
+                            seconds = h * 3600 + m * 60 + s
+                        elif len(parts) == 2:  # MM:SS
+                            m, s = map(float, parts)
+                            seconds = m * 60 + s
+                        else:  # SS
+                            seconds = float(parts[0])
+
+                        return seconds + ms
+                    except Exception as e:
+                        logger.error(f"时间格式转换错误 {time_str}: {str(e)}")
+                        return 0.0
 
                 # 计算当前片段的持续时间
                 start_seconds = time_to_seconds(start_str)
@@ -429,8 +454,8 @@ class ScriptProcessor:
                 duration = end_seconds - start_seconds
 
                 # 设置新的时间戳
-                new_start = seconds_to_time(current_time)
-                new_end = seconds_to_time(current_time + duration)
+                new_start = format_timestamp(current_time)
+                new_end = format_timestamp(current_time + duration)
                 frame['new_timestamp'] = f"{new_start}-{new_end}"
 
                 # 更新当前时间点
@@ -443,7 +468,7 @@ class ScriptProcessor:
             with open(file_name, 'w', encoding='utf-8') as file:
                 json.dump(frame_content_list, file, ensure_ascii=False, indent=4)
 
-            logger.info(f"保存脚本成功，总时长: {seconds_to_time(current_time)}")
+            logger.info(f"保存脚本成功，总时长: {format_timestamp(current_time)}")
 
         except Exception as e:
             logger.error(f"保存结果时发生错误: {str(e)}\n{traceback.format_exc()}")
diff --git a/app/utils/vision_analyzer.py b/app/utils/vision_analyzer.py
index 06342d7..07306c5 100644
--- a/app/utils/vision_analyzer.py
+++ b/app/utils/vision_analyzer.py
@@ -10,6 +10,7 @@ from google.api_core import exceptions
 import google.generativeai as genai
 import PIL.Image
 import traceback
+from app.utils import utils
 
 
 class VisionAnalyzer:
@@ -146,14 +147,34 @@ class VisionAnalyzer:
             response_text = result['response']
             image_paths = result['image_paths']
 
-            img_name_start = Path(image_paths[0]).stem.split('_')[-1]
-            img_name_end = Path(image_paths[-1]).stem.split('_')[-1]
-            txt_path = os.path.join(output_dir, f"frame_{img_name_start}_{img_name_end}.txt")
+            # 从文件名中提取时间戳并转换为标准格式
+            def format_timestamp(img_path):
+                # 从文件名中提取时间部分
+                timestamp = Path(img_path).stem.split('_')[-1]
+                try:
+                    # 将时间转换为秒
+                    seconds = utils.time_to_seconds(timestamp.replace('_', ':'))
+                    # 转换为 HH:MM:SS,mmm 格式
+                    hours = int(seconds // 3600)
+                    minutes = int((seconds % 3600) // 60)
+                    seconds_remainder = seconds % 60
+                    whole_seconds = int(seconds_remainder)
+                    milliseconds = int((seconds_remainder - whole_seconds) * 1000)
+                    
+                    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
+                except Exception as e:
+                    logger.error(f"时间戳格式转换错误: {timestamp}, {str(e)}")
+                    return timestamp
+
+            start_timestamp = format_timestamp(image_paths[0])
+            end_timestamp = format_timestamp(image_paths[-1])
+            
+            txt_path = os.path.join(output_dir, f"frame_{start_timestamp}_{end_timestamp}.txt")
 
             # 保存结果到txt文件
             with open(txt_path, 'w', encoding='utf-8') as f:
                 f.write(response_text.strip())
-            print(f"已保存分析结果到: {txt_path}")
+            logger.info(f"已保存分析结果到: {txt_path}")
 
     def load_images(self, image_paths: List[str]) -> List[PIL.Image.Image]:
         """