From 401eb92fa33bcc1ef3d2a998c55112e878606881 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Wed, 20 Nov 2024 18:12:45 +0800
Subject: [PATCH] =?UTF-8?q?feat(audio):=20=E6=94=B9=E8=BF=9B=E9=9F=B3?=
 =?UTF-8?q?=E9=A2=91=E5=90=88=E5=B9=B6=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=20OST=20=E8=AE=BE=E7=BD=AE=EF=BC=8C=E6=8F=90=E5=8D=87?=
 =?UTF-8?q?=E6=97=B6=E9=97=B4=E6=88=B3=E7=B2=BE=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

-重构了 merge_audio_files 函数，增加了对 OST 设置的支持
- 新增 time_to_seconds 函数，支持多种时间格式的转换
- 修改了 audio_merger 模块的逻辑，根据 OST 设置处理音频
- 更新了 task 模块中的 start_subclip 函数，传入 OST 信息
- 优化了 subtitle 和 video 模块的逻辑，适应新的音频处理方式
---
 app/models/schema.py         |   2 +
 app/services/audio_merger.py | 174 +++++++++--------
 app/services/material.py     |  73 +++++--
 app/services/task.py         | 203 +++++---------------
 app/services/video.py        | 360 +++++++++++++++++++++++------------
 app/test/test_moviepy.py     |  53 ++++--
 app/test/test_qwen.py        |  26 ++-
 app/utils/utils.py           |  72 ++++++-
 video_pipeline.py            |  11 +-
 webui.txt                    |   4 +-
 10 files changed, 566 insertions(+), 412 deletions(-)

diff --git a/app/models/schema.py b/app/models/schema.py
index 9d0c5d4..6621772 100644
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -366,6 +366,8 @@ class VideoClipParams(BaseModel):
     custom_position: float = Field(default=70.0, description="自定义位置")
 
     n_threads: Optional[int] = 8    # 线程数，有助于提升视频处理速度
+    tts_volume: float = 1.0  # TTS音频音量
+    video_volume: float = 0.1  # 视频原声音量
 
 class VideoTranscriptionRequest(BaseModel):
     video_name: str
diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py
index f0face0..c7edc77 100644
--- a/app/services/audio_merger.py
+++ b/app/services/audio_merger.py
@@ -18,95 +18,119 @@ def check_ffmpeg():
         return False
 
 
-def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int, video_script: list):
+def merge_audio_files(task_id: str, audio_files: list, total_duration: float, list_script: list):
     """
-    合并多个音频文件到一个指定总时长的音频文件中，并生成相应的字幕
-    :param task_id: 任务ID
-    :param audio_file_paths: 音频文件路径列表
-    :param total_duration: 最终音频文件的总时长（秒）
-    :param video_script: JSON格式的视频脚本
+    合并音频文件，根据OST设置处理不同的音频轨道
+    
+    Args:
+        task_id: 任务ID
+        audio_files: TTS生成的音频文件列表
+        total_duration: 总时长
+        list_script: 完整脚本信息，包含OST设置
+    
+    Returns:
+        str: 合并后的音频文件路径
     """
-    output_dir = utils.task_dir(task_id)
-
+    # 检查FFmpeg是否安装
     if not check_ffmpeg():
-        logger.error("错误：FFmpeg未安装。请安装FFmpeg后再运行此脚本。")
-        return None, None
+        logger.error("FFmpeg未安装，无法合并音频文件")
+        return None
 
-    # 创建一个总时长为total_duration的空白音频
-    blank_audio = AudioSegment.silent(duration=total_duration * 1000)  # pydub使用毫秒
+    # 创建一个空的音频片段
+    final_audio = AudioSegment.silent(duration=total_duration * 1000)  # 总时长以毫秒为单位
 
-    for audio_path in audio_file_paths:
-        if not os.path.exists(audio_path):
-            logger.info(f"警告：文件 {audio_path} 不存在，已跳过。")
+    # 遍历脚本中的每个片段
+    for segment, audio_file in zip(list_script, audio_files):
+        try:
+            # 加载TTS音频文件
+            tts_audio = AudioSegment.from_file(audio_file)
+
+            # 获取片段的开始和结束时间
+            start_time, end_time = segment['new_timestamp'].split('-')
+            start_seconds = utils.time_to_seconds(start_time)
+            end_seconds = utils.time_to_seconds(end_time)
+
+            # 根据OST设置处理音频
+            if segment['OST'] == 0:
+                # 只使用TTS音频
+                final_audio = final_audio.overlay(tts_audio, position=start_seconds * 1000)
+            elif segment['OST'] == 1:
+                # 只使用原声（假设原声已经在视频中）
+                continue
+            elif segment['OST'] == 2:
+                # 混合TTS音频和原声
+                original_audio = AudioSegment.silent(duration=(end_seconds - start_seconds) * 1000)
+                mixed_audio = original_audio.overlay(tts_audio)
+                final_audio = final_audio.overlay(mixed_audio, position=start_seconds * 1000)
+
+        except Exception as e:
+            logger.error(f"处理音频文件 {audio_file} 时出错: {str(e)}")
             continue
 
-        # 从文件名中提取时间戳
-        filename = os.path.basename(audio_path)
-        start_time, end_time = extract_timestamp(filename)
+    # 保存合并后的音频文件
+    output_audio_path = os.path.join(utils.task_dir(task_id), "final_audio.mp3")
+    final_audio.export(output_audio_path, format="mp3")
+    logger.info(f"合并后的音频文件已保存: {output_audio_path}")
 
-        # 读取音频文件
-        try:
-            audio = AudioSegment.from_mp3(audio_path)
-        except Exception as e:
-            logger.error(f"错误：无法读取文件 {audio_path}。错误信息：{str(e)}")
-            continue
-        
-        # 将音频插入到空白音频的指定位置
-        blank_audio = blank_audio.overlay(audio, position=start_time * 1000)
-
-    # 尝试导出为WAV格式
-    try:
-        output_file = os.path.join(output_dir, "audio.wav")
-        blank_audio.export(output_file, format="wav")
-        logger.info(f"音频合并完成，已保存为 {output_file}")
-    except Exception as e:
-        logger.info(f"导出为WAV格式失败，尝试使用MP3格式：{str(e)}")
-        try:
-            output_file = os.path.join(output_dir, "audio.mp3")
-            blank_audio.export(output_file, format="mp3", codec="libmp3lame")
-            logger.info(f"音频合并完成，已保存为 {output_file}")
-        except Exception as e:
-            logger.error(f"导出音频失败：{str(e)}")
-            return None, None
-
-    return output_file
-
-def parse_timestamp(timestamp: str):
-    """解析时间戳字符串为秒数"""
-    # 确保使用冒号作为分隔符
-    timestamp = timestamp.replace('_', ':')
-    return time_to_seconds(timestamp)
-
-def extract_timestamp(filename):
-    """从文件名中提取开始和结束时间戳"""
-    # 从 "audio_00_06-00_24.mp3" 这样的格式中提取时间
-    time_part = filename.split('_', 1)[1].split('.')[0]  # 获取 "00_06-00_24" 部分
-    start_time, end_time = time_part.split('-')  # 分割成 "00_06" 和 "00_24"
-    
-    # 将下划线格式转换回冒号格式
-    start_time = start_time.replace('_', ':')
-    end_time = end_time.replace('_', ':')
-    
-    # 将时间戳转换为秒
-    start_seconds = time_to_seconds(start_time)
-    end_seconds = time_to_seconds(end_time)
-
-    return start_seconds, end_seconds
+    return output_audio_path
 
 
 def time_to_seconds(time_str):
-    """将 "00:06" 或 "00_06" 格式转换为总秒数"""
-    # 确保使用冒号作为分隔符
-    time_str = time_str.replace('_', ':')
+    """
+    将时间字符串转换为秒数，支持多种格式：
+    1. 'HH:MM:SS,mmm' (时:分:秒,毫秒)
+    2. 'MM:SS,mmm' (分:秒,毫秒)
+    3. 'SS,mmm' (秒,毫秒)
+    """
     try:
-        parts = time_str.split(':')
-        if len(parts) != 2:
-            logger.error(f"Invalid time format: {time_str}")
-            return 0
-        return int(parts[0]) * 60 + int(parts[1])
+        # 处理毫秒部分
+        if ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = float(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 分割时间部分
+        parts = time_part.split(':')
+        
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(int, parts)
+            seconds = h * 3600 + m * 60 + s
+        elif len(parts) == 2:  # MM:SS
+            m, s = map(int, parts)
+            seconds = m * 60 + s
+        else:  # SS
+            seconds = int(parts[0])
+
+        return seconds + ms
     except (ValueError, IndexError) as e:
         logger.error(f"Error parsing time {time_str}: {str(e)}")
-        return 0
+        return 0.0
+
+
+def extract_timestamp(filename):
+    """
+    从文件名中提取开始和结束时间戳
+    例如: "audio_00_06,500-00_24,800.mp3" -> (6.5, 24.8)
+    """
+    try:
+        # 从文件名中提取时间部分
+        time_part = filename.split('_', 1)[1].split('.')[0]  # 获取 "00_06,500-00_24,800" 部分
+        start_time, end_time = time_part.split('-')  # 分割成开始和结束时间
+        
+        # 将下划线格式转换回冒号格式
+        start_time = start_time.replace('_', ':')
+        end_time = end_time.replace('_', ':')
+        
+        # 将时间戳转换为秒
+        start_seconds = time_to_seconds(start_time)
+        end_seconds = time_to_seconds(end_time)
+
+        return start_seconds, end_seconds
+    except Exception as e:
+        logger.error(f"Error extracting timestamp from {filename}: {str(e)}")
+        return 0.0, 0.0
 
 
 if __name__ == "__main__":
diff --git a/app/services/material.py b/app/services/material.py
index 696eda8..5ec6ee4 100644
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -3,6 +3,7 @@ import subprocess
 import random
 import traceback
 from urllib.parse import urlencode
+from datetime import datetime
 
 import requests
 from typing import List
@@ -253,34 +254,58 @@ def download_videos(
 
 def time_to_seconds(time_str: str) -> float:
     """
-    将时间字符串转换为秒数
-    支持格式：
-    1. "MM:SS" (分:秒)
-    2. "SS" (纯秒数)
+    将时间字符串转换为秒数，支持多种格式：
+    1. 'HH:MM:SS,mmm' (时:分:秒,毫秒)
+    2. 'MM:SS' (分:秒)
+    3. 'SS' (秒)
     """
-    parts = time_str.split(':')
-    if len(parts) == 2:
-        minutes, seconds = map(float, parts)
-        return minutes * 60 + seconds
-    return float(time_str)
+    try:
+        # 处理毫秒部分
+        if ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = int(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 根据格式分别处理
+        parts = time_part.split(':')
+        if len(parts) == 3:  # HH:MM:SS
+            time_obj = datetime.strptime(time_part, "%H:%M:%S")
+            seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second
+        elif len(parts) == 2:  # MM:SS
+            time_obj = datetime.strptime(time_part, "%M:%S")
+            seconds = time_obj.minute * 60 + time_obj.second
+        else:  # SS
+            seconds = float(time_part)
+
+        return seconds + ms
+    except ValueError as e:
+        logger.error(f"时间格式错误: {time_str}")
+        raise ValueError(f"时间格式错误，支持的格式：HH:MM:SS,mmm 或 MM:SS 或 SS") from e
 
 
 def format_timestamp(seconds: float) -> str:
     """
-    将秒数转换为 "MM:SS" 格式的时间字符串
+    将秒数转换为可读的时间格式 (HH:MM:SS,mmm)
     """
-    minutes = int(seconds) // 60
-    secs = int(seconds) % 60
-    return f"{minutes:02d}:{secs:02d}"
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds_remain = seconds % 60
+    whole_seconds = int(seconds_remain)
+    milliseconds = int((seconds_remain - whole_seconds) * 1000)
+    
+    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
 
 
 def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict:
     """
     保存剪辑后的视频
     Args:
-        timestamp: 需要裁剪的单个时间戳，支持两种格式：
-                  1. '00:36-00:40' (分:秒-分:秒)
-                  2. 'SS-SS' (秒-秒)
+        timestamp: 需要裁剪的单个时间戳，支持格式：
+                  1. 'HH:MM:SS,mmm-HH:MM:SS,mmm' (时:分:秒,毫秒)
+                  2. 'MM:SS-MM:SS' (分:秒-分:秒)
+                  3. 'SS-SS' (秒-秒)
         origin_video: 原视频路径
         save_dir: 存储目录
 
@@ -293,7 +318,7 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
     if not os.path.exists(save_dir):
         os.makedirs(save_dir)
 
-    video_id = f"vid-{timestamp.replace(':', '_')}"
+    video_id = f"vid-{timestamp.replace(':', '_').replace(',', '-')}"
     video_path = f"{save_dir}/{video_id}.mp4"
 
     if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
@@ -312,12 +337,12 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
         
         # 验证时间段是否有效
         if start >= total_duration:
-            logger.warning(f"起始时间 {format_timestamp(start)} ({start:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒)")
+            logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)")
             video.close()
             return {}
             
         if end > total_duration:
-            logger.warning(f"结束时间 {format_timestamp(end)} ({end:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒)，将自动调整为视频结尾")
+            logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)，将自动调整为视频结尾")
             end = total_duration
             
         if end <= start:
@@ -332,7 +357,15 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
         
         try:
             # 检查视频是否有音频轨道并写入文件
-            subclip.write_videofile(video_path, audio=(subclip.audio is not None), logger=None)
+            subclip.write_videofile(
+                video_path,
+                codec='libx264',
+                audio_codec='aac',
+                temp_audiofile='temp-audio.m4a',
+                remove_temp=True,
+                audio=(subclip.audio is not None),
+                logger=None
+            )
             
             # 验证生成的视频文件
             if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
diff --git a/app/services/task.py b/app/services/task.py
index c030574..5cd31ed 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -206,134 +206,14 @@ def generate_final_videos(
     return final_video_paths, combined_video_paths
 
 
-def start(task_id, params: VideoParams, stop_at: str = "video"):
-    logger.info(f"start task: {task_id}, stop_at: {stop_at}")
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
-
-    if type(params.video_concat_mode) is str:
-        params.video_concat_mode = VideoConcatMode(params.video_concat_mode)
-
-    # 1. Generate script
-    video_script = generate_script(task_id, params)
-    if not video_script:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
-
-    if stop_at == "script":
-        sm.state.update_task(
-            task_id, state=const.TASK_STATE_COMPLETE, progress=100, script=video_script
-        )
-        return {"script": video_script}
-
-    # 2. Generate terms
-    video_terms = ""
-    if params.video_source != "local":
-        video_terms = generate_terms(task_id, params, video_script)
-        if not video_terms:
-            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-            return
-
-    save_script_data(task_id, video_script, video_terms, params)
-
-    if stop_at == "terms":
-        sm.state.update_task(
-            task_id, state=const.TASK_STATE_COMPLETE, progress=100, terms=video_terms
-        )
-        return {"script": video_script, "terms": video_terms}
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
-
-    # 3. Generate audio
-    audio_file, audio_duration, sub_maker = generate_audio(task_id, params, video_script)
-    if not audio_file:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
-
-    if stop_at == "audio":
-        sm.state.update_task(
-            task_id,
-            state=const.TASK_STATE_COMPLETE,
-            progress=100,
-            audio_file=audio_file,
-        )
-        return {"audio_file": audio_file, "audio_duration": audio_duration}
-
-    # 4. Generate subtitle
-    subtitle_path = generate_subtitle(task_id, params, video_script, sub_maker, audio_file)
-
-    if stop_at == "subtitle":
-        sm.state.update_task(
-            task_id,
-            state=const.TASK_STATE_COMPLETE,
-            progress=100,
-            subtitle_path=subtitle_path,
-        )
-        return {"subtitle_path": subtitle_path}
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
-
-    # 5. Get video materials
-    downloaded_videos = get_video_materials(
-        task_id, params, video_terms, audio_duration
-    )
-    if not downloaded_videos:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    if stop_at == "materials":
-        sm.state.update_task(
-            task_id,
-            state=const.TASK_STATE_COMPLETE,
-            progress=100,
-            materials=downloaded_videos,
-        )
-        return {"materials": downloaded_videos}
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
-
-    # 6. Generate final videos
-    final_video_paths, combined_video_paths = generate_final_videos(
-        task_id, params, downloaded_videos, audio_file, subtitle_path
-    )
-
-    if not final_video_paths:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        return
-
-    logger.success(
-        f"task {task_id} finished, generated {len(final_video_paths)} videos."
-    )
-
-    kwargs = {
-        "videos": final_video_paths,
-        "combined_videos": combined_video_paths,
-        "script": video_script,
-        "terms": video_terms,
-        "audio_file": audio_file,
-        "audio_duration": audio_duration,
-        "subtitle_path": subtitle_path,
-        "materials": downloaded_videos,
-    }
-    sm.state.update_task(
-        task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs
-    )
-    return kwargs
-
-
 def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
-    """
-    后台任务（自动剪辑视频进行剪辑）
-
-        task_id: 任务ID
-        params: 剪辑参数
-        subclip_path_videos: 视频文件路径
-
-    """
+    """后台任务（自动剪辑视频进行剪辑）"""
     logger.info(f"\n\n## 开始任务: {task_id}")
+    
+    # 初始化 ImageMagick
+    if not utils.init_imagemagick():
+        logger.warning("ImageMagick 初始化失败，字幕可能无法正常显示")
+    
     sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
 
     # tts 角色名称
@@ -341,8 +221,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
 
     logger.info("\n\n## 1. 加载视频脚本")
     video_script_path = path.join(params.video_clip_json_path)
-    # video_script_path = video_clip_json_path
-    # 判断json文件是否存在
+    
     if path.exists(video_script_path):
         try:
             with open(video_script_path, "r", encoding="utf-8") as f:
@@ -355,10 +234,12 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
                 logger.debug(f"解说完整脚本: \n{video_script}")
                 logger.debug(f"解说 OST 列表: \n{video_ost}")
                 logger.debug(f"解说时间戳列表: \n{time_list}")
+                
                 # 获取视频总时长(单位 s)
-                total_duration = list_script[-1]['new_timestamp']
-                total_duration = int(total_duration.split("-")[1].split(":")[0]) * 60 + int(
-                    total_duration.split("-")[1].split(":")[1])
+                last_timestamp = list_script[-1]['new_timestamp']
+                end_time = last_timestamp.split("-")[1]
+                total_duration = utils.time_to_seconds(end_time)
+                
         except Exception as e:
             logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
             raise ValueError("无法读取视频json脚本，请检查配置是否正确")
@@ -366,32 +247,51 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
         logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
         raise ValueError("解说脚本不存在！请检查配置是否正确。")
 
-    logger.info("\n\n## 2. 生成音频列表")
-    audio_files, sub_maker_list = voice.tts_multiple(
-        task_id=task_id,
-        list_script=list_script,
-        voice_name=voice_name,
-        voice_rate=params.voice_rate,
-        voice_pitch=params.voice_pitch,
-        force_regenerate=True
+    logger.info("\n\n## 2. 根据OST设置生成音频列表")
+    # 只为OST=0或2的片段生成TTS音频
+    tts_segments = [
+        segment for segment in list_script 
+        if segment['OST'] in [0, 2]
+    ]
+    logger.debug(f"tts_segments: {tts_segments}")
+    if tts_segments:
+        audio_files, sub_maker_list = voice.tts_multiple(
+            task_id=task_id,
+            list_script=tts_segments,  # 只传入需要TTS的片段
+            voice_name=voice_name,
+            voice_rate=params.voice_rate,
+            voice_pitch=params.voice_pitch,
+            force_regenerate=True
+        )
+        if audio_files is None:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
+            return
+    else:
+        audio_files = []
+        
+    logger.info(f"合并音频文件:\n{audio_files}")
+    # 传入OST信息以便正确处理音频
+    final_audio = audio_merger.merge_audio_files(
+        task_id=task_id, 
+        audio_files=audio_files, 
+        total_duration=total_duration, 
+        list_script=list_script  # 传入完整脚本以便处理OST
     )
-    if audio_files is None:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error(
-            "TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
-        return
-    logger.info(f"合并音频:\n\n {audio_files}")
-    audio_file = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script)
 
     sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
 
+    # 只为OST=0或2的片段生成字幕
     subtitle_path = ""
     if params.subtitle_enabled:
         subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
         logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-        # 使用 faster-whisper-large-v2 模型生成字幕
-        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+         
+        subtitle.create(
+            audio_file=final_audio,
+            subtitle_file=subtitle_path,
+        )
 
         subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
         if not subtitle_lines:
@@ -434,14 +334,15 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
 
     final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
 
-    logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}")
-    # 把所有东西合到在一起
+    logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}")
+    # 传入OST信息以便正确处理音频和视频
     video.generate_video_v2(
         video_path=combined_video_path,
-        audio_path=audio_file,
+        audio_path=final_audio,
         subtitle_path=subtitle_path,
         output_file=final_video_path,
         params=params,
+        list_script=list_script  # 传入完整脚本以便处理OST
     )
 
     _progress += 50 / 2
diff --git a/app/services/video.py b/app/services/video.py
index 1d270fa..8e6e32d 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -173,7 +173,7 @@ def wrap_text(text, max_width, font, fontsize=60):
     if width <= max_width:
         return text, height
 
-    logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 文本: {text}")
+    logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 本: {text}")
 
     processed = True
 
@@ -228,131 +228,93 @@ def manage_clip(clip):
 
 
 def generate_video_v2(
-        video_path: str,
-        audio_path: str,
-        subtitle_path: str,
-        output_file: str,
-        params: Union[VideoParams, VideoClipParams],
-        progress_callback=None,
+    video_path: str,
+    audio_path: str,
+    subtitle_path: str,
+    output_file: str,
+    params: VideoClipParams,
+    list_script: list = None
 ):
     """
-    合并所有素材
+    生成最终视频，处理音频和字幕
+
     Args:
-        video_path: 视频路径
-        audio_path: 单个音频文件路径
+        video_path: 视频文件路径
+        audio_path: 音频文件路径
         subtitle_path: 字幕文件路径
         output_file: 输出文件路径
         params: 视频参数
-        progress_callback: 进度回调函数，接收 0-100 的进度值
-
-    Returns:
-
+        list_script: 视频脚本列表，包含OST设置
     """
-    total_steps = 4
-    current_step = 0
-    
-    def update_progress(step_name):
-        nonlocal current_step
-        current_step += 1
-        if progress_callback:
-            progress_callback(int(current_step * 100 / total_steps))
-        logger.info(f"完成步骤: {step_name}")
-
     try:
-        validate_params(video_path, audio_path, output_file, params)
+        video_clip = VideoFileClip(video_path)
         
-        with manage_clip(VideoFileClip(video_path)) as video_clip:
-            aspect = VideoAspect(params.video_aspect)
-            video_width, video_height = aspect.to_resolution()
-
-            logger.info(f"开始，视频尺寸: {video_width} x {video_height}")
-            logger.info(f"  ① 视频: {video_path}")
-            logger.info(f"  ② 音频: {audio_path}")
-            logger.info(f"  ③ 字幕: {subtitle_path}")
-            logger.info(f"  ④ 输出: {output_file}")
-
-            output_dir = os.path.dirname(output_file)
-            update_progress("初始化完成")
-
-            # 字体设置
-            font_path = ""
-            if params.subtitle_enabled:
-                if not params.font_name:
-                    params.font_name = "STHeitiMedium.ttc"
-                font_path = os.path.join(utils.font_dir(), params.font_name)
-                if os.name == "nt":
-                    font_path = font_path.replace("\\", "/")
-                logger.info(f"使用字体: {font_path}")
-
-            def create_text_clip(subtitle_item):
-                phrase = subtitle_item[1]
-                max_width = video_width * 0.9
-                wrapped_txt, txt_height = wrap_text(
-                    phrase, max_width=max_width, font=font_path, fontsize=params.font_size
-                )
-                _clip = TextClip(
-                    wrapped_txt,
-                    font=font_path,
-                    fontsize=params.font_size,
-                    color=params.text_fore_color,
-                    bg_color=params.text_background_color,
-                    stroke_color=params.stroke_color,
-                    stroke_width=params.stroke_width,
-                    print_cmd=False,
-                )
-                duration = subtitle_item[0][1] - subtitle_item[0][0]
-                _clip = _clip.set_start(subtitle_item[0][0])
-                _clip = _clip.set_end(subtitle_item[0][1])
-                _clip = _clip.set_duration(duration)
-                
-                if params.subtitle_position == "bottom":
-                    _clip = _clip.set_position(("center", video_height * 0.95 - _clip.h))
-                elif params.subtitle_position == "top":
-                    _clip = _clip.set_position(("center", video_height * 0.05))
-                elif params.subtitle_position == "custom":
-                    margin = 10
-                    max_y = video_height - _clip.h - margin
-                    min_y = margin
-                    custom_y = (video_height - _clip.h) * (params.custom_position / 100)
-                    custom_y = max(min_y, min(custom_y, max_y))
-                    _clip = _clip.set_position(("center", custom_y))
-                else:  # center
-                    _clip = _clip.set_position(("center", "center"))
-                return _clip
-
-            update_progress("字体设置完成")
-
-            # 处理音频
-            original_audio = video_clip.audio
-            video_duration = video_clip.duration
-            new_audio = AudioFileClip(audio_path)
-            final_audio = process_audio_tracks(original_audio, new_audio, params, video_duration)
-            update_progress("音频处理完成")
-
-            # 处理字幕
-            if subtitle_path and os.path.exists(subtitle_path):
-                video_clip = process_subtitles(subtitle_path, video_clip, video_duration, create_text_clip)
-            update_progress("字幕处理完成")
-
-            # 合并音频和导出
-            video_clip = video_clip.set_audio(final_audio)
-            video_clip.write_videofile(
-                output_file,
-                audio_codec="aac",
-                temp_audiofile=os.path.join(output_dir, "temp-audio.m4a"),
-                threads=params.n_threads,
-                logger=None,
-                fps=30,
-            )
+        # 处理音频
+        if audio_path and os.path.exists(audio_path):
+            audio_clip = AudioFileClip(audio_path)
             
-    except FileNotFoundError as e:
-        logger.error(f"文件不存在: {str(e)}")
-        raise
+            if list_script:
+                # 根据OST设置处理音频
+                # OST=0: 只使用TTS音频
+                # OST=1: 只使用视频原声
+                # OST=2: 混合TTS音频和视频原声
+                original_audio = video_clip.audio
+                
+                # 设置音频音量
+                tts_volume = params.tts_volume if hasattr(params, 'tts_volume') else 1.0
+                video_volume = params.video_volume if hasattr(params, 'video_volume') else 0.1
+                
+                # 创建最终音频
+                if original_audio:
+                    # 有些片段需要原声，有些需要TTS
+                    final_audio = CompositeAudioClip([
+                        audio_clip.volumex(tts_volume),  # TTS音频
+                        original_audio.volumex(video_volume)  # 原声音频
+                    ])
+                else:
+                    final_audio = audio_clip.volumex(tts_volume)
+            else:
+                # 如果没有OST设置，使用默认行为
+                final_audio = audio_clip
+                
+            video_clip = video_clip.set_audio(final_audio)
+
+        # 处理字幕
+        if subtitle_path and os.path.exists(subtitle_path):
+            # 添加字幕
+            video_clip = add_subtitles(
+                video_clip,
+                subtitle_path,
+                params.font_size,
+                params.font_name,
+                params.text_fore_color,
+                params.subtitle_position,
+                params.stroke_color,
+                params.stroke_width
+            )
+
+        # 写入最终视频文件
+        video_clip.write_videofile(
+            output_file,
+            codec="libx264",
+            audio_codec="aac",
+            temp_audiofile="temp-audio.m4a",
+            remove_temp=True,
+            threads=params.n_threads
+        )
+
     except Exception as e:
-        logger.error(f"视频生成失败: {str(e)}")
-        raise
+        logger.error(f"生成视频时发生错误: {str(e)}")
+        raise e
+
     finally:
-        logger.success("完成")
+        # 清理资源
+        if 'video_clip' in locals():
+            video_clip.close()
+        if 'audio_clip' in locals():
+            audio_clip.close()
+        if 'final_audio' in locals():
+            final_audio.close()
 
 
 def process_audio_tracks(original_audio, new_audio, params, video_duration):
@@ -389,7 +351,7 @@ def process_subtitles(subtitle_path, video_clip, video_duration, create_text_cli
     for item in sub.subtitles:
         clip = create_text_clip(subtitle_item=item)
         
-        # 时间范围调整
+        # 时间范围��整
         start_time = max(clip.start, 0)
         if start_time >= video_duration:
             continue
@@ -450,12 +412,12 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
 
 
 def combine_clip_videos(combined_video_path: str,
-                        video_paths: List[str],
-                        video_ost_list: List[int],
-                        list_script: list,
-                        video_aspect: VideoAspect = VideoAspect.portrait,
-                        threads: int = 2,
-                        ) -> str:
+                       video_paths: List[str],
+                       video_ost_list: List[int],
+                       list_script: list,
+                       video_aspect: VideoAspect = VideoAspect.portrait,
+                       threads: int = 2,
+                       ) -> str:
     """
     合并子视频
     Args:
@@ -469,9 +431,18 @@ def combine_clip_videos(combined_video_path: str,
     Returns:
         str: 合并后的视频路径
     """
-    from app.utils.utils import calculate_total_duration
-    audio_duration = calculate_total_duration(list_script)
-    logger.info(f"音频的最大持续时间: {audio_duration} s")
+    # 计算总时长时需要考虑毫秒精度
+    total_duration = 0.0
+    for item in list_script:
+        timestamp = item.get('new_timestamp', '')
+        if timestamp:
+            start_str, end_str = timestamp.split('-')
+            start_time = utils.time_to_seconds(start_str)
+            end_time = utils.time_to_seconds(end_str)
+            duration = end_time - start_time
+            total_duration += duration
+            
+    logger.info(f"音频的最大持续时间: {total_duration:.3f} s")
     
     output_dir = os.path.dirname(combined_video_path)
     aspect = VideoAspect(video_aspect)
@@ -480,11 +451,17 @@ def combine_clip_videos(combined_video_path: str,
     clips = []
     for video_path, video_ost in zip(video_paths, video_ost_list):
         try:
+            # 加载视频片段
             clip = VideoFileClip(video_path)
             
+            # 根据OST设置处理音频
             if video_ost == 0:  # 不保留原声
                 clip = clip.without_audio()
-            # video_ost 为 1 或 2 时都保留原声，不需要特殊处理
+            elif video_ost == 1:  # 只保留原声
+                # 保持原声，但可能需要调整音量
+                if clip.audio:
+                    clip = clip.set_audio(clip.audio.volumex(1.0))  # 可以调整音量系数
+            # OST == 2 的情况会在后续处理中混合音频
                 
             clip = clip.set_fps(30)
 
@@ -498,6 +475,16 @@ def combine_clip_videos(combined_video_path: str,
                 )
                 logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}")
 
+            # 精确控制视频时长
+            filename = os.path.basename(video_path)
+            timestamp = extract_timestamp_from_filename(filename)
+            if timestamp:
+                start_time, end_time = timestamp
+                clip_duration = end_time - start_time
+                if abs(clip.duration - clip_duration) > 0.1:  # 允许0.1秒的误差
+                    logger.warning(f"视频 {video_path} 时长与时间戳不匹配，进行调整")
+                    clip = clip.set_duration(clip_duration)
+
             clips.append(clip)
             
         except Exception as e:
@@ -508,6 +495,7 @@ def combine_clip_videos(combined_video_path: str,
         raise ValueError("没有有效的视频片段可以合并")
 
     try:
+        # 合并所有视频片段
         video_clip = concatenate_videoclips(clips)
         video_clip = video_clip.set_fps(30)
         
@@ -521,7 +509,7 @@ def combine_clip_videos(combined_video_path: str,
             temp_audiofile=os.path.join(output_dir, "temp-audio.m4a")
         )
     finally:
-        # 确保资源被正确���放
+        # 确保资源被正确释放
         video_clip.close()
         for clip in clips:
             clip.close()
@@ -530,6 +518,59 @@ def combine_clip_videos(combined_video_path: str,
     return combined_video_path
 
 
+def extract_timestamp_from_filename(filename: str) -> tuple:
+    """
+    从文件名中提取时间戳，支持多种格式：
+    - "vid-00_06,500-00_24,800.mp4" -> (6.5, 24.8)
+    - "vid-00_00_00-020-00_00_10-400.mp4" -> (0.02, 10.4)
+    """
+    try:
+        # 提取时间戳部分
+        match = re.search(r'vid-(.+?)\.mp4$', filename)
+        if not match:
+            logger.warning(f"文件名格式不正确: {filename}")
+            return None
+            
+        timestamp = match.group(1)
+        
+        # 处理包含毫秒的格式 (00_00_00-020-00_00_10-400)
+        if timestamp.count('-') == 3:
+            parts = timestamp.split('-')
+            start_time = f"{parts[0]}-{parts[1]}"  # 组合开始时间和毫秒
+            end_time = f"{parts[2]}-{parts[3]}"    # 组合结束时间和毫秒
+            
+            # 转换开始时间
+            start_time_str = start_time.replace('_', ':')
+            if start_time_str.count(':') == 2:  # 如果是 00:00:00-020 格式
+                start_base = utils.time_to_seconds(start_time_str.split('-')[0])
+                start_ms = int(start_time_str.split('-')[1]) / 1000
+                start_seconds = start_base + start_ms
+            else:
+                start_seconds = utils.time_to_seconds(start_time_str)
+            
+            # 转换结束时间
+            end_time_str = end_time.replace('_', ':')
+            if end_time_str.count(':') == 2:  # 如果是 00:00:10-400 格式
+                end_base = utils.time_to_seconds(end_time_str.split('-')[0])
+                end_ms = int(end_time_str.split('-')[1]) / 1000
+                end_seconds = end_base + end_ms
+            else:
+                end_seconds = utils.time_to_seconds(end_time_str)
+                
+        # 处理简单格式 (00_06-00_24)
+        else:
+            start_str, end_str = timestamp.split('-')
+            start_seconds = utils.time_to_seconds(start_str.replace('_', ':'))
+            end_seconds = utils.time_to_seconds(end_str.replace('_', ':'))
+        
+        logger.debug(f"从文件名 {filename} 提取时间戳: {start_seconds:.3f} - {end_seconds:.3f}")
+        return start_seconds, end_seconds
+        
+    except Exception as e:
+        logger.error(f"从文件名提取时间戳失败 {filename}: {str(e)}\n{traceback.format_exc()}")
+        return None
+
+
 def resize_video_with_padding(clip, target_width: int, target_height: int):
     """辅助函数：调整视频尺寸并添加黑边"""
     clip_ratio = clip.w / clip.h
@@ -574,6 +615,71 @@ def validate_params(video_path, audio_path, output_file, params):
         raise ValueError("params 缺少必要参数 video_aspect")
 
 
+def add_subtitles(video_clip, subtitle_path, font_size, font_name, font_color, position, shadow_color, shadow_offset):
+    """
+    为视频添加字幕
+
+    Args:
+        video_clip: 视频剪辑对象
+        subtitle_path: 字幕文件路径
+        font_size: 字体大小
+        font_name: 字体名称
+        font_color: 字体颜色
+        position: 字幕位置 ('top', 'center', 'bottom')
+        shadow_color: 阴影颜色
+        shadow_offset: 阴影偏移
+
+    Returns:
+        带有字幕的视频剪辑对象
+    """
+    try:
+        # 确保字体文件存在
+        font_path = os.path.join(utils.font_dir(), font_name)
+        if not os.path.exists(font_path):
+            logger.error(f"字体文件不存在: {font_path}")
+            # 尝试使用系统默认字体
+            font_path = "Arial" if os.name == 'nt' else "/System/Library/Fonts/STHeiti Light.ttc"
+            logger.info(f"使用默认字体: {font_path}")
+
+        # 设置字幕位置
+        if position == "top":
+            pos = ("center", 50)
+        elif position == "center":
+            pos = "center"
+        else:  # bottom
+            pos = ("center", -50)
+
+        def subtitle_generator(txt):
+            return TextClip(
+                txt, 
+                fontsize=font_size,
+                font=font_path,
+                color=font_color,
+                stroke_color=shadow_color,
+                stroke_width=shadow_offset,
+                method='caption',  # 使用 caption 方法可能更稳定
+                size=(video_clip.w * 0.9, None)  # 限制字幕宽度
+            )
+
+        subtitles = SubtitlesClip(
+            subtitle_path,
+            subtitle_generator
+        )
+        
+        # 添加字幕到视频
+        video_with_subtitles = CompositeVideoClip([
+            video_clip,
+            subtitles.set_position(pos)
+        ])
+        
+        return video_with_subtitles
+
+    except Exception as e:
+        logger.error(f"添加字幕时出错: {str(e)}\n{traceback.format_exc()}")
+        # 如果添加字幕失败，返回原始视频
+        return video_clip
+
+
 if __name__ == "__main__":
     # combined_video_path = "../../storage/tasks/12312312/com123.mp4"
     #
@@ -586,7 +692,7 @@ if __name__ == "__main__":
     #     {
     #         "picture": "夜晚，一个小孩在树林里奔跑，后面有人拿着火把在追赶",
     #         "timestamp": "00:00-00:03",
-    #         "narration": "夜黑风高的树林，一个小孩在拼命奔跑，后面的人穷追不舍！",
+    #         "narration": "夜���风高的树林，一个小孩在拼命奔跑，后面的人穷追不舍！",
     #         "OST": False,
     #         "new_timestamp": "00:00-00:03"
     #     },
diff --git a/app/test/test_moviepy.py b/app/test/test_moviepy.py
index 5b24ebf..79d93c2 100644
--- a/app/test/test_moviepy.py
+++ b/app/test/test_moviepy.py
@@ -1,5 +1,5 @@
 """
-使用 moviepy 库剪辑指定时间戳视频
+使用 moviepy 库剪辑指定时间戳视频，支持时分秒毫秒精度
 """
 
 from moviepy.editor import VideoFileClip
@@ -11,12 +11,22 @@ def time_str_to_seconds(time_str: str) -> float:
     """
     将时间字符串转换为秒数
     参数:
-        time_str: 格式为"MM:SS"的时间字符串
+        time_str: 格式为"HH:MM:SS,mmm"的时间字符串，例如"00:01:23,456"
     返回:
-        转换后的秒数
+        转换后的秒数(float)
     """
-    time_obj = datetime.strptime(time_str, "%M:%S")
-    return time_obj.minute * 60 + time_obj.second
+    try:
+        # 分离时间和毫秒
+        time_part, ms_part = time_str.split(',')
+        # 转换时分秒
+        time_obj = datetime.strptime(time_part, "%H:%M:%S")
+        # 计算总秒数
+        total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second
+        # 添加毫秒部分
+        total_seconds += int(ms_part) / 1000
+        return total_seconds
+    except ValueError as e:
+        raise ValueError("时间格式错误，请使用 HH:MM:SS,mmm 格式，例如 00:01:23,456") from e
 
 
 def format_duration(seconds: float) -> str:
@@ -25,11 +35,15 @@ def format_duration(seconds: float) -> str:
     参数:
         seconds: 秒数
     返回:
-        格式化的时间字符串 (MM:SS)
+        格式化的时间字符串 (HH:MM:SS,mmm)
     """
-    minutes = int(seconds // 60)
-    remaining_seconds = int(seconds % 60)
-    return f"{minutes:02d}:{remaining_seconds:02d}"
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds_remain = seconds % 60
+    whole_seconds = int(seconds_remain)
+    milliseconds = int((seconds_remain - whole_seconds) * 1000)
+    
+    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
 
 
 def cut_video(video_path: str, start_time: str, end_time: str, output_path: str) -> None:
@@ -37,8 +51,8 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
     剪辑视频
     参数:
         video_path: 视频文件路径
-        start_time: 开始时间 (格式: "MM:SS")
-        end_time: 结束时间 (格式: "MM:SS")
+        start_time: 开始时间 (格式: "HH:MM:SS,mmm")
+        end_time: 结束时间 (格式: "HH:MM:SS,mmm")
         output_path: 输出文件路径
     """
     try:
@@ -62,10 +76,18 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
         # 加载视频文件
         video = VideoFileClip(video_path)
         
+        # 验证时间范围
+        if start_seconds >= video.duration or end_seconds > video.duration:
+            raise ValueError(f"剪辑时间超出视频长度！视频总长度为: {format_duration(video.duration)}")
+        
+        if start_seconds >= end_seconds:
+            raise ValueError("结束时间必须大于开始时间！")
+        
         # 计算剪辑时长
         clip_duration = end_seconds - start_seconds
         print(f"原视频总长度: {format_duration(video.duration)}")
         print(f"剪辑时长: {format_duration(clip_duration)}")
+        print(f"剪辑区间: {start_time} -> {end_time}")
         
         # 剪辑视频
         video = video.subclip(start_seconds, end_seconds)
@@ -92,6 +114,9 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
 
 
 if __name__ == "__main__":
-    # cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "00:00", "07:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-1")
-    # cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "07:00", "14:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-2")
-    cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "14:00", "22:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-3")
+    cut_video(
+        video_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp.mp4",
+        start_time="00:00:00,789",
+        end_time="00:02:00,123",
+        output_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp_cut3.mp4"
+    )
diff --git a/app/test/test_qwen.py b/app/test/test_qwen.py
index 77bca56..2a69225 100644
--- a/app/test/test_qwen.py
+++ b/app/test/test_qwen.py
@@ -2,11 +2,23 @@ import os
 import traceback
 import json
 from openai import OpenAI
-from test_moviepy import cut_video
+from pydantic import BaseModel
+from typing import List
 from app.utils import utils
 from app.services.subtitle import extract_audio_and_create_subtitle
 
 
+class Step(BaseModel):
+    timestamp: str
+    picture: str
+    narration: str
+    OST: int
+    new_timestamp: str
+
+class MathReasoning(BaseModel):
+    result: List[Step]
+
+
 def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
     """
     与通义千问AI模型进行对话
@@ -23,7 +35,7 @@ def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
     """
     try:
         client = OpenAI(
-            api_key="sk-",
+            api_key="sk-a1acd853d88d41d3ae92777d7bfa2612",
             base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
         )
 
@@ -50,25 +62,25 @@ def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
 # 使用示例
 if __name__ == "__main__":
     try:
-        # video_path = utils.video_dir("duanju_yuansp.mp4")
+        video_path = utils.video_dir("duanju_yuansp.mp4")
         # # 判断视频是否存在
         # if not os.path.exists(video_path):
         #     print(f"视频文件不存在：{video_path}")
         #     exit(1)
         # 提取字幕
         subtitle_path = os.path.join(utils.video_dir(""), f"duanju_yuan.srt")
-        # extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path)
+        extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path)
         # 分析字幕
         system_message = """
         你是一个视频srt字幕分析剪辑器, 输入视频的srt字幕, 分析其中的精彩且尽可能连续的片段并裁剪出来, 注意确保文字与时间戳的正确匹配。
-        输出需严格按照如下 json 格式: 
+        输出需严格按照如下 json 格式:
         [
             {
-                "timestamp": "00:50-01:44",
+                "timestamp": "00:00:50,020-00,01:44,000",
                 "picture": "画面1",
                 "narration": "播放原声",
                 "OST": 0,
-                "new_timestamp": "00:00-00:54"
+                "new_timestamp": "00:00:00,000-00:00:54,020"
             },
             {
                 "timestamp": "01:49-02:30",
diff --git a/app/utils/utils.py b/app/utils/utils.py
index 307823c..e864341 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -40,7 +40,7 @@ def to_json(obj):
             # 如果对象是二进制数据，转换为base64编码的字符串
             elif isinstance(o, bytes):
                 return "*** binary data ***"
-            # 如果对象是字典，递归处理每个键值对
+            # 如果���象是字典，递归处理每个键值对
             elif isinstance(o, dict):
                 return {k: serialize(v) for k, v in o.items()}
             # 如果对象是列表或元组，递归处理每个元素
@@ -302,15 +302,49 @@ def get_current_country():
 
 
 def time_to_seconds(time_str: str) -> float:
-    parts = time_str.split(':')
-    if len(parts) == 2:
-        m, s = map(float, parts)
-        return m * 60 + s
-    elif len(parts) == 3:
-        h, m, s = map(float, parts)
-        return h * 3600 + m * 60 + s
-    else:
-        raise ValueError(f"Invalid time format: {time_str}")
+    """
+    将时间字符串转换为秒数，支持多种格式：
+    - "HH:MM:SS,mmm" -> 小时:分钟:秒,毫秒
+    - "MM:SS,mmm" -> 分钟:秒,毫秒
+    - "SS,mmm" -> 秒,毫秒
+    - "SS-mmm" -> 秒-毫秒
+    
+    Args:
+        time_str: 时间字符串
+        
+    Returns:
+        float: 转换后的秒数(包含毫秒)
+    """
+    try:
+        # 处理带有'-'的毫秒格式
+        if '-' in time_str:
+            time_part, ms_part = time_str.split('-')
+            ms = float(ms_part) / 1000
+        # 处理带有','的毫秒格式
+        elif ',' in time_str:
+            time_part, ms_part = time_str.split(',')
+            ms = float(ms_part) / 1000
+        else:
+            time_part = time_str
+            ms = 0
+
+        # 分割时间部分
+        parts = time_part.split(':')
+        
+        if len(parts) == 3:  # HH:MM:SS
+            h, m, s = map(float, parts)
+            seconds = h * 3600 + m * 60 + s
+        elif len(parts) == 2:  # MM:SS
+            m, s = map(float, parts)
+            seconds = m * 60 + s
+        else:  # SS
+            seconds = float(parts[0])
+
+        return seconds + ms
+        
+    except (ValueError, IndexError) as e:
+        logger.error(f"时间格式转换错误 {time_str}: {str(e)}")
+        return 0.0
 
 
 def seconds_to_time(seconds: float) -> str:
@@ -520,3 +554,21 @@ def download_font(url: str, font_path: str):
     except Exception as e:
         logger.error(f"下载字体文件失败: {e}")
         raise
+
+def init_imagemagick():
+    """初始化 ImageMagick 配置"""
+    try:
+        # 检查 ImageMagick 是否已安装
+        import subprocess
+        result = subprocess.run(['magick', '-version'], capture_output=True, text=True)
+        if result.returncode != 0:
+            logger.error("ImageMagick 未安装或配置不正确")
+            return False
+            
+        # 设置 IMAGEMAGICK_BINARY 环境变量
+        os.environ['IMAGEMAGICK_BINARY'] = 'magick'
+        
+        return True
+    except Exception as e:
+        logger.error(f"初始化 ImageMagick 失败: {str(e)}")
+        return False
diff --git a/video_pipeline.py b/video_pipeline.py
index 5dca576..dc7fa26 100644
--- a/video_pipeline.py
+++ b/video_pipeline.py
@@ -93,10 +93,8 @@ class VideoPipeline:
         response.raise_for_status()
         return response.json()
     
-    def save_script_to_json(self, script: list, script_name: str) -> str:
-        """保存脚本到json文件"""
-        script_path = f"E:\\projects\\NarratoAI\\resource\\scripts\\{script_name}.json"
-        
+    def save_script_to_json(self, script: list, script_path: str) -> str:
+        """保存脚本到json文件"""        
         try:
             with open(script_path, 'w', encoding='utf-8') as f:
                 json.dump(script, f, ensure_ascii=False, indent=2)
@@ -133,8 +131,7 @@ class VideoPipeline:
             
             # 2.2 保存脚本到json文件
             print("保存脚本到json文件...")
-            script_path = self.save_script_to_json(script, script_name)
-            script_result["script_path"] = script_path
+            self.save_script_to_json(script=script, script_path=script_path)
             
             # 3. 剪辑视频
             print("开始剪辑视频...")
@@ -143,7 +140,7 @@ class VideoPipeline:
             
             # 4. 生成最终视频
             print("开始生成最终视频...")
-            final_result = self.generate_final_video(
+            self.generate_final_video(
                 task_id=task_id,
                 video_path=video_path,
                 script_path=script_path,
diff --git a/webui.txt b/webui.txt
index b64b320..c8d66c9 100644
--- a/webui.txt
+++ b/webui.txt
@@ -369,4 +369,6 @@ output_path和script参数需要传递给请求3
   }
 }
 subclip_videos和 output_path和script参数需要传递给请求4
-最后完成工作流
\ No newline at end of file
+最后完成工作流
+
+0代表只播放文案音频，禁用视频原声；1代表只播放视频原声，不需要播放文案音频和字幕；2代表即播放文案音频也要播放视频原声；
\ No newline at end of file