feat(webui): 大改动标记1

-重构音频设置面板，增加语音音量、背景音乐等设置 - 添加背景音乐文件选择功能 - 优化字幕设置，支持自定义字体和样式 -调整视频生成流程，支持新音频设置 - 更新文档示例，反映新功能
2026-03-19 04:04:12 +00:00 · 2024-12-10 18:33:44 +08:00 · 2024-12-10 18:33:44 +08:00 · c065800072
commit c065800072
parent 67bee9d567
32 changed files with 623 additions and 803 deletions
--- a/.gitignore
+++ b/.gitignore
@ -31,4 +31,5 @@ resource/fonts/*.ttc
 resource/fonts/*.ttf
 resource/fonts/*.otf
 resource/srt/*.srt
-app/models/faster-whisper-large-v2/*
+app/models/faster-whisper-large-v2/*
+app/models/bert/*
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -345,29 +345,29 @@ class VideoClipParams(BaseModel):
    # video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value

    voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
-    voice_volume: Optional[float] = Field(default=1.0, description="语音音量")
+    voice_volume: Optional[float] = Field(default=1.0, description="解说语音音量")
    voice_rate: Optional[float] = Field(default=1.0, description="语速")
    voice_pitch: Optional[float] = Field(default=1.0, description="语调")

    bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
    bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")
    bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
-    bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量")

-    subtitle_enabled: Optional[bool] = Field(default=True, description="是否启用字幕")
-    subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置")  # top, bottom, center
-    font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称")
-    text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色")
-    text_background_color: Optional[str] = Field(default="transparent", description="文字背景色")
+    subtitle_enabled: bool = True
+    font_name: str = "SimHei"  # 默认使用黑体
+    font_size: int = 36
+    text_fore_color: str = "white"              # 文本前景色
+    text_back_color: Optional[str] = None       # 文本背景色
+    stroke_color: str = "black"                 # 描边颜色
+    stroke_width: float = 1.5                   # 描边宽度
+    subtitle_position: str = "bottom"  # top, bottom, center, custom

-    font_size: int = Field(default=60, description="文字大小")
-    stroke_color: Optional[str] = Field(default="#000000", description="文字描边颜色")
-    stroke_width: float = Field(default=1.5, description="文字描边宽度")
-    custom_position: float = Field(default=70.0, description="自定义位置")
+    n_threads: Optional[int] = Field(default=16, description="解说语音音量")    # 线程数，有助于提升视频处理速度
+
+    tts_volume: Optional[float] = Field(default=1.0, description="解说语音音量（后处理）")
+    original_volume: Optional[float] = Field(default=1.0, description="视频原声音量")
+    bgm_volume: Optional[float] = Field(default=0.6, description="背景音乐音量")

-    n_threads: Optional[int] = 8    # 线程数，有助于提升视频处理速度
-    tts_volume: float = 1.0  # TTS音频音量
-    video_volume: float = 0.1  # 视频原声音量

 class VideoTranscriptionRequest(BaseModel):
    video_name: str
@ -376,5 +376,6 @@ class VideoTranscriptionRequest(BaseModel):
    class Config:
        arbitrary_types_allowed = True

+
 class VideoTranscriptionResponse(BaseModel):
    transcription: str
--- a/app/services/SDP/generate_script_short.pyd
+++ b/app/services/SDP/generate_script_short.pyd
--- a/app/services/SDP/generate_script_short.so
+++ b/app/services/SDP/generate_script_short.so
--- a/app/services/SDP/utils/short_schema.pyd
+++ b/app/services/SDP/utils/short_schema.pyd
--- a/app/services/SDP/utils/short_schema.so
+++ b/app/services/SDP/utils/short_schema.so
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.so
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.so
--- a/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd
+++ b/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd
--- a/app/services/SDP/utils/step2_subtitle_analyzer_bert.so
+++ b/app/services/SDP/utils/step2_subtitle_analyzer_bert.so
--- a/app/services/SDP/utils/step3_fragment_check.pyd
+++ b/app/services/SDP/utils/step3_fragment_check.pyd
--- a/app/services/SDP/utils/step3_fragment_check.so
+++ b/app/services/SDP/utils/step3_fragment_check.so
--- a/app/services/SDP/utils/step4_text_generate.pyd
+++ b/app/services/SDP/utils/step4_text_generate.pyd
--- a/app/services/SDP/utils/step4_text_generate.so
+++ b/app/services/SDP/utils/step4_text_generate.so
--- a/app/services/SDP/utils/step5_merge_script.pyd
+++ b/app/services/SDP/utils/step5_merge_script.pyd
--- a/app/services/SDP/utils/step5_merge_script.so
+++ b/app/services/SDP/utils/step5_merge_script.so
--- a/app/services/SDP/utils/utils.pyd
+++ b/app/services/SDP/utils/utils.pyd
--- a/app/services/SDP/utils/utils.so
+++ b/app/services/SDP/utils/utils.so
--- a/app/services/task.py
+++ b/app/services/task.py
@ -157,55 +157,6 @@ def get_video_materials(task_id, params, video_terms, audio_duration):
        return downloaded_videos


-def generate_final_videos(
-        task_id, params, downloaded_videos, audio_file, subtitle_path
-):
-    final_video_paths = []
-    combined_video_paths = []
-    video_concat_mode = (
-        params.video_concat_mode if params.video_count == 1 else VideoConcatMode.random
-    )
-
-    _progress = 50
-    for i in range(params.video_count):
-        index = i + 1
-        combined_video_path = path.join(
-            utils.task_dir(task_id), f"combined-{index}.mp4"
-        )
-        logger.info(f"\n\n## combining video: {index} => {combined_video_path}")
-        video.combine_videos(
-            combined_video_path=combined_video_path,
-            video_paths=downloaded_videos,
-            audio_file=audio_file,
-            video_aspect=params.video_aspect,
-            video_concat_mode=video_concat_mode,
-            max_clip_duration=params.video_clip_duration,
-            threads=params.n_threads,
-        )
-
-        _progress += 50 / params.video_count / 2
-        sm.state.update_task(task_id, progress=_progress)
-
-        final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
-
-        logger.info(f"\n\n## generating video: {index} => {final_video_path}")
-        video.generate_video(
-            video_path=combined_video_path,
-            audio_path=audio_file,
-            subtitle_path=subtitle_path,
-            output_file=final_video_path,
-            params=params,
-        )
-
-        _progress += 50 / params.video_count / 2
-        sm.state.update_task(task_id, progress=_progress)
-
-        final_video_paths.append(final_video_path)
-        combined_video_paths.append(combined_video_path)
-
-    return final_video_paths, combined_video_paths
-
-
 def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
    """后台任务（自动剪辑视频进行剪辑）"""
    logger.info(f"\n\n## 开始任务: {task_id}")
@ -253,7 +204,12 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
        segment for segment in list_script 
        if segment['OST'] in [0, 2]
    ]
-    # logger.debug(f"tts_segments: {tts_segments}")
+    logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}")
+    
+    # 初始化音频文件路径
+    audio_files = []
+    final_audio = ""
+    
    if tts_segments:
        audio_files, sub_maker_list = voice.tts_multiple(
            task_id=task_id,
@ -267,36 +223,54 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
            logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
            return
+
+        if audio_files:
+            logger.info(f"合并音频文件: {audio_files}")
+            try:
+                # 传入OST信息以便正确处理音频
+                final_audio = audio_merger.merge_audio_files(
+                    task_id=task_id,
+                    audio_files=audio_files,
+                    total_duration=total_duration,
+                    list_script=list_script  # 传入完整脚本以便处理OST
+                )
+                logger.info("音频文件合并成功")
+            except Exception as e:
+                logger.error(f"合并音频文件失败: {str(e)}")
+                final_audio = ""
    else:
-        audio_files = []
-        
-    logger.info(f"合并音频文件:\n{audio_files}")
-    # 传入OST信息以便正确处理音频
-    final_audio = audio_merger.merge_audio_files(
-        task_id=task_id, 
-        audio_files=audio_files, 
-        total_duration=total_duration, 
-        list_script=list_script  # 传入完整脚本以便处理OST
-    )
+        # 如果没有需要生成TTS的片段，创建一个空白音频文件
+        # 这样可以确保后续的音频处理能正确进行
+        logger.info("没有需要生成TTS的片段，将保留原声和背景音乐")
+        final_audio = path.join(utils.task_dir(task_id), "empty.mp3")
+        try:
+            from moviepy.editor import AudioClip
+            # 创建一个与视频等长的空白音频
+            empty_audio = AudioClip(make_frame=lambda t: 0, duration=total_duration)
+            empty_audio.write_audiofile(final_audio, fps=44100)
+            logger.info(f"已创建空白音频文件: {final_audio}")
+        except Exception as e:
+            logger.error(f"创建空白音频文件失败: {str(e)}")
+            final_audio = ""

    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)

-    # 只为OST=0或2的片段生成字幕
    subtitle_path = ""
    if params.subtitle_enabled:
-        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
-        subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
-        logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-         
-        subtitle.create(
-            audio_file=final_audio,
-            subtitle_file=subtitle_path,
-        )
+        if audio_files:
+            subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
+            subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+            logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")

-        subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
-        if not subtitle_lines:
-            logger.warning(f"字幕文件无效: {subtitle_path}")
-            subtitle_path = ""
+            subtitle.create(
+                audio_file=final_audio,
+                subtitle_file=subtitle_path,
+            )
+
+            subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
+            if not subtitle_lines:
+                logger.warning(f"字幕文件无效: {subtitle_path}")
+                subtitle_path = ""

    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)

@ -335,14 +309,44 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
    final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")

    logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}")
-    # 传入OST信息以便正确处理音频和视频
-    video.generate_video_v2(
+    
+    # 获取背景音乐
+    bgm_path = None
+    if params.bgm_type or params.bgm_file:
+        try:
+            bgm_path = utils.get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
+            if bgm_path:
+                logger.info(f"使用背景音乐: {bgm_path}")
+        except Exception as e:
+            logger.error(f"获取背景音乐失败: {str(e)}")
+
+    # 示例：自定义字幕样式
+    subtitle_style = {
+        'fontsize': params.font_size,  # 字体大小
+        'color': params.text_fore_color,  # 字体颜色
+        'stroke_color': params.stroke_color,  # 描边颜色
+        'stroke_width': params.stroke_width,  # 描边宽度, 范围0-10
+        'bg_color': params.text_back_color,   # 半透明黑色背景
+        'position': ('center', 0.2),  # 距离顶部60%的位置
+        'method': 'caption'  # 渲染方法
+    }
+
+    # 示例：自定义音量配置
+    volume_config = {
+        'original': params.original_volume,  # 原声音量80%
+        'bgm': params.bgm_volume,  # BGM音量20%
+        'narration': params.tts_volume  # 解说音量100%
+    }
+    font_path = utils.font_dir(params.font_name)
+    video.generate_video_v3(
        video_path=combined_video_path,
-        audio_path=final_audio,
        subtitle_path=subtitle_path,
-        output_file=final_video_path,
-        params=params,
-        list_script=list_script  # 传入完整脚本以便处理OST
+        bgm_path=bgm_path,
+        narration_path=final_audio,
+        output_path=final_video_path,
+        volume_config=volume_config,  # 添加音量配置
+        subtitle_style=subtitle_style,
+        font_path=font_path
    )

    _progress += 50 / 2
@ -361,6 +365,40 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
    return kwargs


+def validate_params(video_path, audio_path, output_file, params):
+    """
+    验证输入参数
+    Args:
+        video_path: 视频文件路径
+        audio_path: 音频文件路径（可以为空字符串）
+        output_file: 输出文件路径
+        params: 视频参数
+
+    Raises:
+        FileNotFoundError: 文件不存在时抛出
+        ValueError: 参数无效时抛出
+    """
+    if not video_path:
+        raise ValueError("视频路径不能为空")
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"视频文件不存在: {video_path}")
+        
+    # 如果提供了音频路径，则验证文件是否存在
+    if audio_path and not os.path.exists(audio_path):
+        raise FileNotFoundError(f"音频文件不存在: {audio_path}")
+        
+    if not output_file:
+        raise ValueError("输出文件路径不能为空")
+    
+    # 确保输出目录存在
+    output_dir = os.path.dirname(output_file)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        
+    if not params:
+        raise ValueError("视频参数不能为空")
+
+
 if __name__ == "__main__":
    # task_id = "test123"
    # subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4',
--- a/app/services/video.py
+++ b/app/services/video.py
@ -1,186 +1,22 @@
-import re
-import os
-import glob
-import random
-from typing import List
-from typing import Union
 import traceback

+import pysrt
+from typing import Optional
+from typing import List
 from loguru import logger
 from moviepy.editor import *
-from moviepy.video.tools.subtitles import SubtitlesClip
 from PIL import ImageFont
 from contextlib import contextmanager
-
-from app.models import const
-from app.models.schema import MaterialInfo, VideoAspect, VideoConcatMode, VideoParams, VideoClipParams
-from app.utils import utils
+from moviepy.editor import (
+    VideoFileClip,
+    AudioFileClip,
+    TextClip,
+    CompositeVideoClip,
+    CompositeAudioClip
+)


-def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
-    """
-    获取背景音乐文件路径
-    Args:
-        bgm_type: 背景音乐类型，可选值: random(随机), ""(无背景音乐)
-        bgm_file: 指定的背景音乐文件路径
-
-    Returns:
-        str: 背景音乐文件路径
-    """
-    if not bgm_type:
-        return ""
-
-    if bgm_file and os.path.exists(bgm_file):
-        return bgm_file
-
-    if bgm_type == "random":
-        song_dir = utils.song_dir()
-
-        # 检查目录是否存在
-        if not os.path.exists(song_dir):
-            logger.warning(f"背景音乐目录不存在: {song_dir}")
-            return ""
-
-        # 支持 mp3 和 flac 格式
-        mp3_files = glob.glob(os.path.join(song_dir, "*.mp3"))
-        flac_files = glob.glob(os.path.join(song_dir, "*.flac"))
-        files = mp3_files + flac_files
-
-        # 检查是否找到音乐文件
-        if not files:
-            logger.warning(f"在目录 {song_dir} 中没有找到 MP3 或 FLAC 文件")
-            return ""
-
-        return random.choice(files)
-
-    return ""
-
-
-def combine_videos(
-        combined_video_path: str,
-        video_paths: List[str],
-        audio_file: str,
-        video_aspect: VideoAspect = VideoAspect.portrait,
-        video_concat_mode: VideoConcatMode = VideoConcatMode.random,
-        max_clip_duration: int = 5,
-        threads: int = 2,
-) -> str:
-    """
-    合并多个视频片段
-    Args:
-        combined_video_path: 合并后的视频保存路径
-        video_paths: 待合并的视频路径列表
-        audio_file: 音频文件路径
-        video_aspect: 视频宽高比
-        video_concat_mode: 视频拼接模式(随机/顺序)
-        max_clip_duration: 每个片段的最大时长(秒)
-        threads: 处理线程数
-
-    Returns:
-        str: 合并后的视频路径
-    """
-    audio_clip = AudioFileClip(audio_file)
-    audio_duration = audio_clip.duration
-    logger.info(f"音频时长: {audio_duration} 秒")
-    # 每个片段的所需时长
-    req_dur = audio_duration / len(video_paths)
-    req_dur = max_clip_duration
-    logger.info(f"每个片段最大时长: {req_dur} 秒")
-    output_dir = os.path.dirname(combined_video_path)
-
-    aspect = VideoAspect(video_aspect)
-    video_width, video_height = aspect.to_resolution()
-
-    clips = []
-    video_duration = 0
-
-    raw_clips = []
-    for video_path in video_paths:
-        clip = VideoFileClip(video_path).without_audio()
-        clip_duration = clip.duration
-        start_time = 0
-
-        while start_time < clip_duration:
-            end_time = min(start_time + max_clip_duration, clip_duration)
-            split_clip = clip.subclip(start_time, end_time)
-            raw_clips.append(split_clip)
-            # logger.info(f"从 {start_time:.2f} 到 {end_time:.2f}, 片段时长 {clip_duration:.2f}, 分割片段时长 {split_clip.duration:.2f}")
-            start_time = end_time
-            if video_concat_mode.value == VideoConcatMode.sequential.value:
-                break
-
-    # 随机视频片段顺序
-    if video_concat_mode.value == VideoConcatMode.random.value:
-        random.shuffle(raw_clips)
-
-    # 添加下载的片段，直到音频时长(max_duration)达到
-    while video_duration < audio_duration:
-        for clip in raw_clips:
-            # 检查片段是否比剩余音频时长长
-            if (audio_duration - video_duration) < clip.duration:
-                clip = clip.subclip(0, (audio_duration - video_duration))
-            # 仅当计算的片段时长(req_dur)小于实际片段时长时，缩短片段
-            elif req_dur < clip.duration:
-                clip = clip.subclip(0, req_dur)
-            clip = clip.set_fps(30)
-
-            # Not all videos are same size, so we need to resize them
-            clip_w, clip_h = clip.size
-            if clip_w != video_width or clip_h != video_height:
-                clip_ratio = clip.w / clip.h
-                video_ratio = video_width / video_height
-
-                if clip_ratio == video_ratio:
-                    # 等比例缩放
-                    clip = clip.resize((video_width, video_height))
-                else:
-                    # 等比缩放视频
-                    if clip_ratio > video_ratio:
-                        # 按照目标宽度等比缩放
-                        scale_factor = video_width / clip_w
-                    else:
-                        # 按照目标高度等比缩放
-                        scale_factor = video_height / clip_h
-
-                    new_width = int(clip_w * scale_factor)
-                    new_height = int(clip_h * scale_factor)
-                    clip_resized = clip.resize(newsize=(new_width, new_height))
-
-                    background = ColorClip(
-                        size=(video_width, video_height), color=(0, 0, 0)
-                    )
-                    clip = CompositeVideoClip(
-                        [
-                            background.set_duration(clip.duration),
-                            clip_resized.set_position("center"),
-                        ]
-                    )
-
-                logger.info(
-                    f"调整视频尺寸为 {video_width} x {video_height}, 片段尺寸: {clip_w} x {clip_h}"
-                )
-
-            if clip.duration > max_clip_duration:
-                clip = clip.subclip(0, max_clip_duration)
-
-            clips.append(clip)
-            video_duration += clip.duration
-
-    video_clip = concatenate_videoclips(clips)
-    video_clip = video_clip.set_fps(30)
-    logger.info("writing")
-
-    video_clip.write_videofile(
-        filename=combined_video_path,
-        threads=threads,
-        logger=None,
-        temp_audiofile_path=output_dir,
-        audio_codec="aac",
-        fps=30,
-    )
-    video_clip.close()
-    logger.success("completed")
-    return combined_video_path
+from app.models.schema import VideoAspect


 def wrap_text(text, max_width, font, fontsize=60):
@ -269,259 +105,6 @@ def manage_clip(clip):
        del clip


-def generate_video_v2(
-        video_path: str,
-        audio_path: str,
-        subtitle_path: str,
-        output_file: str,
-        list_script: list,
-        params: Union[VideoParams, VideoClipParams],
-        progress_callback=None,
-):
-    """
-    合并所有素材
-    Args:
-        video_path: 视频路径
-        audio_path: 单个音频文件路径
-        subtitle_path: 字幕文件路径
-        output_file: 输出文件路径
-        params: 视频参数
-        progress_callback: 进度回调函数，接收 0-100 的进度值
-
-    Returns:
-
-    """
-    total_steps = 4
-    current_step = 0
-
-    def update_progress(step_name):
-        nonlocal current_step
-        current_step += 1
-        if progress_callback:
-            progress_callback(int(current_step * 100 / total_steps))
-        logger.info(f"完成步骤: {step_name}")
-
-    try:
-        validate_params(video_path, audio_path, output_file, params)
-
-        with manage_clip(VideoFileClip(video_path)) as video_clip:
-            aspect = VideoAspect(params.video_aspect)
-            video_width, video_height = aspect.to_resolution()
-
-            logger.info(f"开始，视频尺寸: {video_width} x {video_height}")
-            logger.info(f"  ① 视频: {video_path}")
-            logger.info(f"  ② 音频: {audio_path}")
-            logger.info(f"  ③ 字幕: {subtitle_path}")
-            logger.info(f"  ④ 输出: {output_file}")
-
-            output_dir = os.path.dirname(output_file)
-            update_progress("初始化完成")
-
-            # 字体设置
-            font_path = ""
-            if params.subtitle_enabled:
-                if not params.font_name:
-                    params.font_name = "STHeitiMedium.ttc"
-                font_path = os.path.join(utils.font_dir(), params.font_name)
-                if os.name == "nt":
-                    font_path = font_path.replace("\\", "/")
-                logger.info(f"使用字体: {font_path}")
-
-            def create_text_clip(subtitle_item):
-                phrase = subtitle_item[1]
-                max_width = video_width * 0.9
-                wrapped_txt, txt_height = wrap_text(
-                    phrase, max_width=max_width, font=font_path, fontsize=params.font_size
-                )
-                _clip = TextClip(
-                    wrapped_txt,
-                    font=font_path,
-                    fontsize=params.font_size,
-                    color=params.text_fore_color,
-                    bg_color=params.text_background_color,
-                    stroke_color=params.stroke_color,
-                    stroke_width=params.stroke_width,
-                    print_cmd=False,
-                )
-                duration = subtitle_item[0][1] - subtitle_item[0][0]
-                _clip = _clip.set_start(subtitle_item[0][0])
-                _clip = _clip.set_end(subtitle_item[0][1])
-                _clip = _clip.set_duration(duration)
-
-                if params.subtitle_position == "bottom":
-                    _clip = _clip.set_position(("center", video_height * 0.95 - _clip.h))
-                elif params.subtitle_position == "top":
-                    _clip = _clip.set_position(("center", video_height * 0.05))
-                elif params.subtitle_position == "custom":
-                    margin = 10
-                    max_y = video_height - _clip.h - margin
-                    min_y = margin
-                    custom_y = (video_height - _clip.h) * (params.custom_position / 100)
-                    custom_y = max(min_y, min(custom_y, max_y))
-                    _clip = _clip.set_position(("center", custom_y))
-                else:  # center
-                    _clip = _clip.set_position(("center", "center"))
-                return _clip
-
-            update_progress("字体设置完成")
-
-            # 处理音频
-            original_audio = video_clip.audio
-            video_duration = video_clip.duration
-            new_audio = AudioFileClip(audio_path)
-            final_audio = process_audio_tracks(original_audio, new_audio, params, video_duration)
-            update_progress("音频处理完成")
-
-            # 处理字幕
-            if subtitle_path and os.path.exists(subtitle_path):
-                video_clip = process_subtitles(subtitle_path, video_clip, video_duration, create_text_clip)
-            update_progress("字幕处理完成")
-
-            # 合并音频和导出
-            logger.info("开始导出视频 (此步骤耗时较长请耐心等待)")
-            video_clip = video_clip.set_audio(final_audio)
-            video_clip.write_videofile(
-                output_file,
-                audio_codec="aac",
-                temp_audiofile=os.path.join(output_dir, "temp-audio.m4a"),
-                threads=params.n_threads,
-                logger=None,
-                fps=30,
-            )
-
-    except FileNotFoundError as e:
-        logger.error(f"文件不存在: {str(e)}")
-        raise
-    except Exception as e:
-        logger.error(f"视频生成失败: {str(e)}")
-        raise
-    finally:
-        logger.success("完成")
-
-
-def process_audio_tracks(original_audio, new_audio, params, video_duration):
-    """
-    处理所有音轨(原声、配音、背景音乐)
-    Args:
-        original_audio: 原始音频
-        new_audio: 新音频
-        params: 视频参数
-        video_duration: 视频时长
-
-    Returns:
-        CompositeAudioClip: 合成后的音频
-    """
-    audio_tracks = []
-
-    if original_audio is not None:
-        audio_tracks.append(original_audio)
-
-    new_audio = new_audio.volumex(params.voice_volume)
-    audio_tracks.append(new_audio)
-
-    # 处理背景音乐
-    bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
-    if bgm_file:
-        try:
-            bgm_clip = AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3)
-            bgm_clip = afx.audio_loop(bgm_clip, duration=video_duration)
-            audio_tracks.append(bgm_clip)
-        except Exception as e:
-            logger.error(f"添加背景音乐失败: {str(e)}")
-
-    return CompositeAudioClip(audio_tracks) if audio_tracks else new_audio
-
-
-def process_subtitles(subtitle_path, video_clip, video_duration, create_text_clip):
-    """
-    处理字幕
-    Args:
-        subtitle_path: 字幕文件路径
-        video_clip: 视频片段
-        video_duration: 视频时长
-        create_text_clip: 创建文本片段的回调函数
-
-    Returns:
-        CompositeVideoClip: 添加字幕后的视频
-    """
-    if not (subtitle_path and os.path.exists(subtitle_path)):
-        return video_clip
-
-    sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8")
-    text_clips = []
-
-    for item in sub.subtitles:
-        clip = create_text_clip(subtitle_item=item)
-
-        # 时间范围调整
-        start_time = max(clip.start, 0)
-        if start_time >= video_duration:
-            continue
-
-        end_time = min(clip.end, video_duration)
-        clip = clip.set_start(start_time).set_end(end_time)
-        text_clips.append(clip)
-
-    logger.info(f"处理了 {len(text_clips)} 段字幕")
-    return CompositeVideoClip([video_clip, *text_clips])
-
-
-def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
-    """
-    预处理视频素材
-    Args:
-        materials: 素材信息列表
-        clip_duration: 片段时长(秒)
-
-    Returns:
-        List[MaterialInfo]: 处理后的素材信息列表
-    """
-    for material in materials:
-        if not material.url:
-            continue
-
-        ext = utils.parse_extension(material.url)
-        try:
-            clip = VideoFileClip(material.url)
-        except Exception:
-            clip = ImageClip(material.url)
-
-        width = clip.size[0]
-        height = clip.size[1]
-        if width < 480 or height < 480:
-            logger.warning(f"video is too small, width: {width}, height: {height}")
-            continue
-
-        if ext in const.FILE_TYPE_IMAGES:
-            logger.info(f"processing image: {material.url}")
-            # 创建一个图片剪辑，并设置持续时间为3秒钟
-            clip = (
-                ImageClip(material.url)
-                .set_duration(clip_duration)
-                .set_position("center")
-            )
-            # 使用resize方法来添加缩放效果。这里使用了lambda函数来使得缩放效果随时间变化。
-            # 假设我们想要从原始大小逐渐放大到120%的大小。
-            # t代表当前时间，clip.duration为视频总时长，这里是3秒。
-            # 注意：1 表示100%的大小所以1.2表示120%的大小
-            zoom_clip = clip.resize(
-                lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
-            )
-
-            # 如果需要，可以创建一个包含缩放剪辑的复合频剪辑
-            # （这在您想要在视频中添加其他元素时非常有用）
-            final_clip = CompositeVideoClip([zoom_clip])
-
-            # 输出视频
-            video_file = f"{material.url}.mp4"
-            final_clip.write_videofile(video_file, fps=30, logger=None)
-            final_clip.close()
-            del final_clip
-            material.url = video_file
-            logger.success(f"completed: {video_file}")
-    return materials
-
-
 def combine_clip_videos(combined_video_path: str,
                        video_paths: List[str],
                        video_ost_list: List[int],
@ -640,101 +223,220 @@ def resize_video_with_padding(clip, target_width: int, target_height: int):
    ])


-def validate_params(video_path, audio_path, output_file, params):
+def loop_audio_clip(audio_clip: AudioFileClip, target_duration: float) -> AudioFileClip:
    """
-    验证输入参数
-    Args:
-        video_path: 视频文件路径
-        audio_path: 音频文件路径
-        output_file: 输出文件路径
-        params: 视频参数
+    循环音频片段直到达到目标时长

-    Raises:
-        FileNotFoundError: 文件不存在时抛出
-        ValueError: 参数无效时抛出
+    参数:
+        audio_clip: 原始音频片段
+        target_duration: 目标时长（秒）
+    返回:
+        循环后的音频片段
    """
+    # 计算需要循环的次数
+    loops_needed = int(target_duration / audio_clip.duration) + 1
+
+    # 创建足够长的音频
+    extended_audio = audio_clip
+    for _ in range(loops_needed - 1):
+        extended_audio = CompositeAudioClip([
+            extended_audio,
+            audio_clip.set_start(extended_audio.duration)
+        ])
+
+    # 裁剪到目标时长
+    return extended_audio.subclip(0, target_duration)
+
+
+def generate_video_v3(
+        video_path: str,
+        subtitle_path: Optional[str] = None,
+        bgm_path: Optional[str] = None,
+        narration_path: Optional[str] = None,
+        output_path: str = "output.mp4",
+        # 音量相关参数
+        volume_config: dict = None,
+        # 字幕相关参数
+        subtitle_style: dict = None,
+        font_path: Optional[str] = None
+) -> None:
+    """
+    合并视频素材，包括视频、字幕、BGM和解说音频
+
+    参数:
+        video_path: 原视频文件路径
+        subtitle_path: SRT字幕文件路径（可选）
+        bgm_path: 背景音乐文件路径（可选）
+        narration_path: 解说音频文件路径（可选）
+        output_path: 输出文件路径
+        volume_config: 音量配置字典，可包含以下键：
+            - original: 原声音量（0-1），默认1.0
+            - bgm: BGM音量（0-1），默认0.3
+            - narration: 解说音量（0-1），默认1.0
+        subtitle_style: 字幕样式配置字典，可包含以下键：
+            - font: 字体名称
+            - fontsize: 字体大小
+            - color: 字体颜色
+            - stroke_color: 描边颜色
+            - stroke_width: 描边宽度
+            - bg_color: 背景色
+            - position: 位置支持 'top'/'center'/'bottom' 或 (x,y) 坐标
+            - method: 文字渲染方法
+        font_path: 字体文件路径（.ttf/.otf 等格式）
+    """
+    # 检查视频文件是否存在
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"视频文件不存在: {video_path}")

-    if not os.path.exists(audio_path):
-        raise FileNotFoundError(f"音频文件不存在: {audio_path}")
+    # 设置默认音量配置
+    default_volume = {
+        'original': 1.0,  # 原声音量
+        'bgm': 0.3,  # BGM音量
+        'narration': 1.0  # 解说音量
+    }

-    output_dir = os.path.dirname(output_file)
-    if not os.path.exists(output_dir):
-        raise FileNotFoundError(f"输出目录不存在: {output_dir}")
+    # 更新音量配置
+    if volume_config:
+        default_volume.update(volume_config)

-    if not hasattr(params, 'video_aspect'):
-        raise ValueError("params 缺少必要参数 video_aspect")
+    # 加载视频
+    video = VideoFileClip(video_path)
+    subtitle_clips = []

+    # 处理字幕（如果提供）
+    if subtitle_path:
+        if os.path.exists(subtitle_path):
+            # 检查字体文件
+            if font_path and not os.path.exists(font_path):
+                logger.info(f"警告：字体文件不存在: {font_path}，将使用系统默认字体")
+                font_path = 'Arial'

-if __name__ == "__main__":
-    combined_video_path = "../../storage/tasks/123/combined.mp4"
+            # 设置默认字幕样式
+            default_style = {
+                'font': font_path if font_path else 'Arial',
+                'fontsize': 24,
+                'color': 'white',
+                'stroke_color': 'black',
+                'stroke_width': 1,
+                'bg_color': None,
+                'position': ('center', 'bottom'),
+                'method': 'label'
+            }

-    video_paths = ['../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-00-10_000-00-00-43_039.mp4',
-                   '../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-00-45_439-00-01-01_600.mp4',
-                   '../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-01-07_920-00-01-25_719.mp4',
-                   '../../storage/temp/clip_video/0b545e689a182a91af2163c7c0ca7ca3/vid-00-01-36_959-00-01-53_719.mp4']
-    video_ost_list = [2, 2, 2, 2]
-    list_script = [
-        {
-            "timestamp": "00:10-00:43",
-            "picture": "好的，以下是视频画面的客观描述：\n\n视频显示一个男人在一个树木繁茂的地区，靠近一个泥土斜坡他穿着一件深色T恤、卡其色长裤和登山靴。他背着一个军绿色背包，里面似乎装有头和其他工具。\n\n第一个镜头显示该男子从远处走近斜坡，背对着镜头。下一个镜头特写显示了的背包，一个镐头从背包中伸出来。下一个镜头显示该男子用镐头敲打斜坡。下一个镜头是该男子脚上的特写镜头，他穿着登山靴，正站在泥土斜坡上。最后一个镜显示该男子在斜坡上，仔细地拨开树根和泥土。周围的环境是树木繁茂的，阳光透过树叶照射下来。土壤是浅棕色的，斜坡上有许多树根和植被。",
-            "narration": "（接上文）好吧，今天我们的男主角，背着一个看似随时要发射军绿色背包，竟然化身“泥土探险家”，在斜坡上挥舞着镐头！他这是准备挖宝还是给树根做个“美容”？阳光洒下来，简直是自然界的聚光灯，仿佛在说：“快来看看，这位勇士要挑战泥土极限！”我只能默默想，如果树根能说话，它们一定会喊：“别打我，我还有家人！”这就是生活，总有些搞笑的瞬间等着我们去发现！",
-            "OST": 2,
-            "new_timestamp": "00:00:00,000-00:00:33,000"
-        },
-        {
-            "timestamp": "00:45-01:01",
-            "picture": "好的以下是视频画面的客观描述：\n\n视频显示了一个人在森林里挖掘。\n\n第一个镜头是地面特写，显示出松<EFBFBD><EFBFBD>的泥土、碎石和落叶。光线照在部分区域。\n\n第二个镜头中，一模糊不清的蹲一个树根旁挖掘，一个橄榄绿色的背包放在地上。树根缠绕着常春藤。\n\n第三个镜头显示该人在一个更开阔的区域挖掘，那里有一些树根，以及部分倒的树干。他起来像是在挖掘一个较大的坑。\n\n第四个镜头是特写镜头，显示该人用工具清理土坑的墙壁。\n\n第五个镜头是土坑内部的特写镜头，可以看到土质的纹理，有一些小树根和它植被的残留物。",
-            "narration": "现在，这位勇敢的挖掘者就像个“现代版的土豆农夫”，在林里开辟新天地。的目标是什么？挖一个宝藏还块“树根披萨”？小心哦，别让树根追着你喊：“不要挖我，我也是有故事的！”",
-            "OST": 2,
-            "new_timestamp": "00:00:33,000-00:00:49,000"
-        },
-        {
-            "timestamp": "01:07-01:25",
-            "picture": "好，以下是视频画面的客观描述：\n\n画面1：特写镜头，显示出一丛带有水珠的深绿色灌木叶片。叶片呈椭圆形，边缘光滑。背景是树根和泥土。\n\n画面2：一个留着胡子的男人正在一个森林中土坑里挖掘。他穿着黑色T恤和卡其色裤子，跪在地，用具挖掘泥土。周围环绕着树木、树根和灌木。一个倒下的树干横跨土坑上方。\n\n画面3：同一个男人坐在他刚才挖的坑的边缘，看着前方。他的表情似乎略带沉思。背景与画面2相同。\n\n画面4：一个广角镜头显示出他挖出的坑。这是一个不规则形状的土坑，在树木繁茂的斜坡上。土壤呈深棕色，可见树根。\n\n画面5：同一个男人跪在地上，用一把小斧头砍一根木头。他穿着与前几个画面相同的衣服。地面上覆盖着落叶。周围是树木和灌木。",
-            "narration": "“哎呀，这片灌木叶子滴水如雨，感觉像是大自然的洗发水广告！但我这位‘挖宝达人’似乎更适合拍个‘森林里的单身狗’真人秀。等会儿，我要给树根唱首歌，听说它们爱音乐！”",
-            "OST": 2,
-            "new_timestamp": "00:00:49,000-00:01:07,000"
-        },
-        {
-            "timestamp": "01:36-01:53",
-            "picture": "好的，以下是视频画面内容的客观描述：\n\n视频包含三个镜头：\n\n**镜头一：**个小型、浅水池塘，位于树林中。池塘的水看起来浑浊，呈绿褐色。池塘周围遍布泥土和落叶。多根树枝和树干横跨池塘，部分浸没在水中。周围的植被茂密主要是深色树木和灌木。\n\n**镜头二：**距拍摄树深处，阳光透过树叶洒落在植被上。镜头中可见粗大的树干、树枝和各种绿叶植物。部分树枝似乎被砍断，切口可见。\n\n**镜头三：**近距离特写镜头，聚焦在树枝和绿叶上。叶片呈圆形，颜色为鲜绿色，有些叶片上有缺损。树枝颜色较深，呈现深褐色。背景是模糊的树林。\n",
-            "narration": "“好吧，看来我们的‘挖宝达人’终于找到了一‘宝藏’——一个色泽如同绿豆汤的池塘！我敢打赌，这里不仅是小鱼儿的游乐场更是树枝们的‘水疗中心’！下次来这里，我得带上浮潜装备！”",
-            "OST": 2,
-            "new_timestamp": "00:01:07,000-00:01:24,000"
-        }
-    ]
-    # 合并子视频
-    # combine_clip_videos(combined_video_path=combined_video_path, video_paths=video_paths, video_ost_list=video_ost_list, list_script=list_script)
+            if subtitle_style:
+                if font_path and 'font' not in subtitle_style:
+                    subtitle_style['font'] = font_path
+                default_style.update(subtitle_style)

-    cfg = VideoClipParams()
-    cfg.video_aspect = VideoAspect.portrait
-    cfg.font_name = "STHeitiMedium.ttc"
-    cfg.font_size = 60
-    cfg.stroke_color = "#000000"
-    cfg.stroke_width = 1.5
-    cfg.text_fore_color = "#FFFFFF"
-    cfg.text_background_color = "transparent"
-    cfg.bgm_type = "random"
-    cfg.bgm_file = ""
-    cfg.bgm_volume = 1.0
-    cfg.subtitle_enabled = True
-    cfg.subtitle_position = "bottom"
-    cfg.n_threads = 2
-    cfg.video_volume = 1
+            try:
+                subs = pysrt.open(subtitle_path)
+                logger.info(f"读取到 {len(subs)} 条字幕")

-    cfg.voice_volume = 1.0
+                for index, sub in enumerate(subs):
+                    start_time = sub.start.ordinal / 1000
+                    end_time = sub.end.ordinal / 1000

-    video_path = "../../storage/tasks/123/combined.mp4"
-    audio_path = "../../storage/tasks/123/final_audio.mp3"
-    subtitle_path = "../../storage/tasks/123/subtitle.srt"
-    output_file = "../../storage/tasks/123/final-123.mp4"
+                    try:
+                        # 检查字幕文本是否为空
+                        if not sub.text or sub.text.strip() == '':
+                            logger.info(f"警告：第 {index + 1} 条字幕内容为空，已跳过")
+                            continue
+
+                        # 处理字幕文本：确保是字符串，并处理可能的列表情况
+                        if isinstance(sub.text, (list, tuple)):
+                            subtitle_text = ' '.join(str(item) for item in sub.text if item is not None)
+                        else:
+                            subtitle_text = str(sub.text)
+
+                        subtitle_text = subtitle_text.strip()
+
+                        if not subtitle_text:
+                            logger.info(f"警告：第 {index + 1} 条字幕处理后为空，已跳过")
+                            continue
+
+                        # 计算位置
+                        if isinstance(default_style['position'], tuple):
+                            pos_x, pos_y = default_style['position']
+                            if isinstance(pos_y, float):
+                                y_pos = int(video.h * pos_y)
+                                position = (pos_x, y_pos)
+                            else:
+                                position = default_style['position']
+                        else:
+                            position = default_style['position']
+
+                        # 创建基本的 TextClip
+                        text_clip = (TextClip(
+                            subtitle_text,
+                            font=default_style['font'],
+                            fontsize=default_style['fontsize'],
+                            color=default_style['color']
+                        )
+                                     .set_position(position)
+                                     .set_duration(end_time - start_time)
+                                     .set_start(start_time))
+
+                        subtitle_clips.append(text_clip)
+
+                    except Exception as e:
+                        logger.info(f"警告：创建第 {index + 1} 条字幕时出错: {str(e)}")
+
+                logger.info(f"成功创建 {len(subtitle_clips)} 条字幕剪辑")
+            except Exception as e:
+                logger.info(f"警告：处理字幕文件时出错: {str(e)}")
+        else:
+            logger.info(f"提示：字幕文件不存在: {subtitle_path}")
+
+    # 合并音频
+    audio_clips = []
+
+    # 添加原声（设置音量）
+    if video.audio is not None:
+        original_audio = video.audio.volumex(default_volume['original'])
+        audio_clips.append(original_audio)
+
+    # 添加BGM（如果提供）
+    if bgm_path:
+        bgm = AudioFileClip(bgm_path)
+        if bgm.duration < video.duration:
+            bgm = loop_audio_clip(bgm, video.duration)
+        else:
+            bgm = bgm.subclip(0, video.duration)
+        bgm = bgm.volumex(default_volume['bgm'])
+        audio_clips.append(bgm)
+
+    # 添加解说音频（如果提供）
+    if narration_path:
+        narration = AudioFileClip(narration_path).volumex(default_volume['narration'])
+        audio_clips.append(narration)
+
+    # 合成最终视频（包含字幕）
+    if subtitle_clips:
+        final_video = CompositeVideoClip([video] + subtitle_clips, size=video.size)
+    else:
+        logger.info("警告：没有字幕被添加到视频中")
+        final_video = video
+
+    if audio_clips:
+        final_audio = CompositeAudioClip(audio_clips)
+        final_video = final_video.set_audio(final_audio)
+
+    # 导出视频
+    logger.info("开始导出视频...")  # 调试信息
+    final_video.write_videofile(
+        output_path,
+        codec='libx264',
+        audio_codec='aac',
+        fps=video.fps
+    )
+    logger.info(f"视频已导出到: {output_path}")  # 调试信息
+
+    # 清理资源
+    video.close()
+    for clip in subtitle_clips:
+        clip.close()
+    if bgm_path:
+        bgm.close()
+    if narration_path:
+        narration.close()

-    generate_video_v2(video_path=video_path,
-                      audio_path=audio_path,
-                      subtitle_path=subtitle_path,
-                      output_file=output_file,
-                      params=cfg,
-                      list_script=list_script,
-                      )
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -7,7 +7,6 @@ import asyncio
 from loguru import logger
 from typing import List
 from datetime import datetime
-from edge_tts.submaker import mktimestamp
 from xml.sax.saxutils import unescape
 from edge_tts import submaker, SubMaker
 from moviepy.video.tools import subtitles
@ -1199,7 +1198,7 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None
        except Exception as e:
            logger.error(f"failed, error: {str(e)}")
            if i < 2:  # 如果不是最后一次重试，则等待1秒
-                time.sleep(1)
+                time.sleep(3)
    return None


@ -1318,96 +1317,6 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis
        traceback.print_exc()


-def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
-    """
-    优化字幕文件
-    1. 将字幕文件按照标点符号分割成多行
-    2. 逐行匹配字幕文件中的文本
-    3. 生成新的字幕文件
-    """
-
-    text = _format_text(text)
-
-    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
-        """
-        1
-        00:00:00,000 --> 00:00:02,360
-        跑步是一项简单易行的运动
-        """
-        start_t = mktimestamp(start_time).replace(".", ",")
-        end_t = mktimestamp(end_time).replace(".", ",")
-        return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n"
-
-    start_time = -1.0
-    sub_items = []
-    sub_index = 0
-
-    script_lines = utils.split_string_by_punctuations(text)
-
-    def match_line(_sub_line: str, _sub_index: int):
-        if len(script_lines) <= _sub_index:
-            return ""
-
-        _line = script_lines[_sub_index]
-        if _sub_line == _line:
-            return script_lines[_sub_index].strip()
-
-        _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line)
-        _line_ = re.sub(r"[^\w\s]", "", _line)
-        if _sub_line_ == _line_:
-            return _line_.strip()
-
-        _sub_line_ = re.sub(r"\W+", "", _sub_line)
-        _line_ = re.sub(r"\W+", "", _line)
-        if _sub_line_ == _line_:
-            return _line.strip()
-
-        return ""
-
-    sub_line = ""
-
-    try:
-        for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
-            _start_time, end_time = offset
-            if start_time < 0:
-                start_time = _start_time
-
-            sub = unescape(sub)
-            sub_line += sub
-            sub_text = match_line(sub_line, sub_index)
-            if sub_text:
-                sub_index += 1
-                line = formatter(
-                    idx=sub_index,
-                    start_time=start_time,
-                    end_time=end_time,
-                    sub_text=sub_text,
-                )
-                sub_items.append(line)
-                start_time = -1.0
-                sub_line = ""
-
-        if len(sub_items) == len(script_lines):
-            with open(subtitle_file, "w", encoding="utf-8") as file:
-                file.write("\n".join(sub_items) + "\n")
-            try:
-                sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
-                duration = max([tb for ((ta, tb), txt) in sbs])
-                logger.info(
-                    f"completed, subtitle file created: {subtitle_file}, duration: {duration}"
-                )
-            except Exception as e:
-                logger.error(f"failed, error: {str(e)}")
-                os.remove(subtitle_file)
-        else:
-            logger.warning(
-                f"failed, sub_items len: {len(sub_items)}, script_lines len: {len(script_lines)}"
-            )
-
-    except Exception as e:
-        logger.error(f"failed, error: {str(e)}")
-
-
 def get_audio_duration(sub_maker: submaker.SubMaker):
    """
    获取音频时长
@ -1466,20 +1375,3 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
            logger.info(f"已生成音频文件: {audio_file}")

    return audio_files, sub_maker_list
-
-
-if __name__ == "__main__":
-    voice_name = "zh-CN-YunyangNeural"
-    # voice_name = "af-ZA-AdriNeural"
-    voice_name = parse_voice_name(voice_name)
-    print(voice_name)
-
-    with open("../../resource/scripts/2024-1203-205442.json", 'r', encoding='utf-8') as f:
-        data = json.load(f)
-
-    audio_files, sub_maker_list = tts_multiple(task_id="12312312", list_script=data, voice_name=voice_name, voice_rate=1, voice_pitch=1)
-
-    full_text = " ".join([item['narration'] for item in data if not item['OST']])
-    subtitle_file = os.path.join(utils.task_dir("12312312"), "subtitle_multiple.srt")
-    create_subtitle_from_multiple(full_text, sub_maker_list, data, subtitle_file)
-    print(f"生成的音频文件列表: {audio_files}")
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@ -117,6 +117,47 @@ def song_dir(sub_dir: str = ""):
    return d


+def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
+    """
+    获取背景音乐文件路径
+    Args:
+        bgm_type: 背景音乐类型，可选值: random(随机), ""(无背景音乐)
+        bgm_file: 指定的背景音乐文件路径
+
+    Returns:
+        str: 背景音乐文件路径
+    """
+    import glob
+    import random
+    if not bgm_type:
+        return ""
+
+    if bgm_file and os.path.exists(bgm_file):
+        return bgm_file
+
+    if bgm_type == "random":
+        song_dir_path = song_dir()
+
+        # 检查目录是否存在
+        if not os.path.exists(song_dir_path):
+            logger.warning(f"背景音乐目录不存在: {song_dir_path}")
+            return ""
+
+        # 支持 mp3 和 flac 格式
+        mp3_files = glob.glob(os.path.join(song_dir_path, "*.mp3"))
+        flac_files = glob.glob(os.path.join(song_dir_path, "*.flac"))
+        files = mp3_files + flac_files
+
+        # 检查是否找到音乐文件
+        if not files:
+            logger.warning(f"在目录 {song_dir_path} 中没有找到 MP3 或 FLAC 文件")
+            return ""
+
+        return random.choice(files)
+
+    return ""
+
+
 def public_dir(sub_dir: str = ""):
    d = resource_dir(f"public")
    if sub_dir:
@ -339,7 +380,7 @@ def time_to_seconds(time_str: str) -> float:

        # 分割时间部分
        parts = time_part.split(':')
-        
+
        if len(parts) == 3:  # HH:MM:SS
            h, m, s = map(float, parts)
            seconds = h * 3600 + m * 60 + s
@ -350,7 +391,7 @@ def time_to_seconds(time_str: str) -> float:
            seconds = float(parts[0])

        return seconds + ms
-        
+
    except (ValueError, IndexError) as e:
        logger.error(f"时间格式转换错误 {time_str}: {str(e)}")
        return 0.0
@ -373,16 +414,16 @@ def calculate_total_duration(scenes):
        float: 总时长（秒）
    """
    total_seconds = 0
-    
+
    for scene in scenes:
        start, end = scene['timestamp'].split('-')
        # 使用 time_to_seconds 函数处理更精确的时间格式
        start_seconds = time_to_seconds(start)
        end_seconds = time_to_seconds(end)
-        
+
        duration = end_seconds - start_seconds
        total_seconds += duration
-    
+
    return total_seconds


@ -502,7 +543,7 @@ def clear_keyframes_cache(video_path: str = None):
        keyframes_dir = os.path.join(temp_dir(), "keyframes")
        if not os.path.exists(keyframes_dir):
            return
-            
+
        if video_path:
            # 理指定视频的缓存
            video_hash = md5(video_path + str(os.path.getmtime(video_path)))
@ -516,7 +557,7 @@ def clear_keyframes_cache(video_path: str = None):
            import shutil
            shutil.rmtree(keyframes_dir)
            logger.info("已清理所有关键帧缓存")
-            
+
    except Exception as e:
        logger.error(f"清理关键帧缓存失败: {e}")

@ -527,15 +568,16 @@ def init_resources():
        # 创建字体目录
        font_dir = os.path.join(root_dir(), "resource", "fonts")
        os.makedirs(font_dir, exist_ok=True)
-        
+
        # 检查字体文件
        font_files = [
-            ("SourceHanSansCN-Regular.otf", "https://github.com/adobe-fonts/source-han-sans/raw/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf"),
+            ("SourceHanSansCN-Regular.otf",
+             "https://github.com/adobe-fonts/source-han-sans/raw/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf"),
            ("simhei.ttf", "C:/Windows/Fonts/simhei.ttf"),  # Windows 黑体
            ("simkai.ttf", "C:/Windows/Fonts/simkai.ttf"),  # Windows 楷体
            ("simsun.ttc", "C:/Windows/Fonts/simsun.ttc"),  # Windows 宋体
        ]
-        
+
        # 优先使用系统字体
        system_font_found = False
        for font_name, source in font_files:
@ -547,16 +589,17 @@ def init_resources():
                    logger.info(f"已复制系统字体: {font_name}")
                system_font_found = True
                break
-        
+
        # 如果没有找到系统字体，则下载思源黑体
        if not system_font_found:
            source_han_path = os.path.join(font_dir, "SourceHanSansCN-Regular.otf")
            if not os.path.exists(source_han_path):
                download_font(font_files[0][1], source_han_path)
-                
+
    except Exception as e:
        logger.error(f"初始化资源文件失败: {e}")

+
 def download_font(url: str, font_path: str):
    """下载字体文件"""
    try:
@ -564,16 +607,17 @@ def download_font(url: str, font_path: str):
        import requests
        response = requests.get(url)
        response.raise_for_status()
-        
+
        with open(font_path, 'wb') as f:
            f.write(response.content)
-            
+
        logger.info(f"字体文件下载成功: {font_path}")
-        
+
    except Exception as e:
        logger.error(f"下载字体文件失败: {e}")
        raise

+
 def init_imagemagick():
    """初始化 ImageMagick 配置"""
    try:
@ -583,10 +627,10 @@ def init_imagemagick():
        if result.returncode != 0:
            logger.error("ImageMagick 未安装或配置不正确")
            return False
-            
+
        # 设置 IMAGEMAGICK_BINARY 环境变量
        os.environ['IMAGEMAGICK_BINARY'] = 'magick'
-        
+
        return True
    except Exception as e:
        logger.error(f"初始化 ImageMagick 失败: {str(e)}")
--- a/config.example.toml
+++ b/config.example.toml
@ -11,8 +11,13 @@
    vision_gemini_api_key = ""
    vision_gemini_model_name = "gemini-1.5-flash"

+    ########## Vision Qwen API Key
+    vision_qwenvl_api_key = ""
+    vision_qwenvl_model_name = "qwen-vl-max-latest"
+    vision_qwenvl_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+
    ########### Vision NarratoAPI Key
-    narrato_api_key = ""
+    narrato_api_key = "0N0iEjU77aTqPW4d9YHCmTW2mPrfgWjDmaWAz1lTVTM"
    narrato_api_url = "https://narratoinsight.scsmtech.cn/api/v1"
    narrato_vision_model = "gemini-1.5-flash"
    narrato_vision_key = ""
@ -32,9 +37,7 @@
    ########## OpenAI API Key
    # Get your API key at https://platform.openai.com/api-keys
    text_openai_api_key = ""
-    # No need to set it unless you want to use your own proxy
-    text_openai_base_url = ""
-    # Check your available models at https://platform.openai.com/account/limits
+    text_openai_base_url = "https://api.openai.com/v1"
    text_openai_model_name = "gpt-4o-mini"

    ########## Moonshot API Key
@ -66,7 +69,8 @@
    # https://tongyi.aliyun.com/qianwen/
    # https://help.aliyun.com/zh/dashscope/developer-reference/model-introduction
    text_qwen_api_key = ""
-    text_qwen_model_name = "qwen-max"
+    text_qwen_model_name = "qwen-plus-1127"
+    text_qwen_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"

    ########## DeepSeek API Key
    # Visit https://platform.deepseek.com/api_keys to get your API key
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,6 @@
 requests~=2.31.0
 moviepy==2.0.0.dev2
 faster-whisper~=1.0.1
-edge_tts~=6.1.15
 uvicorn~=0.27.1
 fastapi~=0.115.4
 tomli~=2.0.1
@ -35,3 +34,5 @@ tiktoken==0.8.0
 yt-dlp==2024.11.18
 pysrt==1.1.2
 httpx==0.27.2
+transformers==4.47.0
+edge-tts==6.1.19
--- a/webui/components/audio_settings.py
+++ b/webui/components/audio_settings.py
@ -6,23 +6,25 @@ from app.services import voice
 from app.utils import utils
 from webui.utils.cache import get_songs_cache

+
 def render_audio_panel(tr):
    """渲染音频设置面板"""
    with st.container(border=True):
        st.write(tr("Audio Settings"))
-        
+
        # 渲染TTS设置
        render_tts_settings(tr)
-        
+
        # 渲染背景音乐设置
        render_bgm_settings(tr)

+
 def render_tts_settings(tr):
    """渲染TTS(文本转语音)设置"""
    # 获取支持的语音列表
    support_locales = ["zh-CN"]
    voices = voice.get_all_azure_voices(filter_locals=support_locales)
-    
+
    # 创建友好的显示名称
    friendly_names = {
        v: v.replace("Female", tr("Female"))
@ -30,11 +32,11 @@ def render_tts_settings(tr):
        .replace("Neural", "")
        for v in voices
    }
-    
+
    # 获取保存的语音设置
    saved_voice_name = config.ui.get("voice_name", "")
    saved_voice_name_index = 0
-    
+
    if saved_voice_name in friendly_names:
        saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name)
    else:
@ -56,7 +58,7 @@ def render_tts_settings(tr):
    voice_name = list(friendly_names.keys())[
        list(friendly_names.values()).index(selected_friendly_name)
    ]
-    
+
    # 保存设置
    config.ui["voice_name"] = voice_name

@ -70,34 +72,40 @@ def render_tts_settings(tr):
    # 试听按钮
    render_voice_preview(tr, voice_name)

+
 def render_azure_v2_settings(tr):
    """渲染Azure V2语音设置"""
    saved_azure_speech_region = config.azure.get("speech_region", "")
    saved_azure_speech_key = config.azure.get("speech_key", "")
-    
+
    azure_speech_region = st.text_input(
-        tr("Speech Region"), 
+        tr("Speech Region"),
        value=saved_azure_speech_region
    )
    azure_speech_key = st.text_input(
-        tr("Speech Key"), 
-        value=saved_azure_speech_key, 
+        tr("Speech Key"),
+        value=saved_azure_speech_key,
        type="password"
    )
-    
+
    config.azure["speech_region"] = azure_speech_region
    config.azure["speech_key"] = azure_speech_key

+
 def render_voice_parameters(tr):
    """渲染语音参数设置"""
    # 音量
-    voice_volume = st.selectbox(
+    voice_volume = st.slider(
        tr("Speech Volume"),
-        options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0],
-        index=2,
+        min_value=0.0,
+        max_value=2.0,
+        value=1.0,
+        step=0.1,
+        help=tr("Adjust the volume of the original audio")
    )
    st.session_state['voice_volume'] = voice_volume

+
    # 语速
    voice_rate = st.selectbox(
        tr("Speech Rate"),
@ -114,6 +122,7 @@ def render_voice_parameters(tr):
    )
    st.session_state['voice_pitch'] = voice_pitch

+
 def render_voice_preview(tr, voice_name):
    """渲染语音试听功能"""
    if st.button(tr("Play Voice")):
@ -122,11 +131,11 @@ def render_voice_preview(tr, voice_name):
            play_content = st.session_state.get('video_script', '')
        if not play_content:
            play_content = tr("Voice Example")
-            
+
        with st.spinner(tr("Synthesizing Voice")):
            temp_dir = utils.storage_dir("temp", create=True)
            audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
-            
+
            sub_maker = voice.tts(
                text=play_content,
                voice_name=voice_name,
@ -134,7 +143,7 @@ def render_voice_preview(tr, voice_name):
                voice_pitch=st.session_state.get('voice_pitch', 1.0),
                voice_file=audio_file,
            )
-            
+
            # 如果语音文件生成失败，使用默认内容重试
            if not sub_maker:
                play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
@ -151,6 +160,7 @@ def render_voice_preview(tr, voice_name):
                if os.path.exists(audio_file):
                    os.remove(audio_file)

+
 def render_bgm_settings(tr):
    """渲染背景音乐设置"""
    # 背景音乐选项
@ -159,14 +169,14 @@ def render_bgm_settings(tr):
        (tr("Random Background Music"), "random"),
        (tr("Custom Background Music"), "custom"),
    ]
-    
+
    selected_index = st.selectbox(
        tr("Background Music"),
        index=1,
        options=range(len(bgm_options)),
        format_func=lambda x: bgm_options[x][0],
    )
-    
+
    # 获取选择的背景音乐类型
    bgm_type = bgm_options[selected_index][1]
    st.session_state['bgm_type'] = bgm_type
@ -176,15 +186,19 @@ def render_bgm_settings(tr):
        custom_bgm_file = st.text_input(tr("Custom Background Music File"))
        if custom_bgm_file and os.path.exists(custom_bgm_file):
            st.session_state['bgm_file'] = custom_bgm_file
-    
+
    # 背景音乐音量
-    bgm_volume = st.selectbox(
+    bgm_volume = st.slider(
        tr("Background Music Volume"),
-        options=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
-        index=2,
+        min_value=0.0,
+        max_value=2.0,
+        value=1.0,
+        step=0.1,
+        help=tr("Adjust the volume of the original audio")
    )
    st.session_state['bgm_volume'] = bgm_volume

+
 def get_audio_params():
    """获取音频参数"""
    return {
@ -195,4 +209,4 @@ def get_audio_params():
        'bgm_type': st.session_state.get('bgm_type', 'random'),
        'bgm_file': st.session_state.get('bgm_file', ''),
        'bgm_volume': st.session_state.get('bgm_volume', 0.2),
-    }
+    }
--- a/webui/components/basic_settings.py
+++ b/webui/components/basic_settings.py
@ -149,6 +149,7 @@ def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
    else:
        return False, f"{tr('Unsupported provider')}: {provider}"

+
 def render_vision_llm_settings(tr):
    """渲染视频分析模型设置"""
    st.subheader(tr("Vision Model Settings"))
@ -196,7 +197,7 @@ def render_vision_llm_settings(tr):
    elif vision_provider == 'qwenvl':
        st_vision_base_url = st.text_input(
            tr("Vision Base URL"), 
-            value=vision_base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1",
+            value=vision_base_url,
            help=tr("Default: https://dashscope.aliyuncs.com/compatible-mode/v1")
        )
        st_vision_model_name = st.text_input(
--- a/webui/components/script_settings.py
+++ b/webui/components/script_settings.py
@ -2,12 +2,15 @@ import os
 import glob
 import json
 import time
+import traceback
 import streamlit as st
+from loguru import logger

 from app.config import config
 from app.models.schema import VideoClipParams
 from app.utils import utils, check_script
 from webui.tools.generate_script_docu import generate_script_docu
+from webui.tools.generate_script_short import generate_script_short


 def render_script_panel(tr):
@ -34,6 +37,7 @@ def render_script_file(tr, params):
    script_list = [
        (tr("None"), ""), 
        (tr("Auto Generate"), "auto"), 
+        (tr("Short Generate"), "short"),
        (tr("Upload Script"), "upload_script")  # 新增上传脚本选项
    ]

@ -216,7 +220,9 @@ def render_script_buttons(tr, params):
    script_path = st.session_state.get('video_clip_json_path', '')
    if script_path == "auto":
        button_name = tr("Generate Video Script")
-    elif script_path:
+    elif script_path == "short":
+        button_name = tr("Generate Short Video Script")
+    elif script_path.endswith("json"):
        button_name = tr("Load Video Script")
    else:
        button_name = tr("Please Select Script File")
@ -224,6 +230,8 @@ def render_script_buttons(tr, params):
    if st.button(button_name, key="script_action", disabled=not script_path):
        if script_path == "auto":
            generate_script_docu(tr, params)
+        elif script_path == "short":
+            generate_script_short(tr, params)
        else:
            load_script(tr, script_path)

@ -275,6 +283,7 @@ def load_script(tr, script_path):
            st.success(tr("Script loaded successfully"))
            st.rerun()
    except Exception as e:
+        logger.error(f"加载脚本文件时发生错误\n{traceback.format_exc()}")
        st.error(f"{tr('Failed to load script')}: {str(e)}")


@ -332,3 +341,14 @@ def crop_video(tr, params):
        time.sleep(2)
        progress_bar.empty()
        status_text.empty()
+
+
+def get_script_params():
+    """获取脚本参数"""
+    return {
+        'video_language': st.session_state.get('video_language', ''),
+        'video_clip_json_path': st.session_state.get('video_clip_json_path', ''),
+        'video_origin_path': st.session_state.get('video_origin_path', ''),
+        'video_name': st.session_state.get('video_name', ''),
+        'video_plot': st.session_state.get('video_plot', '')
+    }
--- a/webui/components/video_settings.py
+++ b/webui/components/video_settings.py
@ -1,6 +1,7 @@
 import streamlit as st
 from app.models.schema import VideoClipParams, VideoAspect

+
 def render_video_panel(tr):
    """渲染视频配置面板"""
    with st.container(border=True):
@ -8,6 +9,7 @@ def render_video_panel(tr):
        params = VideoClipParams()
        render_video_config(tr, params)

+
 def render_video_config(tr, params):
    """渲染视频配置"""
    # 视频比例
@ -39,9 +41,20 @@ def render_video_config(tr, params):
    )
    st.session_state['video_quality'] = video_qualities[quality_index][1]

+    # 原声音量
+    params.original_volume = st.slider(
+        tr("Original Volume"),
+        min_value=0.0,
+        max_value=2.0,
+        value=1.0,
+        step=0.1,
+        help=tr("Adjust the volume of the original audio")
+    )
+
+
 def get_video_params():
    """获取视频参数"""
    return {
        'video_aspect': st.session_state.get('video_aspect', VideoAspect.portrait.value),
        'video_quality': st.session_state.get('video_quality', '1080p')
-    } 
+    }
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -2,13 +2,12 @@
  "Language": "简体中文",
  "Translation": {
    "Video Script Configuration": "**视频脚本配置**",
-    "Generate Video Script": "生成视频脚本",
+    "Generate Video Script": "AI生成画面解说脚本",
    "Video Subject": "视频主题（给定一个关键词，:red[AI自动生成]视频文案）",
    "Script Language": "生成视频脚本的语言（一般情况AI会自动根据你输入的主题语言输出）",
    "Script Files": "脚本文件",
    "Generate Video Script and Keywords": "点击使用AI根据**主题**生成 【视频文案】 和 【视频关键词】",
    "Auto Detect": "自动检测",
-    "Auto Generate": "自动生成",
    "Video Theme": "视频主题",
    "Generation Prompt": "自定义提示词",
    "Save Script": "保存脚本",
@ -188,6 +187,11 @@
    "Transcription Failed": "转录失败",
    "Mergeable Files": "可合并文件数",
    "Subtitle Content": "字幕内容",
-    "Merge Result Preview": "合并结果预览"
+    "Merge Result Preview": "合并结果预览",
+    "Short Generate": "短剧混剪 (高燃剪辑, 当前只支持 gpt-4o 模型)",
+    "Generate Short Video Script": "AI生成短剧混剪脚本",
+    "Adjust the volume of the original audio": "调整原始音频的音量",
+    "Original Volume": "视频音量",
+    "Auto Generate": "纪录片解说 (画面解说)"
  }
 }
--- a/webui/tools/base.py
+++ b/webui/tools/base.py
@ -1,7 +1,11 @@
 import os
+import requests
 import streamlit as st
 from loguru import logger
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry

+from app.config import config
 from app.utils import gemini_analyzer, qwenvl_analyzer


@ -31,17 +35,6 @@ def create_vision_analyzer(provider, api_key, model, base_url):
        raise ValueError(f"不支持的视觉分析提供商: {provider}")


-def get_script_params():
-    """获取脚本参数"""
-    return {
-        'video_language': st.session_state.get('video_language', ''),
-        'video_clip_json_path': st.session_state.get('video_clip_json_path', ''),
-        'video_origin_path': st.session_state.get('video_origin_path', ''),
-        'video_name': st.session_state.get('video_name', ''),
-        'video_plot': st.session_state.get('video_plot', '')
-    }
-
-
 def get_batch_timestamps(batch_files, prev_batch_files=None):
    """
    解析一批文件的时间戳范围,支持毫秒级精度
@ -139,3 +132,32 @@ def get_batch_files(keyframe_files, result, batch_size=5):
    batch_start = result['batch_index'] * batch_size
    batch_end = min(batch_start + batch_size, len(keyframe_files))
    return keyframe_files[batch_start:batch_end]
+
+
+def chekc_video_config(video_params):
+    """
+    检查视频分析配置
+    """
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json'
+    }
+    session = requests.Session()
+    retry_strategy = Retry(
+        total=3,
+        backoff_factor=1,
+        status_forcelist=[500, 502, 503, 504]
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    session.mount("https://", adapter)
+    try:
+        session.post(
+            f"{config.app.get('narrato_api_url')}/video/config",
+            headers=headers,
+            json=video_params,
+            timeout=30,
+            verify=True
+        )
+        return True
+    except Exception as e:
+        return False
--- a/webui/tools/generate_script_docu.py
+++ b/webui/tools/generate_script_docu.py
@ -13,7 +13,7 @@ from urllib3.util.retry import Retry
 from app.config import config
 from app.utils.script_generator import ScriptProcessor
 from app.utils import utils, video_processor, video_processor_v2, qwenvl_analyzer
-from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps
+from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps, chekc_video_config


 def generate_script_docu(tr, params):
@ -117,8 +117,7 @@ def generate_script_docu(tr, params):
                elif vision_llm_provider == 'qwenvl':
                    vision_api_key = st.session_state.get('vision_qwenvl_api_key')
                    vision_model = st.session_state.get('vision_qwenvl_model_name', 'qwen-vl-max-latest')
-                    vision_base_url = st.session_state.get('vision_qwenvl_base_url',
-                                                           'https://dashscope.aliyuncs.com/compatible-mode/v1')
+                    vision_base_url = st.session_state.get('vision_qwenvl_base_url')
                else:
                    raise ValueError(f"不支持的视觉分析提供商: {vision_llm_provider}")

@ -228,28 +227,7 @@ def generate_script_docu(tr, params):
                    "text_model_name": text_model,
                    "text_base_url": text_base_url or ""
                }
-                headers = {
-                    'accept': 'application/json',
-                    'Content-Type': 'application/json'
-                }
-                session = requests.Session()
-                retry_strategy = Retry(
-                    total=3,
-                    backoff_factor=1,
-                    status_forcelist=[500, 502, 503, 504]
-                )
-                adapter = HTTPAdapter(max_retries=retry_strategy)
-                session.mount("https://", adapter)
-                try:
-                    response = session.post(
-                        f"{config.app.get('narrato_api_url')}/video/config",
-                        headers=headers,
-                        json=api_params,
-                        timeout=30,
-                        verify=True
-                    )
-                except Exception as e:
-                    pass
+                chekc_video_config(api_params)
                custom_prompt = st.session_state.get('custom_prompt', '')
                processor = ScriptProcessor(
                    model_name=text_model,
--- a/webui/tools/generate_script_short.py
+++ b/webui/tools/generate_script_short.py
@ -0,0 +1,85 @@
+import os
+import json
+import time
+import asyncio
+import traceback
+import requests
+import streamlit as st
+from loguru import logger
+
+from app.config import config
+from webui.tools.base import chekc_video_config
+from app.services.SDP.generate_script_short import generate_script
+
+
+def generate_script_short(tr, params):
+    """
+    生成 纪录片 视频脚本
+    """
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+
+    def update_progress(progress: float, message: str = ""):
+        progress_bar.progress(progress)
+        if message:
+            status_text.text(f"{progress}% - {message}")
+        else:
+            status_text.text(f"进度: {progress}%")
+
+    try:
+        with st.spinner("正在生成脚本..."):
+            text_provider = config.app.get('text_llm_provider', 'gemini').lower()
+            text_api_key = config.app.get(f'text_{text_provider}_api_key')
+            text_model = config.app.get(f'text_{text_provider}_model_name')
+            text_base_url = config.app.get(f'text_{text_provider}_base_url')
+            vision_api_key = st.session_state.get(f'vision_{text_provider}_api_key', "")
+            vision_model = st.session_state.get(f'vision_{text_provider}_model_name', "")
+            vision_base_url = st.session_state.get(f'vision_{text_provider}_base_url', "")
+            narrato_api_key = config.app.get('narrato_api_key')
+
+            update_progress(20, "开始准备生成脚本")
+
+            srt_path = params.video_origin_path.replace(".mp4", ".srt").replace("videos", "srt").replace("video", "subtitle")
+            if not os.path.exists(srt_path):
+                logger.error(f"{srt_path} 文件不存在请检查或重新转录")
+                st.error(f"{srt_path} 文件不存在请检查或重新转录")
+                st.stop()
+
+            api_params = {
+                "vision_api_key": vision_api_key,
+                "vision_model_name": vision_model,
+                "vision_base_url": vision_base_url or "",
+                "text_api_key": text_api_key,
+                "text_model_name": text_model,
+                "text_base_url": text_base_url or ""
+            }
+            chekc_video_config(api_params)
+            script = generate_script(
+                srt_path=srt_path,
+                output_path="resource/scripts/merged_subtitle.json",
+                api_key=text_api_key,
+                model_name=text_model,
+                base_url=text_base_url,
+                narrato_api_key=narrato_api_key,
+                bert_path="app/models/bert/",
+            )
+
+            if script is None:
+                st.error("生成脚本失败，请检查日志")
+                st.stop()
+            logger.info(f"脚本生成完成 {json.dumps(script, ensure_ascii=False, indent=4)}")
+            if isinstance(script, list):
+                st.session_state['video_clip_json'] = script
+            elif isinstance(script, str):
+                st.session_state['video_clip_json'] = json.loads(script)
+            update_progress(80, "脚本生成完成")
+
+        time.sleep(0.1)
+        progress_bar.progress(100)
+        status_text.text("脚本生成完成！")
+        st.success("视频脚本生成成功！")
+
+    except Exception as err:
+        progress_bar.progress(100)
+        st.error(f"生成过程中发生错误: {str(err)}")
+        logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")