Merge pull request #167 from linyqh/dev0.7

Dev0.7
2026-02-03 11:48:15 +00:00 · 2025-08-03 21:34:21 +08:00 · 2025-08-03 21:34:21 +08:00 · cb02f2c897
commit cb02f2c897
parent 06cbee0654 8e933ff629
30 changed files with 1795 additions and 1186 deletions
--- a/app/config/config.py
+++ b/app/config/config.py
@ -48,6 +48,7 @@ def save_config():
    with open(config_file, "w", encoding="utf-8") as f:
        _cfg["app"] = app
        _cfg["azure"] = azure
        _cfg["soulvoice"] = soulvoice
        _cfg["ui"] = ui
        f.write(toml.dumps(_cfg))
@ -57,6 +58,7 @@ app = _cfg.get("app", {})
 whisper = _cfg.get("whisper", {})
 proxy = _cfg.get("proxy", {})
 azure = _cfg.get("azure", {})
 soulvoice = _cfg.get("soulvoice", {})
 ui = _cfg.get("ui", {})
 frames = _cfg.get("frames", {})
--- a/app/services/clip_video.py
+++ b/app/services/clip_video.py
@ -546,6 +546,359 @@ def try_fallback_encoding(
    return execute_simple_command(fallback_cmd, timestamp, "通用Fallback")
 def _process_narration_only_segment(
    video_origin_path: str,
    script_item: Dict,
    tts_map: Dict,
    output_dir: str,
    encoder_config: Dict,
    hwaccel_args: List[str]
 ) -> Optional[str]:
    """
    处理OST=0的纯解说片段
    - 根据TTS音频时长动态裁剪
    - 移除原声，生成静音视频
    """
    _id = script_item["_id"]
    timestamp = script_item["timestamp"]
    # 获取对应的TTS结果
    tts_item = tts_map.get(_id)
    if not tts_item:
        logger.error(f"未找到片段 {_id} 的TTS结果")
        return None
    # 解析起始时间，使用TTS音频时长计算结束时间
    start_time, _ = parse_timestamp(timestamp)
    duration = tts_item["duration"]
    calculated_end_time = calculate_end_time(start_time, duration, extra_seconds=0)
    # 转换为FFmpeg兼容的时间格式
    ffmpeg_start_time = start_time.replace(',', '.')
    ffmpeg_end_time = calculated_end_time.replace(',', '.')
    # 生成输出文件名
    safe_start_time = start_time.replace(':', '-').replace(',', '-')
    safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
    output_filename = f"ost0_vid_{safe_start_time}@{safe_end_time}.mp4"
    output_path = os.path.join(output_dir, output_filename)
    # 构建FFmpeg命令 - 移除音频
    cmd = _build_ffmpeg_command_with_audio_control(
        video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time,
        encoder_config, hwaccel_args, remove_audio=True
    )
    # 执行命令
    success = execute_ffmpeg_with_fallback(
        cmd, timestamp, video_origin_path, output_path,
        ffmpeg_start_time, ffmpeg_end_time
    )
    return output_path if success else None
 def _process_original_audio_segment(
    video_origin_path: str,
    script_item: Dict,
    output_dir: str,
    encoder_config: Dict,
    hwaccel_args: List[str]
 ) -> Optional[str]:
    """
    处理OST=1的纯原声片段
    - 严格按照脚本timestamp精确裁剪
    - 保持原声不变
    """
    _id = script_item["_id"]
    timestamp = script_item["timestamp"]
    # 严格按照timestamp进行裁剪
    start_time, end_time = parse_timestamp(timestamp)
    # 转换为FFmpeg兼容的时间格式
    ffmpeg_start_time = start_time.replace(',', '.')
    ffmpeg_end_time = end_time.replace(',', '.')
    # 生成输出文件名
    safe_start_time = start_time.replace(':', '-').replace(',', '-')
    safe_end_time = end_time.replace(':', '-').replace(',', '-')
    output_filename = f"ost1_vid_{safe_start_time}@{safe_end_time}.mp4"
    output_path = os.path.join(output_dir, output_filename)
    # 构建FFmpeg命令 - 保持原声
    cmd = _build_ffmpeg_command_with_audio_control(
        video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time,
        encoder_config, hwaccel_args, remove_audio=False
    )
    # 执行命令
    success = execute_ffmpeg_with_fallback(
        cmd, timestamp, video_origin_path, output_path,
        ffmpeg_start_time, ffmpeg_end_time
    )
    return output_path if success else None
 def _process_mixed_segment(
    video_origin_path: str,
    script_item: Dict,
    tts_map: Dict,
    output_dir: str,
    encoder_config: Dict,
    hwaccel_args: List[str]
 ) -> Optional[str]:
    """
    处理OST=2的解说+原声混合片段
    - 根据TTS音频时长动态裁剪
    - 保持原声，确保视频时长等于TTS音频时长
    """
    _id = script_item["_id"]
    timestamp = script_item["timestamp"]
    # 获取对应的TTS结果
    tts_item = tts_map.get(_id)
    if not tts_item:
        logger.error(f"未找到片段 {_id} 的TTS结果")
        return None
    # 解析起始时间，使用TTS音频时长计算结束时间
    start_time, _ = parse_timestamp(timestamp)
    duration = tts_item["duration"]
    calculated_end_time = calculate_end_time(start_time, duration, extra_seconds=0)
    # 转换为FFmpeg兼容的时间格式
    ffmpeg_start_time = start_time.replace(',', '.')
    ffmpeg_end_time = calculated_end_time.replace(',', '.')
    # 生成输出文件名
    safe_start_time = start_time.replace(':', '-').replace(',', '-')
    safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
    output_filename = f"ost2_vid_{safe_start_time}@{safe_end_time}.mp4"
    output_path = os.path.join(output_dir, output_filename)
    # 构建FFmpeg命令 - 保持原声
    cmd = _build_ffmpeg_command_with_audio_control(
        video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time,
        encoder_config, hwaccel_args, remove_audio=False
    )
    # 执行命令
    success = execute_ffmpeg_with_fallback(
        cmd, timestamp, video_origin_path, output_path,
        ffmpeg_start_time, ffmpeg_end_time
    )
    return output_path if success else None
 def _build_ffmpeg_command_with_audio_control(
    input_path: str,
    output_path: str,
    start_time: str,
    end_time: str,
    encoder_config: Dict[str, str],
    hwaccel_args: List[str] = None,
    remove_audio: bool = False
 ) -> List[str]:
    """
    构建支持音频控制的FFmpeg命令
    Args:
        input_path: 输入视频路径
        output_path: 输出视频路径
        start_time: 开始时间
        end_time: 结束时间
        encoder_config: 编码器配置
        hwaccel_args: 硬件加速参数
        remove_audio: 是否移除音频（OST=0时为True）
    Returns:
        List[str]: ffmpeg命令列表
    """
    cmd = ["ffmpeg", "-y"]
    # 硬件加速设置（参考原有逻辑）
    if encoder_config["video_codec"] == "h264_nvenc":
        # 对于NVENC，不使用硬件解码以避免滤镜链问题
        pass
    elif hwaccel_args:
        cmd.extend(hwaccel_args)
    # 输入文件
    cmd.extend(["-i", input_path])
    # 时间范围
    cmd.extend(["-ss", start_time, "-to", end_time])
    # 视频编码器设置
    cmd.extend(["-c:v", encoder_config["video_codec"]])
    # 音频处理
    if remove_audio:
        # OST=0: 移除音频
        cmd.extend(["-an"])  # -an 表示不包含音频流
        logger.debug("OST=0: 移除音频流")
    else:
        # OST=1,2: 保持原声
        cmd.extend(["-c:a", encoder_config["audio_codec"]])
        cmd.extend(["-ar", "44100", "-ac", "2"])
        logger.debug("OST=1/2: 保持原声")
    # 像素格式
    cmd.extend(["-pix_fmt", encoder_config["pixel_format"]])
    # 质量和预设参数（参考原有逻辑）
    if encoder_config["video_codec"] == "h264_nvenc":
        cmd.extend(["-preset", encoder_config["preset"]])
        cmd.extend(["-cq", encoder_config["quality_value"]])
        cmd.extend(["-profile:v", "main"])
    elif encoder_config["video_codec"] == "h264_amf":
        cmd.extend(["-quality", encoder_config["preset"]])
        cmd.extend(["-qp_i", encoder_config["quality_value"]])
    elif encoder_config["video_codec"] == "h264_qsv":
        cmd.extend(["-preset", encoder_config["preset"]])
        cmd.extend(["-global_quality", encoder_config["quality_value"]])
    elif encoder_config["video_codec"] == "h264_videotoolbox":
        cmd.extend(["-profile:v", "high"])
        cmd.extend(["-b:v", encoder_config["quality_value"]])
    else:
        # 软件编码器（libx264）
        cmd.extend(["-preset", encoder_config["preset"]])
        cmd.extend(["-crf", encoder_config["quality_value"]])
    # 优化参数
    cmd.extend(["-avoid_negative_ts", "make_zero"])
    cmd.extend(["-movflags", "+faststart"])
    # 输出文件
    cmd.append(output_path)
    return cmd
 def clip_video_unified(
        video_origin_path: str,
        script_list: List[Dict],
        tts_results: List[Dict],
        output_dir: Optional[str] = None,
        task_id: Optional[str] = None
 ) -> Dict[str, str]:
    """
    基于OST类型的统一视频裁剪策略 - 消除双重裁剪问题
    Args:
        video_origin_path: 原始视频的路径
        script_list: 完整的脚本列表，包含所有片段信息
        tts_results: TTS结果列表，仅包含OST=0和OST=2的片段
        output_dir: 输出目录路径，默认为None时会自动生成
        task_id: 任务ID，用于生成唯一的输出目录，默认为None时会自动生成
    Returns:
        Dict[str, str]: 片段ID到裁剪后视频路径的映射
    """
    # 检查视频文件是否存在
    if not os.path.exists(video_origin_path):
        raise FileNotFoundError(f"视频文件不存在: {video_origin_path}")
    # 如果未提供task_id，则根据输入生成一个唯一ID
    if task_id is None:
        content_for_hash = f"{video_origin_path}_{json.dumps(script_list)}"
        task_id = hashlib.md5(content_for_hash.encode()).hexdigest()
    # 设置输出目录
    if output_dir is None:
        output_dir = os.path.join(
            os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
            "storage", "temp", "clip_video_unified", task_id
        )
    # 确保输出目录存在
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    # 创建TTS结果的快速查找映射
    tts_map = {item['_id']: item for item in tts_results}
    # 获取硬件加速支持
    hwaccel_type = check_hardware_acceleration()
    hwaccel_args = []
    if hwaccel_type:
        hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args()
        hwaccel_info = ffmpeg_utils.get_ffmpeg_hwaccel_info()
        logger.info(f"🚀 使用硬件加速: {hwaccel_type} ({hwaccel_info.get('message', '')})")
    else:
        logger.info("🔧 使用软件编码")
    # 获取编码器配置
    encoder_config = get_safe_encoder_config(hwaccel_type)
    logger.debug(f"编码器配置: {encoder_config}")
    # 统计信息
    total_clips = len(script_list)
    result = {}
    failed_clips = []
    success_count = 0
    logger.info(f"📹 开始统一视频裁剪，总共{total_clips}个片段")
    for i, script_item in enumerate(script_list, 1):
        _id = script_item.get("_id")
        ost = script_item.get("OST", 0)
        timestamp = script_item["timestamp"]
        logger.info(f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, 时间戳:{timestamp}")
        try:
            if ost == 0:  # 纯解说片段
                output_path = _process_narration_only_segment(
                    video_origin_path, script_item, tts_map, output_dir,
                    encoder_config, hwaccel_args
                )
            elif ost == 1:  # 纯原声片段
                output_path = _process_original_audio_segment(
                    video_origin_path, script_item, output_dir,
                    encoder_config, hwaccel_args
                )
            elif ost == 2:  # 解说+原声混合片段
                output_path = _process_mixed_segment(
                    video_origin_path, script_item, tts_map, output_dir,
                    encoder_config, hwaccel_args
                )
            else:
                logger.warning(f"未知的OST类型: {ost}，跳过片段 {_id}")
                continue
            if output_path and os.path.exists(output_path) and os.path.getsize(output_path) > 0:
                result[_id] = output_path
                success_count += 1
                logger.info(f"✅ [{i}/{total_clips}] 片段处理成功: OST={ost}, ID={_id}")
            else:
                failed_clips.append(f"ID:{_id}, OST:{ost}")
                logger.error(f"❌ [{i}/{total_clips}] 片段处理失败: OST={ost}, ID={_id}")
        except Exception as e:
            failed_clips.append(f"ID:{_id}, OST:{ost}")
            logger.error(f"❌ [{i}/{total_clips}] 片段处理异常: OST={ost}, ID={_id}, 错误: {str(e)}")
    # 最终统计
    logger.info(f"📊 统一视频裁剪完成: 成功 {success_count}/{total_clips}, 失败 {len(failed_clips)}")
    # 检查是否有失败的片段
    if failed_clips:
        logger.warning(f"⚠️  以下片段处理失败: {failed_clips}")
        if len(failed_clips) == total_clips:
            raise RuntimeError("所有视频片段处理都失败了，请检查视频文件和ffmpeg配置")
        elif len(failed_clips) > total_clips / 2:
            logger.warning(f"⚠️  超过一半的片段处理失败 ({len(failed_clips)}/{total_clips})，请检查硬件加速配置")
    if success_count > 0:
        logger.info(f"🎉 统一视频裁剪任务完成! 输出目录: {output_dir}")
    return result
 def clip_video(
        video_origin_path: str,
        tts_result: List[Dict],
@ -613,6 +966,49 @@ def clip_video(
        # 根据持续时间计算真正的结束时间（加上1秒余量）
        duration = item["duration"]
        # 时长合理性检查和修正
        if duration <= 0 or duration > 300:  # 超过5分钟认为不合理
            logger.warning(f"检测到异常时长 {duration}秒，片段: {timestamp}")
            # 尝试从时间戳计算实际时长
            try:
                start_time_str, end_time_str = timestamp.split('-')
                # 解析开始时间
                if ',' in start_time_str:
                    time_part, ms_part = start_time_str.split(',')
                    h1, m1, s1 = map(int, time_part.split(':'))
                    ms1 = int(ms_part)
                else:
                    h1, m1, s1 = map(int, start_time_str.split(':'))
                    ms1 = 0
                # 解析结束时间
                if ',' in end_time_str:
                    time_part, ms_part = end_time_str.split(',')
                    h2, m2, s2 = map(int, time_part.split(':'))
                    ms2 = int(ms_part)
                else:
                    h2, m2, s2 = map(int, end_time_str.split(':'))
                    ms2 = 0
                # 计算实际时长
                start_total_ms = (h1 * 3600 + m1 * 60 + s1) * 1000 + ms1
                end_total_ms = (h2 * 3600 + m2 * 60 + s2) * 1000 + ms2
                actual_duration = (end_total_ms - start_total_ms) / 1000.0
                if actual_duration > 0 and actual_duration <= 300:
                    duration = actual_duration
                    logger.info(f"使用时间戳计算的实际时长: {duration:.3f}秒")
                else:
                    duration = 5.0  # 默认5秒
                    logger.warning(f"时间戳计算也异常，使用默认时长: {duration}秒")
            except Exception as e:
                duration = 5.0  # 默认5秒
                logger.warning(f"时长修正失败，使用默认时长: {duration}秒, 错误: {str(e)}")
        calculated_end_time = calculate_end_time(start_time, duration)
        # 转换为FFmpeg兼容的时间格式（逗号替换为点）
--- a/app/services/llm/base.py
+++ b/app/services/llm/base.py
@ -57,14 +57,33 @@ class BaseLLMProvider(ABC):
        """验证配置参数"""
        if not self.api_key:
            raise ConfigurationError("API密钥不能为空", "api_key")
-        
+
        if not self.model_name:
            raise ConfigurationError("模型名称不能为空", "model_name")
-        
+
-        if self.model_name not in self.supported_models:
+        # 检查模型支持情况
-            from .exceptions import ModelNotSupportedError
+        self._validate_model_support()
            raise ModelNotSupportedError(self.model_name, self.provider_name)
    def _validate_model_support(self):
        """验证模型支持情况"""
        from app.config import config
        from .exceptions import ModelNotSupportedError
        from loguru import logger
        # 获取模型验证模式配置
        strict_model_validation = config.app.get('strict_model_validation', True)
        if self.model_name not in self.supported_models:
            if strict_model_validation:
                # 严格模式：抛出异常
                raise ModelNotSupportedError(self.model_name, self.provider_name)
            else:
                # 宽松模式：仅记录警告
                logger.warning(
                    f"模型 {self.model_name} 未在供应商 {self.provider_name} 的预定义支持列表中，"
                    f"但已启用宽松验证模式。支持的模型列表: {self.supported_models}"
                )
    def _initialize(self):
        """初始化提供商特定设置，子类可重写"""
        pass
@ -77,11 +96,15 @@ class BaseLLMProvider(ABC):
    def _handle_api_error(self, status_code: int, response_text: str) -> LLMServiceError:
        """处理API错误，返回适当的异常"""
        from .exceptions import APICallError, RateLimitError, AuthenticationError
-        
+
        if status_code == 401:
            return AuthenticationError()
        elif status_code == 429:
            return RateLimitError()
        elif status_code in [502, 503, 504]:
            return APICallError(f"服务器错误 HTTP {status_code}", status_code, response_text)
        elif status_code == 524:
            return APICallError(f"服务器处理超时 HTTP {status_code}", status_code, response_text)
        else:
            return APICallError(f"HTTP {status_code}", status_code, response_text)
--- a/app/services/llm/config_validator.py
+++ b/app/services/llm/config_validator.py
@ -213,7 +213,8 @@ class LLMConfigValidator:
                "确保所有API密钥都已正确配置",
                "建议为每个提供商配置base_url以提高稳定性",
                "定期检查模型名称是否为最新版本",
-                "建议配置多个提供商作为备用方案"
+                "建议配置多个提供商作为备用方案",
                "如果使用新发布的模型遇到MODEL_NOT_SUPPORTED错误，可以设置 strict_model_validation = false 启用宽松验证模式"
            ]
        }
@ -252,8 +253,8 @@ class LLMConfigValidator:
        """获取示例模型名称"""
        examples = {
            "gemini": {
-                "vision": ["gemini-2.0-flash-lite", "gemini-2.0-flash"],
+                "vision": ["gemini-2.5-flash", "gemini-2.0-flash-lite", "gemini-2.0-flash"],
-                "text": ["gemini-2.0-flash", "gemini-1.5-pro"]
+                "text": ["gemini-2.5-flash", "gemini-2.0-flash", "gemini-1.5-pro"]
            },
            "openai": {
                "vision": [],
--- a/app/services/llm/providers/gemini_openai_provider.py
+++ b/app/services/llm/providers/gemini_openai_provider.py
@ -27,6 +27,7 @@ class GeminiOpenAIVisionProvider(VisionModelProvider):
    @property
    def supported_models(self) -> List[str]:
        return [
            "gemini-2.5-flash",
            "gemini-2.0-flash-lite",
            "gemini-2.0-flash",
            "gemini-1.5-pro",
@ -137,6 +138,7 @@ class GeminiOpenAITextProvider(TextModelProvider):
    @property
    def supported_models(self) -> List[str]:
        return [
            "gemini-2.5-flash",
            "gemini-2.0-flash-lite",
            "gemini-2.0-flash",
            "gemini-1.5-pro",
--- a/app/services/llm/providers/gemini_provider.py
+++ b/app/services/llm/providers/gemini_provider.py
@ -27,6 +27,7 @@ class GeminiVisionProvider(VisionModelProvider):
    @property
    def supported_models(self) -> List[str]:
        return [
            "gemini-2.5-flash",
            "gemini-2.0-flash-lite",
            "gemini-2.0-flash",
            "gemini-1.5-pro",
@ -136,25 +137,72 @@ class GeminiVisionProvider(VisionModelProvider):
        return base64.b64encode(img_bytes).decode('utf-8')
    async def _make_api_call(self, payload: Dict[str, Any]) -> Dict[str, Any]:
-        """执行原生Gemini API调用"""
+        """执行原生Gemini API调用，包含重试机制"""
        from app.config import config
        url = f"{self.base_url}/models/{self.model_name}:generateContent?key={self.api_key}"
-        
+
-        response = await asyncio.to_thread(
+        max_retries = config.app.get('llm_max_retries', 3)
-            requests.post,
+        base_timeout = config.app.get('llm_vision_timeout', 120)
-            url,
+
-            json=payload,
+        for attempt in range(max_retries):
-            headers={
+            try:
-                "Content-Type": "application/json",
+                # 根据尝试次数调整超时时间
-                "User-Agent": "NarratoAI/1.0"
+                timeout = base_timeout * (attempt + 1)
-            },
+                logger.debug(f"Gemini API调用尝试 {attempt + 1}/{max_retries}，超时设置: {timeout}秒")
-            timeout=120
+
-        )
+                response = await asyncio.to_thread(
-        
+                    requests.post,
-        if response.status_code != 200:
+                    url,
-            error = self._handle_api_error(response.status_code, response.text)
+                    json=payload,
-            raise error
+                    headers={
-        
+                        "Content-Type": "application/json",
-        return response.json()
+                        "User-Agent": "NarratoAI/1.0"
                    },
                    timeout=timeout
                )
                if response.status_code == 200:
                    return response.json()
                # 处理特定的错误状态码
                if response.status_code == 429:
                    # 速率限制，等待后重试
                    wait_time = 30 * (attempt + 1)
                    logger.warning(f"Gemini API速率限制，等待 {wait_time} 秒后重试")
                    await asyncio.sleep(wait_time)
                    continue
                elif response.status_code in [502, 503, 504, 524]:
                    # 服务器错误或超时，可以重试
                    if attempt < max_retries - 1:
                        wait_time = 10 * (attempt + 1)
                        logger.warning(f"Gemini API服务器错误 {response.status_code}，等待 {wait_time} 秒后重试")
                        await asyncio.sleep(wait_time)
                        continue
                # 其他错误，直接抛出
                error = self._handle_api_error(response.status_code, response.text)
                raise error
            except requests.exceptions.Timeout:
                if attempt < max_retries - 1:
                    wait_time = 15 * (attempt + 1)
                    logger.warning(f"Gemini API请求超时，等待 {wait_time} 秒后重试")
                    await asyncio.sleep(wait_time)
                    continue
                else:
                    raise APICallError("Gemini API请求超时，已达到最大重试次数")
            except requests.exceptions.RequestException as e:
                if attempt < max_retries - 1:
                    wait_time = 10 * (attempt + 1)
                    logger.warning(f"Gemini API网络错误: {str(e)}，等待 {wait_time} 秒后重试")
                    await asyncio.sleep(wait_time)
                    continue
                else:
                    raise APICallError(f"Gemini API网络错误: {str(e)}")
        # 如果所有重试都失败了
        raise APICallError("Gemini API调用失败，已达到最大重试次数")
    def _parse_vision_response(self, response_data: Dict[str, Any]) -> str:
        """解析视觉分析响应"""
@ -192,6 +240,7 @@ class GeminiTextProvider(TextModelProvider):
    @property
    def supported_models(self) -> List[str]:
        return [
            "gemini-2.5-flash",
            "gemini-2.0-flash-lite",
            "gemini-2.0-flash",
            "gemini-1.5-pro",
@ -278,25 +327,72 @@ class GeminiTextProvider(TextModelProvider):
        return self._parse_text_response(response_data)
    async def _make_api_call(self, payload: Dict[str, Any]) -> Dict[str, Any]:
-        """执行原生Gemini API调用"""
+        """执行原生Gemini API调用，包含重试机制"""
        from app.config import config
        url = f"{self.base_url}/models/{self.model_name}:generateContent?key={self.api_key}"
-        
+
-        response = await asyncio.to_thread(
+        max_retries = config.app.get('llm_max_retries', 3)
-            requests.post,
+        base_timeout = config.app.get('llm_text_timeout', 180)  # 文本生成任务使用更长的基础超时时间
-            url,
+
-            json=payload,
+        for attempt in range(max_retries):
-            headers={
+            try:
-                "Content-Type": "application/json",
+                # 根据尝试次数调整超时时间
-                "User-Agent": "NarratoAI/1.0"
+                timeout = base_timeout * (attempt + 1)
-            },
+                logger.debug(f"Gemini文本API调用尝试 {attempt + 1}/{max_retries}，超时设置: {timeout}秒")
-            timeout=120
+
-        )
+                response = await asyncio.to_thread(
-        
+                    requests.post,
-        if response.status_code != 200:
+                    url,
-            error = self._handle_api_error(response.status_code, response.text)
+                    json=payload,
-            raise error
+                    headers={
-        
+                        "Content-Type": "application/json",
-        return response.json()
+                        "User-Agent": "NarratoAI/1.0"
                    },
                    timeout=timeout
                )
                if response.status_code == 200:
                    return response.json()
                # 处理特定的错误状态码
                if response.status_code == 429:
                    # 速率限制，等待后重试
                    wait_time = 30 * (attempt + 1)
                    logger.warning(f"Gemini API速率限制，等待 {wait_time} 秒后重试")
                    await asyncio.sleep(wait_time)
                    continue
                elif response.status_code in [502, 503, 504, 524]:
                    # 服务器错误或超时，可以重试
                    if attempt < max_retries - 1:
                        wait_time = 15 * (attempt + 1)
                        logger.warning(f"Gemini API服务器错误 {response.status_code}，等待 {wait_time} 秒后重试")
                        await asyncio.sleep(wait_time)
                        continue
                # 其他错误，直接抛出
                error = self._handle_api_error(response.status_code, response.text)
                raise error
            except requests.exceptions.Timeout:
                if attempt < max_retries - 1:
                    wait_time = 20 * (attempt + 1)
                    logger.warning(f"Gemini文本API请求超时，等待 {wait_time} 秒后重试")
                    await asyncio.sleep(wait_time)
                    continue
                else:
                    raise APICallError("Gemini文本API请求超时，已达到最大重试次数")
            except requests.exceptions.RequestException as e:
                if attempt < max_retries - 1:
                    wait_time = 15 * (attempt + 1)
                    logger.warning(f"Gemini文本API网络错误: {str(e)}，等待 {wait_time} 秒后重试")
                    await asyncio.sleep(wait_time)
                    continue
                else:
                    raise APICallError(f"Gemini文本API网络错误: {str(e)}")
        # 如果所有重试都失败了
        raise APICallError("Gemini文本API调用失败，已达到最大重试次数")
    def _parse_text_response(self, response_data: Dict[str, Any]) -> str:
        """解析文本生成响应"""
--- a/app/services/task.py
+++ b/app/services/task.py
@ -15,13 +15,19 @@ from app.services import state as sm
 from app.utils import utils
-def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
+def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict = None):
    """
-    后台任务（自动剪辑视频进行剪辑）
+    后台任务（统一视频裁剪处理）- 优化版本
    实施基于OST类型的统一视频裁剪策略，消除双重裁剪问题：
    - OST=0: 根据TTS音频时长动态裁剪，移除原声
    - OST=1: 严格按照脚本timestamp精确裁剪，保持原声
    - OST=2: 根据TTS音频时长动态裁剪，保持原声
    Args:
        task_id: 任务ID
        params: 视频参数
-        subclip_path_videos: 视频片段路径
+        subclip_path_videos: 视频片段路径（可选，仅作为备用方案）
    """
    global merged_audio_path, merged_subtitle_path
@ -94,17 +100,26 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
    # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
    """
-    3. 裁剪视频 - 将超出音频长度的视频进行裁剪
+    3. 统一视频裁剪 - 基于OST类型的差异化裁剪策略
    """
-    logger.info("\n\n## 3. 裁剪视频")
+    logger.info("\n\n## 3. 统一视频裁剪（基于OST类型）")
-    video_clip_result = clip_video.clip_video(params.video_origin_path, tts_results)
+
-    # 更新 list_script 中的时间戳
+    # 使用新的统一裁剪策略
    video_clip_result = clip_video.clip_video_unified(
        video_origin_path=params.video_origin_path,
        script_list=list_script,
        tts_results=tts_results
    )
    # 更新 list_script 中的时间戳和路径信息
    tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results}
    subclip_clip_result = {
        tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results
    }
    new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result)
    logger.info(f"统一裁剪完成，处理了 {len(video_clip_result)} 个视频片段")
    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60)
    """
@ -139,8 +154,27 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
    combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4")
    logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")
-    # 如果 new_script_list 中没有 video，则使用 subclip_path_videos 中的视频
+
-    video_clips = [new_script['video'] if new_script.get('video') else subclip_path_videos.get(new_script.get('_id', '')) for new_script in new_script_list]
+    # 使用统一裁剪后的视频片段
    video_clips = []
    for new_script in new_script_list:
        video_path = new_script.get('video')
        if video_path and os.path.exists(video_path):
            video_clips.append(video_path)
        else:
            logger.warning(f"片段 {new_script.get('_id')} 的视频文件不存在或未生成: {video_path}")
            # 如果统一裁剪失败，尝试使用备用方案（如果提供了subclip_path_videos）
            if subclip_path_videos and new_script.get('_id') in subclip_path_videos:
                backup_video = subclip_path_videos[new_script.get('_id')]
                if os.path.exists(backup_video):
                    video_clips.append(backup_video)
                    logger.info(f"使用备用视频: {backup_video}")
                else:
                    logger.error(f"备用视频也不存在: {backup_video}")
            else:
                logger.error(f"无法找到片段 {new_script.get('_id')} 的视频文件")
    logger.info(f"准备合并 {len(video_clips)} 个视频片段")
    merger_video.combine_clip_videos(
        output_video_path=combined_video_path,
@ -208,6 +242,199 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
    return kwargs
 def start_subclip_unified(task_id: str, params: VideoClipParams):
    """
    统一视频裁剪处理函数 - 完全基于OST类型的新实现
    这是优化后的版本，完全移除了对预裁剪视频的依赖，
    实现真正的统一裁剪策略。
    Args:
        task_id: 任务ID
        params: 视频参数
    """
    global merged_audio_path, merged_subtitle_path
    logger.info(f"\n\n## 开始统一视频处理任务: {task_id}")
    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0)
    """
    1. 加载剪辑脚本
    """
    logger.info("\n\n## 1. 加载视频脚本")
    video_script_path = path.join(params.video_clip_json_path)
    if path.exists(video_script_path):
        try:
            with open(video_script_path, "r", encoding="utf-8") as f:
                list_script = json.load(f)
                video_list = [i['narration'] for i in list_script]
                video_ost = [i['OST'] for i in list_script]
                time_list = [i['timestamp'] for i in list_script]
                video_script = " ".join(video_list)
                logger.debug(f"解说完整脚本: \n{video_script}")
                logger.debug(f"解说 OST 列表: \n{video_ost}")
                logger.debug(f"解说时间戳列表: \n{time_list}")
        except Exception as e:
            logger.error(f"无法读取视频json脚本，请检查脚本格式是否正确")
            raise ValueError("无法读取视频json脚本，请检查脚本格式是否正确")
    else:
        logger.error(f"video_script_path: {video_script_path}")
        raise ValueError("解说脚本不存在！请检查配置是否正确。")
    """
    2. 使用 TTS 生成音频素材
    """
    logger.info("\n\n## 2. 根据OST设置生成音频列表")
    # 只为OST=0 or 2的判断生成音频， OST=0 仅保留解说 OST=2 保留解说和原声
    tts_segments = [
        segment for segment in list_script
        if segment['OST'] in [0, 2]
    ]
    logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}")
    tts_results = voice.tts_multiple(
        task_id=task_id,
        list_script=tts_segments,  # 只传入需要TTS的片段
        voice_name=params.voice_name,
        voice_rate=params.voice_rate,
        voice_pitch=params.voice_pitch,
    )
    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
    """
    3. 统一视频裁剪 - 基于OST类型的差异化裁剪策略
    """
    logger.info("\n\n## 3. 统一视频裁剪（基于OST类型）")
    # 使用新的统一裁剪策略
    video_clip_result = clip_video.clip_video_unified(
        video_origin_path=params.video_origin_path,
        script_list=list_script,
        tts_results=tts_results
    )
    # 更新 list_script 中的时间戳和路径信息
    tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results}
    subclip_clip_result = {
        tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results
    }
    new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result)
    logger.info(f"统一裁剪完成，处理了 {len(video_clip_result)} 个视频片段")
    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60)
    """
    4. 合并音频和字幕
    """
    logger.info("\n\n## 4. 合并音频和字幕")
    total_duration = sum([script["duration"] for script in new_script_list])
    if tts_segments:
        try:
            # 合并音频文件
            merged_audio_path = audio_merger.merge_audio_files(
                task_id=task_id,
                total_duration=total_duration,
                list_script=new_script_list
            )
            logger.info(f"音频文件合并成功->{merged_audio_path}")
            # 合并字幕文件
            merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list)
            logger.info(f"字幕文件合并成功->{merged_subtitle_path}")
        except Exception as e:
            logger.error(f"合并音频文件失败: {str(e)}")
    else:
        logger.warning("没有需要合并的音频/字幕")
        merged_audio_path = ""
        merged_subtitle_path = ""
    """
    5. 合并视频
    """
    final_video_paths = []
    combined_video_paths = []
    combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4")
    logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")
    # 使用统一裁剪后的视频片段
    video_clips = []
    for new_script in new_script_list:
        video_path = new_script.get('video')
        if video_path and os.path.exists(video_path):
            video_clips.append(video_path)
        else:
            logger.error(f"片段 {new_script.get('_id')} 的视频文件不存在: {video_path}")
    logger.info(f"准备合并 {len(video_clips)} 个视频片段")
    merger_video.combine_clip_videos(
        output_video_path=combined_video_path,
        video_paths=video_clips,
        video_ost_list=video_ost,
        video_aspect=params.video_aspect,
        threads=params.n_threads
    )
    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80)
    """
    6. 合并字幕/BGM/配音/视频
    """
    output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
    logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}")
    bgm_path = utils.get_bgm_file()
    # 获取优化的音量配置
    optimized_volumes = get_recommended_volumes_for_content('mixed')
    # 应用用户设置和优化建议的组合
    final_tts_volume = params.tts_volume if hasattr(params, 'tts_volume') and params.tts_volume != 1.0 else optimized_volumes['tts_volume']
    final_original_volume = params.original_volume if hasattr(params, 'original_volume') and params.original_volume != 0.7 else optimized_volumes['original_volume']
    final_bgm_volume = params.bgm_volume if hasattr(params, 'bgm_volume') and params.bgm_volume != 0.3 else optimized_volumes['bgm_volume']
    logger.info(f"音量配置 - TTS: {final_tts_volume}, 原声: {final_original_volume}, BGM: {final_bgm_volume}")
    # 调用示例
    options = {
        'voice_volume': final_tts_volume,
        'bgm_volume': final_bgm_volume,
        'original_audio_volume': final_original_volume,
        'keep_original_audio': True,
        'subtitle_enabled': params.subtitle_enabled,
        'subtitle_font': params.font_name,
        'subtitle_font_size': params.font_size,
        'subtitle_color': params.text_fore_color,
        'subtitle_bg_color': None,
        'subtitle_position': params.subtitle_position,
        'custom_position': params.custom_position,
        'threads': params.n_threads
    }
    generate_video.merge_materials(
        video_path=combined_video_path,
        audio_path=merged_audio_path,
        subtitle_path=merged_subtitle_path,
        bgm_path=bgm_path,
        output_path=output_video_path,
        options=options
    )
    final_video_paths.append(output_video_path)
    combined_video_paths.append(combined_video_path)
    logger.success(f"统一处理任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.")
    kwargs = {
        "videos": final_video_paths,
        "combined_videos": combined_video_paths
    }
    sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs)
    return kwargs
 def validate_params(video_path, audio_path, output_file, params):
    """
    验证输入参数
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -4,19 +4,42 @@ import json
 import traceback
 import edge_tts
 import asyncio
 import requests
 from loguru import logger
-from typing import List, Union
+from typing import List, Union, Tuple
 from datetime import datetime
 from xml.sax.saxutils import unescape
 from edge_tts import submaker, SubMaker
-from edge_tts.submaker import mktimestamp
+# from edge_tts.submaker import mktimestamp  # 函数可能不存在，我们自己实现
 from moviepy.video.tools import subtitles
 try:
    from moviepy import AudioFileClip
    MOVIEPY_AVAILABLE = True
 except ImportError:
    MOVIEPY_AVAILABLE = False
    logger.warning("moviepy 未安装，将使用估算方法计算音频时长")
 import time
 from app.config import config
 from app.utils import utils
 def mktimestamp(time_seconds: float) -> str:
    """
    将秒数转换为 SRT 时间戳格式
    Args:
        time_seconds: 时间（秒）
    Returns:
        str: SRT 格式的时间戳，如 "00:01:23.456"
    """
    hours = int(time_seconds // 3600)
    minutes = int((time_seconds % 3600) // 60)
    seconds = time_seconds % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"
 def get_all_azure_voices(filter_locals=None) -> list[str]:
    if filter_locals is None:
        filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW", "vi-VN"]
@ -1035,11 +1058,39 @@ def is_azure_v2_voice(voice_name: str):
    return ""
 def should_use_azure_speech_services(voice_name: str) -> bool:
    """判断音色是否应该使用Azure Speech Services"""
    if not voice_name or is_soulvoice_voice(voice_name):
        return False
    voice_name = voice_name.strip()
    # 如果是带-V2后缀的，肯定是Azure Speech Services
    if voice_name.endswith("-V2"):
        return True
    # 检查是否为Azure官方音色格式 (如: zh-CN-YunzeNeural)
    # Azure音色通常格式为: [语言]-[地区]-[名称]Neural
    import re
    pattern = r'^[a-z]{2}-[A-Z]{2}-\w+Neural$'
    if re.match(pattern, voice_name):
        return True
    return False
 def tts(
    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
 ) -> Union[SubMaker, None]:
-    if is_azure_v2_voice(voice_name):
+    # 检查是否为 SoulVoice 引擎
    if is_soulvoice_voice(voice_name):
        return soulvoice_tts(text, voice_name, voice_file, speed=voice_rate)
    # 检查是否应该使用 Azure Speech Services
    if should_use_azure_speech_services(voice_name):
        return azure_tts_v2(text, voice_name, voice_file)
    # 默认使用 Edge TTS (Azure V1)
    return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
@ -1110,12 +1161,22 @@ def azure_tts_v1(
 def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]:
-    voice_name = is_azure_v2_voice(voice_name)
+    # 直接使用官方音色名称，不需要V2后缀验证
-    if not voice_name:
+    # Azure Speech Services 的音色名称如: zh-CN-YunzeNeural, en-US-AvaMultilingualNeural
-        logger.error(f"invalid voice name: {voice_name}")
+    processed_voice_name = voice_name.strip()
-        raise ValueError(f"invalid voice name: {voice_name}")
+    if not processed_voice_name:
        logger.error(f"invalid voice name: {voice_name} (empty)")
        raise ValueError(f"invalid voice name: {voice_name} (empty)")
    text = text.strip()
    # 检查Azure Speech SDK是否可用
    try:
        import azure.cognitiveservices.speech as speechsdk
    except ImportError as e:
        logger.error("Azure Speech SDK 未安装。请运行: pip install azure-cognitiveservices-speech")
        logger.error("或者使用 Edge TTS 引擎作为替代方案")
        return None
    def _format_duration_to_offset(duration) -> int:
        if isinstance(duration, str):
            time_obj = datetime.strptime(duration, "%H:%M:%S.%f")
@ -1134,9 +1195,7 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker,
    for i in range(3):
        try:
-            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
+            logger.info(f"start, voice name: {processed_voice_name}, try: {i + 1}")
            import azure.cognitiveservices.speech as speechsdk
            sub_maker = SubMaker()
@ -1155,7 +1214,7 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker,
            speech_config = speechsdk.SpeechConfig(
                subscription=speech_key, region=service_region
            )
-            speech_config.speech_synthesis_voice_name = voice_name
+            speech_config.speech_synthesis_voice_name = processed_voice_name
            # speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
            #                            value='true')
            speech_config.set_property(
@ -1368,6 +1427,10 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
            if start_time < 0:
                start_time = _start_time
            # 将 100纳秒单位转换为秒
            start_time_seconds = start_time / 10000000
            end_time_seconds = end_time / 10000000
            sub = unescape(sub)
            sub_line += sub
            sub_text = match_line(sub_line, sub_index)
@ -1375,8 +1438,8 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
                sub_index += 1
                line = formatter(
                    idx=sub_index,
-                    start_time=start_time,
+                    start_time=start_time_seconds,
-                    end_time=end_time,
+                    end_time=end_time_seconds,
                    sub_text=sub_text,
                )
                sub_items.append(line)
@ -1402,9 +1465,13 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
                f"\nsub_items:{json.dumps(sub_items, indent=4, ensure_ascii=False)}"
                f"\nscript_lines:{json.dumps(script_lines, indent=4, ensure_ascii=False)}"
            )
            # 返回默认值，避免 None 错误
            return subtitle_file, 3.0
    except Exception as e:
        logger.error(f"failed, error: {str(e)}")
        # 返回默认值，避免 None 错误
        return subtitle_file, 3.0
 def get_audio_duration(sub_maker: submaker.SubMaker):
@ -1453,8 +1520,21 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
                             f"或者使用其他 tts 引擎")
                continue
            else:
-                # 为当前片段生成字幕文件
+                # SoulVoice 引擎不生成字幕文件
-                _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
+                if is_soulvoice_voice(voice_name):
                    # 获取实际音频文件的时长
                    duration = get_audio_duration_from_file(audio_file)
                    if duration <= 0:
                        # 如果无法获取文件时长，尝试从 SubMaker 获取
                        duration = get_audio_duration(sub_maker)
                        if duration <= 0:
                            # 最后的 fallback，基于文本长度估算
                            duration = max(1.0, len(text) / 3.0)
                            logger.warning(f"无法获取音频时长，使用文本估算: {duration:.2f}秒")
                    # 不创建字幕文件
                    subtitle_file = ""
                else:
                    _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
            tts_results.append({
                "_id": item['_id'],
@ -1467,3 +1547,168 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
            logger.info(f"已生成音频文件: {audio_file}")
    return tts_results
 def get_audio_duration_from_file(audio_file: str) -> float:
    """
    获取音频文件的时长（秒）
    """
    if MOVIEPY_AVAILABLE:
        try:
            audio_clip = AudioFileClip(audio_file)
            duration = audio_clip.duration
            audio_clip.close()
            return duration
        except Exception as e:
            logger.error(f"使用 moviepy 获取音频时长失败: {str(e)}")
    # Fallback: 使用更准确的估算方法
    try:
        import os
        file_size = os.path.getsize(audio_file)
        # 更准确的 MP3 时长估算
        # 假设 MP3 平均比特率为 128kbps = 16KB/s
        # 但实际文件还包含头部信息，所以调整系数
        estimated_duration = max(1.0, file_size / 20000)  # 调整为更保守的估算
        # 对于中文语音，根据文本长度进行二次校正
        # 一般中文语音速度约为 3-4 字/秒
        logger.warning(f"使用文件大小估算音频时长: {estimated_duration:.2f}秒")
        return estimated_duration
    except Exception as e:
        logger.error(f"获取音频时长失败: {str(e)}")
        # 如果所有方法都失败，返回一个基于文本长度的估算
        return 3.0  # 默认3秒，避免返回0
 def is_soulvoice_voice(voice_name: str) -> bool:
    """
    检查是否为 SoulVoice 语音
    """
    return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:")
 def parse_soulvoice_voice(voice_name: str) -> str:
    """
    解析 SoulVoice 语音名称
    支持格式：
    - soulvoice:speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
    - speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
    """
    if voice_name.startswith("soulvoice:"):
        return voice_name[10:]  # 移除 "soulvoice:" 前缀
    return voice_name
 def soulvoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
    """
    使用 SoulVoice API 进行文本转语音
    Args:
        text: 要转换的文本
        voice_name: 语音名称
        voice_file: 输出音频文件路径
        speed: 语音速度
    Returns:
        SubMaker: 包含时间戳信息的字幕制作器，失败时返回 None
    """
    # 获取配置
    api_key = config.soulvoice.get("api_key", "")
    api_url = config.soulvoice.get("api_url", "https://tts.scsmtech.cn/tts")
    default_model = config.soulvoice.get("model", "FunAudioLLM/CosyVoice2-0.5B")
    if not api_key:
        logger.error("SoulVoice API key 未配置")
        return None
    # 解析语音名称
    parsed_voice = parse_soulvoice_voice(voice_name)
    # 准备请求数据
    headers = {
        'Authorization': f'Bearer {api_key}',
        'Content-Type': 'application/json'
    }
    data = {
        'text': text.strip(),
        'model': default_model,
        'voice': parsed_voice,
        'speed': speed
    }
    # 重试机制
    for attempt in range(3):
        try:
            logger.info(f"第 {attempt + 1} 次调用 SoulVoice API")
            # 设置代理
            proxies = {}
            if config.proxy.get("http"):
                proxies = {
                    'http': config.proxy.get("http"),
                    'https': config.proxy.get("https", config.proxy.get("http"))
                }
            # 调用 API
            response = requests.post(
                api_url,
                headers=headers,
                json=data,
                proxies=proxies,
                timeout=60
            )
            if response.status_code == 200:
                # 保存音频文件
                with open(voice_file, 'wb') as f:
                    f.write(response.content)
                logger.info(f"SoulVoice TTS 成功生成音频: {voice_file}")
                # SoulVoice 不支持精确字幕生成，返回简单的 SubMaker 对象
                sub_maker = SubMaker()
                sub_maker.subs = [text]  # 整个文本作为一个段落
                sub_maker.offset = [(0, 0)]  # 占位时间戳
                return sub_maker
            else:
                logger.error(f"SoulVoice API 调用失败: {response.status_code} - {response.text}")
        except requests.exceptions.Timeout:
            logger.error(f"SoulVoice API 调用超时 (尝试 {attempt + 1}/3)")
        except requests.exceptions.RequestException as e:
            logger.error(f"SoulVoice API 网络错误: {str(e)} (尝试 {attempt + 1}/3)")
        except Exception as e:
            logger.error(f"SoulVoice TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)")
        if attempt < 2:  # 不是最后一次尝试
            time.sleep(2)  # 等待2秒后重试
    logger.error("SoulVoice TTS 生成失败，已达到最大重试次数")
    return None
 def is_soulvoice_voice(voice_name: str) -> bool:
    """
    检查是否为 SoulVoice 语音
    """
    return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:")
 def parse_soulvoice_voice(voice_name: str) -> str:
    """
    解析 SoulVoice 语音名称
    支持格式：
    - soulvoice:speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
    - speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
    """
    if voice_name.startswith("soulvoice:"):
        return voice_name[10:]  # 移除 "soulvoice:" 前缀
    return voice_name
--- a/app/utils/check_script.py
+++ b/app/utils/check_script.py
@ -1,4 +1,5 @@
 import json
 import re
 from typing import Dict, Any
 def check_format(script_content: str) -> Dict[str, Any]:
@ -6,76 +7,104 @@ def check_format(script_content: str) -> Dict[str, Any]:
    Args:
        script_content: 脚本内容
    Returns:
-        Dict: {'success': bool, 'message': str}
+        Dict: {'success': bool, 'message': str, 'details': str}
    """
    try:
        # 检查是否为有效的JSON
        data = json.loads(script_content)
-        
+
        # 检查是否为列表
        if not isinstance(data, list):
            return {
                'success': False,
-                'message': '脚本必须是JSON数组格式'
+                'message': '脚本必须是JSON数组格式',
                'details': '正确格式应该是: [{"_id": 1, "timestamp": "...", ...}, ...]'
            }
-        
+
        # 检查数组不能为空
        if len(data) == 0:
            return {
                'success': False,
                'message': '脚本数组不能为空',
                'details': '至少需要包含一个脚本片段'
            }
        # 检查每个片段
        for i, clip in enumerate(data):
            # 检查是否为对象类型
            if not isinstance(clip, dict):
                return {
                    'success': False,
                    'message': f'第{i+1}个元素必须是对象类型',
                    'details': f'当前类型: {type(clip).__name__}'
                }
            # 检查必需字段
-            required_fields = ['narration', 'picture', 'timestamp']
+            required_fields = ['_id', 'timestamp', 'picture', 'narration', 'OST']
            for field in required_fields:
                if field not in clip:
                    return {
                        'success': False,
-                        'message': f'第{i+1}个片段缺少必需字段: {field}'
+                        'message': f'第{i+1}个片段缺少必需字段: {field}',
                        'details': f'必需字段: {", ".join(required_fields)}'
                    }
-            
+
-            # 检查字段类型
+            # 验证 _id 字段
-            if not isinstance(clip['narration'], str):
+            if not isinstance(clip['_id'], int) or clip['_id'] <= 0:
                return {
                    'success': False,
-                    'message': f'第{i+1}个片段的narration必须是字符串'
+                    'message': f'第{i+1}个片段的_id必须是正整数',
                    'details': f'当前值: {clip["_id"]} (类型: {type(clip["_id"]).__name__})'
                }
-            if not isinstance(clip['picture'], str):
+
            # 验证 timestamp 字段格式
            timestamp_pattern = r'^\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}$'
            if not isinstance(clip['timestamp'], str) or not re.match(timestamp_pattern, clip['timestamp']):
                return {
                    'success': False,
-                    'message': f'第{i+1}个片段的picture必须是字符串'
+                    'message': f'第{i+1}个片段的timestamp格式错误',
                    'details': f'正确格式: "HH:MM:SS,mmm-HH:MM:SS,mmm"，示例: "00:00:00,600-00:00:07,559"'
                }
-            if not isinstance(clip['timestamp'], str):
+
            # 验证 picture 字段
            if not isinstance(clip['picture'], str) or not clip['picture'].strip():
                return {
                    'success': False,
-                    'message': f'第{i+1}个片段的timestamp必须是字符串'
+                    'message': f'第{i+1}个片段的picture必须是非空字符串',
                    'details': f'当前值: {clip.get("picture", "未定义")}'
                }
-            
+
-            # 检查字段内容不能为空
+            # 验证 narration 字段
-            if not clip['narration'].strip():
+            if not isinstance(clip['narration'], str) or not clip['narration'].strip():
                return {
                    'success': False,
-                    'message': f'第{i+1}个片段的narration不能为空'
+                    'message': f'第{i+1}个片段的narration必须是非空字符串',
                    'details': f'当前值: {clip.get("narration", "未定义")}'
                }
-            if not clip['picture'].strip():
+
            # 验证 OST 字段
            if not isinstance(clip['OST'], int):
                return {
                    'success': False,
-                    'message': f'第{i+1}个片段的picture不能为空'
+                    'message': f'第{i+1}个片段的OST必须是整数',
-                }
+                    'details': f'当前值: {clip["OST"]} (类型: {type(clip["OST"]).__name__})，常用值: 0, 1, 2'
            if not clip['timestamp'].strip():
                return {
                    'success': False,
                    'message': f'第{i+1}个片段的timestamp不能为空'
                }
        return {
            'success': True,
-            'message': '脚本格式检查通过'
+            'message': '脚本格式检查通过',
            'details': f'共验证 {len(data)} 个脚本片段，格式正确'
        }
    except json.JSONDecodeError as e:
        return {
            'success': False,
-            'message': f'JSON格式错误: {str(e)}'
+            'message': f'JSON格式错误: {str(e)}',
            'details': '请检查JSON语法，确保所有括号、引号、逗号正确'
        }
    except Exception as e:
        return {
            'success': False,
-            'message': f'检查过程中发生错误: {str(e)}'
+            'message': f'检查过程中发生错误: {str(e)}',
            'details': '请联系技术支持'
        }
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@ -509,6 +509,12 @@ def clean_model_output(output):
 def cut_video(params, progress_callback=None):
    """
    旧的视频裁剪函数 - 已弃用
    注意：此函数已被统一裁剪策略取代，不再推荐使用。
    新的实现请使用 task.start_subclip_unified() 函数。
    """
    try:
        task_id = str(uuid4())
        st.session_state['task_id'] = task_id
--- a/changelog.py
+++ b/changelog.py
@ -1,17 +0,0 @@
 from git_changelog.cli import build_and_render
 # 运行这段脚本自动生成CHANGELOG.md文件
 build_and_render(
    repository=".",
    output="CHANGELOG.md",
    convention="angular",
    provider="github",
    template="keepachangelog",
    parse_trailers=True,
    parse_refs=False,
    sections=["build", "deps", "feat", "fix", "refactor"],
    versioning="pep440",
    bump="1.1.2",  # 指定bump版本
    in_place=True,
 )
--- a/check_gpu_cuda_cudnn.bat
+++ b/check_gpu_cuda_cudnn.bat
--- a/config.example.toml
+++ b/config.example.toml
@ -1,5 +1,19 @@
 [app]
-    project_version="0.6.8"
+    project_version="0.7.0"
    # 模型验证模式配置
    # true: 严格模式，只允许使用预定义支持列表中的模型（默认）
    # false: 宽松模式，允许使用任何模型名称，仅记录警告
    strict_model_validation = true
    # LLM API 超时配置（秒）
    # 视觉模型基础超时时间
    llm_vision_timeout = 120
    # 文本模型基础超时时间（解说文案生成等复杂任务需要更长时间）
    llm_text_timeout = 180
    # API 重试次数
    llm_max_retries = 3
    # 支持视频理解的大模型提供商
    #   gemini  (谷歌, 需要 VPN)
    #   siliconflow (硅基流动)
@ -77,6 +91,37 @@
    # webui界面是否显示配置项
    hide_config = true
 [azure]
    # Azure TTS 配置
    speech_key = ""
    speech_region = ""
 [soulvoice]
    # SoulVoice TTS API 密钥
    api_key = ""
    # 音色 URI（必需）
    voice_uri = "speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr"
    # API 接口地址（可选，默认值如下）
    api_url = "https://tts.scsmtech.cn/tts"
    # 默认模型（可选）
    model = "FunAudioLLM/CosyVoice2-0.5B"
 [ui]
    # TTS引擎选择 (edge_tts, azure_speech, soulvoice)
    tts_engine = "edge_tts"
    # Edge TTS 配置
    edge_voice_name = "zh-CN-XiaoyiNeural-Female"
    edge_volume = 80
    edge_rate = 1.0
    edge_pitch = 0
    # Azure Speech Services 配置
    azure_voice_name = "zh-CN-XiaoyiNeural-Female"
    azure_volume = 80
    azure_rate = 1.0
    azure_pitch = 0
 [proxy]
    # clash 默认地址：http://127.0.0.1:7890
    http = ""
--- a/main.py
+++ b/main.py
@ -1,19 +0,0 @@
 import os
 import uvicorn
 from loguru import logger
 from app.config import config
 if __name__ == "__main__":
    logger.info(
        "start server, docs: http://127.0.0.1:" + str(config.listen_port) + "/docs"
    )
    os.environ["HTTP_PROXY"] = config.proxy.get("http")
    os.environ["HTTPS_PROXY"] = config.proxy.get("https")
    uvicorn.run(
        app="app.asgi:app",
        host=config.listen_host,
        port=config.listen_port,
        reload=config.reload_debug,
        log_level="warning",
    )
--- a/2
+++ b/2
@ -1 +1 @@
-0.6.8
+0.7.0
--- a/release-notes.md
+++ b/release-notes.md
@ -1,17 +0,0 @@
 # Release Notes
 ## Latest Changes
 * docs(README): 更新README. PR [#138](https://github.com/linyqh/NarratoAI/pull/138) by [@linyqh](https://github.com/linyqh).
 * Dev 0.6.0. PR [#137](https://github.com/linyqh/NarratoAI/pull/137) by [@linyqh](https://github.com/linyqh).
 * Dev 0.6.0 . PR [#134](https://github.com/linyqh/NarratoAI/pull/134) by [@linyqh](https://github.com/linyqh).
 * Dev-0.3.9. PR [#73](https://github.com/linyqh/NarratoAI/pull/73) by [@linyqh](https://github.com/linyqh).
 * 0.3.9 版本发布. PR [#71](https://github.com/linyqh/NarratoAI/pull/71) by [@linyqh](https://github.com/linyqh).
 * docs: add Japanese README. PR [#66](https://github.com/linyqh/NarratoAI/pull/66) by [@eltociear](https://github.com/eltociear).
 * docs: 测试 release 2. PR [#62](https://github.com/linyqh/NarratoAI/pull/62) by [@linyqh](https://github.com/linyqh).
 * docs: 测试 release. PR [#61](https://github.com/linyqh/NarratoAI/pull/61) by [@linyqh](https://github.com/linyqh).
 * docs: 测试commit. PR [#60](https://github.com/linyqh/NarratoAI/pull/60) by [@linyqh](https://github.com/linyqh).
 * Dev. PR [#59](https://github.com/linyqh/NarratoAI/pull/59) by [@linyqh](https://github.com/linyqh).
 * 0.2.0新版预发布. PR [#37](https://github.com/linyqh/NarratoAI/pull/37) by [@linyqh](https://github.com/linyqh).
 * v0.3.6. PR [#58](https://github.com/linyqh/NarratoAI/pull/58) by [@linyqh](https://github.com/linyqh).
 * 0.3.4 修改各种bug. PR [#49](https://github.com/linyqh/NarratoAI/pull/49) by [@linyqh](https://github.com/linyqh).
--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,7 @@ pysrt==1.1.2
 openai~=1.77.0
 google-generativeai>=0.8.5
 azure-cognitiveservices-speech~=1.37.0
 # 待优化项
 # opencv-python==4.11.0.86
@ -29,7 +30,6 @@ google-generativeai>=0.8.5
 # python-multipart~=0.0.9
 # redis==5.0.3
 # opencv-python~=4.10.0.84
 # azure-cognitiveservices-speech~=1.37.0
 # git-changelog~=2.5.2
 # watchdog==5.0.2
 # pydub==0.25.1
--- a/start.bat
+++ b/start.bat
@ -1,88 +0,0 @@
@echo off
 :: 设置控制台代码页为UTF-8，解决中文显示问题
 chcp 65001 >nul
 :: 关闭命令回显，使脚本运行时更整洁
 :: 获取当前脚本所在目录路径并存储在变量中
 set "CURRENT_DIR=%~dp0"
 echo ***** 当前工作目录: %CURRENT_DIR% *****
 :: ==================== FFmpeg 配置 ====================
 :: 设置 FFmpeg 可执行文件的完整路径
 set "FFMPEG_BINARY=%CURRENT_DIR%lib\ffmpeg\ffmpeg-7.0-essentials_build\ffmpeg.exe"
 set "FFMPEG_PATH=%CURRENT_DIR%lib\ffmpeg\ffmpeg-7.0-essentials_build"
 echo ***** FFmpeg 执行文件路径: %FFMPEG_BINARY% *****
 :: 将 FFmpeg 目录添加到系统 PATH 环境变量，使其可以在命令行中直接调用
 set "PATH=%FFMPEG_PATH%;%PATH%"
 :: ==================== ImageMagick 配置 ====================
 :: 设置 ImageMagick 可执行文件的完整路径（用于图像处理）
 set "IMAGEMAGICK_BINARY=%CURRENT_DIR%lib\imagemagic\ImageMagick-7.1.1-29-portable-Q16-x64\magick.exe"
 set "IMAGEMAGICK_PATH=%CURRENT_DIR%lib\imagemagic\ImageMagick-7.1.1-29-portable-Q16-x64"
 echo ***** ImageMagick 执行文件路径: %IMAGEMAGICK_BINARY% *****
 :: 将 ImageMagick 目录添加到系统 PATH 环境变量
 set "PATH=%IMAGEMAGICK_PATH%;%PATH%"
 :: ==================== Python 环境配置 ====================
 :: 设置 Python 模块搜索路径，确保能够正确导入项目模块
 set "PYTHONPATH=%CURRENT_DIR%NarratoAI;%PYTHONPATH%"
 echo ***** Python模块搜索路径: %PYTHONPATH% *****
 :: ==================== 项目特定环境变量配置 ====================
 :: 设置项目根目录和依赖工具的路径，供应用程序内部使用
 set "NARRATO_ROOT=%CURRENT_DIR%NarratoAI"
 set "NARRATO_FFMPEG=%FFMPEG_BINARY%"
 set "NARRATO_IMAGEMAGICK=%IMAGEMAGICK_BINARY%"
 :: ==================== Streamlit 配置 ====================
 :: 设置 Streamlit（Python Web应用框架）的配置文件路径
 set "USER_HOME=%USERPROFILE%"
 set "STREAMLIT_DIR=%USER_HOME%\.streamlit"
 set "CREDENTIAL_FILE=%STREAMLIT_DIR%\credentials.toml"
 echo ***** Streamlit 凭证文件路径: %CREDENTIAL_FILE% *****
 :: 检查并创建 Streamlit 配置目录和凭证文件（如果不存在）
 if not exist "%STREAMLIT_DIR%" (
    echo 创建 Streamlit 配置目录...
    mkdir "%STREAMLIT_DIR%"
    (
        echo [general]
        echo email=""
    ) > "%CREDENTIAL_FILE%"
    echo Streamlit 配置文件已创建!
 )
 :: ==================== 依赖检查 ====================
 :: 验证必要的外部工具是否存在，确保应用可以正常运行
 if not exist "%FFMPEG_BINARY%" (
    echo 错误: 未找到 FFmpeg 执行文件，路径: %FFMPEG_BINARY%
    echo 请确保已正确安装 FFmpeg 或检查路径配置
    pause
    exit /b 1
 )
 if not exist "%IMAGEMAGICK_BINARY%" (
    echo 错误: 未找到 ImageMagick 执行文件，路径: %IMAGEMAGICK_BINARY%
    echo 请确保已正确安装 ImageMagick 或检查路径配置
    pause
    exit /b 1
 )
 :: ==================== 启动应用 ====================
 :: 切换到项目目录并启动应用
 echo ***** 切换工作目录到: %CURRENT_DIR%NarratoAI *****
 cd /d "%CURRENT_DIR%NarratoAI"
 echo ***** 正在启动 NarratoAI 应用... *****
 :: 使用项目自带的Python解释器启动Streamlit应用
 "%CURRENT_DIR%lib\python\python.exe" -m streamlit run webui.py  --browser.serverAddress="127.0.0.1" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False
 :: 参数说明:
 ::   --browser.serverAddress="127.0.0.1" - 将服务器绑定到本地地址
 ::   --server.enableCORS=True - 启用跨域资源共享
 ::   --server.maxUploadSize=2048 - 设置最大上传文件大小为2048MB
 ::   --browser.gatherUsageStats=False - 禁用使用统计收集
 :: 应用关闭后暂停，让用户看到最终输出
 pause
--- a/update.bat
+++ b/update.bat
@ -1,112 +0,0 @@
@echo off
 chcp 65001 >nul
 setlocal EnableDelayedExpansion
 set "CURRENT_DIR=%~dp0"
 echo ***** 当前目录: %CURRENT_DIR% *****
 REM 清除可能影响的环境变量
 set PYTHONPATH=
 set PYTHONHOME=
 REM 初始化代理设置为空
 set "HTTP_PROXY="
 set "HTTPS_PROXY="
 :git_pull
 echo 正在更新代码，请稍候...
 REM 使用git更新代码并检查是否成功
 "%CURRENT_DIR%lib\git\bin\git.exe" -C "%CURRENT_DIR%NarratoAI" pull > "%TEMP%\git_output.txt" 2>&1
 set GIT_EXIT_CODE=%ERRORLEVEL%
 if %GIT_EXIT_CODE% NEQ 0 (
    echo [错误] 代码更新失败！错误代码: %GIT_EXIT_CODE%
    type "%TEMP%\git_output.txt"
    findstr /C:"error: 403" /C:"fatal: unable to access" /C:"The requested URL returned error: 403" "%TEMP%\git_output.txt" >nul
    if !ERRORLEVEL! EQU 0 (
        echo.
        echo [提示] 检测到 GitHub 403 错误，可能是由于网络问题导致。
        if not defined HTTP_PROXY (
            echo.
            echo 请输入代理地址（例如 http://127.0.0.1:7890），或直接按回车跳过:
            set /p PROXY_INPUT="> "
            if not "!PROXY_INPUT!"=="" (
                set "HTTP_PROXY=!PROXY_INPUT!"
                set "HTTPS_PROXY=!PROXY_INPUT!"
                echo.
                echo [信息] 已设置代理: !PROXY_INPUT!
                echo 正在使用代理重试...
                goto git_pull
            ) else (
                echo.
                echo [警告] 未设置代理，建议:
                echo    - 手动设置系统代理
                echo    - 使用VPN或其他网络工具
                echo    - 重新运行此脚本并输入代理地址
            )
        ) else (
            echo.
            echo [警告] 使用代理 !HTTP_PROXY! 仍然失败。
            echo 您可以:
            echo    1. 输入新的代理地址（或直接按回车使用当前代理: !HTTP_PROXY!）
            echo    2. 输入 "clear" 清除代理设置
            set /p PROXY_INPUT="> "
            if "!PROXY_INPUT!"=="clear" (
                set "HTTP_PROXY="
                set "HTTPS_PROXY="
                echo [信息] 已清除代理设置
                goto end
            ) else if not "!PROXY_INPUT!"=="" (
                set "HTTP_PROXY=!PROXY_INPUT!"
                set "HTTPS_PROXY=!PROXY_INPUT!"
                echo [信息] 已更新代理为: !PROXY_INPUT!
                echo 正在使用新代理重试...
                goto git_pull
            ) else (
                echo [信息] 保持当前代理: !HTTP_PROXY!
                echo 您可以稍后再次尝试或手动解决网络问题
            )
        )
    ) else (
        echo.
        echo [警告] 遇到其他错误，请检查输出信息以获取更多详情。
    )
    goto end
 ) else (
    echo [成功] 代码已成功更新！
 )
 echo 正在更新pip，请稍候...
 "%CURRENT_DIR%lib\python\python.exe" -m pip install --upgrade pip >nul 2>&1
 if %ERRORLEVEL% NEQ 0 (
    echo [警告] pip更新失败，将继续使用当前版本。
 ) else (
    echo [成功] pip已更新至最新版本！
 )
 echo 正在安装依赖，请稍候...
 REM 确保使用正确的Python和pip
 "%CURRENT_DIR%lib\python\python.exe" -m pip install -q -r "%CURRENT_DIR%NarratoAI\requirements.txt"
 if %ERRORLEVEL% NEQ 0 (
    echo [错误] 依赖安装失败！请检查requirements.txt文件是否存在。
    goto end
 ) else (
    echo [成功] 依赖安装完成！
 )
 echo ===================================
 echo      ✓ 程序更新已完成
 echo ===================================
 :end
 if exist "%TEMP%\git_output.txt" del "%TEMP%\git_output.txt"
 REM 清除设置的代理环境变量
 if defined HTTP_PROXY (
    echo [信息] 本次会话的代理设置已清除
    set "HTTP_PROXY="
    set "HTTPS_PROXY="
 )
 pause
--- a/video_pipeline.py
+++ b/video_pipeline.py
@ -1,178 +0,0 @@
 import requests
 import json
 import os
 import time
 from typing import Dict, Any
 class VideoPipeline:
    def __init__(self, base_url: str = "http://127.0.0.1:8080"):
        self.base_url = base_url
    def download_video(self, url: str, resolution: str = "1080p", 
                      output_format: str = "mp4", rename: str = None) -> Dict[str, Any]:
        """下载视频的第一步"""
        endpoint = f"{self.base_url}/api/v2/youtube/download"
        payload = {
            "url": url,
            "resolution": resolution,
            "output_format": output_format,
            "rename": rename or time.strftime("%Y-%m-%d")
        }
        response = requests.post(endpoint, json=payload)
        response.raise_for_status()
        return response.json()
    def generate_script(self, video_path: str, skip_seconds: int = 0,
                       threshold: int = 30, vision_batch_size: int = 10,
                       vision_llm_provider: str = "gemini") -> Dict[str, Any]:
        """生成脚本的第二步"""
        endpoint = f"{self.base_url}/api/v2/scripts/generate"
        payload = {
            "video_path": video_path,
            "skip_seconds": skip_seconds,
            "threshold": threshold,
            "vision_batch_size": vision_batch_size,
            "vision_llm_provider": vision_llm_provider
        }
        response = requests.post(endpoint, json=payload)
        response.raise_for_status()
        return response.json()
    def crop_video(self, video_path: str, script: list) -> Dict[str, Any]:
        """剪辑视频的第三步"""
        endpoint = f"{self.base_url}/api/v2/scripts/crop"
        payload = {
            "video_origin_path": video_path,
            "video_script": script
        }
        response = requests.post(endpoint, json=payload)
        response.raise_for_status()
        return response.json()
    def generate_final_video(self, task_id: str, video_path: str, 
                           script_path: str, script: list, subclip_videos: Dict[str, str], voice_name: str) -> Dict[str, Any]:
        """生成最终视频的第四步"""
        endpoint = f"{self.base_url}/api/v2/scripts/start-subclip"
        request_data = {
            "video_clip_json": script,
            "video_clip_json_path": script_path,
            "video_origin_path": video_path,
            "video_aspect": "16:9",
            "video_language": "zh-CN",
            "voice_name": voice_name,
            "voice_volume": 1,
            "voice_rate": 1.2,
            "voice_pitch": 1,
            "bgm_name": "random",
            "bgm_type": "random",
            "bgm_file": "",
            "bgm_volume": 0.3,
            "subtitle_enabled": True,
            "subtitle_position": "bottom",
            "font_name": "STHeitiMedium.ttc",
            "text_fore_color": "#FFFFFF",
            "text_background_color": "transparent",
            "font_size": 75,
            "stroke_color": "#000000",
            "stroke_width": 1.5,
            "custom_position": 70,
            "n_threads": 8
        }
        payload = {
            "request": request_data,
            "subclip_videos": subclip_videos
        }
        params = {"task_id": task_id}
        response = requests.post(endpoint, params=params, json=payload)
        response.raise_for_status()
        return response.json()
    def save_script_to_json(self, script: list, script_path: str) -> str:
        """保存脚本到json文件"""        
        try:
            with open(script_path, 'w', encoding='utf-8') as f:
                json.dump(script, f, ensure_ascii=False, indent=2)
            print(f"脚本已保存到: {script_path}")
            return script_path
        except Exception as e:
            print(f"保存脚本失败: {str(e)}")
            raise
    def run_pipeline(self, task_id: str, script_name: str, youtube_url: str, video_name: str="null", skip_seconds: int = 0, threshold: int = 30, vision_batch_size: int = 10, vision_llm_provider: str = "gemini", voice_name: str = "zh-CN-YunjianNeural") -> Dict[str, Any]:
        """运行完整的pipeline"""
        try:
            current_path = os.path.dirname(os.path.abspath(__file__))
            video_path = os.path.join(current_path, "resource", "videos", f"{video_name}.mp4")
            # 判断视频是否存在
            if not os.path.exists(video_path):
                # 1. 下载视频
                print(f"视频不存在, 开始下载视频: {video_path}")
                download_result = self.download_video(url=youtube_url, resolution="1080p", output_format="mp4", rename=video_name)
                video_path = download_result["output_path"]
            else:
                print(f"视频已存在: {video_path}")
            # 2. 判断script_name是否存在
            # 2.1.1 拼接脚本路径 NarratoAI/resource/scripts
            script_path = os.path.join(current_path, "resource", "scripts", script_name)
            if os.path.exists(script_path):
                script = json.load(open(script_path, "r", encoding="utf-8"))
            else:
                # 2.1.2 生成脚本
                print("开始生成脚本...")
                script_result = self.generate_script(video_path=video_path, skip_seconds=skip_seconds, threshold=threshold, vision_batch_size=vision_batch_size, vision_llm_provider=vision_llm_provider)
                script = script_result["script"]
            # 2.2 保存脚本到json文件
            print("保存脚本到json文件...")
            self.save_script_to_json(script=script, script_path=script_path)
            # 3. 剪辑视频
            print("开始剪辑视频...")
            crop_result = self.crop_video(video_path=video_path, script=script)
            subclip_videos = crop_result["subclip_videos"]
            # 4. 生成最终视频
            print("开始生成最终视频...")
            self.generate_final_video(
                task_id=task_id,
                video_path=video_path,
                script_path=script_path,
                script=script,
                subclip_videos=subclip_videos,
                voice_name=voice_name
            )
            return {
                "status": "等待异步生成视频",
                "path": os.path.join(current_path, "storage", "tasks", task_id)
            }
        except Exception as e:
            return {
                "status": "error",
                "error": str(e)
            }
 # 使用示例
 if __name__ == "__main__":
    pipeline = VideoPipeline()
    result = pipeline.run_pipeline(
        task_id="test_111901",
        script_name="test.json",
        youtube_url="https://www.youtube.com/watch?v=vLJ7Yed6FQ4",
        video_name="2024-11-19-01",
        skip_seconds=50,
        threshold=35,
        vision_batch_size=10,
        vision_llm_provider="gemini",
        voice_name="zh-CN-YunjianNeural",
    )
    print(result)
--- a/webui.py
+++ b/webui.py
@ -4,7 +4,7 @@ import sys
 from loguru import logger
 from app.config import config
 from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \
-    review_settings, merge_settings, system_settings
+    system_settings
 # from webui.utils import cache, file_utils
 from app.utils import utils
 from app.utils import ffmpeg_utils
@ -106,8 +106,7 @@ def init_global_state():
        st.session_state['video_plot'] = ''
    if 'ui_language' not in st.session_state:
        st.session_state['ui_language'] = config.ui.get("language", utils.get_system_locale())
-    if 'subclip_videos' not in st.session_state:
+    # 移除subclip_videos初始化 - 现在使用统一裁剪策略
        st.session_state['subclip_videos'] = {}
 def tr(key):
@ -136,11 +135,9 @@ def render_generate_button():
        logger.add(log_received)
        config.save_config()
        task_id = st.session_state.get('task_id')
-        if not task_id:
+        # 移除task_id检查 - 现在使用统一裁剪策略，不再需要预裁剪
-            st.error(tr("请先裁剪视频"))
+        # 直接检查必要的文件是否存在
            return
        if not st.session_state.get('video_clip_json_path'):
            st.error(tr("脚本文件不能为空"))
            return
@ -168,10 +165,14 @@ def render_generate_button():
        # 创建参数对象
        params = VideoClipParams(**all_params)
-        result = tm.start_subclip(
+        # 使用新的统一裁剪策略，不再需要预裁剪的subclip_videos
        # 生成一个新的task_id用于本次处理
        import uuid
        task_id = str(uuid.uuid4())
        result = tm.start_subclip_unified(
            task_id=task_id,
-            params=params,
+            params=params
            subclip_path_videos=st.session_state['subclip_videos']
        )
        video_files = result.get("videos", [])
@ -220,22 +221,17 @@ def main():
    # 首先渲染不依赖PyTorch的UI部分
    # 渲染基础设置面板
    basic_settings.render_basic_settings(tr)
    # 渲染合并设置
    merge_settings.render_merge_settings(tr)
    # 渲染主面板
    panel = st.columns(3)
    with panel[0]:
        script_settings.render_script_panel(tr)
    with panel[1]:
        video_settings.render_video_panel(tr)
        audio_settings.render_audio_panel(tr)
    with panel[2]:
        video_settings.render_video_panel(tr)
        subtitle_settings.render_subtitle_panel(tr)
    # 渲染视频审查面板
    review_settings.render_review_panel(tr)
    # 放到最后渲染可能使用PyTorch的部分
    # 渲染系统设置面板
    with panel[2]:
--- a/webui/components/init.py
+++ b/webui/components/init.py
@ -3,13 +3,11 @@ from .script_settings import render_script_panel
 from .video_settings import render_video_panel
 from .audio_settings import render_audio_panel
 from .subtitle_settings import render_subtitle_panel
 from .review_settings import render_review_panel
 __all__ = [
    'render_basic_settings',
    'render_script_panel',
    'render_video_panel',
    'render_audio_panel',
-    'render_subtitle_panel',
+    'render_subtitle_panel'
-    'render_review_panel'
+]
 ] 
--- a/webui/components/audio_settings.py
+++ b/webui/components/audio_settings.py
@ -8,6 +8,64 @@ from app.utils import utils
 from webui.utils.cache import get_songs_cache
 def get_soulvoice_voices():
    """获取 SoulVoice 语音列表"""
    # 检查是否配置了 SoulVoice API key
    api_key = config.soulvoice.get("api_key", "")
    if not api_key:
        return []
    # 只返回一个 SoulVoice 选项，音色通过输入框自定义
    return ["soulvoice:custom"]
 def get_tts_engine_options():
    """获取TTS引擎选项"""
    return {
        "edge_tts": "Edge TTS",
        "azure_speech": "Azure Speech Services",
        "soulvoice": "SoulVoice"
    }
 def get_tts_engine_descriptions():
    """获取TTS引擎详细描述"""
    return {
        "edge_tts": {
            "title": "Edge TTS",
            "features": "完全免费，但服务稳定性一般，不支持语音克隆功能",
            "use_case": "测试和轻量级使用",
            "registration": None
        },
        "azure_speech": {
            "title": "Azure Speech Services",
            "features": "提供一定免费额度，超出后按量付费，需要绑定海外信用卡",
            "use_case": "企业级应用，需要稳定服务",
            "registration": "https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices"
        },
        "soulvoice": {
            "title": "SoulVoice",
            "features": "提供免费额度，支持语音克隆，支持微信购买额度，无需信用卡，性价比极高",
            "use_case": "个人用户和中小企业，需要语音克隆功能",
            "registration": "https://soulvoice.scsmtech.cn/"
        }
    }
 def is_valid_azure_voice_name(voice_name: str) -> bool:
    """检查是否为有效的Azure音色名称格式"""
    if not voice_name or not isinstance(voice_name, str):
        return False
    voice_name = voice_name.strip()
    # Azure音色名称通常格式为: [语言]-[地区]-[名称]Neural
    # 例如: zh-CN-YunzeNeural, en-US-AvaMultilingualNeural
    import re
    pattern = r'^[a-z]{2}-[A-Z]{2}-\w+Neural$'
    return bool(re.match(pattern, voice_name))
 def render_audio_panel(tr):
    """渲染音频设置面板"""
    with st.container(border=True):
@ -22,37 +80,91 @@ def render_audio_panel(tr):
 def render_tts_settings(tr):
    """渲染TTS(文本转语音)设置"""
    # 1. TTS引擎选择器
    # st.subheader("🎤 TTS引擎选择")
    engine_options = get_tts_engine_options()
    engine_descriptions = get_tts_engine_descriptions()
    # 获取保存的TTS引擎设置
    saved_tts_engine = config.ui.get("tts_engine", "edge_tts")
    # 确保保存的引擎在可用选项中
    if saved_tts_engine not in engine_options:
        saved_tts_engine = "edge_tts"
    # TTS引擎选择下拉框
    selected_engine = st.selectbox(
        "选择TTS引擎",
        options=list(engine_options.keys()),
        format_func=lambda x: engine_options[x],
        index=list(engine_options.keys()).index(saved_tts_engine),
        help="选择您要使用的文本转语音引擎"
    )
    # 保存TTS引擎选择
    config.ui["tts_engine"] = selected_engine
    # 2. 显示引擎详细说明
    if selected_engine in engine_descriptions:
        desc = engine_descriptions[selected_engine]
        with st.expander(f"📋 {desc['title']} 详细说明", expanded=True):
            st.markdown(f"**特点：** {desc['features']}")
            st.markdown(f"**适用场景：** {desc['use_case']}")
            if desc['registration']:
                st.markdown(f"**注册地址：** [{desc['registration']}]({desc['registration']})")
    # 3. 根据选择的引擎渲染对应的配置界面
    # st.subheader("⚙️ 引擎配置")
    if selected_engine == "edge_tts":
        render_edge_tts_settings(tr)
    elif selected_engine == "azure_speech":
        render_azure_speech_settings(tr)
    elif selected_engine == "soulvoice":
        render_soulvoice_engine_settings(tr)
    # 4. 试听功能
    render_voice_preview_new(tr, selected_engine)
 def render_edge_tts_settings(tr):
    """渲染 Edge TTS 引擎设置"""
    # 获取支持的语音列表
    support_locales = ["zh-CN", "en-US"]
-    voices = voice.get_all_azure_voices(filter_locals=support_locales)
+    all_voices = voice.get_all_azure_voices(filter_locals=support_locales)
    # 只保留标准版本的语音（Edge TTS专用，不包含V2）
    edge_voices = [v for v in all_voices if "-V2" not in v]
    # 创建友好的显示名称
-    friendly_names = {
+    friendly_names = {}
-        v: v.replace("Female", tr("Female"))
+    for v in edge_voices:
-        .replace("Male", tr("Male"))
+        friendly_names[v] = v.replace("Female", tr("Female")).replace("Male", tr("Male")).replace("Neural", "")
        .replace("Neural", "")
        for v in voices
    }
    # 获取保存的语音设置
-    saved_voice_name = config.ui.get("voice_name", "")
+    saved_voice_name = config.ui.get("edge_voice_name", "zh-CN-XiaoxiaoNeural-Female")
    saved_voice_name_index = 0
-    if saved_voice_name in friendly_names:
+    # 确保保存的音色在可用列表中
-        saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name)
+    if saved_voice_name not in friendly_names:
-    else:
+        # 选择与UI语言匹配的第一个语音
-        # 如果没有保存的设置，选择与UI语言匹配的第一个语音
+        for v in edge_voices:
-        for i, v in enumerate(voices):
+            if v.lower().startswith(st.session_state.get("ui_language", "zh-CN").lower()):
-            if (v.lower().startswith(st.session_state["ui_language"].lower())
+                saved_voice_name = v
                    and "V2" not in v):
                saved_voice_name_index = i
                break
        else:
            # 如果没找到匹配的，使用第一个
            saved_voice_name = edge_voices[0] if edge_voices else ""
-    # 语音选择下拉框
+    # 音色选择下拉框（Edge TTS音色相对较少，保留下拉框）
    selected_friendly_name = st.selectbox(
-        tr("Speech Synthesis"),
+        "音色选择",
        options=list(friendly_names.values()),
-        index=saved_voice_name_index,
+        index=list(friendly_names.keys()).index(saved_voice_name) if saved_voice_name in friendly_names else 0,
        help="选择Edge TTS音色"
    )
    # 获取实际的语音名称
@ -60,22 +172,323 @@ def render_tts_settings(tr):
        list(friendly_names.values()).index(selected_friendly_name)
    ]
-    # 保存设置
+    # 显示音色信息
-    config.ui["voice_name"] = voice_name
+    with st.expander("💡 Edge TTS 音色说明", expanded=False):
        st.write("**中文音色：**")
        zh_voices = [v for v in edge_voices if v.startswith("zh-CN")]
        for v in zh_voices:
            gender = "女声" if "Female" in v else "男声"
            name = v.replace("-Female", "").replace("-Male", "").replace("zh-CN-", "").replace("Neural", "")
            st.write(f"• {name} ({gender})")
-    # Azure V2语音特殊处理
+        st.write("")
-    if voice.is_azure_v2_voice(voice_name):
+        st.write("**英文音色：**")
-        render_azure_v2_settings(tr)
+        en_voices = [v for v in edge_voices if v.startswith("en-US")][:5]  # 只显示前5个
        for v in en_voices:
            gender = "女声" if "Female" in v else "男声"
            name = v.replace("-Female", "").replace("-Male", "").replace("en-US-", "").replace("Neural", "")
            st.write(f"• {name} ({gender})")
-    # 语音参数设置
+        if len([v for v in edge_voices if v.startswith("en-US")]) > 5:
-    render_voice_parameters(tr)
+            st.write("• ... 更多英文音色")
-    # 试听按钮
+    config.ui["edge_voice_name"] = voice_name
-    render_voice_preview(tr, voice_name)
+    config.ui["voice_name"] = voice_name  # 兼容性
    # 音量调节
    voice_volume = st.slider(
        "音量调节",
        min_value=0,
        max_value=100,
        value=int(config.ui.get("edge_volume", 80)),
        step=1,
        help="调节语音音量 (0-100)"
    )
    config.ui["edge_volume"] = voice_volume
    st.session_state['voice_volume'] = voice_volume / 100.0
    # 语速调节
    voice_rate = st.slider(
        "语速调节",
        min_value=0.5,
        max_value=2.0,
        value=config.ui.get("edge_rate", 1.0),
        step=0.1,
        help="调节语音速度 (0.5-2.0倍速)"
    )
    config.ui["edge_rate"] = voice_rate
    st.session_state['voice_rate'] = voice_rate
    # 语调调节
    voice_pitch = st.slider(
        "语调调节",
        min_value=-50,
        max_value=50,
        value=int(config.ui.get("edge_pitch", 0)),
        step=5,
        help="调节语音音调 (-50%到+50%)"
    )
    config.ui["edge_pitch"] = voice_pitch
    # 转换为比例值
    st.session_state['voice_pitch'] = 1.0 + (voice_pitch / 100.0)
 def render_azure_speech_settings(tr):
    """渲染 Azure Speech Services 引擎设置"""
    # 服务区域配置
    azure_speech_region = st.text_input(
        "服务区域",
        value=config.azure.get("speech_region", ""),
        placeholder="例如：eastus",
        help="Azure Speech Services 服务区域，如：eastus, westus2, eastasia 等"
    )
    # API Key配置
    azure_speech_key = st.text_input(
        "API Key",
        value=config.azure.get("speech_key", ""),
        type="password",
        help="Azure Speech Services API 密钥"
    )
    # 保存Azure配置
    config.azure["speech_region"] = azure_speech_region
    config.azure["speech_key"] = azure_speech_key
    # 音色名称输入框
    saved_voice_name = config.ui.get("azure_voice_name", "zh-CN-XiaoxiaoMultilingualNeural")
    # 音色名称输入
    voice_name = st.text_input(
        "音色名称",
        value=saved_voice_name,
        help="输入Azure Speech Services音色名称，直接使用官方音色名称即可。例如：zh-CN-YunzeNeural",
        placeholder="zh-CN-YunzeNeural"
    )
    # 显示常用音色示例
    with st.expander("💡 常用音色参考", expanded=False):
        st.write("**中文音色：**")
        st.write("• zh-CN-XiaoxiaoMultilingualNeural (女声，多语言)")
        st.write("• zh-CN-YunzeNeural (男声)")
        st.write("• zh-CN-YunxiNeural (男声)")
        st.write("• zh-CN-XiaochenNeural (女声)")
        st.write("")
        st.write("**英文音色：**")
        st.write("• en-US-AndrewMultilingualNeural (男声，多语言)")
        st.write("• en-US-AvaMultilingualNeural (女声，多语言)")
        st.write("• en-US-BrianMultilingualNeural (男声，多语言)")
        st.write("• en-US-EmmaMultilingualNeural (女声，多语言)")
        st.write("")
        st.info("💡 更多音色请参考 [Azure Speech Services 官方文档](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support)")
    # 快速选择按钮
    st.write("**快速选择：**")
    cols = st.columns(3)
    with cols[0]:
        if st.button("中文女声", help="zh-CN-XiaoxiaoMultilingualNeural"):
            voice_name = "zh-CN-XiaoxiaoMultilingualNeural"
            st.rerun()
    with cols[1]:
        if st.button("中文男声", help="zh-CN-YunzeNeural"):
            voice_name = "zh-CN-YunzeNeural"
            st.rerun()
    with cols[2]:
        if st.button("英文女声", help="en-US-AvaMultilingualNeural"):
            voice_name = "en-US-AvaMultilingualNeural"
            st.rerun()
    # 验证音色名称并显示状态
    if voice_name.strip():
        # 检查是否为有效的Azure音色格式
        if is_valid_azure_voice_name(voice_name):
            st.success(f"✅ 音色名称有效: {voice_name}")
        else:
            st.warning(f"⚠️ 音色名称格式可能不正确: {voice_name}")
            st.info("💡 Azure音色名称通常格式为: [语言]-[地区]-[名称]Neural")
    # 保存配置
    config.ui["azure_voice_name"] = voice_name
    config.ui["voice_name"] = voice_name  # 兼容性
    # 音量调节
    voice_volume = st.slider(
        "音量调节",
        min_value=0,
        max_value=100,
        value=int(config.ui.get("azure_volume", 80)),
        step=1,
        help="调节语音音量 (0-100)"
    )
    config.ui["azure_volume"] = voice_volume
    st.session_state['voice_volume'] = voice_volume / 100.0
    # 语速调节
    voice_rate = st.slider(
        "语速调节",
        min_value=0.5,
        max_value=2.0,
        value=config.ui.get("azure_rate", 1.0),
        step=0.1,
        help="调节语音速度 (0.5-2.0倍速)"
    )
    config.ui["azure_rate"] = voice_rate
    st.session_state['voice_rate'] = voice_rate
    # 语调调节
    voice_pitch = st.slider(
        "语调调节",
        min_value=-50,
        max_value=50,
        value=int(config.ui.get("azure_pitch", 0)),
        step=5,
        help="调节语音音调 (-50%到+50%)"
    )
    config.ui["azure_pitch"] = voice_pitch
    # 转换为比例值
    st.session_state['voice_pitch'] = 1.0 + (voice_pitch / 100.0)
    # 显示配置状态
    if azure_speech_region and azure_speech_key:
        st.success("✅ Azure Speech Services 配置已设置")
    elif not azure_speech_region:
        st.warning("⚠️ 请配置服务区域")
    elif not azure_speech_key:
        st.warning("⚠️ 请配置 API Key")
 def render_soulvoice_engine_settings(tr):
    """渲染 SoulVoice 引擎设置"""
    # API Key 输入
    api_key = st.text_input(
        "API Key",
        value=config.soulvoice.get("api_key", ""),
        type="password",
        help="请输入您的 SoulVoice API 密钥"
    )
    # 音色 URI 输入
    voice_uri = st.text_input(
        "音色URI",
        value=config.soulvoice.get("voice_uri", "speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr"),
        help="请输入 SoulVoice 音色标识符",
        placeholder="speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr"
    )
    # 模型名称选择
    model_options = [
        "FunAudioLLM/CosyVoice2-0.5B",
        "FunAudioLLM/CosyVoice-300M",
        "FunAudioLLM/CosyVoice-300M-SFT",
        "FunAudioLLM/CosyVoice-300M-Instruct"
    ]
    saved_model = config.soulvoice.get("model", "FunAudioLLM/CosyVoice2-0.5B")
    if saved_model not in model_options:
        model_options.append(saved_model)
    model = st.selectbox(
        "模型名称",
        options=model_options,
        index=model_options.index(saved_model),
        help="选择使用的 TTS 模型"
    )
    # 高级设置
    with st.expander("高级设置", expanded=False):
        api_url = st.text_input(
            "API 地址",
            value=config.soulvoice.get("api_url", "https://tts.scsmtech.cn/tts"),
            help="SoulVoice API 接口地址"
        )
    # 保存配置
    config.soulvoice["api_key"] = api_key
    config.soulvoice["voice_uri"] = voice_uri
    config.soulvoice["model"] = model
    config.soulvoice["api_url"] = api_url
    # 设置兼容性配置
    if voice_uri:
        # 确保音色 URI 有正确的前缀
        if not voice_uri.startswith("soulvoice:") and not voice_uri.startswith("speech:"):
            voice_name = f"soulvoice:{voice_uri}"
        else:
            voice_name = voice_uri if voice_uri.startswith("soulvoice:") else f"soulvoice:{voice_uri}"
        config.ui["voice_name"] = voice_name
    # 显示配置状态
    if api_key and voice_uri:
        st.success("✅ SoulVoice 配置已设置")
    elif not api_key:
        st.warning("⚠️ 请配置 SoulVoice API Key")
    elif not voice_uri:
        st.warning("⚠️ 请配置音色 URI")
 def render_voice_preview_new(tr, selected_engine):
    """渲染新的语音试听功能"""
    if st.button("🎵 试听语音合成", use_container_width=True):
        play_content = "感谢关注 NarratoAI，有任何问题或建议，可以关注微信公众号，求助或讨论"
        # 根据选择的引擎获取对应的语音配置
        voice_name = ""
        voice_rate = 1.0
        voice_pitch = 1.0
        if selected_engine == "edge_tts":
            voice_name = config.ui.get("edge_voice_name", "zh-CN-XiaoyiNeural-Female")
            voice_rate = config.ui.get("edge_rate", 1.0)
            voice_pitch = 1.0 + (config.ui.get("edge_pitch", 0) / 100.0)
        elif selected_engine == "azure_speech":
            voice_name = config.ui.get("azure_voice_name", "zh-CN-XiaoxiaoMultilingualNeural")
            voice_rate = config.ui.get("azure_rate", 1.0)
            voice_pitch = 1.0 + (config.ui.get("azure_pitch", 0) / 100.0)
        elif selected_engine == "soulvoice":
            voice_uri = config.soulvoice.get("voice_uri", "")
            if voice_uri:
                if not voice_uri.startswith("soulvoice:") and not voice_uri.startswith("speech:"):
                    voice_name = f"soulvoice:{voice_uri}"
                else:
                    voice_name = voice_uri if voice_uri.startswith("soulvoice:") else f"soulvoice:{voice_uri}"
            voice_rate = 1.0  # SoulVoice 使用默认语速
            voice_pitch = 1.0  # SoulVoice 不支持音调调节
        if not voice_name:
            st.error("请先配置语音设置")
            return
        with st.spinner("正在合成语音..."):
            temp_dir = utils.storage_dir("temp", create=True)
            audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
            sub_maker = voice.tts(
                text=play_content,
                voice_name=voice_name,
                voice_rate=voice_rate,
                voice_pitch=voice_pitch,
                voice_file=audio_file,
            )
            if sub_maker and os.path.exists(audio_file):
                st.success("✅ 语音合成成功！")
                # 播放音频
                with open(audio_file, 'rb') as audio_file_obj:
                    audio_bytes = audio_file_obj.read()
                    st.audio(audio_bytes, format='audio/mp3')
                # 清理临时文件
                try:
                    os.remove(audio_file)
                except:
                    pass
            else:
                st.error("❌ 语音合成失败，请检查配置")
 def render_azure_v2_settings(tr):
-    """渲染Azure V2语音设置"""
+    """渲染Azure V2语音设置（保留兼容性）"""
    saved_azure_speech_region = config.azure.get("speech_region", "")
    saved_azure_speech_key = config.azure.get("speech_key", "")
@ -93,8 +506,60 @@ def render_azure_v2_settings(tr):
    config.azure["speech_key"] = azure_speech_key
-def render_voice_parameters(tr):
+def render_soulvoice_settings(tr):
-    """渲染语音参数设置"""
+    """渲染 SoulVoice 语音设置（保留兼容性）"""
    saved_api_key = config.soulvoice.get("api_key", "")
    saved_api_url = config.soulvoice.get("api_url", "https://tts.scsmtech.cn/tts")
    saved_model = config.soulvoice.get("model", "FunAudioLLM/CosyVoice2-0.5B")
    saved_voice_uri = config.soulvoice.get("voice_uri", "speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr")
    # API Key 输入
    api_key = st.text_input(
        "SoulVoice API Key",
        value=saved_api_key,
        type="password",
        help="请输入您的 SoulVoice API 密钥"
    )
    # 音色 URI 输入
    voice_uri = st.text_input(
        "音色 URI",
        value=saved_voice_uri,
        help="请输入 SoulVoice 音色标识符，格式如：speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr",
        placeholder="speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr"
    )
    # API URL 输入（可选）
    with st.expander("高级设置", expanded=False):
        api_url = st.text_input(
            "API 地址",
            value=saved_api_url,
            help="SoulVoice API 接口地址"
        )
        model = st.text_input(
            "模型名称",
            value=saved_model,
            help="使用的 TTS 模型"
        )
    # 保存配置
    config.soulvoice["api_key"] = api_key
    config.soulvoice["voice_uri"] = voice_uri
    config.soulvoice["api_url"] = api_url
    config.soulvoice["model"] = model
    # 显示配置状态
    if api_key and voice_uri:
        st.success("✅ SoulVoice 配置已设置")
    elif not api_key:
        st.warning("⚠️ 请配置 SoulVoice API Key")
    elif not voice_uri:
        st.warning("⚠️ 请配置音色 URI")
 def render_voice_parameters(tr, voice_name):
    """渲染语音参数设置（保留兼容性）"""
    # 音量 - 使用统一的默认值
    voice_volume = st.slider(
        tr("Speech Volume"),
@ -106,22 +571,41 @@ def render_voice_parameters(tr):
    )
    st.session_state['voice_volume'] = voice_volume
    # 检查是否为 SoulVoice 引擎
    is_soulvoice = voice.is_soulvoice_voice(voice_name)
    # 语速
-    voice_rate = st.selectbox(
+    if is_soulvoice:
-        tr("Speech Rate"),
+        # SoulVoice 支持更精细的语速控制
-        options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
+        voice_rate = st.slider(
-        index=2,
+            tr("Speech Rate"),
-    )
+            min_value=0.5,
            max_value=2.0,
            value=1.0,
            step=0.1,
            help="SoulVoice 语音速度控制"
        )
    else:
        # Azure TTS 使用预设选项
        voice_rate = st.selectbox(
            tr("Speech Rate"),
            options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
            index=2,
        )
    st.session_state['voice_rate'] = voice_rate
-    # 音调
+    # 音调 - SoulVoice 不支持音调调节
-    voice_pitch = st.selectbox(
+    if not is_soulvoice:
-        tr("Speech Pitch"),
+        voice_pitch = st.selectbox(
-        options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
+            tr("Speech Pitch"),
-        index=2,
+            options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
-    )
+            index=2,
-    st.session_state['voice_pitch'] = voice_pitch
+        )
        st.session_state['voice_pitch'] = voice_pitch
    else:
        # SoulVoice 不支持音调调节，设置默认值
        st.session_state['voice_pitch'] = 1.0
        st.info("ℹ️ SoulVoice 引擎不支持音调调节")
 def render_voice_preview(tr, voice_name):
@ -157,9 +641,12 @@ def render_voice_preview(tr, voice_name):
                )
            if sub_maker and os.path.exists(audio_file):
                st.success(tr("Voice synthesis successful"))
                st.audio(audio_file, format="audio/mp3")
                if os.path.exists(audio_file):
                    os.remove(audio_file)
            else:
                st.error(tr("Voice synthesis failed"))
 def render_bgm_settings(tr):
--- a/webui/components/merge_settings.py
+++ b/webui/components/merge_settings.py
@ -1,296 +0,0 @@
 import os
 import time
 import streamlit as st
 from loguru import logger
 from typing import List, Dict
 from dataclasses import dataclass
 from streamlit.runtime.uploaded_file_manager import UploadedFile
 from webui.utils.merge_video import merge_videos_and_subtitles
 from app.utils.utils import video_dir, srt_dir
 # 定义临时目录路径
 TEMP_MERGE_DIR = os.path.join("storage", "temp", "merge")
 # 确保临时目录存在
 os.makedirs(TEMP_MERGE_DIR, exist_ok=True)
@dataclass
 class VideoSubtitlePair:
    video_file: UploadedFile | None
    subtitle_file: str | None
    base_name: str
    order: int = 0
 def save_uploaded_file(uploaded_file: UploadedFile, target_dir: str) -> str:
    """Save uploaded file to target directory and return the file path"""
    file_path = os.path.join(target_dir, uploaded_file.name)
    # 如果文件已存在，先删除它
    if os.path.exists(file_path):
        os.remove(file_path)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getvalue())
    return file_path
 def clean_temp_dir():
    """清空临时目录"""
    if os.path.exists(TEMP_MERGE_DIR):
        for file in os.listdir(TEMP_MERGE_DIR):
            file_path = os.path.join(TEMP_MERGE_DIR, file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                logger.error(f"清理临时文件失败: {str(e)}")
 def group_files(files: List[UploadedFile]) -> Dict[str, VideoSubtitlePair]:
    """Group uploaded files by their base names"""
    pairs = {}
    order_counter = 0
    # 首先处理所有视频文件
    for file in files:
        base_name = os.path.splitext(file.name)[0]
        ext = os.path.splitext(file.name)[1].lower()
        if ext == ".mp4":
            if base_name not in pairs:
                pairs[base_name] = VideoSubtitlePair(None, None, base_name, order_counter)
                order_counter += 1
            pairs[base_name].video_file = file
            # 保存视频文件到临时目录
            video_path = save_uploaded_file(file, TEMP_MERGE_DIR)
    # 然后处理所有字幕文件
    for file in files:
        base_name = os.path.splitext(file.name)[0]
        ext = os.path.splitext(file.name)[1].lower()
        if ext == ".srt":
            # 即使没有对应视频也保存字幕文件
            subtitle_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt")
            save_uploaded_file(file, TEMP_MERGE_DIR)
            if base_name in pairs:  # 如果有对应的视频
                pairs[base_name].subtitle_file = subtitle_path
    return pairs
 def render_merge_settings(tr):
    """Render the merge settings section"""
    with st.expander(tr("Video Subtitle Merge"), expanded=False):
        # 上传文件区域
        uploaded_files = st.file_uploader(
            tr("Upload Video and Subtitle Files"),
            type=["mp4", "srt"],
            accept_multiple_files=True,
            key="merge_files"
        )
        if uploaded_files:
            all_pairs = group_files(uploaded_files)
            if all_pairs:
                st.write(tr("All Uploaded Files"))
                # 初始化或更新session state中的排序信息
                if 'file_orders' not in st.session_state:
                    st.session_state.file_orders = {
                        name: pair.order for name, pair in all_pairs.items()
                    }
                    st.session_state.needs_reorder = False
                # 确保所有新文件都有排序值
                for name, pair in all_pairs.items():
                    if name not in st.session_state.file_orders:
                        st.session_state.file_orders[name] = pair.order
                # 移除不存在的文件的排序值
                st.session_state.file_orders = {
                    k: v for k, v in st.session_state.file_orders.items() 
                    if k in all_pairs
                }
                # 按照排序值对文件对进行排序
                sorted_pairs = sorted(
                    all_pairs.items(),
                    key=lambda x: st.session_state.file_orders[x[0]]
                )
                # 计算需要多少行来显示所有视频（每行5个）
                num_pairs = len(sorted_pairs)
                num_rows = (num_pairs + 4) // 5  # 向上取整,每行5个
                # 遍历每一行
                for row in range(num_rows):
                    # 创建5列
                    cols = st.columns(5)
                    # 在这一行中填充视频（最多5个）
                    for col_idx in range(5):
                        pair_idx = row * 5 + col_idx
                        if pair_idx < num_pairs:
                            base_name, pair = sorted_pairs[pair_idx]
                            with cols[col_idx]:
                                st.caption(base_name)
                                # 显示视频预览（如果存在）
                                video_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.mp4")
                                if os.path.exists(video_path):
                                    st.video(video_path)
                                else:
                                    st.warning(tr("Missing Video"))
                                # 显示字幕预览（如果存在）
                                subtitle_path = os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt")
                                if os.path.exists(subtitle_path):
                                    with open(subtitle_path, 'r', encoding='utf-8') as f:
                                        subtitle_content = f.read()
                                        st.markdown(tr("Subtitle Preview"))
                                        st.text_area(
                                            "Subtitle Content",
                                            value=subtitle_content,
                                            height=100,  # 减高度以适应5列布局
                                            label_visibility="collapsed",
                                            key=f"subtitle_preview_{base_name}"
                                        )
                                else:
                                    st.warning(tr("Missing Subtitle"))
                                    # 如果有视频但没有字幕，显示一键转录按钮
                                    # if os.path.exists(video_path):
                                    #     if st.button(tr("One-Click Transcribe"), key=f"transcribe_{base_name}"):
                                            # with st.spinner(tr("Transcribing...")):
                                            #     try:
                                            #         # 生成字幕文件
                                            #         result = extract_audio_and_create_subtitle(video_path, subtitle_path)
                                            #         if result:
                                            #             # 读取生成的字幕文件内容并显示预览
                                            #             with open(subtitle_path, 'r', encoding='utf-8') as f:
                                            #                 subtitle_content = f.read()
                                            #                 st.markdown(tr("Subtitle Preview"))
                                            #                 st.text_area(
                                            #                     "Subtitle Content",
                                            #                     value=subtitle_content,
                                            #                     height=150,
                                            #                     label_visibility="collapsed",
                                            #                     key=f"subtitle_preview_transcribed_{base_name}"
                                            #                 )
                                            #                 st.success(tr("Transcription Complete!"))
                                            #                 # 更新pair的字幕文件路径
                                            #                 pair.subtitle_file = subtitle_path
                                            #         else:
                                            #             st.error(tr("Transcription Failed. Please try again."))
                                            #     except Exception as e:
                                            #         error_message = str(e)
                                            #         logger.error(traceback.format_exc())
                                            #         if "rate limit exceeded" in error_message.lower():
                                            #             st.error(tr("API rate limit exceeded. Please wait about an hour and try again."))
                                            #         elif "resource_exhausted" in error_message.lower():
                                            #             st.error(tr("Resources exhausted. Please try again later."))
                                            #         else:
                                            #             st.error(f"{tr('Transcription Failed')}: {str(e)}")
                                # 排序输入框
                                order = st.number_input(
                                    tr("Order"),
                                    min_value=0,
                                    value=st.session_state.file_orders[base_name],
                                    key=f"order_{base_name}",
                                    on_change=lambda: setattr(st.session_state, 'needs_reorder', True)
                                )
                                if order != st.session_state.file_orders[base_name]:
                                    st.session_state.file_orders[base_name] = order
                                    st.session_state.needs_reorder = True
                # 如果需要重新排序，重新加载页面
                if st.session_state.needs_reorder:
                    st.session_state.needs_reorder = False
                    st.rerun()
                # 找出有完整视频和字幕的文件对
                complete_pairs = {
                    k: v for k, v in all_pairs.items()
                    if os.path.exists(os.path.join(TEMP_MERGE_DIR, f"{k}.mp4")) and 
                    os.path.exists(os.path.join(TEMP_MERGE_DIR, f"{k}.srt"))
                }
                # 合并按钮和结果显示
                cols = st.columns([1, 2, 1])
                with cols[0]:
                    st.write(f"{tr('Mergeable Files')}: {len(complete_pairs)}")
                merge_videos_result = None
                with cols[1]:
                    if st.button(tr("Merge All Files"), type="primary", use_container_width=True):
                        try:
                            # 获取排序后的完整文件对
                            sorted_complete_pairs = sorted(
                                [(k, v) for k, v in complete_pairs.items()],
                                key=lambda x: st.session_state.file_orders[x[0]]
                            )
                            video_paths = []
                            subtitle_paths = []
                            for base_name, _ in sorted_complete_pairs:
                                video_paths.append(os.path.join(TEMP_MERGE_DIR, f"{base_name}.mp4"))
                                subtitle_paths.append(os.path.join(TEMP_MERGE_DIR, f"{base_name}.srt"))
                            # 获取输出文件路径
                            output_video = os.path.join(video_dir(), f"merged_video_{time.strftime('%M%S')}.mp4")
                            output_subtitle = os.path.join(srt_dir(), f"merged_subtitle_{time.strftime('%M%S')}.srt")
                            with st.spinner(tr("Merging files...")):
                                # 合并文件
                                merge_videos_and_subtitles(
                                    video_paths,
                                    subtitle_paths,
                                    output_video,
                                    output_subtitle
                                )
                                success = True
                                error_msg = ""
                                # 检查输出文件是否成功生成
                                if not os.path.exists(output_video):
                                    success = False
                                    error_msg += tr("Failed to generate merged video. ")
                                if not os.path.exists(output_subtitle):
                                    success = False
                                    error_msg += tr("Failed to generate merged subtitle. ")
                                if success:
                                    # 显示成功消息
                                    st.success(tr("Merge completed!"))
                                    merge_videos_result = (output_video, output_subtitle)
                                    # 清理临时目录
                                    clean_temp_dir()
                                else:
                                    st.error(error_msg)
                        except Exception as e:
                            error_message = str(e)
                            if "moviepy" in error_message.lower():
                                st.error(tr("Error processing video files. Please check if the videos are valid MP4 files."))
                            # elif "pysrt" in error_message.lower():
                            #     st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files."))
                            else:
                                st.error(f"{tr('Error during merge')}: {error_message}")
                # 合并结果预览放在合并按钮下方
                if merge_videos_result:
                    st.markdown(f"<h3 style='text-align: center'>{tr('Merge Result Preview')}</h3>", unsafe_allow_html=True)
                    # 使用列布局使视频居中
                    col1, col2, col3 = st.columns([1,2,1])
                    with col2:
                        st.video(merge_videos_result[0])
                        st.code(f"{tr('Video Path')}: {merge_videos_result[0]}")
                        st.code(f"{tr('Subtitle Path')}: {merge_videos_result[1]}")
            else:
                st.warning(tr("No Files Found"))
--- a/webui/components/review_settings.py
+++ b/webui/components/review_settings.py
@ -1,88 +0,0 @@
 import streamlit as st
 import os
 from loguru import logger
 def render_review_panel(tr):
    """渲染视频审查面板"""
    with st.expander(tr("Video Check"), expanded=False):
        try:
            video_list = st.session_state.get('video_clip_json', [])
            subclip_videos = st.session_state.get('subclip_videos', {})
        except KeyError:
            video_list = []
            subclip_videos = {}
        # 计算列数和行数
        num_videos = len(video_list)
        cols_per_row = 3
        rows = (num_videos + cols_per_row - 1) // cols_per_row  # 向上取整计算行数
        # 使用容器展示视频
        for row in range(rows):
            cols = st.columns(cols_per_row)
            for col in range(cols_per_row):
                index = row * cols_per_row + col
                if index < num_videos:
                    with cols[col]:
                        render_video_item(tr, video_list, subclip_videos, index)
 def render_video_item(tr, video_list, subclip_videos, index):
    """渲染单个视频项"""
    video_script = video_list[index]
    # 显示时间戳
    timestamp = video_script.get('_id', '')
    st.text_area(
        tr("Timestamp"),
        value=timestamp,
        height=70,
        disabled=True,
        key=f"timestamp_{index}"
    )
    # 显示视频播放器
    video_path = subclip_videos.get(timestamp)
    if video_path and os.path.exists(video_path):
        try:
            st.video(video_path)
        except Exception as e:
            logger.error(f"加载视频失败 {video_path}: {e}")
            st.error(f"无法加载视频: {os.path.basename(video_path)}")
    else:
        st.warning(tr("视频文件未找到"))
    # 显示画面描述
    st.text_area(
        tr("Picture Description"),
        value=video_script.get('picture', ''),
        height=150,
        disabled=True,
        key=f"picture_{index}"
    )
    # 显示旁白文本
    narration = st.text_area(
        tr("Narration"),
        value=video_script.get('narration', ''),
        height=150,
        key=f"narration_{index}"
    )
    # 保存修改后的旁白文本
    if narration != video_script.get('narration', ''):
        video_script['narration'] = narration
        st.session_state['video_clip_json'] = video_list
    # 显示剪辑模式
    ost = st.selectbox(
        tr("Clip Mode"),
        options=range(0, 3),
        index=video_script.get('OST', 0),
        key=f"ost_{index}",
        help=tr("0: Keep the audio only, 1: Keep the original sound only, 2: Keep the original sound and audio")
    )
    # 保存修改后的剪辑模式
    if ost != video_script.get('OST', 0):
        video_script['OST'] = ost
        st.session_state['video_clip_json'] = video_list
--- a/webui/components/script_settings.py
+++ b/webui/components/script_settings.py
@ -333,38 +333,12 @@ def render_script_buttons(tr, params):
    video_clip_json_details = st.text_area(
        tr("Video Script"),
        value=json.dumps(st.session_state.get('video_clip_json', []), indent=2, ensure_ascii=False),
-        height=180
+        height=500
    )
-    # 操作按钮行
+    # 操作按钮行 - 合并格式检查和保存功能
-    button_cols = st.columns(3)
+    if st.button(tr("Save Script"), key="save_script", use_container_width=True):
-    with button_cols[0]:
+        save_script_with_validation(tr, video_clip_json_details)
        if st.button(tr("Check Format"), key="check_format", use_container_width=True):
            check_script_format(tr, video_clip_json_details)
    with button_cols[1]:
        if st.button(tr("Save Script"), key="save_script", use_container_width=True):
            save_script(tr, video_clip_json_details)
    with button_cols[2]:
        script_valid = st.session_state.get('script_format_valid', False)
        if st.button(tr("Crop Video"), key="crop_video", disabled=not script_valid, use_container_width=True):
            crop_video(tr, params)
 def check_script_format(tr, script_content):
    """检查脚本格式"""
    try:
        result = check_script.check_format(script_content)
        if result.get('success'):
            st.success(tr("Script format check passed"))
            st.session_state['script_format_valid'] = True
        else:
            st.error(f"{tr('Script format check failed')}: {result.get('message')}")
            st.session_state['script_format_valid'] = False
    except Exception as e:
        st.error(f"{tr('Script format check error')}: {str(e)}")
        st.session_state['script_format_valid'] = False
 def load_script(tr, script_path):
@ -381,12 +355,52 @@ def load_script(tr, script_path):
        st.error(f"{tr('Failed to load script')}: {str(e)}")
-def save_script(tr, video_clip_json_details):
+def save_script_with_validation(tr, video_clip_json_details):
-    """保存视频脚本"""
+    """保存视频脚本（包含格式验证）"""
    if not video_clip_json_details:
        st.error(tr("请输入视频脚本"))
        st.stop()
    # 第一步：格式验证
    with st.spinner("正在验证脚本格式..."):
        try:
            result = check_script.check_format(video_clip_json_details)
            if not result.get('success'):
                # 格式验证失败，显示详细错误信息
                error_message = result.get('message', '未知错误')
                error_details = result.get('details', '')
                st.error(f"**脚本格式验证失败**")
                st.error(f"**错误信息：** {error_message}")
                if error_details:
                    st.error(f"**详细说明：** {error_details}")
                # 显示正确格式示例
                st.info("**正确的脚本格式示例：**")
                example_script = [
                    {
                        "_id": 1,
                        "timestamp": "00:00:00,600-00:00:07,559",
                        "picture": "工地上，蔡晓艳奋力救人，场面混乱",
                        "narration": "灾后重建，工地上险象环生！泼辣女工蔡晓艳挺身而出，救人第一！",
                        "OST": 0
                    },
                    {
                        "_id": 2,
                        "timestamp": "00:00:08,240-00:00:12,359",
                        "picture": "领导视察，蔡晓艳不屑一顾",
                        "narration": "播放原片4",
                        "OST": 1
                    }
                ]
                st.code(json.dumps(example_script, ensure_ascii=False, indent=2), language='json')
                st.stop()
        except Exception as e:
            st.error(f"格式验证过程中发生错误: {str(e)}")
            st.stop()
    # 第二步：保存脚本
    with st.spinner(tr("Save Script")):
        script_dir = utils.script_dir()
        timestamp = time.strftime("%Y-%m%d-%H%M%S")
@ -403,7 +417,7 @@ def save_script(tr, video_clip_json_details):
                config.app["video_clip_json_path"] = save_path
                # 显示成功消息
-                st.success(tr("Script saved successfully"))
+                st.success("✅ 脚本格式验证通过，保存成功！")
                # 强制重新加载页面更新选择框
                time.sleep(0.5)  # 给一点时间让用户看到成功消息
@ -414,26 +428,7 @@ def save_script(tr, video_clip_json_details):
            st.stop()
-def crop_video(tr, params):
+# crop_video函数已移除 - 现在使用统一裁剪策略，不再需要预裁剪步骤
    """裁剪视频"""
    progress_bar = st.progress(0)
    status_text = st.empty()
    def update_progress(progress):
        progress_bar.progress(progress)
        status_text.text(f"剪辑进度: {progress}%")
    try:
        utils.cut_video(params, update_progress)
        time.sleep(0.5)
        progress_bar.progress(100)
        st.success("视频剪辑成功完成！")
    except Exception as e:
        st.error(f"剪辑过程中发生错误: {str(e)}")
    finally:
        time.sleep(1)
        progress_bar.empty()
        status_text.empty()
 def get_script_params():
--- a/webui/components/subtitle_settings.py
+++ b/webui/components/subtitle_settings.py
@ -9,14 +9,35 @@ def render_subtitle_panel(tr):
    with st.container(border=True):
        st.write(tr("Subtitle Settings"))
-        # 启用字幕选项
+        # 检查是否选择了 SoulVoice 引擎
-        enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True)
+        from app.services import voice
-        st.session_state['subtitle_enabled'] = enable_subtitles
+        current_voice = st.session_state.get('voice_name', '')
        is_soulvoice = voice.is_soulvoice_voice(current_voice)
-        if enable_subtitles:
+        if is_soulvoice:
-            render_font_settings(tr)
+            # SoulVoice 引擎时显示禁用提示
-            render_position_settings(tr)
+            st.warning("⚠️ SoulVoice TTS 不支持精确字幕生成")
-            render_style_settings(tr)
+            st.info("💡 建议使用专业剪辑工具（如剪映、PR等）手动添加字幕")
            # 强制禁用字幕
            st.session_state['subtitle_enabled'] = False
            # 显示禁用状态的复选框
            st.checkbox(
                tr("Enable Subtitles"),
                value=False,
                disabled=True,
                help="SoulVoice 引擎不支持字幕生成，请使用其他 TTS 引擎"
            )
        else:
            # 其他引擎正常显示字幕选项
            enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True)
            st.session_state['subtitle_enabled'] = enable_subtitles
            if enable_subtitles:
                render_font_settings(tr)
                render_position_settings(tr)
                render_style_settings(tr)
 def render_font_settings(tr):
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@ -29,7 +29,7 @@
    "Clip Duration": "Maximum Clip Duration (Seconds) (**Not the total length of the video**, refers to the length of each **composite segment**)",
    "Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously",
    "Audio Settings": "**Audio Settings**",
-    "Speech Synthesis": "Speech Synthesis Voice (:red[**Keep consistent with the script language**. Note: V2 version performs better, but requires an API KEY])",
+    "Speech Synthesis": "Speech Synthesis Voice (:red[**Keep consistent with the script language**. Note: V2 version performs better, but requires an API KEY; SoulVoice provides high-quality Chinese voices])",
    "Speech Region": "Service Region (:red[Required, [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Key": "API Key (:red[Required, either Key 1 or Key 2 is acceptable [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "Speech Volume (1.0 represents 100%)",
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -11,7 +11,6 @@
    "Video Theme": "视频主题",
    "Generation Prompt": "自定义提示词",
    "Save Script": "保存脚本",
    "Crop Video": "裁剪视频",
    "Video File": "视频文件（:blue[1️⃣支持上传视频文件(限制2G) 2️⃣大文件建议直接导入 ./resource/videos 目录]）",
    "Plot Description": "剧情描述 (:blue[可从 https://www.tvmao.com/ 获取])",
    "Generate Video Keywords": "点击使用AI根据**文案**生成【视频关键】",
@ -29,7 +28,7 @@
    "Clip Duration": "视频片段最大时长(秒)（**不是视频总长度**，是指每个**合成片段**的长度）",
    "Number of Videos Generated Simultaneously": "同时生成视频数量",
    "Audio Settings": "**音频设置**",
-    "Speech Synthesis": "朗读声音（:red[**与文案语言保持一致**。注意：V2版效果更好，但是需要API KEY]）",
+    "Speech Synthesis": "朗读声音（:red[**与文案语言保持一致**。注意：V2版效果更好，但是需要API KEY；SoulVoice 提供高质量中文语音]）",
    "Speech Region": "服务区域 (:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Key": "API Key (:red[必填，密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "朗读音量（1.0表示100%）",
@ -82,7 +81,6 @@
    "TTS Provider": "语音合成提供商",
    "Hide Log": "隐藏日志",
    "Upload Local Files": "上传本地文件",
    "Video Check": "视频审查",
    "File Uploaded Successfully": "文件上传成功",
    "timestamp": "时间戳",
    "Picture description": "图片描述",
@ -137,31 +135,6 @@
    "Script Uploaded Successfully": "脚本上传成功",
    "Invalid JSON format": "无效的JSON格式",
    "Upload failed": "上传失败",
    "Video Subtitle Merge": "**合并视频与字幕**",
    "Upload Video and Subtitle Files": "上传视频和字幕文件",
    "Matched File Pairs": "已匹配的文件对",
    "Merge All Files": "合并所有文件",
    "Merge Function Not Implemented": "合并功能待实现",
    "No Matched Pairs Found": "未找到匹配的文件对",
    "Missing Subtitle": "缺少对应的字幕文件, 请使用其他软件完成字幕转录，比如剪映等",
    "Missing Video": "缺少对应的视频文件",
    "All Uploaded Files": "所有上传的文件",
    "Order": "排序序号",
    "Reorder": "重新排序",
    "Merging files...": "正在合并文件...",
    "Merge completed!": "合并完成！",
    "Download Merged Video": "下载合并后的视频",
    "Download Merged Subtitle": "下载合并后的字幕",
    "Error during merge": "合并过程中出错",
    "Failed to generate merged video.": "生成合并视频失败。",
    "Failed to generate merged subtitle.": "生成合并字幕失败。",
    "Error reading merged video file": "读取合并后的视频文件时出错",
    "Error reading merged subtitle file": "读取合并后的字幕文件时出错",
    "Error processing video files. Please check if the videos are valid MP4 files.": "处理视频文件时出错。请检查视频是否为有效的MP4文件。",
    "Error processing subtitle files. Please check if the subtitles are valid SRT files.": "处理字幕文件时出错。请检查字幕是否为有效的SRT文件。",
    "Preview Merged Video": "预览合并后的视频",
    "Video Path": "视频路径",
    "Subtitle Path": "字幕路径",
    "Enable Proxy": "启用代理",
    "QwenVL model is available": "QwenVL 模型可用",
    "QwenVL model is not available": "QwenVL 模型不可用",
@ -184,9 +157,6 @@
    "API rate limit exceeded. Please wait about an hour and try again.": "API 调用次数已达到限制，请等待约一小时后再试。",
    "Resources exhausted. Please try again later.": "资源已耗尽，请稍后再试。",
    "Transcription Failed": "转录失败",
    "Mergeable Files": "可合并文件数",
    "Subtitle Content": "字幕内容",
    "Merge Result Preview": "合并结果预览",
    "Short Generate": "短剧混剪 (高燃剪辑)",
    "Generate Short Video Script": "AI生成短剧混剪脚本",
    "Adjust the volume of the original audio": "调整原始音频的音量",
--- a/webui/utils/merge_video.py
+++ b/webui/utils/merge_video.py
@ -1,115 +0,0 @@
 """
 合并视频和字幕文件
 """
 import os
 import pysrt
 from moviepy import VideoFileClip, concatenate_videoclips
 def get_video_duration(video_path):
    """获取视频时长（秒）"""
    video = VideoFileClip(video_path)
    duration = video.duration
    video.close()
    return duration
 def adjust_subtitle_timing(subtitle_path, time_offset):
    """调整字幕时间戳"""
    subs = pysrt.open(subtitle_path)
    # 为每个字幕项添加时间偏移
    for sub in subs:
        sub.start.hours += int(time_offset / 3600)
        sub.start.minutes += int((time_offset % 3600) / 60)
        sub.start.seconds += int(time_offset % 60)
        sub.start.milliseconds += int((time_offset * 1000) % 1000)
        sub.end.hours += int(time_offset / 3600)
        sub.end.minutes += int((time_offset % 3600) / 60)
        sub.end.seconds += int(time_offset % 60)
        sub.end.milliseconds += int((time_offset * 1000) % 1000)
    return subs
 def merge_videos_and_subtitles(video_paths, subtitle_paths, output_video_path, output_subtitle_path):
    """合并视频和字幕文件"""
    if len(video_paths) != len(subtitle_paths):
        raise ValueError("视频文件数量与字幕文件数量不匹配")
    # 1. 合并视频
    video_clips = []
    accumulated_duration = 0
    merged_subs = pysrt.SubRipFile()
    try:
        # 处理所有视频和字幕
        for i, (video_path, subtitle_path) in enumerate(zip(video_paths, subtitle_paths)):
            # 添加视频
            print(f"处理视频 {i + 1}/{len(video_paths)}: {video_path}")
            video_clip = VideoFileClip(video_path)
            video_clips.append(video_clip)
            # 处理字幕
            print(f"处理字幕 {i + 1}/{len(subtitle_paths)}: {subtitle_path}")
            if i == 0:
                # 第一个字幕文件直接读取
                current_subs = pysrt.open(subtitle_path)
            else:
                # 后续字幕文件需要调整时间戳
                current_subs = adjust_subtitle_timing(subtitle_path, accumulated_duration)
            # 合并字幕
            merged_subs.extend(current_subs)
            # 更新累计时长
            accumulated_duration += video_clip.duration
        # 判断视频是否存在，若已经存在不重复合并
        if not os.path.exists(output_video_path):
            print("合并视频中...")
            final_video = concatenate_videoclips(video_clips)
            # 保存合并后的视频
            print("保存合并后的视频...")
            final_video.write_videofile(output_video_path, audio_codec='aac')
        # 保存合并后的字幕
        print("保存合并后的字幕...")
        merged_subs.save(output_subtitle_path, encoding='utf-8')
        print("合并完成")
    finally:
        # 清理资源
        for clip in video_clips:
            clip.close()
 def main():
    # 示例用法
    video_paths = [
        "temp/1.mp4",
        "temp/2.mp4",
        "temp/3.mp4",
        "temp/4.mp4",
        "temp/5.mp4",
    ]
    subtitle_paths = [
        "temp/1.srt",
        "temp/2.srt",
        "temp/3.srt",
        "temp/4.srt",
        "temp/5.srt",
    ]
    output_video_path = "temp/merged_video.mp4"
    output_subtitle_path = "temp/merged_subtitle.srt"
    merge_videos_and_subtitles(video_paths, subtitle_paths, output_video_path, output_subtitle_path)
 if __name__ == "__main__":
    main()