From e1f45db95a330549bedd1e1d0dd6c537b9aab847 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Sun, 3 Aug 2025 04:26:42 +0800
Subject: [PATCH] =?UTF-8?q?feat(tts):=20=E6=B7=BB=E5=8A=A0=20SoulVoice=20T?=
 =?UTF-8?q?TS=20=E5=BC=95=E6=93=8E=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

实现 SoulVoice TTS 引擎集成，包括配置管理、语音选择、API 调用和字幕处理
新增 SoulVoice 配置项和示例配置
修改音频设置面板以支持 SoulVoice 选项
优化音频时长计算和异常处理
更新多语言文案以反映 SoulVoice 支持
---
 app/config/config.py                  |   2 +
 app/services/clip_video.py            |  43 +++++
 app/services/voice.py                 | 228 +++++++++++++++++++++++++-
 config.example.toml                   |  15 ++
 webui/components/audio_settings.py    | 153 ++++++++++++++---
 webui/components/subtitle_settings.py |  35 +++-
 webui/i18n/en.json                    |   2 +-
 webui/i18n/zh.json                    |   2 +-
 8 files changed, 440 insertions(+), 40 deletions(-)

diff --git a/app/config/config.py b/app/config/config.py
index 4b2b0b4..ceb8f11 100644
--- a/app/config/config.py
+++ b/app/config/config.py
@@ -48,6 +48,7 @@ def save_config():
     with open(config_file, "w", encoding="utf-8") as f:
         _cfg["app"] = app
         _cfg["azure"] = azure
+        _cfg["soulvoice"] = soulvoice
         _cfg["ui"] = ui
         f.write(toml.dumps(_cfg))
 
@@ -57,6 +58,7 @@ app = _cfg.get("app", {})
 whisper = _cfg.get("whisper", {})
 proxy = _cfg.get("proxy", {})
 azure = _cfg.get("azure", {})
+soulvoice = _cfg.get("soulvoice", {})
 ui = _cfg.get("ui", {})
 frames = _cfg.get("frames", {})
 
diff --git a/app/services/clip_video.py b/app/services/clip_video.py
index 1a0e8e1..1c5fddf 100644
--- a/app/services/clip_video.py
+++ b/app/services/clip_video.py
@@ -613,6 +613,49 @@ def clip_video(
 
         # 根据持续时间计算真正的结束时间（加上1秒余量）
         duration = item["duration"]
+
+        # 时长合理性检查和修正
+        if duration <= 0 or duration > 300:  # 超过5分钟认为不合理
+            logger.warning(f"检测到异常时长 {duration}秒，片段: {timestamp}")
+
+            # 尝试从时间戳计算实际时长
+            try:
+                start_time_str, end_time_str = timestamp.split('-')
+
+                # 解析开始时间
+                if ',' in start_time_str:
+                    time_part, ms_part = start_time_str.split(',')
+                    h1, m1, s1 = map(int, time_part.split(':'))
+                    ms1 = int(ms_part)
+                else:
+                    h1, m1, s1 = map(int, start_time_str.split(':'))
+                    ms1 = 0
+
+                # 解析结束时间
+                if ',' in end_time_str:
+                    time_part, ms_part = end_time_str.split(',')
+                    h2, m2, s2 = map(int, time_part.split(':'))
+                    ms2 = int(ms_part)
+                else:
+                    h2, m2, s2 = map(int, end_time_str.split(':'))
+                    ms2 = 0
+
+                # 计算实际时长
+                start_total_ms = (h1 * 3600 + m1 * 60 + s1) * 1000 + ms1
+                end_total_ms = (h2 * 3600 + m2 * 60 + s2) * 1000 + ms2
+                actual_duration = (end_total_ms - start_total_ms) / 1000.0
+
+                if actual_duration > 0 and actual_duration <= 300:
+                    duration = actual_duration
+                    logger.info(f"使用时间戳计算的实际时长: {duration:.3f}秒")
+                else:
+                    duration = 5.0  # 默认5秒
+                    logger.warning(f"时间戳计算也异常，使用默认时长: {duration}秒")
+
+            except Exception as e:
+                duration = 5.0  # 默认5秒
+                logger.warning(f"时长修正失败，使用默认时长: {duration}秒, 错误: {str(e)}")
+
         calculated_end_time = calculate_end_time(start_time, duration)
 
         # 转换为FFmpeg兼容的时间格式（逗号替换为点）
diff --git a/app/services/voice.py b/app/services/voice.py
index 31f6d66..d45db75 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -4,19 +4,42 @@ import json
 import traceback
 import edge_tts
 import asyncio
+import requests
 from loguru import logger
-from typing import List, Union
+from typing import List, Union, Tuple
 from datetime import datetime
 from xml.sax.saxutils import unescape
 from edge_tts import submaker, SubMaker
-from edge_tts.submaker import mktimestamp
+# from edge_tts.submaker import mktimestamp  # 函数可能不存在，我们自己实现
 from moviepy.video.tools import subtitles
+try:
+    from moviepy import AudioFileClip
+    MOVIEPY_AVAILABLE = True
+except ImportError:
+    MOVIEPY_AVAILABLE = False
+    logger.warning("moviepy 未安装，将使用估算方法计算音频时长")
 import time
 
 from app.config import config
 from app.utils import utils
 
 
+def mktimestamp(time_seconds: float) -> str:
+    """
+    将秒数转换为 SRT 时间戳格式
+
+    Args:
+        time_seconds: 时间（秒）
+
+    Returns:
+        str: SRT 格式的时间戳，如 "00:01:23.456"
+    """
+    hours = int(time_seconds // 3600)
+    minutes = int((time_seconds % 3600) // 60)
+    seconds = time_seconds % 60
+    return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"
+
+
 def get_all_azure_voices(filter_locals=None) -> list[str]:
     if filter_locals is None:
         filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW", "vi-VN"]
@@ -1038,8 +1061,15 @@ def is_azure_v2_voice(voice_name: str):
 def tts(
     text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
 ) -> Union[SubMaker, None]:
+    # 检查是否为 SoulVoice 引擎
+    if is_soulvoice_voice(voice_name):
+        return soulvoice_tts(text, voice_name, voice_file, speed=voice_rate)
+
+    # 检查是否为 Azure V2 引擎
     if is_azure_v2_voice(voice_name):
         return azure_tts_v2(text, voice_name, voice_file)
+
+    # 默认使用 Azure V1 引擎
     return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
 
 
@@ -1368,6 +1398,10 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
             if start_time < 0:
                 start_time = _start_time
 
+            # 将 100纳秒单位转换为秒
+            start_time_seconds = start_time / 10000000
+            end_time_seconds = end_time / 10000000
+
             sub = unescape(sub)
             sub_line += sub
             sub_text = match_line(sub_line, sub_index)
@@ -1375,8 +1409,8 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
                 sub_index += 1
                 line = formatter(
                     idx=sub_index,
-                    start_time=start_time,
-                    end_time=end_time,
+                    start_time=start_time_seconds,
+                    end_time=end_time_seconds,
                     sub_text=sub_text,
                 )
                 sub_items.append(line)
@@ -1402,9 +1436,13 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
                 f"\nsub_items:{json.dumps(sub_items, indent=4, ensure_ascii=False)}"
                 f"\nscript_lines:{json.dumps(script_lines, indent=4, ensure_ascii=False)}"
             )
+            # 返回默认值，避免 None 错误
+            return subtitle_file, 3.0
 
     except Exception as e:
         logger.error(f"failed, error: {str(e)}")
+        # 返回默认值，避免 None 错误
+        return subtitle_file, 3.0
 
 
 def get_audio_duration(sub_maker: submaker.SubMaker):
@@ -1453,8 +1491,21 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
                              f"或者使用其他 tts 引擎")
                 continue
             else:
-                # 为当前片段生成字幕文件
-                _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
+                # SoulVoice 引擎不生成字幕文件
+                if is_soulvoice_voice(voice_name):
+                    # 获取实际音频文件的时长
+                    duration = get_audio_duration_from_file(audio_file)
+                    if duration <= 0:
+                        # 如果无法获取文件时长，尝试从 SubMaker 获取
+                        duration = get_audio_duration(sub_maker)
+                        if duration <= 0:
+                            # 最后的 fallback，基于文本长度估算
+                            duration = max(1.0, len(text) / 3.0)
+                            logger.warning(f"无法获取音频时长，使用文本估算: {duration:.2f}秒")
+                    # 不创建字幕文件
+                    subtitle_file = ""
+                else:
+                    _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
 
             tts_results.append({
                 "_id": item['_id'],
@@ -1467,3 +1518,168 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
             logger.info(f"已生成音频文件: {audio_file}")
 
     return tts_results
+
+
+def get_audio_duration_from_file(audio_file: str) -> float:
+    """
+    获取音频文件的时长（秒）
+    """
+    if MOVIEPY_AVAILABLE:
+        try:
+            audio_clip = AudioFileClip(audio_file)
+            duration = audio_clip.duration
+            audio_clip.close()
+            return duration
+        except Exception as e:
+            logger.error(f"使用 moviepy 获取音频时长失败: {str(e)}")
+
+    # Fallback: 使用更准确的估算方法
+    try:
+        import os
+        file_size = os.path.getsize(audio_file)
+
+        # 更准确的 MP3 时长估算
+        # 假设 MP3 平均比特率为 128kbps = 16KB/s
+        # 但实际文件还包含头部信息，所以调整系数
+        estimated_duration = max(1.0, file_size / 20000)  # 调整为更保守的估算
+
+        # 对于中文语音，根据文本长度进行二次校正
+        # 一般中文语音速度约为 3-4 字/秒
+        logger.warning(f"使用文件大小估算音频时长: {estimated_duration:.2f}秒")
+        return estimated_duration
+    except Exception as e:
+        logger.error(f"获取音频时长失败: {str(e)}")
+        # 如果所有方法都失败，返回一个基于文本长度的估算
+        return 3.0  # 默认3秒，避免返回0
+
+
+def is_soulvoice_voice(voice_name: str) -> bool:
+    """
+    检查是否为 SoulVoice 语音
+    """
+    return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:")
+
+
+def parse_soulvoice_voice(voice_name: str) -> str:
+    """
+    解析 SoulVoice 语音名称
+    支持格式：
+    - soulvoice:speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
+    - speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
+    """
+    if voice_name.startswith("soulvoice:"):
+        return voice_name[10:]  # 移除 "soulvoice:" 前缀
+    return voice_name
+
+
+def soulvoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
+    """
+    使用 SoulVoice API 进行文本转语音
+
+    Args:
+        text: 要转换的文本
+        voice_name: 语音名称
+        voice_file: 输出音频文件路径
+        speed: 语音速度
+
+    Returns:
+        SubMaker: 包含时间戳信息的字幕制作器，失败时返回 None
+    """
+    # 获取配置
+    api_key = config.soulvoice.get("api_key", "")
+    api_url = config.soulvoice.get("api_url", "https://tts.scsmtech.cn/tts")
+    default_model = config.soulvoice.get("model", "FunAudioLLM/CosyVoice2-0.5B")
+
+    if not api_key:
+        logger.error("SoulVoice API key 未配置")
+        return None
+
+    # 解析语音名称
+    parsed_voice = parse_soulvoice_voice(voice_name)
+
+    # 准备请求数据
+    headers = {
+        'Authorization': f'Bearer {api_key}',
+        'Content-Type': 'application/json'
+    }
+
+    data = {
+        'text': text.strip(),
+        'model': default_model,
+        'voice': parsed_voice,
+        'speed': speed
+    }
+
+    # 重试机制
+    for attempt in range(3):
+        try:
+            logger.info(f"第 {attempt + 1} 次调用 SoulVoice API")
+
+            # 设置代理
+            proxies = {}
+            if config.proxy.get("http"):
+                proxies = {
+                    'http': config.proxy.get("http"),
+                    'https': config.proxy.get("https", config.proxy.get("http"))
+                }
+
+            # 调用 API
+            response = requests.post(
+                api_url,
+                headers=headers,
+                json=data,
+                proxies=proxies,
+                timeout=60
+            )
+
+            if response.status_code == 200:
+                # 保存音频文件
+                with open(voice_file, 'wb') as f:
+                    f.write(response.content)
+
+                logger.info(f"SoulVoice TTS 成功生成音频: {voice_file}")
+
+                # SoulVoice 不支持精确字幕生成，返回简单的 SubMaker 对象
+                sub_maker = SubMaker()
+                sub_maker.subs = [text]  # 整个文本作为一个段落
+                sub_maker.offset = [(0, 0)]  # 占位时间戳
+
+                return sub_maker
+
+            else:
+                logger.error(f"SoulVoice API 调用失败: {response.status_code} - {response.text}")
+
+        except requests.exceptions.Timeout:
+            logger.error(f"SoulVoice API 调用超时 (尝试 {attempt + 1}/3)")
+        except requests.exceptions.RequestException as e:
+            logger.error(f"SoulVoice API 网络错误: {str(e)} (尝试 {attempt + 1}/3)")
+        except Exception as e:
+            logger.error(f"SoulVoice TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)")
+
+        if attempt < 2:  # 不是最后一次尝试
+            time.sleep(2)  # 等待2秒后重试
+
+    logger.error("SoulVoice TTS 生成失败，已达到最大重试次数")
+    return None
+
+
+def is_soulvoice_voice(voice_name: str) -> bool:
+    """
+    检查是否为 SoulVoice 语音
+    """
+    return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:")
+
+
+def parse_soulvoice_voice(voice_name: str) -> str:
+    """
+    解析 SoulVoice 语音名称
+    支持格式：
+    - soulvoice:speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
+    - speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
+    """
+    if voice_name.startswith("soulvoice:"):
+        return voice_name[10:]  # 移除 "soulvoice:" 前缀
+    return voice_name
+
+
+
diff --git a/config.example.toml b/config.example.toml
index c9ca75f..ddf529a 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -77,6 +77,21 @@
     # webui界面是否显示配置项
     hide_config = true
 
+[azure]
+    # Azure TTS 配置
+    speech_key = ""
+    speech_region = ""
+
+[soulvoice]
+    # SoulVoice TTS API 密钥
+    api_key = ""
+    # 音色 URI（必需）
+    voice_uri = "speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr"
+    # API 接口地址（可选，默认值如下）
+    api_url = "https://tts.scsmtech.cn/tts"
+    # 默认模型（可选）
+    model = "FunAudioLLM/CosyVoice2-0.5B"
+
 [proxy]
     # clash 默认地址：http://127.0.0.1:7890
     http = ""
diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py
index e422d48..b194e81 100644
--- a/webui/components/audio_settings.py
+++ b/webui/components/audio_settings.py
@@ -8,6 +8,17 @@ from app.utils import utils
 from webui.utils.cache import get_songs_cache
 
 
+def get_soulvoice_voices():
+    """获取 SoulVoice 语音列表"""
+    # 检查是否配置了 SoulVoice API key
+    api_key = config.soulvoice.get("api_key", "")
+    if not api_key:
+        return []
+
+    # 只返回一个 SoulVoice 选项，音色通过输入框自定义
+    return ["soulvoice:custom"]
+
+
 def render_audio_panel(tr):
     """渲染音频设置面板"""
     with st.container(border=True):
@@ -24,15 +35,24 @@ def render_tts_settings(tr):
     """渲染TTS(文本转语音)设置"""
     # 获取支持的语音列表
     support_locales = ["zh-CN", "en-US"]
-    voices = voice.get_all_azure_voices(filter_locals=support_locales)
+    azure_voices = voice.get_all_azure_voices(filter_locals=support_locales)
+
+    # 添加 SoulVoice 语音选项
+    soulvoice_voices = get_soulvoice_voices()
+
+    # 合并所有语音选项
+    all_voices = azure_voices + soulvoice_voices
 
     # 创建友好的显示名称
-    friendly_names = {
-        v: v.replace("Female", tr("Female"))
-        .replace("Male", tr("Male"))
-        .replace("Neural", "")
-        for v in voices
-    }
+    friendly_names = {}
+
+    # Azure 语音的友好名称
+    for v in azure_voices:
+        friendly_names[v] = v.replace("Female", tr("Female")).replace("Male", tr("Male")).replace("Neural", "")
+
+    # SoulVoice 语音的友好名称
+    for v in soulvoice_voices:
+        friendly_names[v] = "SoulVoice (自定义音色)"
 
     # 获取保存的语音设置
     saved_voice_name = config.ui.get("voice_name", "")
@@ -42,9 +62,9 @@ def render_tts_settings(tr):
         saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name)
     else:
         # 如果没有保存的设置，选择与UI语言匹配的第一个语音
-        for i, v in enumerate(voices):
+        for i, v in enumerate(all_voices):
             if (v.lower().startswith(st.session_state["ui_language"].lower())
-                    and "V2" not in v):
+                    and "V2" not in v and not v.startswith("soulvoice:")):
                 saved_voice_name_index = i
                 break
 
@@ -60,20 +80,84 @@ def render_tts_settings(tr):
         list(friendly_names.values()).index(selected_friendly_name)
     ]
 
+    # 如果选择的是 SoulVoice 自定义选项，使用配置的音色 URI
+    if voice_name == "soulvoice:custom":
+        custom_voice_uri = config.soulvoice.get("voice_uri", "")
+        if custom_voice_uri:
+            # 确保音色 URI 有正确的前缀
+            if not custom_voice_uri.startswith("soulvoice:") and not custom_voice_uri.startswith("speech:"):
+                voice_name = f"soulvoice:{custom_voice_uri}"
+            else:
+                voice_name = custom_voice_uri if custom_voice_uri.startswith("soulvoice:") else f"soulvoice:{custom_voice_uri}"
+
     # 保存设置
     config.ui["voice_name"] = voice_name
 
-    # Azure V2语音特殊处理
-    if voice.is_azure_v2_voice(voice_name):
+    # 根据语音类型渲染不同的设置
+    if voice.is_soulvoice_voice(voice_name):
+        render_soulvoice_settings(tr)
+    elif voice.is_azure_v2_voice(voice_name):
         render_azure_v2_settings(tr)
 
     # 语音参数设置
-    render_voice_parameters(tr)
+    render_voice_parameters(tr, voice_name)
 
     # 试听按钮
     render_voice_preview(tr, voice_name)
 
 
+def render_soulvoice_settings(tr):
+    """渲染 SoulVoice 语音设置"""
+    saved_api_key = config.soulvoice.get("api_key", "")
+    saved_api_url = config.soulvoice.get("api_url", "https://tts.scsmtech.cn/tts")
+    saved_model = config.soulvoice.get("model", "FunAudioLLM/CosyVoice2-0.5B")
+    saved_voice_uri = config.soulvoice.get("voice_uri", "speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr")
+
+    # API Key 输入
+    api_key = st.text_input(
+        "SoulVoice API Key",
+        value=saved_api_key,
+        type="password",
+        help="请输入您的 SoulVoice API 密钥"
+    )
+
+    # 音色 URI 输入
+    voice_uri = st.text_input(
+        "音色 URI",
+        value=saved_voice_uri,
+        help="请输入 SoulVoice 音色标识符，格式如：speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr",
+        placeholder="speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr"
+    )
+
+    # API URL 输入（可选）
+    with st.expander("高级设置", expanded=False):
+        api_url = st.text_input(
+            "API 地址",
+            value=saved_api_url,
+            help="SoulVoice API 接口地址"
+        )
+
+        model = st.text_input(
+            "模型名称",
+            value=saved_model,
+            help="使用的 TTS 模型"
+        )
+
+    # 保存配置
+    config.soulvoice["api_key"] = api_key
+    config.soulvoice["voice_uri"] = voice_uri
+    config.soulvoice["api_url"] = api_url
+    config.soulvoice["model"] = model
+
+    # 显示配置状态
+    if api_key and voice_uri:
+        st.success("✅ SoulVoice 配置已设置")
+    elif not api_key:
+        st.warning("⚠️ 请配置 SoulVoice API Key")
+    elif not voice_uri:
+        st.warning("⚠️ 请配置音色 URI")
+
+
 def render_azure_v2_settings(tr):
     """渲染Azure V2语音设置"""
     saved_azure_speech_region = config.azure.get("speech_region", "")
@@ -93,7 +177,7 @@ def render_azure_v2_settings(tr):
     config.azure["speech_key"] = azure_speech_key
 
 
-def render_voice_parameters(tr):
+def render_voice_parameters(tr, voice_name):
     """渲染语音参数设置"""
     # 音量 - 使用统一的默认值
     voice_volume = st.slider(
@@ -106,22 +190,41 @@ def render_voice_parameters(tr):
     )
     st.session_state['voice_volume'] = voice_volume
 
+    # 检查是否为 SoulVoice 引擎
+    is_soulvoice = voice.is_soulvoice_voice(voice_name)
 
     # 语速
-    voice_rate = st.selectbox(
-        tr("Speech Rate"),
-        options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
-        index=2,
-    )
+    if is_soulvoice:
+        # SoulVoice 支持更精细的语速控制
+        voice_rate = st.slider(
+            tr("Speech Rate"),
+            min_value=0.5,
+            max_value=2.0,
+            value=1.0,
+            step=0.1,
+            help="SoulVoice 语音速度控制"
+        )
+    else:
+        # Azure TTS 使用预设选项
+        voice_rate = st.selectbox(
+            tr("Speech Rate"),
+            options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
+            index=2,
+        )
     st.session_state['voice_rate'] = voice_rate
 
-    # 音调
-    voice_pitch = st.selectbox(
-        tr("Speech Pitch"),
-        options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
-        index=2,
-    )
-    st.session_state['voice_pitch'] = voice_pitch
+    # 音调 - SoulVoice 不支持音调调节
+    if not is_soulvoice:
+        voice_pitch = st.selectbox(
+            tr("Speech Pitch"),
+            options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
+            index=2,
+        )
+        st.session_state['voice_pitch'] = voice_pitch
+    else:
+        # SoulVoice 不支持音调调节，设置默认值
+        st.session_state['voice_pitch'] = 1.0
+        st.info("ℹ️ SoulVoice 引擎不支持音调调节")
 
 
 def render_voice_preview(tr, voice_name):
diff --git a/webui/components/subtitle_settings.py b/webui/components/subtitle_settings.py
index ee27985..53b98c7 100644
--- a/webui/components/subtitle_settings.py
+++ b/webui/components/subtitle_settings.py
@@ -9,14 +9,35 @@ def render_subtitle_panel(tr):
     with st.container(border=True):
         st.write(tr("Subtitle Settings"))
 
-        # 启用字幕选项
-        enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True)
-        st.session_state['subtitle_enabled'] = enable_subtitles
+        # 检查是否选择了 SoulVoice 引擎
+        from app.services import voice
+        current_voice = st.session_state.get('voice_name', '')
+        is_soulvoice = voice.is_soulvoice_voice(current_voice)
 
-        if enable_subtitles:
-            render_font_settings(tr)
-            render_position_settings(tr)
-            render_style_settings(tr)
+        if is_soulvoice:
+            # SoulVoice 引擎时显示禁用提示
+            st.warning("⚠️ SoulVoice TTS 不支持精确字幕生成")
+            st.info("💡 建议使用专业剪辑工具（如剪映、PR等）手动添加字幕")
+
+            # 强制禁用字幕
+            st.session_state['subtitle_enabled'] = False
+
+            # 显示禁用状态的复选框
+            st.checkbox(
+                tr("Enable Subtitles"),
+                value=False,
+                disabled=True,
+                help="SoulVoice 引擎不支持字幕生成，请使用其他 TTS 引擎"
+            )
+        else:
+            # 其他引擎正常显示字幕选项
+            enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True)
+            st.session_state['subtitle_enabled'] = enable_subtitles
+
+            if enable_subtitles:
+                render_font_settings(tr)
+                render_position_settings(tr)
+                render_style_settings(tr)
 
 
 def render_font_settings(tr):
diff --git a/webui/i18n/en.json b/webui/i18n/en.json
index 63a2c36..3a69807 100644
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@@ -29,7 +29,7 @@
     "Clip Duration": "Maximum Clip Duration (Seconds) (**Not the total length of the video**, refers to the length of each **composite segment**)",
     "Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously",
     "Audio Settings": "**Audio Settings**",
-    "Speech Synthesis": "Speech Synthesis Voice (:red[**Keep consistent with the script language**. Note: V2 version performs better, but requires an API KEY])",
+    "Speech Synthesis": "Speech Synthesis Voice (:red[**Keep consistent with the script language**. Note: V2 version performs better, but requires an API KEY; SoulVoice provides high-quality Chinese voices])",
     "Speech Region": "Service Region (:red[Required, [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
     "Speech Key": "API Key (:red[Required, either Key 1 or Key 2 is acceptable [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
     "Speech Volume": "Speech Volume (1.0 represents 100%)",
diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json
index e028c9e..aad77e8 100644
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@@ -29,7 +29,7 @@
     "Clip Duration": "视频片段最大时长(秒)（**不是视频总长度**，是指每个**合成片段**的长度）",
     "Number of Videos Generated Simultaneously": "同时生成视频数量",
     "Audio Settings": "**音频设置**",
-    "Speech Synthesis": "朗读声音（:red[**与文案语言保持一致**。注意：V2版效果更好，但是需要API KEY]）",
+    "Speech Synthesis": "朗读声音（:red[**与文案语言保持一致**。注意：V2版效果更好，但是需要API KEY；SoulVoice 提供高质量中文语音]）",
     "Speech Region": "服务区域 (:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
     "Speech Key": "API Key (:red[必填，密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
     "Speech Volume": "朗读音量（1.0表示100%）",