feat(voice): 添加代理支持和改进 edge-tts 字幕处理

添加代理配置支持，包括代理开关和地址设置重构 edge-tts 字幕处理逻辑，兼容新旧 API 版本改进错误处理和日志记录，增加连接超时设置更新 edge-tts 依赖至最新版本
2026-05-08 09:38:04 +00:00 · 2026-03-10 21:46:03 +08:00 · 2026-03-10 21:46:03 +08:00 · 7ca594a788
commit 7ca594a788
parent ca4bfd1b44
4 changed files with 86 additions and 27 deletions
--- a/app/config/config.py
+++ b/app/config/config.py
@ -47,6 +47,7 @@ def load_config():
 def save_config():
    with open(config_file, "w", encoding="utf-8") as f:
        _cfg["app"] = app
+        _cfg["proxy"] = proxy
        _cfg["azure"] = azure
        _cfg["tencent"] = tencent
        _cfg["soulvoice"] = soulvoice
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -41,6 +41,42 @@ def mktimestamp(time_seconds: float) -> str:
    return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"


+def new_sub_maker() -> SubMaker:
+    """创建兼容新旧 edge-tts API 的 SubMaker。"""
+    sub_maker = SubMaker()
+    if not hasattr(sub_maker, "subs"):
+        sub_maker.subs = []
+    if not hasattr(sub_maker, "offset"):
+        sub_maker.offset = []
+    return sub_maker
+
+
+def add_subtitle_event(
+    sub_maker: SubMaker,
+    start_offset: int,
+    end_offset: int,
+    text: str,
+    boundary_type: str = "WordBoundary",
+) -> None:
+    """向 SubMaker 写入项目兼容的字幕事件。"""
+    if hasattr(sub_maker, "feed"):
+        duration = max(0, end_offset - start_offset)
+        try:
+            sub_maker.feed(
+                {
+                    "type": boundary_type,
+                    "offset": start_offset,
+                    "duration": duration,
+                    "text": text,
+                }
+            )
+        except Exception:
+            pass
+
+    sub_maker.subs.append(text)
+    sub_maker.offset.append((start_offset, end_offset))
+
+
 def get_all_azure_voices(filter_locals=None) -> list[str]:
    if filter_locals is None:
        filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW", "vi-VN"]
@ -1137,6 +1173,16 @@ def convert_pitch_to_percent(rate: float) -> str:
        return f"{percent}Hz"


+def get_edge_tts_proxy() -> str | None:
+    """返回 Edge TTS 应使用的代理地址。"""
+    proxy_enabled = config.proxy.get("enabled")
+    if proxy_enabled is False:
+        return None
+
+    proxy_url = (config.proxy.get("https") or config.proxy.get("http") or "").strip()
+    return proxy_url or None
+
+
 def azure_tts_v1(
    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
 ) -> Union[SubMaker, None]:
@ -1149,16 +1195,29 @@ def azure_tts_v1(
            logger.info(f"第 {i+1} 次使用 edge_tts 生成音频")

            async def _do() -> tuple[SubMaker, bytes]:
-                communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, pitch=pitch_str, proxy=config.proxy.get("http"))
-                sub_maker = edge_tts.SubMaker()
+                communicate = edge_tts.Communicate(
+                    text,
+                    voice_name,
+                    rate=rate_str,
+                    pitch=pitch_str,
+                    boundary="WordBoundary",
+                    proxy=get_edge_tts_proxy(),
+                    connect_timeout=10,
+                    receive_timeout=60,
+                )
+                sub_maker = new_sub_maker()
                audio_data = bytes()  # 用于存储音频数据
                
                async for chunk in communicate.stream():
                    if chunk["type"] == "audio":
                        audio_data += chunk["data"]
-                    elif chunk["type"] == "WordBoundary":
-                        sub_maker.create_sub(
-                            (chunk["offset"], chunk["duration"]), chunk["text"]
+                    elif chunk["type"] in {"WordBoundary", "SentenceBoundary"}:
+                        add_subtitle_event(
+                            sub_maker,
+                            start_offset=chunk["offset"],
+                            end_offset=chunk["offset"] + chunk["duration"],
+                            text=chunk["text"],
+                            boundary_type=chunk["type"],
                        )
                return sub_maker, audio_data

@ -1166,18 +1225,21 @@ def azure_tts_v1(
            sub_maker, audio_data = asyncio.run(_do())
            
            # 验证数据是否有效
-            if not sub_maker or not sub_maker.subs or not audio_data:
-                logger.warning(f"failed, invalid data generated")
+            if not audio_data:
+                logger.warning("failed, no audio data generated")
                if i < 2:
                    time.sleep(1)
                continue

+            if not sub_maker.subs:
+                logger.warning("edge_tts returned audio without boundary events; subtitle timing may be unavailable")
+
            # 数据有效，写入文件
            with open(voice_file, "wb") as file:
                file.write(audio_data)
            return sub_maker
        except Exception as e:
-            logger.error(f"生成音频文件时出错: {str(e)}")
+            logger.exception(f"生成音频文件时出错: {type(e).__name__}: {str(e)}")
            if i < 2:
                time.sleep(1)
    return None
@ -1220,13 +1282,12 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker,
        try:
            logger.info(f"start, voice name: {processed_voice_name}, try: {i + 1}")

-            sub_maker = SubMaker()
+            sub_maker = new_sub_maker()

            def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
                duration = _format_duration_to_offset(str(evt.duration))
                offset = _format_duration_to_offset(evt.audio_offset)
-                sub_maker.subs.append(evt.text)
-                sub_maker.offset.append((offset, offset + duration))
+                add_subtitle_event(sub_maker, offset, offset + duration, evt.text)

            # Creates an instance of a speech config with specified subscription key and service region.
            speech_key = config.azure.get("speech_key", "")
@ -1717,9 +1778,9 @@ def qwen3_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -
                f.write(audio_bytes)

            # 估算字幕
-            sub = SubMaker()
+            sub = new_sub_maker()
            est_ms = max(800, int(len(text) * 180))
-            sub.create_sub((0, est_ms), text)
+            add_subtitle_event(sub, 0, est_ms, text)
            
            logger.info(f"Qwen3 TTS 生成成功（DashScope SDK），文件大小: {len(audio_bytes)} 字节")
            return sub
@ -1811,18 +1872,18 @@ def tencent_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0)
                f.write(audio_data)

            # 创建字幕对象
-            sub_maker = SubMaker()
+            sub_maker = new_sub_maker()
            if resp.Subtitles:
                for sub in resp.Subtitles:
                    start_ms = sub.BeginTime
                    end_ms = sub.EndTime
                    text = sub.Text
                    # 转换为 100ns 单位
-                    sub_maker.create_sub((start_ms * 10000, end_ms * 10000), text)
+                    add_subtitle_event(sub_maker, start_ms * 10000, end_ms * 10000, text)
            else:
                # 如果没有字幕返回，则使用估算作为后备方案
                duration_ms = len(text) * 200
-                sub_maker.create_sub((0, duration_ms * 10000), text)
+                add_subtitle_event(sub_maker, 0, duration_ms * 10000, text)

            logger.info(f"腾讯云 TTS 生成成功，文件大小: {len(audio_data)} 字节")
            return sub_maker
@ -1903,7 +1964,7 @@ def soulvoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
                logger.info(f"SoulVoice TTS 成功生成音频: {voice_file}")

                # SoulVoice 不支持精确字幕生成，返回简单的 SubMaker 对象
-                sub_maker = SubMaker()
+                sub_maker = new_sub_maker()
                sub_maker.subs = [text]  # 整个文本作为一个段落
                sub_maker.offset = [(0, 0)]  # 占位时间戳

@ -2034,10 +2095,10 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
                logger.info(f"IndexTTS2 成功生成音频: {voice_file}, 大小: {len(response.content)} 字节")

                # IndexTTS2 不支持精确字幕生成，返回简单的 SubMaker 对象
-                sub_maker = SubMaker()
+                sub_maker = new_sub_maker()
                # 估算音频时长（基于文本长度）
                estimated_duration_ms = max(1000, int(len(text) * 200))
-                sub_maker.create_sub((0, estimated_duration_ms * 10000), text)
+                add_subtitle_event(sub_maker, 0, estimated_duration_ms * 10000, text)

                return sub_maker

@ -2068,6 +2129,3 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.

    logger.error("IndexTTS2 TTS 生成失败，已达到最大重试次数")
    return None
-
-
-
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # 核心依赖
 requests>=2.32.0
 moviepy==2.1.1
-edge-tts==6.1.19
+edge-tts==7.2.7
 streamlit>=1.45.0
 watchdog==6.0.0
 loguru>=0.7.3
@ -35,4 +35,4 @@ tenacity>=9.0.0
 # 如果需要 CUDA 支持，取消注释下面的行
 # torch>=2.0.0
 # torchvision>=0.15.0
-# torchaudio>=2.0.0
+# torchaudio>=2.0.0
--- a/webui/components/basic_settings.py
+++ b/webui/components/basic_settings.py
@ -179,7 +179,7 @@ def render_proxy_settings(tr):
    proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled)
    
    # 保存代理开关状态
-    # config.proxy["enabled"] = proxy_enabled
+    config.proxy["enabled"] = proxy_enabled

    # 只有在代理启用时才显示代理设置输入框
    if proxy_enabled:
@ -196,8 +196,8 @@ def render_proxy_settings(tr):
        # 当代理被禁用时，清除环境变量和配置
        os.environ.pop("HTTP_PROXY", None)
        os.environ.pop("HTTPS_PROXY", None)
-        # config.proxy["http"] = ""
-        # config.proxy["https"] = ""
+        config.proxy["http"] = ""
+        config.proxy["https"] = ""


 def test_vision_model_connection(api_key, base_url, model_name, provider, tr):