feat(voice): 添加代理支持和改进 edge-tts 字幕处理

添加代理配置支持,包括代理开关和地址设置
重构 edge-tts 字幕处理逻辑,兼容新旧 API 版本
改进错误处理和日志记录,增加连接超时设置
更新 edge-tts 依赖至最新版本
This commit is contained in:
linyq 2026-03-10 21:46:03 +08:00
parent ca4bfd1b44
commit 7ca594a788
4 changed files with 86 additions and 27 deletions

View File

@ -47,6 +47,7 @@ def load_config():
def save_config():
with open(config_file, "w", encoding="utf-8") as f:
_cfg["app"] = app
_cfg["proxy"] = proxy
_cfg["azure"] = azure
_cfg["tencent"] = tencent
_cfg["soulvoice"] = soulvoice

View File

@ -41,6 +41,42 @@ def mktimestamp(time_seconds: float) -> str:
return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"
def new_sub_maker() -> SubMaker:
"""创建兼容新旧 edge-tts API 的 SubMaker。"""
sub_maker = SubMaker()
if not hasattr(sub_maker, "subs"):
sub_maker.subs = []
if not hasattr(sub_maker, "offset"):
sub_maker.offset = []
return sub_maker
def add_subtitle_event(
sub_maker: SubMaker,
start_offset: int,
end_offset: int,
text: str,
boundary_type: str = "WordBoundary",
) -> None:
"""向 SubMaker 写入项目兼容的字幕事件。"""
if hasattr(sub_maker, "feed"):
duration = max(0, end_offset - start_offset)
try:
sub_maker.feed(
{
"type": boundary_type,
"offset": start_offset,
"duration": duration,
"text": text,
}
)
except Exception:
pass
sub_maker.subs.append(text)
sub_maker.offset.append((start_offset, end_offset))
def get_all_azure_voices(filter_locals=None) -> list[str]:
if filter_locals is None:
filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW", "vi-VN"]
@ -1137,6 +1173,16 @@ def convert_pitch_to_percent(rate: float) -> str:
return f"{percent}Hz"
def get_edge_tts_proxy() -> str | None:
"""返回 Edge TTS 应使用的代理地址。"""
proxy_enabled = config.proxy.get("enabled")
if proxy_enabled is False:
return None
proxy_url = (config.proxy.get("https") or config.proxy.get("http") or "").strip()
return proxy_url or None
def azure_tts_v1(
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
) -> Union[SubMaker, None]:
@ -1149,16 +1195,29 @@ def azure_tts_v1(
logger.info(f"{i+1} 次使用 edge_tts 生成音频")
async def _do() -> tuple[SubMaker, bytes]:
communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, pitch=pitch_str, proxy=config.proxy.get("http"))
sub_maker = edge_tts.SubMaker()
communicate = edge_tts.Communicate(
text,
voice_name,
rate=rate_str,
pitch=pitch_str,
boundary="WordBoundary",
proxy=get_edge_tts_proxy(),
connect_timeout=10,
receive_timeout=60,
)
sub_maker = new_sub_maker()
audio_data = bytes() # 用于存储音频数据
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
elif chunk["type"] == "WordBoundary":
sub_maker.create_sub(
(chunk["offset"], chunk["duration"]), chunk["text"]
elif chunk["type"] in {"WordBoundary", "SentenceBoundary"}:
add_subtitle_event(
sub_maker,
start_offset=chunk["offset"],
end_offset=chunk["offset"] + chunk["duration"],
text=chunk["text"],
boundary_type=chunk["type"],
)
return sub_maker, audio_data
@ -1166,18 +1225,21 @@ def azure_tts_v1(
sub_maker, audio_data = asyncio.run(_do())
# 验证数据是否有效
if not sub_maker or not sub_maker.subs or not audio_data:
logger.warning(f"failed, invalid data generated")
if not audio_data:
logger.warning("failed, no audio data generated")
if i < 2:
time.sleep(1)
continue
if not sub_maker.subs:
logger.warning("edge_tts returned audio without boundary events; subtitle timing may be unavailable")
# 数据有效,写入文件
with open(voice_file, "wb") as file:
file.write(audio_data)
return sub_maker
except Exception as e:
logger.error(f"生成音频文件时出错: {str(e)}")
logger.exception(f"生成音频文件时出错: {type(e).__name__}: {str(e)}")
if i < 2:
time.sleep(1)
return None
@ -1220,13 +1282,12 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker,
try:
logger.info(f"start, voice name: {processed_voice_name}, try: {i + 1}")
sub_maker = SubMaker()
sub_maker = new_sub_maker()
def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
duration = _format_duration_to_offset(str(evt.duration))
offset = _format_duration_to_offset(evt.audio_offset)
sub_maker.subs.append(evt.text)
sub_maker.offset.append((offset, offset + duration))
add_subtitle_event(sub_maker, offset, offset + duration, evt.text)
# Creates an instance of a speech config with specified subscription key and service region.
speech_key = config.azure.get("speech_key", "")
@ -1717,9 +1778,9 @@ def qwen3_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -
f.write(audio_bytes)
# 估算字幕
sub = SubMaker()
sub = new_sub_maker()
est_ms = max(800, int(len(text) * 180))
sub.create_sub((0, est_ms), text)
add_subtitle_event(sub, 0, est_ms, text)
logger.info(f"Qwen3 TTS 生成成功DashScope SDK文件大小: {len(audio_bytes)} 字节")
return sub
@ -1811,18 +1872,18 @@ def tencent_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0)
f.write(audio_data)
# 创建字幕对象
sub_maker = SubMaker()
sub_maker = new_sub_maker()
if resp.Subtitles:
for sub in resp.Subtitles:
start_ms = sub.BeginTime
end_ms = sub.EndTime
text = sub.Text
# 转换为 100ns 单位
sub_maker.create_sub((start_ms * 10000, end_ms * 10000), text)
add_subtitle_event(sub_maker, start_ms * 10000, end_ms * 10000, text)
else:
# 如果没有字幕返回,则使用估算作为后备方案
duration_ms = len(text) * 200
sub_maker.create_sub((0, duration_ms * 10000), text)
add_subtitle_event(sub_maker, 0, duration_ms * 10000, text)
logger.info(f"腾讯云 TTS 生成成功,文件大小: {len(audio_data)} 字节")
return sub_maker
@ -1903,7 +1964,7 @@ def soulvoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
logger.info(f"SoulVoice TTS 成功生成音频: {voice_file}")
# SoulVoice 不支持精确字幕生成,返回简单的 SubMaker 对象
sub_maker = SubMaker()
sub_maker = new_sub_maker()
sub_maker.subs = [text] # 整个文本作为一个段落
sub_maker.offset = [(0, 0)] # 占位时间戳
@ -2034,10 +2095,10 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
logger.info(f"IndexTTS2 成功生成音频: {voice_file}, 大小: {len(response.content)} 字节")
# IndexTTS2 不支持精确字幕生成,返回简单的 SubMaker 对象
sub_maker = SubMaker()
sub_maker = new_sub_maker()
# 估算音频时长(基于文本长度)
estimated_duration_ms = max(1000, int(len(text) * 200))
sub_maker.create_sub((0, estimated_duration_ms * 10000), text)
add_subtitle_event(sub_maker, 0, estimated_duration_ms * 10000, text)
return sub_maker
@ -2068,6 +2129,3 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
logger.error("IndexTTS2 TTS 生成失败,已达到最大重试次数")
return None

View File

@ -1,7 +1,7 @@
# 核心依赖
requests>=2.32.0
moviepy==2.1.1
edge-tts==6.1.19
edge-tts==7.2.7
streamlit>=1.45.0
watchdog==6.0.0
loguru>=0.7.3
@ -35,4 +35,4 @@ tenacity>=9.0.0
# 如果需要 CUDA 支持,取消注释下面的行
# torch>=2.0.0
# torchvision>=0.15.0
# torchaudio>=2.0.0
# torchaudio>=2.0.0

View File

@ -179,7 +179,7 @@ def render_proxy_settings(tr):
proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled)
# 保存代理开关状态
# config.proxy["enabled"] = proxy_enabled
config.proxy["enabled"] = proxy_enabled
# 只有在代理启用时才显示代理设置输入框
if proxy_enabled:
@ -196,8 +196,8 @@ def render_proxy_settings(tr):
# 当代理被禁用时,清除环境变量和配置
os.environ.pop("HTTP_PROXY", None)
os.environ.pop("HTTPS_PROXY", None)
# config.proxy["http"] = ""
# config.proxy["https"] = ""
config.proxy["http"] = ""
config.proxy["https"] = ""
def test_vision_model_connection(api_key, base_url, model_name, provider, tr):