feat(tts): 添加 SoulVoice TTS 引擎支持

实现 SoulVoice TTS 引擎集成,包括配置管理、语音选择、API 调用和字幕处理
新增 SoulVoice 配置项和示例配置
修改音频设置面板以支持 SoulVoice 选项
优化音频时长计算和异常处理
更新多语言文案以反映 SoulVoice 支持
This commit is contained in:
linyq 2025-08-03 04:26:42 +08:00
parent 06cbee0654
commit e1f45db95a
8 changed files with 440 additions and 40 deletions

View File

@ -48,6 +48,7 @@ def save_config():
with open(config_file, "w", encoding="utf-8") as f:
_cfg["app"] = app
_cfg["azure"] = azure
_cfg["soulvoice"] = soulvoice
_cfg["ui"] = ui
f.write(toml.dumps(_cfg))
@ -57,6 +58,7 @@ app = _cfg.get("app", {})
whisper = _cfg.get("whisper", {})
proxy = _cfg.get("proxy", {})
azure = _cfg.get("azure", {})
soulvoice = _cfg.get("soulvoice", {})
ui = _cfg.get("ui", {})
frames = _cfg.get("frames", {})

View File

@ -613,6 +613,49 @@ def clip_video(
# 根据持续时间计算真正的结束时间加上1秒余量
duration = item["duration"]
# 时长合理性检查和修正
if duration <= 0 or duration > 300: # 超过5分钟认为不合理
logger.warning(f"检测到异常时长 {duration}秒,片段: {timestamp}")
# 尝试从时间戳计算实际时长
try:
start_time_str, end_time_str = timestamp.split('-')
# 解析开始时间
if ',' in start_time_str:
time_part, ms_part = start_time_str.split(',')
h1, m1, s1 = map(int, time_part.split(':'))
ms1 = int(ms_part)
else:
h1, m1, s1 = map(int, start_time_str.split(':'))
ms1 = 0
# 解析结束时间
if ',' in end_time_str:
time_part, ms_part = end_time_str.split(',')
h2, m2, s2 = map(int, time_part.split(':'))
ms2 = int(ms_part)
else:
h2, m2, s2 = map(int, end_time_str.split(':'))
ms2 = 0
# 计算实际时长
start_total_ms = (h1 * 3600 + m1 * 60 + s1) * 1000 + ms1
end_total_ms = (h2 * 3600 + m2 * 60 + s2) * 1000 + ms2
actual_duration = (end_total_ms - start_total_ms) / 1000.0
if actual_duration > 0 and actual_duration <= 300:
duration = actual_duration
logger.info(f"使用时间戳计算的实际时长: {duration:.3f}")
else:
duration = 5.0 # 默认5秒
logger.warning(f"时间戳计算也异常,使用默认时长: {duration}")
except Exception as e:
duration = 5.0 # 默认5秒
logger.warning(f"时长修正失败,使用默认时长: {duration}秒, 错误: {str(e)}")
calculated_end_time = calculate_end_time(start_time, duration)
# 转换为FFmpeg兼容的时间格式逗号替换为点

View File

@ -4,19 +4,42 @@ import json
import traceback
import edge_tts
import asyncio
import requests
from loguru import logger
from typing import List, Union
from typing import List, Union, Tuple
from datetime import datetime
from xml.sax.saxutils import unescape
from edge_tts import submaker, SubMaker
from edge_tts.submaker import mktimestamp
# from edge_tts.submaker import mktimestamp # 函数可能不存在,我们自己实现
from moviepy.video.tools import subtitles
try:
from moviepy import AudioFileClip
MOVIEPY_AVAILABLE = True
except ImportError:
MOVIEPY_AVAILABLE = False
logger.warning("moviepy 未安装,将使用估算方法计算音频时长")
import time
from app.config import config
from app.utils import utils
def mktimestamp(time_seconds: float) -> str:
"""
将秒数转换为 SRT 时间戳格式
Args:
time_seconds: 时间
Returns:
str: SRT 格式的时间戳 "00:01:23.456"
"""
hours = int(time_seconds // 3600)
minutes = int((time_seconds % 3600) // 60)
seconds = time_seconds % 60
return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"
def get_all_azure_voices(filter_locals=None) -> list[str]:
if filter_locals is None:
filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW", "vi-VN"]
@ -1038,8 +1061,15 @@ def is_azure_v2_voice(voice_name: str):
def tts(
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
) -> Union[SubMaker, None]:
# 检查是否为 SoulVoice 引擎
if is_soulvoice_voice(voice_name):
return soulvoice_tts(text, voice_name, voice_file, speed=voice_rate)
# 检查是否为 Azure V2 引擎
if is_azure_v2_voice(voice_name):
return azure_tts_v2(text, voice_name, voice_file)
# 默认使用 Azure V1 引擎
return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
@ -1368,6 +1398,10 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
if start_time < 0:
start_time = _start_time
# 将 100纳秒单位转换为秒
start_time_seconds = start_time / 10000000
end_time_seconds = end_time / 10000000
sub = unescape(sub)
sub_line += sub
sub_text = match_line(sub_line, sub_index)
@ -1375,8 +1409,8 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
sub_index += 1
line = formatter(
idx=sub_index,
start_time=start_time,
end_time=end_time,
start_time=start_time_seconds,
end_time=end_time_seconds,
sub_text=sub_text,
)
sub_items.append(line)
@ -1402,9 +1436,13 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
f"\nsub_items:{json.dumps(sub_items, indent=4, ensure_ascii=False)}"
f"\nscript_lines:{json.dumps(script_lines, indent=4, ensure_ascii=False)}"
)
# 返回默认值,避免 None 错误
return subtitle_file, 3.0
except Exception as e:
logger.error(f"failed, error: {str(e)}")
# 返回默认值,避免 None 错误
return subtitle_file, 3.0
def get_audio_duration(sub_maker: submaker.SubMaker):
@ -1453,8 +1491,21 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
f"或者使用其他 tts 引擎")
continue
else:
# 为当前片段生成字幕文件
_, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
# SoulVoice 引擎不生成字幕文件
if is_soulvoice_voice(voice_name):
# 获取实际音频文件的时长
duration = get_audio_duration_from_file(audio_file)
if duration <= 0:
# 如果无法获取文件时长,尝试从 SubMaker 获取
duration = get_audio_duration(sub_maker)
if duration <= 0:
# 最后的 fallback基于文本长度估算
duration = max(1.0, len(text) / 3.0)
logger.warning(f"无法获取音频时长,使用文本估算: {duration:.2f}")
# 不创建字幕文件
subtitle_file = ""
else:
_, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
tts_results.append({
"_id": item['_id'],
@ -1467,3 +1518,168 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
logger.info(f"已生成音频文件: {audio_file}")
return tts_results
def get_audio_duration_from_file(audio_file: str) -> float:
"""
获取音频文件的时长
"""
if MOVIEPY_AVAILABLE:
try:
audio_clip = AudioFileClip(audio_file)
duration = audio_clip.duration
audio_clip.close()
return duration
except Exception as e:
logger.error(f"使用 moviepy 获取音频时长失败: {str(e)}")
# Fallback: 使用更准确的估算方法
try:
import os
file_size = os.path.getsize(audio_file)
# 更准确的 MP3 时长估算
# 假设 MP3 平均比特率为 128kbps = 16KB/s
# 但实际文件还包含头部信息,所以调整系数
estimated_duration = max(1.0, file_size / 20000) # 调整为更保守的估算
# 对于中文语音,根据文本长度进行二次校正
# 一般中文语音速度约为 3-4 字/秒
logger.warning(f"使用文件大小估算音频时长: {estimated_duration:.2f}")
return estimated_duration
except Exception as e:
logger.error(f"获取音频时长失败: {str(e)}")
# 如果所有方法都失败,返回一个基于文本长度的估算
return 3.0 # 默认3秒避免返回0
def is_soulvoice_voice(voice_name: str) -> bool:
"""
检查是否为 SoulVoice 语音
"""
return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:")
def parse_soulvoice_voice(voice_name: str) -> str:
"""
解析 SoulVoice 语音名称
支持格式
- soulvoice:speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
- speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
"""
if voice_name.startswith("soulvoice:"):
return voice_name[10:] # 移除 "soulvoice:" 前缀
return voice_name
def soulvoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""
使用 SoulVoice API 进行文本转语音
Args:
text: 要转换的文本
voice_name: 语音名称
voice_file: 输出音频文件路径
speed: 语音速度
Returns:
SubMaker: 包含时间戳信息的字幕制作器失败时返回 None
"""
# 获取配置
api_key = config.soulvoice.get("api_key", "")
api_url = config.soulvoice.get("api_url", "https://tts.scsmtech.cn/tts")
default_model = config.soulvoice.get("model", "FunAudioLLM/CosyVoice2-0.5B")
if not api_key:
logger.error("SoulVoice API key 未配置")
return None
# 解析语音名称
parsed_voice = parse_soulvoice_voice(voice_name)
# 准备请求数据
headers = {
'Authorization': f'Bearer {api_key}',
'Content-Type': 'application/json'
}
data = {
'text': text.strip(),
'model': default_model,
'voice': parsed_voice,
'speed': speed
}
# 重试机制
for attempt in range(3):
try:
logger.info(f"{attempt + 1} 次调用 SoulVoice API")
# 设置代理
proxies = {}
if config.proxy.get("http"):
proxies = {
'http': config.proxy.get("http"),
'https': config.proxy.get("https", config.proxy.get("http"))
}
# 调用 API
response = requests.post(
api_url,
headers=headers,
json=data,
proxies=proxies,
timeout=60
)
if response.status_code == 200:
# 保存音频文件
with open(voice_file, 'wb') as f:
f.write(response.content)
logger.info(f"SoulVoice TTS 成功生成音频: {voice_file}")
# SoulVoice 不支持精确字幕生成,返回简单的 SubMaker 对象
sub_maker = SubMaker()
sub_maker.subs = [text] # 整个文本作为一个段落
sub_maker.offset = [(0, 0)] # 占位时间戳
return sub_maker
else:
logger.error(f"SoulVoice API 调用失败: {response.status_code} - {response.text}")
except requests.exceptions.Timeout:
logger.error(f"SoulVoice API 调用超时 (尝试 {attempt + 1}/3)")
except requests.exceptions.RequestException as e:
logger.error(f"SoulVoice API 网络错误: {str(e)} (尝试 {attempt + 1}/3)")
except Exception as e:
logger.error(f"SoulVoice TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)")
if attempt < 2: # 不是最后一次尝试
time.sleep(2) # 等待2秒后重试
logger.error("SoulVoice TTS 生成失败,已达到最大重试次数")
return None
def is_soulvoice_voice(voice_name: str) -> bool:
"""
检查是否为 SoulVoice 语音
"""
return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:")
def parse_soulvoice_voice(voice_name: str) -> str:
"""
解析 SoulVoice 语音名称
支持格式
- soulvoice:speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
- speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr
"""
if voice_name.startswith("soulvoice:"):
return voice_name[10:] # 移除 "soulvoice:" 前缀
return voice_name

View File

@ -77,6 +77,21 @@
# webui界面是否显示配置项
hide_config = true
[azure]
# Azure TTS 配置
speech_key = ""
speech_region = ""
[soulvoice]
# SoulVoice TTS API 密钥
api_key = ""
# 音色 URI必需
voice_uri = "speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr"
# API 接口地址(可选,默认值如下)
api_url = "https://tts.scsmtech.cn/tts"
# 默认模型(可选)
model = "FunAudioLLM/CosyVoice2-0.5B"
[proxy]
# clash 默认地址http://127.0.0.1:7890
http = ""

View File

@ -8,6 +8,17 @@ from app.utils import utils
from webui.utils.cache import get_songs_cache
def get_soulvoice_voices():
"""获取 SoulVoice 语音列表"""
# 检查是否配置了 SoulVoice API key
api_key = config.soulvoice.get("api_key", "")
if not api_key:
return []
# 只返回一个 SoulVoice 选项,音色通过输入框自定义
return ["soulvoice:custom"]
def render_audio_panel(tr):
"""渲染音频设置面板"""
with st.container(border=True):
@ -24,15 +35,24 @@ def render_tts_settings(tr):
"""渲染TTS(文本转语音)设置"""
# 获取支持的语音列表
support_locales = ["zh-CN", "en-US"]
voices = voice.get_all_azure_voices(filter_locals=support_locales)
azure_voices = voice.get_all_azure_voices(filter_locals=support_locales)
# 添加 SoulVoice 语音选项
soulvoice_voices = get_soulvoice_voices()
# 合并所有语音选项
all_voices = azure_voices + soulvoice_voices
# 创建友好的显示名称
friendly_names = {
v: v.replace("Female", tr("Female"))
.replace("Male", tr("Male"))
.replace("Neural", "")
for v in voices
}
friendly_names = {}
# Azure 语音的友好名称
for v in azure_voices:
friendly_names[v] = v.replace("Female", tr("Female")).replace("Male", tr("Male")).replace("Neural", "")
# SoulVoice 语音的友好名称
for v in soulvoice_voices:
friendly_names[v] = "SoulVoice (自定义音色)"
# 获取保存的语音设置
saved_voice_name = config.ui.get("voice_name", "")
@ -42,9 +62,9 @@ def render_tts_settings(tr):
saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name)
else:
# 如果没有保存的设置选择与UI语言匹配的第一个语音
for i, v in enumerate(voices):
for i, v in enumerate(all_voices):
if (v.lower().startswith(st.session_state["ui_language"].lower())
and "V2" not in v):
and "V2" not in v and not v.startswith("soulvoice:")):
saved_voice_name_index = i
break
@ -60,20 +80,84 @@ def render_tts_settings(tr):
list(friendly_names.values()).index(selected_friendly_name)
]
# 如果选择的是 SoulVoice 自定义选项,使用配置的音色 URI
if voice_name == "soulvoice:custom":
custom_voice_uri = config.soulvoice.get("voice_uri", "")
if custom_voice_uri:
# 确保音色 URI 有正确的前缀
if not custom_voice_uri.startswith("soulvoice:") and not custom_voice_uri.startswith("speech:"):
voice_name = f"soulvoice:{custom_voice_uri}"
else:
voice_name = custom_voice_uri if custom_voice_uri.startswith("soulvoice:") else f"soulvoice:{custom_voice_uri}"
# 保存设置
config.ui["voice_name"] = voice_name
# Azure V2语音特殊处理
if voice.is_azure_v2_voice(voice_name):
# 根据语音类型渲染不同的设置
if voice.is_soulvoice_voice(voice_name):
render_soulvoice_settings(tr)
elif voice.is_azure_v2_voice(voice_name):
render_azure_v2_settings(tr)
# 语音参数设置
render_voice_parameters(tr)
render_voice_parameters(tr, voice_name)
# 试听按钮
render_voice_preview(tr, voice_name)
def render_soulvoice_settings(tr):
"""渲染 SoulVoice 语音设置"""
saved_api_key = config.soulvoice.get("api_key", "")
saved_api_url = config.soulvoice.get("api_url", "https://tts.scsmtech.cn/tts")
saved_model = config.soulvoice.get("model", "FunAudioLLM/CosyVoice2-0.5B")
saved_voice_uri = config.soulvoice.get("voice_uri", "speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr")
# API Key 输入
api_key = st.text_input(
"SoulVoice API Key",
value=saved_api_key,
type="password",
help="请输入您的 SoulVoice API 密钥"
)
# 音色 URI 输入
voice_uri = st.text_input(
"音色 URI",
value=saved_voice_uri,
help="请输入 SoulVoice 音色标识符格式如speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr",
placeholder="speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr"
)
# API URL 输入(可选)
with st.expander("高级设置", expanded=False):
api_url = st.text_input(
"API 地址",
value=saved_api_url,
help="SoulVoice API 接口地址"
)
model = st.text_input(
"模型名称",
value=saved_model,
help="使用的 TTS 模型"
)
# 保存配置
config.soulvoice["api_key"] = api_key
config.soulvoice["voice_uri"] = voice_uri
config.soulvoice["api_url"] = api_url
config.soulvoice["model"] = model
# 显示配置状态
if api_key and voice_uri:
st.success("✅ SoulVoice 配置已设置")
elif not api_key:
st.warning("⚠️ 请配置 SoulVoice API Key")
elif not voice_uri:
st.warning("⚠️ 请配置音色 URI")
def render_azure_v2_settings(tr):
"""渲染Azure V2语音设置"""
saved_azure_speech_region = config.azure.get("speech_region", "")
@ -93,7 +177,7 @@ def render_azure_v2_settings(tr):
config.azure["speech_key"] = azure_speech_key
def render_voice_parameters(tr):
def render_voice_parameters(tr, voice_name):
"""渲染语音参数设置"""
# 音量 - 使用统一的默认值
voice_volume = st.slider(
@ -106,22 +190,41 @@ def render_voice_parameters(tr):
)
st.session_state['voice_volume'] = voice_volume
# 检查是否为 SoulVoice 引擎
is_soulvoice = voice.is_soulvoice_voice(voice_name)
# 语速
voice_rate = st.selectbox(
tr("Speech Rate"),
options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
index=2,
)
if is_soulvoice:
# SoulVoice 支持更精细的语速控制
voice_rate = st.slider(
tr("Speech Rate"),
min_value=0.5,
max_value=2.0,
value=1.0,
step=0.1,
help="SoulVoice 语音速度控制"
)
else:
# Azure TTS 使用预设选项
voice_rate = st.selectbox(
tr("Speech Rate"),
options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
index=2,
)
st.session_state['voice_rate'] = voice_rate
# 音调
voice_pitch = st.selectbox(
tr("Speech Pitch"),
options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
index=2,
)
st.session_state['voice_pitch'] = voice_pitch
# 音调 - SoulVoice 不支持音调调节
if not is_soulvoice:
voice_pitch = st.selectbox(
tr("Speech Pitch"),
options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
index=2,
)
st.session_state['voice_pitch'] = voice_pitch
else:
# SoulVoice 不支持音调调节,设置默认值
st.session_state['voice_pitch'] = 1.0
st.info(" SoulVoice 引擎不支持音调调节")
def render_voice_preview(tr, voice_name):

View File

@ -9,14 +9,35 @@ def render_subtitle_panel(tr):
with st.container(border=True):
st.write(tr("Subtitle Settings"))
# 启用字幕选项
enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True)
st.session_state['subtitle_enabled'] = enable_subtitles
# 检查是否选择了 SoulVoice 引擎
from app.services import voice
current_voice = st.session_state.get('voice_name', '')
is_soulvoice = voice.is_soulvoice_voice(current_voice)
if enable_subtitles:
render_font_settings(tr)
render_position_settings(tr)
render_style_settings(tr)
if is_soulvoice:
# SoulVoice 引擎时显示禁用提示
st.warning("⚠️ SoulVoice TTS 不支持精确字幕生成")
st.info("💡 建议使用专业剪辑工具如剪映、PR等手动添加字幕")
# 强制禁用字幕
st.session_state['subtitle_enabled'] = False
# 显示禁用状态的复选框
st.checkbox(
tr("Enable Subtitles"),
value=False,
disabled=True,
help="SoulVoice 引擎不支持字幕生成,请使用其他 TTS 引擎"
)
else:
# 其他引擎正常显示字幕选项
enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True)
st.session_state['subtitle_enabled'] = enable_subtitles
if enable_subtitles:
render_font_settings(tr)
render_position_settings(tr)
render_style_settings(tr)
def render_font_settings(tr):

View File

@ -29,7 +29,7 @@
"Clip Duration": "Maximum Clip Duration (Seconds) (**Not the total length of the video**, refers to the length of each **composite segment**)",
"Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously",
"Audio Settings": "**Audio Settings**",
"Speech Synthesis": "Speech Synthesis Voice (:red[**Keep consistent with the script language**. Note: V2 version performs better, but requires an API KEY])",
"Speech Synthesis": "Speech Synthesis Voice (:red[**Keep consistent with the script language**. Note: V2 version performs better, but requires an API KEY; SoulVoice provides high-quality Chinese voices])",
"Speech Region": "Service Region (:red[Required, [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
"Speech Key": "API Key (:red[Required, either Key 1 or Key 2 is acceptable [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
"Speech Volume": "Speech Volume (1.0 represents 100%)",

View File

@ -29,7 +29,7 @@
"Clip Duration": "视频片段最大时长(秒)**不是视频总长度**,是指每个**合成片段**的长度)",
"Number of Videos Generated Simultaneously": "同时生成视频数量",
"Audio Settings": "**音频设置**",
"Speech Synthesis": "朗读声音(:red[**与文案语言保持一致**。注意V2版效果更好但是需要API KEY]",
"Speech Synthesis": "朗读声音(:red[**与文案语言保持一致**。注意V2版效果更好但是需要API KEYSoulVoice 提供高质量中文语音]",
"Speech Region": "服务区域 (:red[必填,[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
"Speech Key": "API Key (:red[必填密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
"Speech Volume": "朗读音量1.0表示100%",