NarratoAI/webui/components/audio_settings.py
viccy 4ab29fd776 feat: 优化视频生成进度展示与UI细节
- 为视频生成任务的每个处理步骤添加详细的中文状态提示
- 重构WebUI的视频生成弹窗,使用Streamlit原生状态组件优化进度展示
- 清理多语言翻译文本中的冗余表情符号,统一UI文本风格
- 调整TTS设置面板的折叠面板默认展开状态为关闭,并移除标题中的表情前缀
2026-06-07 18:36:47 +08:00

2038 lines
74 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import streamlit as st
import os
import shutil
import json
from uuid import uuid4
from app.config import config
from app.services import voice
from app.models.schema import AudioVolumeDefaults
from app.utils import utils
INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/mp3"
INDEXTTS_REFERENCE_AUDIO_COPY_SUBDIR = "indextts_refs"
INDEXTTS_REFERENCE_AUDIO_MAP = [
("yingshijieshuo-zh-male.mp3", "影视解说", "Film Narration"),
("maikeashe-zh-male.mp3", "麦克阿瑟", "Macintosh"),
("dong-yuhui-zh-male.mp3", "董宇辉", "Dong Yuhui"),
("fangzhenren-ad-fake-news-zh-male.mp3", "仿真人", "Realistic Human"),
("fengyin-jilupian-jieshuo-zh-male.mp3", "风吟纪录片解说", "Fengyin Documentary Narration"),
("guwo-dianying-jieshuo-zh-male.mp3", "顾我电影解说", "Guwo Film Narration"),
("jia-xiaojun-final-zh-male.mp3", "贾小军", "Jia Xiaojun"),
("junshi-zh-male.mp3", "军事解说", "Military Narration"),
("qi-tongwei-v2-zh-male.mp3", "祁同伟", "Qi Tongwei"),
("saima-niang-mambo-oye-zh-female.mp3", "赛马娘曼波欧耶版", "Uma Musume Mambo Oye Version"),
("shejian-shangde-zhongguo-zh-male.mp3", "舌尖上的中国", "A Bite of China"),
("xiaoming-jianmo-zh-male.mp3", "小明剑魔", "Xiaoming Sword Demon"),
("xin-youxi-jieshuo-zh-male.mp3", "新游戏解说", "New Game Narration"),
("xinzhong-zhicheng-zh-male.mp3", "心中之城", "City in the Heart"),
("alex-chikna-en-male.mp3", "亚历克斯", "Alex Chikna"),
("alle-en-unknown.mp3", "艾莉", "ALLE"),
("calm-normal-en-unknown.mp3", "沉稳男声", "Calm Normal"),
("donald-j-trump-noise-reduction-en-male.mp3", "唐纳德·特朗普", "Donald J. Trump"),
("elite-en-unknown.mp3", "精英男声", "ELITE"),
("horror-en-unknown.mp3", "惊悚男声", "Horror"),
("meiqu-kelong-en-unknown.mp3", "美式男声", "US Clone"),
("sarah-en-female.mp3", "莎拉", "Sarah"),
]
INDEXTTS_REFERENCE_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg")
BGM_RESOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/bgms-safe"
BGM_TRACKS_JSON = os.path.join(BGM_RESOURCE_DIR, "tracks.json")
BGM_UPLOAD_SUBDIR = "uploaded_bgms"
BGM_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg")
LOCAL_TTS_ENGINES = {
config.INDEXTTS_ENGINE,
config.INDEXTTS2_ENGINE,
config.OMNIVOICE_ENGINE,
}
def get_soulvoice_voices():
"""获取 SoulVoice 语音列表"""
# 检查是否配置了 SoulVoice API key
api_key = config.soulvoice.get("api_key", "")
if not api_key:
return []
# 只返回一个 SoulVoice 选项,音色通过输入框自定义
return ["soulvoice:custom"]
def get_tts_engine_options(tr=lambda key: key):
"""获取TTS引擎选项"""
engine_options = {
config.INDEXTTS_ENGINE: config.INDEXTTS_DISPLAY_NAME,
config.INDEXTTS2_ENGINE: config.INDEXTTS2_DISPLAY_NAME,
config.OMNIVOICE_ENGINE: config.OMNIVOICE_DISPLAY_NAME,
"edge_tts": "Edge TTS",
"qwen3_tts": tr("Tongyi Qwen3 TTS"),
"tencent_tts": tr("Tencent Cloud TTS"),
"doubaotts": tr("Doubao TTS"),
"azure_speech": "Azure Speech Services"
}
return {
engine: format_tts_engine_option(engine, display_name, tr)
for engine, display_name in engine_options.items()
}
def get_tts_engine_deployment_label(tts_engine, tr=lambda key: key):
"""获取TTS引擎部署类型标签"""
if tts_engine in LOCAL_TTS_ENGINES:
return tr("Local Deployment")
return tr("Cloud Service")
def format_tts_engine_option(tts_engine, display_name, tr=lambda key: key):
"""格式化TTS引擎下拉显示名"""
deployment_label = get_tts_engine_deployment_label(tts_engine, tr)
return f"{display_name} [{deployment_label}]"
def get_tts_engine_descriptions(tr=lambda key: key):
"""获取TTS引擎详细描述"""
return {
"edge_tts": {
"title": "Edge TTS",
"features": tr("Edge TTS features"),
"use_case": tr("Edge TTS use case"),
"registration": None
},
"azure_speech": {
"title": "Azure Speech Services",
"features": tr("Azure Speech Services features"),
"use_case": tr("Azure Speech Services use case"),
"registration": "https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices"
},
"tencent_tts": {
"title": tr("Tencent Cloud TTS"),
"features": tr("Tencent Cloud TTS features"),
"use_case": tr("Tencent Cloud TTS use case"),
"registration": "https://console.cloud.tencent.com/tts"
},
"qwen3_tts": {
"title": tr("Tongyi Qwen3 TTS"),
"features": tr("Tongyi Qwen3 TTS features"),
"use_case": tr("High-quality Chinese speech synthesis use case"),
"registration": "https://dashscope.aliyuncs.com/"
},
config.INDEXTTS_ENGINE: {
"title": config.INDEXTTS_DISPLAY_NAME,
"features": tr("IndexTTS features"),
"use_case": tr("IndexTTS use case"),
"registration": None
},
config.INDEXTTS2_ENGINE: {
"title": config.INDEXTTS2_DISPLAY_NAME,
"features": tr("IndexTTS2 features"),
"use_case": tr("IndexTTS2 use case"),
"registration": None
},
config.OMNIVOICE_ENGINE: {
"title": config.OMNIVOICE_DISPLAY_NAME,
"features": tr("OmniVoice features"),
"use_case": tr("OmniVoice use case"),
"registration": None
},
"doubaotts": {
"title": tr("Doubao TTS"),
"features": tr("Doubao TTS features"),
"use_case": tr("High-quality Chinese speech synthesis use case"),
"registration": "https://www.volcengine.com/product/voice-tech"
}
}
def infer_indextts_reference_audio_language(filename):
"""根据文件名推断参考音频语言"""
lower_filename = filename.lower()
if "-zh-" in lower_filename:
return "zh"
if "-en-" in lower_filename:
return "en"
return "unknown"
def get_indextts_reference_audio_options():
"""获取本地 IndexTTS-1.5 参考音频选项"""
options = []
mapped_files = set()
for filename, zh_name, en_name in INDEXTTS_REFERENCE_AUDIO_MAP:
audio_path = os.path.join(INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR, filename)
if os.path.isfile(audio_path):
options.append({
"filename": filename,
"path": audio_path,
"zh": zh_name,
"en": en_name,
"language": infer_indextts_reference_audio_language(filename),
})
mapped_files.add(filename)
if os.path.isdir(INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR):
for filename in sorted(os.listdir(INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR)):
if filename in mapped_files:
continue
if not filename.lower().endswith(INDEXTTS_REFERENCE_AUDIO_EXTENSIONS):
continue
audio_path = os.path.join(INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR, filename)
if not os.path.isfile(audio_path):
continue
fallback_name = os.path.splitext(filename)[0]
options.append({
"filename": filename,
"path": audio_path,
"zh": fallback_name,
"en": fallback_name,
"language": infer_indextts_reference_audio_language(filename),
})
return options
def format_indextts_reference_audio_option(option):
"""格式化 IndexTTS-1.5 参考音频下拉显示名"""
zh_name = option.get("zh", "")
en_name = option.get("en", "")
language = option.get("language", "unknown")
ui_language = str(st.session_state.get("ui_language", "zh-CN")).lower()
if ui_language.startswith("en"):
display_name = en_name or zh_name or option.get("filename", "")
language_labels = {
"zh": "Chinese",
"en": "English",
}
else:
display_name = zh_name or en_name or option.get("filename", "")
language_labels = {
"zh": "中文",
"en": "英文",
}
language_label = language_labels.get(language)
if not language_label:
return display_name
return f"{display_name} ({language_label})"
def get_indextts_reference_audio_index(options, saved_reference_audio):
"""根据已保存的参考音频文件匹配下拉选项索引"""
if not options:
return 0
saved_filename = os.path.basename(saved_reference_audio or "")
for index, option in enumerate(options):
if option["filename"] == saved_filename:
return index
return 0
def copy_indextts_reference_audio(source_path):
"""复制一份参考音频到项目存储目录,并返回复制后的路径"""
if not source_path or not os.path.isfile(source_path):
return ""
target_dir = utils.storage_dir(INDEXTTS_REFERENCE_AUDIO_COPY_SUBDIR, create=True)
target_path = os.path.join(target_dir, os.path.basename(source_path))
if os.path.abspath(source_path) == os.path.abspath(target_path):
return target_path
should_copy = True
if os.path.exists(target_path):
should_copy = os.path.getsize(source_path) != os.path.getsize(target_path)
if should_copy:
shutil.copy2(source_path, target_path)
return target_path
def load_bgm_tracks_metadata():
"""读取 BGM 资源描述信息。"""
if not os.path.isfile(BGM_TRACKS_JSON):
return {}
try:
with open(BGM_TRACKS_JSON, "r", encoding="utf-8") as f:
tracks = json.load(f)
except (OSError, json.JSONDecodeError):
return {}
if not isinstance(tracks, list):
return {}
metadata = {}
for track in tracks:
if not isinstance(track, dict):
continue
filename = track.get("fileName")
if filename:
metadata[filename] = track
return metadata
def get_bgm_resource_options():
"""获取 BGM 资源目录中的音频选项。"""
options = []
metadata = load_bgm_tracks_metadata()
added_files = set()
for filename, track in metadata.items():
audio_path = os.path.join(BGM_RESOURCE_DIR, filename)
if not os.path.isfile(audio_path):
continue
options.append({
"filename": filename,
"path": audio_path,
"title": track.get("title") or os.path.splitext(filename)[0],
"style": track.get("style", ""),
"category": track.get("category", ""),
})
added_files.add(filename)
if os.path.isdir(BGM_RESOURCE_DIR):
for filename in sorted(os.listdir(BGM_RESOURCE_DIR)):
if filename in added_files:
continue
if not filename.lower().endswith(BGM_AUDIO_EXTENSIONS):
continue
audio_path = os.path.join(BGM_RESOURCE_DIR, filename)
if not os.path.isfile(audio_path):
continue
options.append({
"filename": filename,
"path": audio_path,
"title": os.path.splitext(filename)[0],
"style": "",
"category": "",
})
return options
def format_bgm_resource_option(option):
"""格式化 BGM 资源下拉显示名。"""
title = option.get("title") or os.path.splitext(option.get("filename", ""))[0]
style = option.get("style", "")
category = option.get("category", "")
if style:
return f"{title} ({style})"
if category:
return f"{title} ({category})"
return title
def get_bgm_resource_index(options, saved_bgm_file):
"""根据已保存的 BGM 文件匹配下拉选项索引。"""
if not options:
return 0
saved_filename = os.path.basename(saved_bgm_file or "")
for index, option in enumerate(options):
if option["filename"] == saved_filename:
return index
return 0
def get_audio_mime_type(audio_path):
"""根据音频文件扩展名返回 MIME 类型"""
extension = os.path.splitext(audio_path or "")[1].lower()
if extension == ".wav":
return "audio/wav"
if extension == ".flac":
return "audio/flac"
if extension == ".ogg":
return "audio/ogg"
if extension == ".m4a":
return "audio/mp4"
if extension == ".aac":
return "audio/aac"
return "audio/mp3"
def render_reference_audio_preview_button(reference_audio, key, tr, preview_state_key="indextts_reference_audio_preview_path"):
"""渲染参考音频试听按钮"""
can_preview = bool(reference_audio and os.path.isfile(reference_audio))
if st.button(
" ",
key=key,
icon=":material/play_arrow:",
help=tr("Preview Reference Audio Help"),
disabled=not can_preview,
use_container_width=True,
):
st.session_state[preview_state_key] = reference_audio
def render_indextts_reference_audio_selector(tr, tts_config, key_prefix):
"""渲染 IndexTTS 系列共用的参考音频选择器。"""
saved_reference_audio = tts_config.get("reference_audio", "")
reference_audio_source_options = {
tr("Select from Resource Directory"): "resource",
tr("Upload Reference Audio"): "upload",
}
reference_audio_source_labels = list(reference_audio_source_options.keys())
saved_reference_audio_source = tts_config.get("reference_audio_source", "resource")
if saved_reference_audio_source not in reference_audio_source_options.values():
saved_reference_audio_source = "resource"
default_reference_audio_source_label = next(
label
for label, source_value in reference_audio_source_options.items()
if source_value == saved_reference_audio_source
)
st.markdown(f"**{tr('Reference Audio Path')}**")
reference_audio_source_label = st.pills(
tr("Reference Audio Source"),
options=reference_audio_source_labels,
selection_mode="single",
default=default_reference_audio_source_label,
key=f"{key_prefix}_reference_audio_source_selection",
help=tr("Reference Audio Source Help"),
label_visibility="collapsed",
width="stretch",
)
if not reference_audio_source_label:
reference_audio_source_label = default_reference_audio_source_label
reference_audio_source = reference_audio_source_options[reference_audio_source_label]
reference_audio = saved_reference_audio
preview_state_key = f"{key_prefix}_reference_audio_preview_path"
reference_audio_options = get_indextts_reference_audio_options()
if reference_audio_source == "resource" and reference_audio_options:
selected_audio_index = get_indextts_reference_audio_index(reference_audio_options, saved_reference_audio)
select_col, preview_col = st.columns([5, 1])
with select_col:
selected_audio_option = reference_audio_options[st.selectbox(
tr("Reference Audio Path"),
options=range(len(reference_audio_options)),
index=selected_audio_index,
format_func=lambda x: format_indextts_reference_audio_option(reference_audio_options[x]),
help=tr("Reference Audio Path Help"),
label_visibility="collapsed",
key=f"{key_prefix}_reference_audio_select",
)]
reference_audio = copy_indextts_reference_audio(selected_audio_option["path"])
with preview_col:
render_reference_audio_preview_button(
reference_audio,
f"{key_prefix}_resource_reference_audio_preview",
tr,
preview_state_key=preview_state_key,
)
elif reference_audio_source == "resource":
st.warning(tr("No Reference Audio Resources Found"))
if reference_audio_source == "upload":
if saved_reference_audio_source != "upload":
reference_audio = ""
upload_col, preview_col = st.columns([5, 1])
with upload_col:
uploaded_file = st.file_uploader(
tr("Upload Reference Audio File"),
type=["wav", "mp3"],
help=tr("Upload Reference Audio Help"),
label_visibility="collapsed",
key=f"{key_prefix}_reference_audio_upload",
)
if uploaded_file is not None:
target_dir = utils.storage_dir(INDEXTTS_REFERENCE_AUDIO_COPY_SUBDIR, create=True)
audio_path = os.path.join(target_dir, f"uploaded_{uploaded_file.name}")
with open(audio_path, "wb") as f:
f.write(uploaded_file.getbuffer())
reference_audio = audio_path
st.success(tr("Audio uploaded").format(path=audio_path))
with preview_col:
render_reference_audio_preview_button(
reference_audio,
f"{key_prefix}_upload_reference_audio_preview",
tr,
preview_state_key=preview_state_key,
)
preview_audio_path = st.session_state.get(preview_state_key, "")
if preview_audio_path == reference_audio and os.path.isfile(preview_audio_path):
with open(preview_audio_path, "rb") as audio_file:
st.audio(audio_file.read(), format=get_audio_mime_type(preview_audio_path))
return reference_audio_source, reference_audio
def render_bgm_preview_button(bgm_file, key, tr):
"""渲染 BGM 试听按钮。"""
can_preview = bool(bgm_file and os.path.isfile(bgm_file))
if st.button(
" ",
key=key,
icon=":material/play_arrow:",
help=tr("Preview Background Music Help"),
disabled=not can_preview,
use_container_width=True,
):
st.session_state["bgm_preview_path"] = bgm_file
def is_valid_azure_voice_name(voice_name: str) -> bool:
"""检查是否为有效的Azure音色名称格式"""
if not voice_name or not isinstance(voice_name, str):
return False
voice_name = voice_name.strip()
# Azure音色名称通常格式为: [语言]-[地区]-[名称]Neural
# 例如: zh-CN-YunzeNeural, en-US-AvaMultilingualNeural
import re
pattern = r'^[a-z]{2}-[A-Z]{2}-\w+Neural$'
return bool(re.match(pattern, voice_name))
def render_audio_panel(tr):
"""渲染音频设置面板"""
with st.container(border=True):
st.write(tr("Audio Settings"))
# 渲染TTS设置
render_tts_settings(tr)
# 背景音乐独立成框,放在音频设置下方
render_bgm_panel(tr)
def render_bgm_panel(tr):
"""渲染背景音乐设置面板"""
with st.container(border=True):
render_bgm_settings(tr)
def render_tts_settings(tr):
"""渲染TTS(文本转语音)设置"""
# 1. TTS引擎选择器
# st.subheader("TTS引擎选择")
engine_options = get_tts_engine_options(tr)
engine_descriptions = get_tts_engine_descriptions(tr)
# 获取保存的TTS引擎设置
saved_tts_engine = config.normalize_tts_engine_name(
config.ui.get("tts_engine", config.INDEXTTS_ENGINE)
)
# 确保保存的引擎在可用选项中
if saved_tts_engine not in engine_options:
saved_tts_engine = config.INDEXTTS_ENGINE
# TTS引擎选择下拉框
selected_engine = st.selectbox(
tr("Select TTS Engine"),
options=list(engine_options.keys()),
format_func=lambda x: engine_options[x],
index=list(engine_options.keys()).index(saved_tts_engine),
help=tr("Select TTS Engine Help")
)
# 保存TTS引擎选择
config.ui["tts_engine"] = selected_engine
st.session_state['tts_engine'] = selected_engine
# 2. 显示引擎详细说明
if selected_engine in engine_descriptions:
desc = engine_descriptions[selected_engine]
with st.expander(tr("TTS Engine Details").format(engine=desc['title']), expanded=False):
st.markdown(f"**{tr('Features')}:** {desc['features']}")
st.markdown(f"**{tr('Use Case')}:** {desc['use_case']}")
if desc['registration']:
st.markdown(f"**{tr('Registration URL')}:** [{desc['registration']}]({desc['registration']})")
# 3. 根据选择的引擎渲染对应的配置界面
# st.subheader("引擎配置")
if selected_engine == "edge_tts":
render_edge_tts_settings(tr)
elif selected_engine == "azure_speech":
render_azure_speech_settings(tr)
elif selected_engine == "soulvoice":
render_soulvoice_engine_settings(tr)
elif selected_engine == "tencent_tts":
render_tencent_tts_settings(tr)
elif selected_engine == "qwen3_tts":
render_qwen3_tts_settings(tr)
elif selected_engine == config.INDEXTTS_ENGINE:
render_indextts_tts_settings(tr)
elif selected_engine == config.INDEXTTS2_ENGINE:
render_indextts2_tts_settings(tr)
elif selected_engine == config.OMNIVOICE_ENGINE:
render_omnivoice_tts_settings(tr)
elif selected_engine == "doubaotts":
render_doubaotts_settings(tr)
# 4. 试听功能
render_voice_preview_new(tr, selected_engine)
def render_edge_tts_settings(tr):
"""渲染 Edge TTS 引擎设置"""
# 获取 Edge TTS 支持的全部语言和音色
edge_voices = voice.get_all_edge_voices()
# 创建友好的显示名称
friendly_names = {}
for v in edge_voices:
friendly_names[v] = v.replace("Female", tr("Female")).replace("Male", tr("Male")).replace("Neural", "")
# 获取保存的语音设置
saved_voice_name = config.ui.get("edge_voice_name", "zh-CN-XiaoxiaoNeural-Female")
# 确保保存的音色在可用列表中
if saved_voice_name not in friendly_names:
# 选择与UI语言匹配的第一个语音
for v in edge_voices:
if v.lower().startswith(st.session_state.get("ui_language", "zh-CN").lower()):
saved_voice_name = v
break
else:
# 如果没找到匹配的,使用第一个
saved_voice_name = edge_voices[0] if edge_voices else ""
# 音色选择下拉框
selected_friendly_name = st.selectbox(
tr("Voice Selection"),
options=list(friendly_names.values()),
index=list(friendly_names.keys()).index(saved_voice_name) if saved_voice_name in friendly_names else 0,
help=tr("Select Edge TTS Voice")
)
# 获取实际的语音名称
voice_name = list(friendly_names.keys())[
list(friendly_names.values()).index(selected_friendly_name)
]
# 显示音色信息
with st.expander(tr("Edge TTS Voice Description"), expanded=False):
st.write(tr("Loaded voice count").format(count=len(edge_voices)))
for v in edge_voices:
gender = tr("Female Voice") if "Female" in v else tr("Male Voice")
name = v.replace("-Female", "").replace("-Male", "").replace("Neural", "")
st.write(f"{name} ({gender})")
config.ui["edge_voice_name"] = voice_name
config.ui["voice_name"] = voice_name # 兼容性
# 音量调节
voice_volume = st.slider(
tr("Voice Volume"),
min_value=0,
max_value=100,
value=int(config.ui.get("edge_volume", 80)),
step=1,
help=tr("Voice Volume Help Percent")
)
config.ui["edge_volume"] = voice_volume
st.session_state['voice_volume'] = voice_volume / 100.0
# 语速调节
voice_rate = st.slider(
tr("Voice Rate"),
min_value=0.5,
max_value=2.0,
value=config.ui.get("edge_rate", 1.0),
step=0.1,
help=tr("Voice Rate Help 0.5-2.0")
)
config.ui["edge_rate"] = voice_rate
st.session_state['voice_rate'] = voice_rate
# 语调调节
voice_pitch = st.slider(
tr("Voice Pitch"),
min_value=-50,
max_value=50,
value=int(config.ui.get("edge_pitch", 0)),
step=5,
help=tr("Voice Pitch Help Percent")
)
config.ui["edge_pitch"] = voice_pitch
# 转换为比例值
st.session_state['voice_pitch'] = 1.0 + (voice_pitch / 100.0)
def render_azure_speech_settings(tr):
"""渲染 Azure Speech Services 引擎设置"""
# 服务区域配置
azure_speech_region = st.text_input(
tr("Service Region"),
value=config.azure.get("speech_region", ""),
placeholder=tr("Service Region Placeholder"),
help=tr("Azure Service Region Help")
)
# API Key配置
azure_speech_key = st.text_input(
"API Key",
value=config.azure.get("speech_key", ""),
type="password",
help=tr("Azure Speech Key Help")
)
# 保存Azure配置
config.azure["speech_region"] = azure_speech_region
config.azure["speech_key"] = azure_speech_key
# 音色名称输入框
saved_voice_name = config.ui.get("azure_voice_name", "zh-CN-XiaoxiaoMultilingualNeural")
# 音色名称输入
voice_name = st.text_input(
tr("Voice Name"),
value=saved_voice_name,
help=tr("Azure Voice Name Help"),
placeholder="zh-CN-YunzeNeural"
)
# 显示常用音色示例
with st.expander(tr("Common Voice Reference"), expanded=False):
st.write(f"**{tr('Chinese Voices')}:**")
st.write(f"• zh-CN-XiaoxiaoMultilingualNeural ({tr('Female Voice')}, {tr('Multilingual')})")
st.write(f"• zh-CN-YunzeNeural ({tr('Male Voice')})")
st.write(f"• zh-CN-YunxiNeural ({tr('Male Voice')})")
st.write(f"• zh-CN-XiaochenNeural ({tr('Female Voice')})")
st.write("")
st.write(f"**{tr('English Voices')}:**")
st.write(f"• en-US-AndrewMultilingualNeural ({tr('Male Voice')}, {tr('Multilingual')})")
st.write(f"• en-US-AvaMultilingualNeural ({tr('Female Voice')}, {tr('Multilingual')})")
st.write(f"• en-US-BrianMultilingualNeural ({tr('Male Voice')}, {tr('Multilingual')})")
st.write(f"• en-US-EmmaMultilingualNeural ({tr('Female Voice')}, {tr('Multilingual')})")
st.write("")
st.info(tr("Azure Voices Docs Notice"))
# 快速选择按钮
st.write(f"**{tr('Quick Select')}:**")
cols = st.columns(3)
with cols[0]:
if st.button(tr("Chinese Female Voice"), help="zh-CN-XiaoxiaoMultilingualNeural"):
voice_name = "zh-CN-XiaoxiaoMultilingualNeural"
st.rerun()
with cols[1]:
if st.button(tr("Chinese Male Voice"), help="zh-CN-YunzeNeural"):
voice_name = "zh-CN-YunzeNeural"
st.rerun()
with cols[2]:
if st.button(tr("English Female Voice"), help="en-US-AvaMultilingualNeural"):
voice_name = "en-US-AvaMultilingualNeural"
st.rerun()
# 验证音色名称并显示状态
if voice_name.strip():
# 检查是否为有效的Azure音色格式
if is_valid_azure_voice_name(voice_name):
st.success(tr("Voice name valid").format(voice=voice_name))
else:
st.warning(tr("Voice name format may be invalid").format(voice=voice_name))
st.info(tr("Azure voice name format notice"))
# 保存配置
config.ui["azure_voice_name"] = voice_name
config.ui["voice_name"] = voice_name # 兼容性
# 音量调节
voice_volume = st.slider(
tr("Voice Volume"),
min_value=0,
max_value=100,
value=int(config.ui.get("azure_volume", 80)),
step=1,
help=tr("Voice Volume Help Percent")
)
config.ui["azure_volume"] = voice_volume
st.session_state['voice_volume'] = voice_volume / 100.0
# 语速调节
voice_rate = st.slider(
tr("Voice Rate"),
min_value=0.5,
max_value=2.0,
value=config.ui.get("azure_rate", 1.0),
step=0.1,
help=tr("Voice Rate Help 0.5-2.0")
)
config.ui["azure_rate"] = voice_rate
st.session_state['voice_rate'] = voice_rate
# 语调调节
voice_pitch = st.slider(
tr("Voice Pitch"),
min_value=-50,
max_value=50,
value=int(config.ui.get("azure_pitch", 0)),
step=5,
help=tr("Voice Pitch Help Percent")
)
config.ui["azure_pitch"] = voice_pitch
# 转换为比例值
st.session_state['voice_pitch'] = 1.0 + (voice_pitch / 100.0)
# 显示配置状态
if azure_speech_region and azure_speech_key:
st.success(tr("Azure Speech Services configured"))
elif not azure_speech_region:
st.warning(tr("Please configure service region"))
elif not azure_speech_key:
st.warning(tr("Please configure API Key"))
def render_tencent_tts_settings(tr):
"""渲染腾讯云 TTS 引擎设置"""
# Secret ID 输入
secret_id = st.text_input(
"Secret ID",
value=config.tencent.get("secret_id", ""),
help=tr("Tencent Secret ID Help")
)
# Secret Key 输入
secret_key = st.text_input(
"Secret Key",
value=config.tencent.get("secret_key", ""),
type="password",
help=tr("Tencent Secret Key Help")
)
# 地域选择
region_options = [
"ap-beijing",
"ap-shanghai",
"ap-guangzhou",
"ap-chengdu",
"ap-nanjing",
"ap-singapore",
"ap-hongkong"
]
saved_region = config.tencent.get("region", "ap-beijing")
if saved_region not in region_options:
region_options.append(saved_region)
region = st.selectbox(
tr("Service Region"),
options=region_options,
index=region_options.index(saved_region),
help=tr("Tencent Service Region Help")
)
# 音色选择
voice_type_options = {
"101001": "智瑜 - 女声(推荐)",
"101002": "智聆 - 女声",
"101003": "智美 - 女声",
"101004": "智云 - 男声",
"101005": "智莉 - 女声",
"101006": "智言 - 男声",
"101007": "智娜 - 女声",
"101008": "智琪 - 女声",
"101009": "智芸 - 女声",
"101010": "智华 - 男声",
"101011": "智燕 - 女声",
"101012": "智丹 - 女声",
"101013": "智辉 - 男声",
"101014": "智宁 - 女声",
"101015": "智萌 - 女声",
"101016": "智甜 - 女声",
"101017": "智蓉 - 女声",
"101018": "智靖 - 男声"
}
saved_voice_type = config.ui.get("tencent_voice_type", "101001")
if saved_voice_type not in voice_type_options:
voice_type_options[saved_voice_type] = f"{tr('Custom Voice')} ({saved_voice_type})"
selected_voice_display = st.selectbox(
tr("Voice Selection"),
options=list(voice_type_options.values()),
index=list(voice_type_options.keys()).index(saved_voice_type),
help=tr("Select Tencent TTS Voice")
)
# 获取实际的音色ID
voice_type = list(voice_type_options.keys())[
list(voice_type_options.values()).index(selected_voice_display)
]
# 语速调节
voice_rate = st.slider(
tr("Voice Rate"),
min_value=0.5,
max_value=2.0,
value=config.ui.get("tencent_rate", 1.0),
step=0.1,
help=tr("Voice Rate Help 0.5-2.0")
)
config.ui["voice_name"] = saved_voice_type # 兼容性
# 显示音色说明
with st.expander(tr("Tencent Cloud TTS Voice Description"), expanded=False):
st.write(f"**{tr('Female Voices')}:**")
female_voices = [(k, v) for k, v in voice_type_options.items() if "女声" in v]
for voice_id, voice_desc in female_voices[:6]: # 显示前6个
st.write(f"{voice_desc} (ID: {voice_id})")
st.write("")
st.write(f"**{tr('Male Voices')}:**")
male_voices = [(k, v) for k, v in voice_type_options.items() if "男声" in v]
for voice_id, voice_desc in male_voices:
st.write(f"{voice_desc} (ID: {voice_id})")
st.write("")
st.info(tr("Tencent More Voices Notice"))
# 保存配置
config.tencent["secret_id"] = secret_id
config.tencent["secret_key"] = secret_key
config.tencent["region"] = region
config.ui["tencent_voice_type"] = voice_type
config.ui["tencent_rate"] = voice_rate
config.ui["voice_name"] = saved_voice_type #兼容性
def render_qwen3_tts_settings(tr):
"""渲染 Qwen3 TTS 设置"""
api_key = st.text_input(
"API Key",
value=config.tts_qwen.get("api_key", ""),
type="password",
help=tr("Qwen DashScope API Key Help")
)
model_name = st.text_input(
tr("TTS Model Name"),
value=config.tts_qwen.get("model_name", "qwen3-tts-flash"),
help=tr("Qwen TTS Model Help")
)
# Qwen3 TTS 音色选项 - 中文名: 英文参数
voice_options = {
"芊悦": "Cherry",
"晨煦": "Ethan",
"不吃鱼": "Nofish",
"詹妮弗": "Jennifer",
"甜茶": "Ryan",
"卡捷琳娜": "Katerina",
"墨讲师": "Elias",
"上海-阿珍": "Jada",
"北京-晓东": "Dylan",
"四川-晴儿": "Sunny",
"南京-老李": "Li",
"陕西-秦川": "Marcus",
"闽南-阿杰": "Roy",
"天津-李彼得": "Peter",
"粤语-阿强": "Rocky",
"粤语-阿清": "Kiki",
"四川-程川": "Eric"
}
# 显示给用户的中文名称列表
display_names = list(voice_options.keys())
saved_voice_param = config.ui.get("qwen_voice_type", "Cherry")
# 如果保存的英文参数不在选项中,查找对应的中文名称
saved_display_name = "芊悦" # 默认值
for chinese_name, english_param in voice_options.items():
if english_param == saved_voice_param:
saved_display_name = chinese_name
break
# 如果保存的音色不在选项中,添加到自定义选项
if saved_display_name not in display_names:
display_names.append(saved_display_name)
voice_options[saved_display_name] = saved_voice_param
selected_display_name = st.selectbox(
tr("Voice Selection"),
options=display_names,
index=display_names.index(saved_display_name) if saved_display_name in display_names else 0,
help=tr("Select Qwen3 TTS Voice")
)
# 获取对应的英文参数
voice_type = voice_options.get(selected_display_name, "Cherry")
voice_rate = st.slider(
tr("Voice Rate"),
min_value=0.5,
max_value=2.0,
value=1.0,
step=0.1,
help=tr("Voice Rate Help 0.5-2.0")
)
# 保存配置
config.tts_qwen["api_key"] = api_key
config.tts_qwen["model_name"] = model_name
config.ui["qwen_voice_type"] = voice_type
config.ui["qwen3_rate"] = voice_rate
config.ui["voice_name"] = voice_type #兼容性
def render_indextts_tts_settings(tr):
"""渲染 IndexTTS-1.5 TTS 设置"""
# API 地址配置
api_url = st.text_input(
tr("API URL"),
value=config.indextts.get("api_url", "http://127.0.0.1:8081/tts"),
help=tr("IndexTTS API URL Help")
)
reference_audio_source, reference_audio = render_indextts_reference_audio_selector(
tr,
config.indextts,
"indextts",
)
# 推理模式
infer_mode_options = [
("普通推理", tr("Standard Inference")),
("快速推理", tr("Fast Inference")),
]
infer_mode_index = 0 if config.indextts.get("infer_mode", "普通推理") == "普通推理" else 1
infer_mode = infer_mode_options[st.selectbox(
tr("Inference Mode"),
options=range(len(infer_mode_options)),
index=infer_mode_index,
format_func=lambda x: infer_mode_options[x][1],
help=tr("Inference Mode Help")
)][0]
# 高级参数折叠面板
with st.expander(tr("Advanced Parameters"), expanded=False):
col1, col2 = st.columns(2)
with col1:
temperature = st.slider(
tr("Sampling Temperature"),
min_value=0.1,
max_value=2.0,
value=float(config.indextts.get("temperature", 1.0)),
step=0.1,
help=tr("Sampling Temperature Help")
)
top_p = st.slider(
"Top P",
min_value=0.0,
max_value=1.0,
value=float(config.indextts.get("top_p", 0.8)),
step=0.05,
help=tr("Top P Help")
)
top_k = st.slider(
"Top K",
min_value=0,
max_value=100,
value=int(config.indextts.get("top_k", 30)),
step=5,
help=tr("Top K Help")
)
with col2:
num_beams = st.slider(
tr("Num Beams"),
min_value=1,
max_value=10,
value=int(config.indextts.get("num_beams", 3)),
step=1,
help=tr("Num Beams Help")
)
repetition_penalty = st.slider(
tr("Repetition Penalty"),
min_value=1.0,
max_value=20.0,
value=float(config.indextts.get("repetition_penalty", 10.0)),
step=0.5,
help=tr("Repetition Penalty Help")
)
do_sample = st.checkbox(
tr("Enable Sampling"),
value=config.indextts.get("do_sample", True),
help=tr("Enable Sampling Help")
)
# 显示使用说明
with st.expander(tr("IndexTTS Usage Instructions Title"), expanded=False):
st.markdown(tr("IndexTTS Usage Instructions"))
# 保存配置
config.indextts["api_url"] = api_url
config.indextts["reference_audio_source"] = reference_audio_source
config.indextts["reference_audio"] = reference_audio
config.indextts["infer_mode"] = infer_mode
config.indextts["temperature"] = temperature
config.indextts["top_p"] = top_p
config.indextts["top_k"] = top_k
config.indextts["num_beams"] = num_beams
config.indextts["repetition_penalty"] = repetition_penalty
config.indextts["do_sample"] = do_sample
# 保存 voice_name 用于兼容性
if reference_audio:
config.ui["voice_name"] = f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}"
def render_indextts2_tts_settings(tr):
"""渲染 IndexTTS-2 TTS 设置"""
api_url = st.text_input(
tr("API URL"),
value=config.indextts2.get("api_url", "http://192.168.3.6:7863/tts"),
help=tr("IndexTTS2 API URL Help")
)
reference_audio_source, reference_audio = render_indextts_reference_audio_selector(
tr,
config.indextts2,
"indextts2",
)
emotion_mode_options = [
("speaker", tr("Emotion Mode Speaker")),
("audio", tr("Emotion Mode Audio")),
("vector", tr("Emotion Mode Vector")),
("text", tr("Emotion Mode Text")),
]
saved_emotion_mode = config.indextts2.get("emotion_mode", "speaker")
emotion_mode_values = [item[0] for item in emotion_mode_options]
if saved_emotion_mode not in emotion_mode_values:
saved_emotion_mode = "speaker"
with st.expander(tr("IndexTTS2 Emotion Parameters"), expanded=False):
emotion_mode = emotion_mode_options[st.selectbox(
tr("Emotion Mode"),
options=range(len(emotion_mode_options)),
index=emotion_mode_values.index(saved_emotion_mode),
format_func=lambda x: emotion_mode_options[x][1],
help=tr("Emotion Mode Help"),
)][0]
emotion_alpha = st.slider(
tr("Emotion Alpha"),
min_value=0.0,
max_value=1.0,
value=float(config.indextts2.get("emotion_alpha", 0.65)),
step=0.05,
help=tr("Emotion Alpha Help"),
)
emotion_audio = config.indextts2.get("emotion_audio", "")
emotion_text = config.indextts2.get("emotion_text", "")
if emotion_mode == "audio":
emotion_audio_col, emotion_preview_col = st.columns([5, 1])
with emotion_audio_col:
emotion_audio = st.text_input(
tr("Emotion Reference Audio Path"),
value=emotion_audio,
help=tr("Emotion Reference Audio Path Help"),
)
with emotion_preview_col:
render_reference_audio_preview_button(
emotion_audio,
"indextts2_emotion_audio_preview",
tr,
preview_state_key="indextts2_emotion_audio_preview_path",
)
preview_audio_path = st.session_state.get("indextts2_emotion_audio_preview_path", "")
if preview_audio_path == emotion_audio and os.path.isfile(preview_audio_path):
with open(preview_audio_path, "rb") as audio_file:
st.audio(audio_file.read(), format=get_audio_mime_type(preview_audio_path))
elif emotion_mode == "text":
emotion_text = st.text_input(
tr("Emotion Text"),
value=emotion_text,
help=tr("Emotion Text Help"),
placeholder=tr("Emotion Text Placeholder"),
)
use_random = st.checkbox(
tr("Use Random Emotion"),
value=bool(config.indextts2.get("use_random", False)),
help=tr("Use Random Emotion Help"),
)
emotion_vector_defaults = {
"vec_happy": 0.0,
"vec_angry": 0.0,
"vec_sad": 0.0,
"vec_afraid": 0.0,
"vec_disgusted": 0.0,
"vec_melancholic": 0.0,
"vec_surprised": 0.0,
"vec_calm": 0.8,
}
emotion_vector_labels = {
"vec_happy": tr("Emotion Happy"),
"vec_angry": tr("Emotion Angry"),
"vec_sad": tr("Emotion Sad"),
"vec_afraid": tr("Emotion Afraid"),
"vec_disgusted": tr("Emotion Disgusted"),
"vec_melancholic": tr("Emotion Melancholic"),
"vec_surprised": tr("Emotion Surprised"),
"vec_calm": tr("Emotion Calm"),
}
emotion_vector_values = {}
if emotion_mode == "vector":
vec_cols = st.columns(2)
for index, (field, default_value) in enumerate(emotion_vector_defaults.items()):
with vec_cols[index % 2]:
emotion_vector_values[field] = st.slider(
emotion_vector_labels[field],
min_value=0.0,
max_value=1.0,
value=float(config.indextts2.get(field, default_value)),
step=0.05,
)
else:
emotion_vector_values = {
field: float(config.indextts2.get(field, default_value))
for field, default_value in emotion_vector_defaults.items()
}
with st.expander(tr("Advanced Parameters"), expanded=False):
col1, col2 = st.columns(2)
with col1:
temperature = st.slider(
tr("Sampling Temperature"),
min_value=0.1,
max_value=2.0,
value=float(config.indextts2.get("temperature", 0.8)),
step=0.1,
help=tr("Sampling Temperature Help")
)
top_p = st.slider(
"Top P",
min_value=0.0,
max_value=1.0,
value=float(config.indextts2.get("top_p", 0.8)),
step=0.05,
help=tr("Top P Help")
)
top_k = st.slider(
"Top K",
min_value=0,
max_value=100,
value=int(config.indextts2.get("top_k", 30)),
step=5,
help=tr("Top K Help")
)
max_text_tokens_per_segment = st.slider(
tr("Max Text Tokens Per Segment"),
min_value=20,
max_value=600,
value=int(config.indextts2.get("max_text_tokens_per_segment", 120)),
step=10,
help=tr("Max Text Tokens Per Segment Help")
)
with col2:
num_beams = st.slider(
tr("Num Beams"),
min_value=1,
max_value=10,
value=int(config.indextts2.get("num_beams", 3)),
step=1,
help=tr("Num Beams Help")
)
repetition_penalty = st.slider(
tr("Repetition Penalty"),
min_value=0.1,
max_value=20.0,
value=float(config.indextts2.get("repetition_penalty", 10.0)),
step=0.1,
help=tr("Repetition Penalty Help")
)
max_mel_tokens = st.slider(
tr("Max Mel Tokens"),
min_value=50,
max_value=1815,
value=int(config.indextts2.get("max_mel_tokens", 1500)),
step=10,
help=tr("Max Mel Tokens Help")
)
with st.expander(tr("IndexTTS2 Usage Instructions Title"), expanded=False):
st.markdown(tr("IndexTTS2 Usage Instructions"))
config.indextts2["api_url"] = api_url
config.indextts2["reference_audio_source"] = reference_audio_source
config.indextts2["reference_audio"] = reference_audio
config.indextts2["emotion_mode"] = emotion_mode
config.indextts2["emotion_audio"] = emotion_audio
config.indextts2["emotion_alpha"] = emotion_alpha
config.indextts2["emotion_text"] = emotion_text
config.indextts2["use_random"] = use_random
config.indextts2["max_text_tokens_per_segment"] = max_text_tokens_per_segment
for field, value in emotion_vector_values.items():
config.indextts2[field] = value
config.indextts2["temperature"] = temperature
config.indextts2["top_p"] = top_p
config.indextts2["top_k"] = top_k
config.indextts2["num_beams"] = num_beams
config.indextts2["repetition_penalty"] = repetition_penalty
config.indextts2["max_mel_tokens"] = max_mel_tokens
if reference_audio:
config.ui["voice_name"] = f"{config.INDEXTTS2_VOICE_PREFIX}{reference_audio}"
st.session_state['voice_rate'] = 1.0
st.session_state['voice_pitch'] = 1.0
def render_omnivoice_tts_settings(tr):
"""渲染 OmniVoice TTS 设置"""
omnivoice_config = config.omnivoice
api_url = st.text_input(
tr("API URL"),
value=omnivoice_config.get("api_url", "http://127.0.0.1:7866/tts"),
help=tr("OmniVoice API URL Help"),
)
language = st.text_input(
tr("OmniVoice Language Code"),
value=omnivoice_config.get("language", "zh"),
help=tr("OmniVoice Language Code Help"),
placeholder="zh",
)
mode_options = [
("auto", tr("OmniVoice Mode Auto")),
("voice_design", tr("OmniVoice Mode Voice Design")),
("voice_clone", tr("OmniVoice Mode Voice Clone")),
]
mode_values = [item[0] for item in mode_options]
saved_mode = omnivoice_config.get("mode", "auto")
if saved_mode not in mode_values:
saved_mode = "auto"
mode = mode_options[st.selectbox(
tr("OmniVoice Generation Mode"),
options=range(len(mode_options)),
index=mode_values.index(saved_mode),
format_func=lambda x: mode_options[x][1],
help=tr("OmniVoice Generation Mode Help"),
)][0]
instruct = omnivoice_config.get("instruct", "")
reference_audio_source = omnivoice_config.get("reference_audio_source", "resource")
reference_audio = omnivoice_config.get("reference_audio", "")
ref_text = omnivoice_config.get("ref_text", "")
if mode == "voice_design":
instruct = st.text_area(
tr("OmniVoice Instruct"),
value=instruct,
help=tr("OmniVoice Instruct Help"),
placeholder=tr("OmniVoice Instruct Placeholder"),
height=80,
)
elif mode == "voice_clone":
reference_audio_source, reference_audio = render_indextts_reference_audio_selector(
tr,
omnivoice_config,
"omnivoice",
)
ref_text = st.text_area(
tr("OmniVoice Reference Text"),
value=ref_text,
help=tr("OmniVoice Reference Text Help"),
placeholder=tr("OmniVoice Reference Text Placeholder"),
height=90,
)
with st.expander(tr("Advanced Parameters"), expanded=False):
col1, col2 = st.columns(2)
with col1:
num_step = st.slider(
"Num Step",
min_value=4,
max_value=64,
value=int(omnivoice_config.get("num_step", 32)),
step=1,
help=tr("OmniVoice Num Step Help"),
)
guidance_scale = st.slider(
"Guidance Scale",
min_value=0.1,
max_value=10.0,
value=float(omnivoice_config.get("guidance_scale", 2.0)),
step=0.1,
help=tr("OmniVoice Guidance Scale Help"),
)
voice_rate = st.slider(
tr("Voice Rate"),
min_value=0.5,
max_value=2.0,
value=float(omnivoice_config.get("speed", 1.0)),
step=0.1,
help=tr("Voice Rate Help 0.5-2.0"),
)
with col2:
saved_duration = omnivoice_config.get("duration", "")
duration_value = float(saved_duration) if saved_duration not in (None, "") else 0.0
duration = st.number_input(
tr("OmniVoice Duration"),
min_value=0.0,
max_value=120.0,
value=duration_value,
step=0.5,
help=tr("OmniVoice Duration Help"),
)
denoise = st.checkbox(
tr("OmniVoice Denoise"),
value=bool(omnivoice_config.get("denoise", True)),
help=tr("OmniVoice Denoise Help"),
)
postprocess_output = st.checkbox(
tr("OmniVoice Postprocess Output"),
value=bool(omnivoice_config.get("postprocess_output", True)),
help=tr("OmniVoice Postprocess Output Help"),
)
preprocess_prompt = st.checkbox(
tr("OmniVoice Preprocess Prompt"),
value=bool(omnivoice_config.get("preprocess_prompt", True)),
help=tr("OmniVoice Preprocess Prompt Help"),
)
with st.expander(tr("OmniVoice Usage Instructions Title"), expanded=False):
st.markdown(tr("OmniVoice Usage Instructions"))
config.omnivoice["api_url"] = api_url
config.omnivoice["language"] = language
config.omnivoice["mode"] = mode
config.omnivoice["instruct"] = instruct
config.omnivoice["reference_audio_source"] = reference_audio_source
config.omnivoice["reference_audio"] = reference_audio
config.omnivoice["ref_text"] = ref_text
config.omnivoice["num_step"] = num_step
config.omnivoice["guidance_scale"] = guidance_scale
config.omnivoice["speed"] = voice_rate
config.omnivoice["duration"] = duration if duration > 0 else ""
config.omnivoice["denoise"] = denoise
config.omnivoice["postprocess_output"] = postprocess_output
config.omnivoice["preprocess_prompt"] = preprocess_prompt
if mode == "voice_clone" and reference_audio:
config.ui["voice_name"] = f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}"
else:
config.ui["voice_name"] = f"{config.OMNIVOICE_VOICE_PREFIX}{mode}"
st.session_state["voice_rate"] = voice_rate
st.session_state["voice_pitch"] = 1.0
def render_doubaotts_settings(tr):
"""渲染豆包语音 TTS 设置"""
# AK 输入
ak = st.text_input(
"Access Key",
value=config.doubaotts.get("ak", ""),
help=tr("Volcengine Access Key Help")
)
# SK 输入
sk = st.text_input(
"Secret Key",
value=config.doubaotts.get("sk", ""),
type="password",
help=tr("Volcengine Secret Key Help")
)
# AppID 输入
appid = st.text_input(
"AppID",
value=config.doubaotts.get("appid", ""),
help=tr("Doubao AppID Help")
)
# Token 输入
token = st.text_input(
"Token",
value=config.doubaotts.get("token", ""),
type="password",
help=tr("Doubao Token Help")
)
# 集群配置
cluster = st.text_input(
tr("Cluster"),
value=config.doubaotts.get("cluster", "volcano_tts"),
help=tr("Doubao Cluster Help")
)
# 音色选择
# 在线音色列表(从文档中提取)
voice_options = {
"BV700_V2_streaming": "灿灿 2.0",
"BV705_streaming": "炀炀",
"BV701_V2_streaming": "擎苍 2.0",
"BV001_V2_streaming": "通用女声 2.0",
"BV700_streaming": "灿灿",
"BV406_V2_streaming": "超自然音色-梓梓2.0",
"BV406_streaming": "超自然音色-梓梓",
"BV407_V2_streaming": "超自然音色-燃燃2.0",
"BV407_streaming": "超自然音色-燃燃",
"BV001_streaming": "通用女声",
"BV002_streaming": "通用男声",
"BV701_streaming": "擎苍",
"BV123_streaming": "阳光青年",
"BV120_streaming": "反卷青年",
"BV119_streaming": "通用赘婿",
"BV115_streaming": "古风少御",
"BV107_streaming": "霸气青叔",
"BV100_streaming": "质朴青年",
"BV104_streaming": "温柔淑女",
"BV004_streaming": "开朗青年",
"BV113_streaming": "甜宠少御",
"BV102_streaming": "儒雅青年",
"BV405_streaming": "甜美小源",
"BV007_streaming": "亲切女声",
"BV009_streaming": "知性女声",
"BV419_streaming": "诚诚",
"BV415_streaming": "童童",
"BV008_streaming": "亲切男声",
"BV408_streaming": "译制片男声",
"BV426_streaming": "懒小羊",
"BV428_streaming": "清新文艺女声",
"BV403_streaming": "鸡汤女声",
"BV158_streaming": "智慧老者",
"BV157_streaming": "慈爱姥姥",
"BR001_streaming": "说唱小哥",
"BV410_streaming": "活力解说男",
"BV411_streaming": "影视解说小帅",
"BV437_streaming": "解说小帅-多情感",
"BV412_streaming": "影视解说小美",
"BV159_streaming": "纨绔青年",
"BV418_streaming": "直播一姐",
"BV142_streaming": "沉稳解说男",
"BV143_streaming": "潇洒青年",
"BV056_streaming": "阳光男声",
"BV005_streaming": "活泼女声",
"BV064_streaming": "小萝莉",
"BV051_streaming": "奶气萌娃",
"BV063_streaming": "动漫海绵",
"BV417_streaming": "动漫海星",
"BV050_streaming": "动漫小新",
"BV061_streaming": "天才童声",
"BV401_streaming": "促销男声",
"BV402_streaming": "促销女声",
"BV006_streaming": "磁性男声",
"BV011_streaming": "新闻女声",
"BV012_streaming": "新闻男声",
"BV034_streaming": "知性姐姐-双语",
"BV033_streaming": "温柔小哥",
"BV511_streaming": "慵懒女声-Ava",
"BV505_streaming": "议论女声-Alicia",
"BV138_streaming": "情感女声-Lawrence",
"BV027_streaming": "美式女声-Amelia",
"BV502_streaming": "讲述女声-Amanda",
"BV503_streaming": "活力女声-Ariana",
"BV504_streaming": "活力男声-Jackson",
"BV421_streaming": "天才少女",
"BV702_streaming": "Stefan",
"BV506_streaming": "天真萌娃-Lily",
"BV040_streaming": "亲切女声-Anna",
"BV516_streaming": "澳洲男声-Henry",
"BV520_streaming": "元气少女",
"BV521_streaming": "萌系少女",
"BV522_streaming": "气质女声",
"BV524_streaming": "日语男声",
"BV531_streaming": "活力男声Carlos巴西地区",
"BV530_streaming": "活力女声(巴西地区)",
"BV065_streaming": "气质御姐(墨西哥地区)",
"BV021_streaming": "东北老铁",
"BV020_streaming": "东北丫头",
"BV704_streaming": "方言灿灿",
"BV210_streaming": "西安佟掌柜",
"BV217_streaming": "沪上阿姐",
"BV213_streaming": "广西表哥",
"BV025_streaming": "甜美台妹",
"BV227_streaming": "台普男声",
"BV026_streaming": "港剧男神",
"BV424_streaming": "广东女仔",
"BV212_streaming": "相声演员",
"BV019_streaming": "重庆小伙",
"BV221_streaming": "四川甜妹儿",
"BV423_streaming": "重庆幺妹儿",
"BV214_streaming": "乡村企业家",
"BV226_streaming": "湖南妹坨",
"BV216_streaming": "长沙靓女"
}
saved_voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming")
if saved_voice_type not in voice_options:
voice_options[saved_voice_type] = f"{tr('Custom Voice')} ({saved_voice_type})"
selected_voice_display = st.selectbox(
tr("Voice Selection"),
options=list(voice_options.values()),
index=list(voice_options.keys()).index(saved_voice_type) if saved_voice_type in voice_options else 0,
help=tr("Select Doubao TTS Voice")
)
# 获取实际的音色ID
voice_type = list(voice_options.keys())[
list(voice_options.values()).index(selected_voice_display)
]
# 高级参数折叠面板
with st.expander(tr("Advanced Parameters"), expanded=False):
col1, col2 = st.columns(2)
with col1:
# 语速调节
voice_rate = st.slider(
tr("Voice Rate"),
min_value=0.2,
max_value=3.0,
value=config.ui.get("doubaotts_rate", 1.0),
step=0.1,
help=tr("Voice Rate Help 0.2-3.0")
)
# 音量调节
voice_volume = st.slider(
tr("Voice Volume"),
min_value=0.1,
max_value=2.0,
value=config.doubaotts.get("volume", 1.0),
step=0.1,
help=tr("Voice Volume Help 0.1-2.0")
)
with col2:
# 音高调节
voice_pitch = st.slider(
tr("Voice Pitch"),
min_value=0.5,
max_value=1.5,
value=config.doubaotts.get("pitch", 1.0),
step=0.1,
help=tr("Voice Pitch Help 0.5-1.5")
)
# 句尾静音时长
silence_duration = st.slider(
tr("Sentence Silence Duration"),
min_value=0.0,
max_value=2.0,
value=config.doubaotts.get("silence_duration", 0.125),
step=0.05,
help=tr("Sentence Silence Duration Help")
)
# 显示API Key申请流程
with st.expander(tr("Doubao TTS API Key Application Process"), expanded=False):
st.write(f"**{tr('Application Steps')}:**")
st.write(tr("Doubao TTS Step 1"))
st.write(tr("Doubao TTS Step 2"))
st.write(tr("Doubao TTS Step 3"))
st.write(tr("Doubao TTS Step 4"))
st.write(tr("Doubao TTS Step 5"))
st.write(tr("Doubao TTS Step 6"))
st.write("")
st.info(tr("Doubao TTS Fill Credentials Notice"))
# 保存配置
config.doubaotts["ak"] = ak
config.doubaotts["sk"] = sk
config.doubaotts["appid"] = appid
config.doubaotts["token"] = token
config.doubaotts["cluster"] = cluster
config.doubaotts["volume"] = voice_volume
config.doubaotts["pitch"] = voice_pitch
config.doubaotts["silence_duration"] = silence_duration
config.ui["doubaotts_voice_type"] = voice_type
config.ui["doubaotts_rate"] = voice_rate
config.ui["voice_name"] = voice_type # 兼容性
st.session_state['voice_rate'] = voice_rate # 确保语速参数被保存到session state
# 显示配置状态
if ak and sk and appid and token:
st.success(tr("Doubao TTS configured"))
else:
missing = []
if not ak:
missing.append("Access Key")
if not sk:
missing.append("Secret Key")
if not appid:
missing.append("AppID")
if not token:
missing.append("Token")
if missing:
st.warning(tr("Please configure missing fields").format(fields=', '.join(missing)))
def render_voice_preview_new(tr, selected_engine):
"""渲染新的语音试听功能"""
if st.button(tr("Preview Voice Synthesis"), use_container_width=True):
play_content = tr("Voice Preview Sample")
# 根据选择的引擎获取对应的语音配置
voice_name = ""
voice_rate = 1.0
voice_pitch = 1.0
if selected_engine == "edge_tts":
voice_name = config.ui.get("edge_voice_name", "zh-CN-XiaoyiNeural-Female")
voice_rate = config.ui.get("edge_rate", 1.0)
voice_pitch = 1.0 + (config.ui.get("edge_pitch", 0) / 100.0)
elif selected_engine == "azure_speech":
voice_name = config.ui.get("azure_voice_name", "zh-CN-XiaoxiaoMultilingualNeural")
voice_rate = config.ui.get("azure_rate", 1.0)
voice_pitch = 1.0 + (config.ui.get("azure_pitch", 0) / 100.0)
elif selected_engine == "soulvoice":
voice_uri = config.soulvoice.get("voice_uri", "")
if voice_uri:
if not voice_uri.startswith("soulvoice:") and not voice_uri.startswith("speech:"):
voice_name = f"soulvoice:{voice_uri}"
else:
voice_name = voice_uri if voice_uri.startswith("soulvoice:") else f"soulvoice:{voice_uri}"
voice_rate = 1.0 # SoulVoice 使用默认语速
voice_pitch = 1.0 # SoulVoice 不支持音调调节
elif selected_engine == "tencent_tts":
voice_type = config.ui.get("tencent_voice_type", "101001")
voice_name = f"tencent:{voice_type}"
voice_rate = config.ui.get("tencent_rate", 1.0)
voice_pitch = 1.0 # 腾讯云 TTS 不支持音调调节
elif selected_engine == "qwen3_tts":
vt = config.ui.get("qwen_voice_type", "Cherry")
voice_name = f"qwen3:{vt}"
voice_rate = config.ui.get("qwen3_rate", 1.0)
voice_pitch = 1.0 # Qwen3 TTS 不支持音调调节
elif selected_engine == config.INDEXTTS_ENGINE:
reference_audio = config.indextts.get("reference_audio", "")
if reference_audio:
voice_name = f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}"
voice_rate = 1.0 # IndexTTS-1.5 不支持速度调节
voice_pitch = 1.0 # IndexTTS-1.5 不支持音调调节
elif selected_engine == config.INDEXTTS2_ENGINE:
reference_audio = config.indextts2.get("reference_audio", "")
if reference_audio:
voice_name = f"{config.INDEXTTS2_VOICE_PREFIX}{reference_audio}"
voice_rate = 1.0 # IndexTTS-2 使用自身生成参数
voice_pitch = 1.0
elif selected_engine == config.OMNIVOICE_ENGINE:
mode = config.omnivoice.get("mode", "auto")
reference_audio = config.omnivoice.get("reference_audio", "")
if mode == "voice_clone" and reference_audio:
voice_name = f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}"
else:
voice_name = f"{config.OMNIVOICE_VOICE_PREFIX}{mode}"
voice_rate = config.omnivoice.get("speed", 1.0)
voice_pitch = 1.0
elif selected_engine == "doubaotts":
voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming")
voice_name = voice_type
voice_rate = config.ui.get("doubaotts_rate", 1.0)
voice_pitch = 1.0 # 豆包语音 TTS 不支持音调调节
if not voice_name:
st.error(tr("Please configure voice settings first"))
return
with st.spinner(tr("Synthesizing Voice")):
temp_dir = utils.storage_dir("temp", create=True)
audio_format = "audio/wav" if selected_engine in (
config.INDEXTTS_ENGINE,
config.INDEXTTS2_ENGINE,
config.OMNIVOICE_ENGINE,
) else "audio/mp3"
audio_extension = ".wav" if audio_format == "audio/wav" else ".mp3"
audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}{audio_extension}")
sub_maker = voice.tts(
text=play_content,
voice_name=voice_name,
voice_rate=voice_rate,
voice_pitch=voice_pitch,
voice_file=audio_file,
tts_engine=st.session_state.get('tts_engine')
)
if sub_maker and os.path.exists(audio_file):
st.success(tr("Voice synthesis successful"))
# 播放音频
with open(audio_file, 'rb') as audio_file_obj:
audio_bytes = audio_file_obj.read()
st.audio(audio_bytes, format=audio_format)
# 清理临时文件
try:
os.remove(audio_file)
except:
pass
else:
st.error(tr("Voice synthesis failed"))
def render_azure_v2_settings(tr):
"""渲染Azure V2语音设置保留兼容性"""
saved_azure_speech_region = config.azure.get("speech_region", "")
saved_azure_speech_key = config.azure.get("speech_key", "")
azure_speech_region = st.text_input(
tr("Speech Region"),
value=saved_azure_speech_region
)
azure_speech_key = st.text_input(
tr("Speech Key"),
value=saved_azure_speech_key,
type="password"
)
config.azure["speech_region"] = azure_speech_region
config.azure["speech_key"] = azure_speech_key
def render_voice_parameters(tr, voice_name):
"""渲染语音参数设置(保留兼容性)"""
# 音量 - 使用统一的默认值
voice_volume = st.slider(
tr("Speech Volume"),
min_value=AudioVolumeDefaults.MIN_VOLUME,
max_value=AudioVolumeDefaults.MAX_VOLUME,
value=AudioVolumeDefaults.VOICE_VOLUME,
step=0.01,
help=tr("Adjust the volume of the original audio")
)
st.session_state['voice_volume'] = voice_volume
# 检查是否为 SoulVoice 引擎
is_soulvoice = voice.is_soulvoice_voice(voice_name)
# 语速
if is_soulvoice:
# SoulVoice 支持更精细的语速控制
voice_rate = st.slider(
tr("Speech Rate"),
min_value=0.5,
max_value=2.0,
value=1.0,
step=0.1,
help="SoulVoice 语音速度控制"
)
else:
# Azure TTS 使用预设选项
voice_rate = st.selectbox(
tr("Speech Rate"),
options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
index=2,
)
st.session_state['voice_rate'] = voice_rate
# 音调 - SoulVoice 不支持音调调节
if not is_soulvoice:
voice_pitch = st.selectbox(
tr("Speech Pitch"),
options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
index=2,
)
st.session_state['voice_pitch'] = voice_pitch
else:
# SoulVoice 不支持音调调节,设置默认值
st.session_state['voice_pitch'] = 1.0
st.info(tr("SoulVoice pitch not supported"))
def render_voice_preview(tr, voice_name):
"""渲染语音试听功能"""
if st.button(tr("Play Voice")):
play_content = "感谢关注 NarratoAI有任何问题或建议可以关注微信公众号求助或讨论"
if not play_content:
play_content = st.session_state.get('video_script', '')
if not play_content:
play_content = tr("Voice Example")
with st.spinner(tr("Synthesizing Voice")):
temp_dir = utils.storage_dir("temp", create=True)
audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
sub_maker = voice.tts(
text=play_content,
voice_name=voice_name,
voice_rate=st.session_state.get('voice_rate', 1.0),
voice_pitch=st.session_state.get('voice_pitch', 1.0),
voice_file=audio_file,
)
# 如果语音文件生成失败,使用默认内容重试
if not sub_maker:
play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
sub_maker = voice.tts(
text=play_content,
voice_name=voice_name,
voice_rate=st.session_state.get('voice_rate', 1.0),
voice_pitch=st.session_state.get('voice_pitch', 1.0),
voice_file=audio_file,
)
if sub_maker and os.path.exists(audio_file):
st.success(tr("Voice synthesis successful"))
st.audio(audio_file, format="audio/mp3")
if os.path.exists(audio_file):
os.remove(audio_file)
else:
st.error(tr("Voice synthesis failed"))
def render_bgm_settings(tr):
"""渲染背景音乐设置"""
saved_bgm_file = st.session_state.get('bgm_file', '')
saved_bgm_source = st.session_state.get('bgm_source', 'resource')
if st.session_state.get('bgm_type') == "":
saved_bgm_source = "none"
bgm_source_options = {
tr("Select from Resource Directory"): "resource",
tr("Upload Background Music"): "upload",
tr("No Background Music"): "none",
}
if saved_bgm_source not in bgm_source_options.values():
saved_bgm_source = "resource"
default_bgm_source_label = next(
label
for label, source_value in bgm_source_options.items()
if source_value == saved_bgm_source
)
st.markdown(f"**{tr('Background Music')}**")
bgm_source_label = st.pills(
tr("Background Music Source"),
options=list(bgm_source_options.keys()),
selection_mode="single",
default=default_bgm_source_label,
key="bgm_source_selection",
help=tr("Background Music Source Help"),
label_visibility="collapsed",
width="stretch",
)
if not bgm_source_label:
bgm_source_label = default_bgm_source_label
bgm_source = bgm_source_options[bgm_source_label]
bgm_file = ""
bgm_name = ""
if bgm_source == "resource":
bgm_options = get_bgm_resource_options()
if bgm_options:
selected_bgm_index = get_bgm_resource_index(bgm_options, saved_bgm_file)
select_col, preview_col = st.columns([5, 1])
with select_col:
selected_bgm_option = bgm_options[st.selectbox(
tr("Background Music"),
options=range(len(bgm_options)),
index=selected_bgm_index,
format_func=lambda x: format_bgm_resource_option(bgm_options[x]),
help=tr("Background Music Path Help"),
label_visibility="collapsed"
)]
bgm_file = selected_bgm_option["path"]
bgm_name = selected_bgm_option["title"]
with preview_col:
render_bgm_preview_button(
bgm_file,
"resource_bgm_preview",
tr,
)
else:
st.warning(tr("No Background Music Resources Found"))
if bgm_source == "upload":
if st.session_state.get('bgm_source') != "upload":
saved_bgm_file = ""
bgm_file = saved_bgm_file if saved_bgm_file and os.path.isfile(saved_bgm_file) else ""
bgm_name = os.path.splitext(os.path.basename(bgm_file))[0] if bgm_file else ""
upload_col, preview_col = st.columns([5, 1])
with upload_col:
uploaded_file = st.file_uploader(
tr("Upload Background Music File"),
type=[extension.lstrip(".") for extension in BGM_AUDIO_EXTENSIONS],
help=tr("Upload Background Music Help"),
label_visibility="collapsed"
)
if uploaded_file is not None:
target_dir = utils.storage_dir(BGM_UPLOAD_SUBDIR, create=True)
bgm_file = os.path.join(target_dir, f"uploaded_{uploaded_file.name}")
with open(bgm_file, "wb") as f:
f.write(uploaded_file.getbuffer())
bgm_name = os.path.splitext(uploaded_file.name)[0]
st.success(tr("Background Music uploaded").format(path=bgm_file))
with preview_col:
render_bgm_preview_button(
bgm_file,
"upload_bgm_preview",
tr,
)
preview_bgm_path = st.session_state.get("bgm_preview_path", "")
if preview_bgm_path == bgm_file and os.path.isfile(preview_bgm_path):
with open(preview_bgm_path, "rb") as audio_file:
st.audio(audio_file.read(), format=get_audio_mime_type(preview_bgm_path))
bgm_type = "" if bgm_source == "none" or not bgm_file else "custom"
st.session_state['bgm_source'] = bgm_source
st.session_state['bgm_type'] = bgm_type
st.session_state['bgm_file'] = bgm_file if bgm_type else ""
st.session_state['bgm_name'] = bgm_name if bgm_type else ""
# 背景音乐音量 - 使用统一的默认值
bgm_volume = st.slider(
tr("Background Music Volume"),
min_value=AudioVolumeDefaults.MIN_VOLUME,
max_value=AudioVolumeDefaults.MAX_VOLUME,
value=AudioVolumeDefaults.BGM_VOLUME,
step=0.01,
help=tr("Adjust the volume of the original audio")
)
st.session_state['bgm_volume'] = bgm_volume
def get_audio_params():
"""获取音频参数"""
return {
'voice_name': config.ui.get("voice_name", ""),
'voice_volume': st.session_state.get('voice_volume', AudioVolumeDefaults.VOICE_VOLUME),
'voice_rate': st.session_state.get('voice_rate', 1.0),
'voice_pitch': st.session_state.get('voice_pitch', 1.0),
'bgm_name': st.session_state.get('bgm_name', ''),
'bgm_type': st.session_state.get('bgm_type', 'random'),
'bgm_file': st.session_state.get('bgm_file', ''),
'bgm_volume': st.session_state.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME),
'tts_engine': st.session_state.get('tts_engine', config.INDEXTTS_ENGINE),
}