Merge pull request #237 from aw123456dew/feature/doubao-tts

add doubao tts
This commit is contained in:
viccy 2026-04-08 15:14:10 +08:00 committed by GitHub
commit 8c129790c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 428 additions and 8 deletions

View File

@ -82,6 +82,7 @@ def save_config():
_cfg["ui"] = ui
_cfg["tts_qwen"] = tts_qwen
_cfg["indextts2"] = indextts2
_cfg["doubaotts"] = doubaotts
f.write(toml.dumps(_cfg))
@ -96,6 +97,7 @@ ui = _cfg.get("ui", {})
frames = _cfg.get("frames", {})
tts_qwen = _cfg.get("tts_qwen", {})
indextts2 = _cfg.get("indextts2", {})
doubaotts = _cfg.get("doubaotts", {})
hostname = socket.gethostname()

View File

@ -1116,6 +1116,125 @@ def should_use_azure_speech_services(voice_name: str) -> bool:
return False
def doubaotts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""
使用豆包语音 TTS 生成语音
"""
# 读取配置
doubaotts_cfg = getattr(config, "doubaotts", {}) or {}
appid = doubaotts_cfg.get("appid", "")
token = doubaotts_cfg.get("token", "")
ak = doubaotts_cfg.get("ak", "")
sk = doubaotts_cfg.get("sk", "")
cluster = doubaotts_cfg.get("cluster", "volcano_tts")
if not appid or not token:
logger.error("豆包语音 TTS 配置未完成")
return None
# 准备参数
voice_type = voice_name
safe_speed = float(max(0.2, min(3.0, speed)))
text = text.strip()
# 构建请求参数
import uuid
reqid = str(uuid.uuid4())
# 获取高级参数
volume = doubaotts_cfg.get("volume", 1.0)
pitch = doubaotts_cfg.get("pitch", 1.0)
silence_duration = doubaotts_cfg.get("silence_duration", 0.125)
payload = {
"app": {
"appid": appid,
"token": token,
"cluster": cluster
},
"user": {
"uid": "NarratoAI"
},
"audio": {
"voice_type": voice_type,
"encoding": "mp3",
"rate": 24000,
"speed_ratio": safe_speed,
"volume_ratio": float(volume),
"pitch_ratio": float(pitch)
},
"request": {
"reqid": reqid,
"text": text,
"text_type": "plain",
"operation": "query"
}
}
# 如果设置了句尾静音时长,添加到请求参数中
if silence_duration > 0:
payload["audio"]["silence_duration"] = float(silence_duration)
# API 地址
url = "https://openspeech.bytedance.com/api/v1/tts"
# 构建请求头使用Bearer Token认证
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer;{token}"
}
for i in range(3):
try:
logger.info(f"=== 豆包语音 TTS 请求参数 (第 {i+1} 次调用) ===")
# 发送请求
import requests
# 处理代理设置
proxies = None
proxy_enabled = config.proxy.get("enabled", False)
if proxy_enabled:
proxy_url = config.proxy.get("https", config.proxy.get("http", ""))
if proxy_url:
proxies = {"https": proxy_url, "http": proxy_url}
response = requests.post(url, json=payload, headers=headers, proxies=proxies, timeout=60)
if response.status_code == 200:
result = response.json()
if result.get("code") == 3000:
# 成功
audio_data = result.get("data", "")
if audio_data:
# 解码 base64 音频数据
import base64
audio_bytes = base64.b64decode(audio_data)
# 写入文件
with open(voice_file, "wb") as f:
f.write(audio_bytes)
logger.success(f"豆包语音 TTS 合成成功: {voice_file}")
# 创建 SubMaker 对象(简化版,不包含时间戳)
sub_maker = new_sub_maker()
return sub_maker
else:
logger.error("豆包语音 TTS 响应中无音频数据")
else:
logger.error(f"豆包语音 TTS 失败: {result.get('message', '未知错误')}")
else:
logger.error(f"豆包语音 TTS API 请求失败: {response.status_code}, {response.text}")
if i < 2:
time.sleep(1)
except Exception as e:
logger.error(f"豆包语音 TTS 错误: {str(e)}")
if i < 2:
time.sleep(3)
return None
def tts(
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str, tts_engine: str
) -> Union[SubMaker, None]:
@ -1147,6 +1266,10 @@ def tts(
if tts_engine == "indextts2":
logger.info("分发到 IndexTTS2")
return indextts2_tts(text, voice_name, voice_file, speed=voice_rate)
if tts_engine == "doubaotts":
logger.info("分发到豆包语音 TTS")
return doubaotts_tts(text, voice_name, voice_file, speed=voice_rate)
# Fallback for unknown engine - default to azure v1
logger.warning(f"未知的 TTS 引擎: '{tts_engine}', 将默认使用 Edge TTS (Azure V1)。")
@ -1606,8 +1729,8 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
f"或者使用其他 tts 引擎")
continue
else:
# SoulVoice、Qwen3、IndexTTS2 引擎不生成字幕文件
if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2":
# SoulVoice、Qwen3、IndexTTS2、豆包语音 引擎不生成字幕文件
if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2" or tts_engine == "doubaotts":
# 获取实际音频文件的时长
duration = get_audio_duration_from_file(audio_file)
if duration <= 0:
@ -1615,8 +1738,27 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
duration = get_audio_duration(sub_maker)
if duration <= 0:
# 最后的 fallback基于文本长度估算
duration = max(1.0, len(text) / 3.0)
logger.warning(f"无法获取音频时长,使用文本估算: {duration:.2f}")
# 对于英文文本,使用更准确的估算方法
# 英文平均语速约为每分钟150-180个单词即每秒2.5-3个单词
# 对于中文文本约为每秒3-4字
import re
# 计算英文单词数
english_words = len(re.findall(r'\b\w+\b', text))
# 计算中文字符数
chinese_chars = len(re.findall(r'[\u4e00-\u9fa5]', text))
if english_words > chinese_chars:
# 主要是英文文本
# 假设平均每个单词需要0.35秒
estimated_duration = max(1.0, english_words * 0.35)
else:
# 主要是中文文本
# 假设平均每个汉字需要0.3秒
estimated_duration = max(1.0, chinese_chars * 0.3)
# 确保估算时长合理
duration = max(1.0, estimated_duration)
logger.warning(f"无法获取音频时长,使用文本估算: {duration:.2f}秒 (英文单词: {english_words}, 中文字符: {chinese_chars})")
# 不创建字幕文件
subtitle_file = ""
else:
@ -1658,8 +1800,6 @@ def get_audio_duration_from_file(audio_file: str) -> float:
# 但实际文件还包含头部信息,所以调整系数
estimated_duration = max(1.0, file_size / 20000) # 调整为更保守的估算
# 对于中文语音,根据文本长度进行二次校正
# 一般中文语音速度约为 3-4 字/秒
logger.warning(f"使用文件大小估算音频时长: {estimated_duration:.2f}")
return estimated_duration
except Exception as e:

View File

@ -114,9 +114,25 @@
do_sample = true
num_beams = 3
repetition_penalty = 10.0
[doubaotts]
# 豆包语音 TTS 配置
# 申请流程:
# 1. 打开 https://console.volcengine.com/iam/keymanage 新建 Access Key 和 Secret Key
# 2. 打开 https://www.volcengine.com/product/voice-tech 点击立即使用
# 3. 在 API 服务中心找到音频生成下面的语音合成,获取 APPID 和 Token
ak = ""
sk = ""
appid = ""
token = ""
cluster = "volcano_tts"
# 高级参数
volume = 1.0
pitch = 1.0
silence_duration = 0.125
[ui]
# TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen)
# TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen, doubaotts)
tts_engine = "edge_tts"
# Edge TTS 配置
@ -130,6 +146,10 @@
azure_volume = 80
azure_rate = 1.0
azure_pitch = 0
# 豆包语音 TTS 配置
doubaotts_voice_type = "BV700_V2_streaming"
doubaotts_rate = 1.0
##########################################
# 代理和网络配置

View File

@ -26,7 +26,8 @@ def get_tts_engine_options():
"azure_speech": "Azure Speech Services",
"tencent_tts": "腾讯云 TTS",
"qwen3_tts": "通义千问 Qwen3 TTS",
"indextts2": "IndexTTS2 语音克隆"
"indextts2": "IndexTTS2 语音克隆",
"doubaotts": "豆包语音 TTS"
}
@ -62,6 +63,12 @@ def get_tts_engine_descriptions():
"features": "零样本语音克隆,上传参考音频即可合成相同音色的语音,需要本地或私有部署",
"use_case": "下载地址https://pan.quark.cn/s/0767c9bcefd5",
"registration": None
},
"doubaotts": {
"title": "豆包语音 TTS",
"features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快",
"use_case": "需要高质量中文语音合成的用户",
"registration": "https://www.volcengine.com/product/voice-tech"
}
}
@ -147,6 +154,8 @@ def render_tts_settings(tr):
render_qwen3_tts_settings(tr)
elif selected_engine == "indextts2":
render_indextts2_tts_settings(tr)
elif selected_engine == "doubaotts":
render_doubaotts_settings(tr)
# 4. 试听功能
render_voice_preview_new(tr, selected_engine)
@ -703,6 +712,250 @@ def render_indextts2_tts_settings(tr):
config.ui["voice_name"] = f"indextts2:{reference_audio}"
def render_doubaotts_settings(tr):
"""渲染豆包语音 TTS 设置"""
# AK 输入
ak = st.text_input(
"Access Key",
value=config.doubaotts.get("ak", ""),
help="火山引擎 Access Key"
)
# SK 输入
sk = st.text_input(
"Secret Key",
value=config.doubaotts.get("sk", ""),
type="password",
help="火山引擎 Secret Key"
)
# AppID 输入
appid = st.text_input(
"AppID",
value=config.doubaotts.get("appid", ""),
help="豆包语音应用 AppID"
)
# Token 输入
token = st.text_input(
"Token",
value=config.doubaotts.get("token", ""),
type="password",
help="豆包语音应用 Token"
)
# 集群配置
cluster = st.text_input(
"集群",
value=config.doubaotts.get("cluster", "volcano_tts"),
help="业务集群,标准音色使用 volcano_tts"
)
# 音色选择
# 在线音色列表(从文档中提取)
voice_options = {
"BV700_V2_streaming": "灿灿 2.0",
"BV705_streaming": "炀炀",
"BV701_V2_streaming": "擎苍 2.0",
"BV001_V2_streaming": "通用女声 2.0",
"BV700_streaming": "灿灿",
"BV406_V2_streaming": "超自然音色-梓梓2.0",
"BV406_streaming": "超自然音色-梓梓",
"BV407_V2_streaming": "超自然音色-燃燃2.0",
"BV407_streaming": "超自然音色-燃燃",
"BV001_streaming": "通用女声",
"BV002_streaming": "通用男声",
"BV701_streaming": "擎苍",
"BV123_streaming": "阳光青年",
"BV120_streaming": "反卷青年",
"BV119_streaming": "通用赘婿",
"BV115_streaming": "古风少御",
"BV107_streaming": "霸气青叔",
"BV100_streaming": "质朴青年",
"BV104_streaming": "温柔淑女",
"BV004_streaming": "开朗青年",
"BV113_streaming": "甜宠少御",
"BV102_streaming": "儒雅青年",
"BV405_streaming": "甜美小源",
"BV007_streaming": "亲切女声",
"BV009_streaming": "知性女声",
"BV419_streaming": "诚诚",
"BV415_streaming": "童童",
"BV008_streaming": "亲切男声",
"BV408_streaming": "译制片男声",
"BV426_streaming": "懒小羊",
"BV428_streaming": "清新文艺女声",
"BV403_streaming": "鸡汤女声",
"BV158_streaming": "智慧老者",
"BV157_streaming": "慈爱姥姥",
"BR001_streaming": "说唱小哥",
"BV410_streaming": "活力解说男",
"BV411_streaming": "影视解说小帅",
"BV437_streaming": "解说小帅-多情感",
"BV412_streaming": "影视解说小美",
"BV159_streaming": "纨绔青年",
"BV418_streaming": "直播一姐",
"BV142_streaming": "沉稳解说男",
"BV143_streaming": "潇洒青年",
"BV056_streaming": "阳光男声",
"BV005_streaming": "活泼女声",
"BV064_streaming": "小萝莉",
"BV051_streaming": "奶气萌娃",
"BV063_streaming": "动漫海绵",
"BV417_streaming": "动漫海星",
"BV050_streaming": "动漫小新",
"BV061_streaming": "天才童声",
"BV401_streaming": "促销男声",
"BV402_streaming": "促销女声",
"BV006_streaming": "磁性男声",
"BV011_streaming": "新闻女声",
"BV012_streaming": "新闻男声",
"BV034_streaming": "知性姐姐-双语",
"BV033_streaming": "温柔小哥",
"BV511_streaming": "慵懒女声-Ava",
"BV505_streaming": "议论女声-Alicia",
"BV138_streaming": "情感女声-Lawrence",
"BV027_streaming": "美式女声-Amelia",
"BV502_streaming": "讲述女声-Amanda",
"BV503_streaming": "活力女声-Ariana",
"BV504_streaming": "活力男声-Jackson",
"BV421_streaming": "天才少女",
"BV702_streaming": "Stefan",
"BV506_streaming": "天真萌娃-Lily",
"BV040_streaming": "亲切女声-Anna",
"BV516_streaming": "澳洲男声-Henry",
"BV520_streaming": "元气少女",
"BV521_streaming": "萌系少女",
"BV522_streaming": "气质女声",
"BV524_streaming": "日语男声",
"BV531_streaming": "活力男声Carlos巴西地区",
"BV530_streaming": "活力女声(巴西地区)",
"BV065_streaming": "气质御姐(墨西哥地区)",
"BV021_streaming": "东北老铁",
"BV020_streaming": "东北丫头",
"BV704_streaming": "方言灿灿",
"BV210_streaming": "西安佟掌柜",
"BV217_streaming": "沪上阿姐",
"BV213_streaming": "广西表哥",
"BV025_streaming": "甜美台妹",
"BV227_streaming": "台普男声",
"BV026_streaming": "港剧男神",
"BV424_streaming": "广东女仔",
"BV212_streaming": "相声演员",
"BV019_streaming": "重庆小伙",
"BV221_streaming": "四川甜妹儿",
"BV423_streaming": "重庆幺妹儿",
"BV214_streaming": "乡村企业家",
"BV226_streaming": "湖南妹坨",
"BV216_streaming": "长沙靓女"
}
saved_voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming")
if saved_voice_type not in voice_options:
voice_options[saved_voice_type] = f"自定义音色 ({saved_voice_type})"
selected_voice_display = st.selectbox(
"音色选择",
options=list(voice_options.values()),
index=list(voice_options.keys()).index(saved_voice_type) if saved_voice_type in voice_options else 0,
help="选择豆包语音 TTS 音色"
)
# 获取实际的音色ID
voice_type = list(voice_options.keys())[
list(voice_options.values()).index(selected_voice_display)
]
# 高级参数折叠面板
with st.expander("🔧 高级参数", expanded=False):
col1, col2 = st.columns(2)
with col1:
# 语速调节
voice_rate = st.slider(
"语速调节",
min_value=0.2,
max_value=3.0,
value=config.ui.get("doubaotts_rate", 1.0),
step=0.1,
help="调节语音速度 (0.2-3.0)"
)
# 音量调节
voice_volume = st.slider(
"音量调节",
min_value=0.1,
max_value=2.0,
value=config.doubaotts.get("volume", 1.0),
step=0.1,
help="调节语音音量 (0.1-2.0)"
)
with col2:
# 音高调节
voice_pitch = st.slider(
"音高调节",
min_value=0.5,
max_value=1.5,
value=config.doubaotts.get("pitch", 1.0),
step=0.1,
help="调节语音音高 (0.5-1.5)"
)
# 句尾静音时长
silence_duration = st.slider(
"句尾静音时长 (秒)",
min_value=0.0,
max_value=2.0,
value=config.doubaotts.get("silence_duration", 0.125),
step=0.05,
help="调节句尾静音时长 (0.0-2.0秒)"
)
# 显示API Key申请流程
with st.expander("💡 豆包语音 TTS API Key申请流程", expanded=False):
st.write("**申请步骤:**")
st.write("1. 打开 [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)")
st.write("2. 新建 Access Key 和 Secret Key")
st.write("3. 打开 [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)")
st.write("4. 点击立即使用")
st.write("5. 在最左边的API服务中心找到音频生成下面的语音合成注意是语音合成不是语音合成大模型")
st.write("6. 翻到最下面获取 APPID 和 Access Token")
st.write("")
st.info("💡 请将获取到的 Access Key、Secret Key、AppID 和 Token 填写到上方的配置中")
# 保存配置
config.doubaotts["ak"] = ak
config.doubaotts["sk"] = sk
config.doubaotts["appid"] = appid
config.doubaotts["token"] = token
config.doubaotts["cluster"] = cluster
config.doubaotts["volume"] = voice_volume
config.doubaotts["pitch"] = voice_pitch
config.doubaotts["silence_duration"] = silence_duration
config.ui["doubaotts_voice_type"] = voice_type
config.ui["doubaotts_rate"] = voice_rate
config.ui["voice_name"] = voice_type # 兼容性
st.session_state['voice_rate'] = voice_rate # 确保语速参数被保存到session state
# 显示配置状态
if ak and sk and appid and token:
st.success("✅ 豆包语音 TTS 配置已设置")
else:
missing = []
if not ak:
missing.append("Access Key")
if not sk:
missing.append("Secret Key")
if not appid:
missing.append("AppID")
if not token:
missing.append("Token")
if missing:
st.warning(f"⚠️ 请配置: {', '.join(missing)}")
def render_voice_preview_new(tr, selected_engine):
"""渲染新的语音试听功能"""
if st.button("🎵 试听语音合成", use_container_width=True):
@ -746,6 +999,11 @@ def render_voice_preview_new(tr, selected_engine):
voice_name = f"indextts2:{reference_audio}"
voice_rate = 1.0 # IndexTTS2 不支持速度调节
voice_pitch = 1.0 # IndexTTS2 不支持音调调节
elif selected_engine == "doubaotts":
voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming")
voice_name = voice_type
voice_rate = config.ui.get("doubaotts_rate", 1.0)
voice_pitch = 1.0 # 豆包语音 TTS 不支持音调调节
if not voice_name:
st.error("请先配置语音设置")