新增qwen3 tts服务

This commit is contained in:
harry 2025-10-11 16:58:11 +08:00
parent debf1f95b1
commit 01c8c8097e
6 changed files with 229 additions and 8 deletions

View File

@ -51,6 +51,7 @@ def save_config():
_cfg["tencent"] = tencent
_cfg["soulvoice"] = soulvoice
_cfg["ui"] = ui
_cfg["tts_qwen"] = tts_qwen
f.write(toml.dumps(_cfg))
@ -63,6 +64,7 @@ tencent = _cfg.get("tencent", {})
soulvoice = _cfg.get("soulvoice", {})
ui = _cfg.get("ui", {})
frames = _cfg.get("frames", {})
tts_qwen = _cfg.get("tts_qwen", {})
hostname = socket.gethostname()

View File

@ -1089,6 +1089,10 @@ def tts(
logger.info("分发到腾讯云 TTS")
return tencent_tts(text, voice_name, voice_file, speed=voice_rate)
if tts_engine == "qwen3_tts":
logger.info("分发到 Qwen3 TTS", voice_name)
return qwen3_tts(text, voice_name, voice_file, speed=voice_rate)
if tts_engine == "soulvoice":
logger.info("分发到 SoulVoice TTS")
return soulvoice_tts(text, voice_name, voice_file, speed=voice_rate)
@ -1538,7 +1542,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
continue
else:
# SoulVoice 引擎不生成字幕文件
if is_soulvoice_voice(voice_name):
if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine):
# 获取实际音频文件的时长
duration = get_audio_duration_from_file(audio_file)
if duration <= 0:
@ -1619,6 +1623,111 @@ def parse_tencent_voice(voice_name: str) -> str:
return voice_name
def parse_qwen3_voice(voice_name: str) -> str:
"""
解析 Qwen3 语音名称
"""
if isinstance(voice_name, str) and voice_name.startswith("qwen3:"):
return voice_name[6:]
return voice_name
def qwen3_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""
使用通义千问 Qwen3 TTS 生成语音仅使用 DashScope SDK
"""
# 读取配置
tts_qwen_cfg = getattr(config, "tts_qwen", {}) or {}
api_key = tts_qwen_cfg.get("api_key", "")
model_name = tts_qwen_cfg.get("model_name", "qwen3-tts-flash")
if not api_key:
logger.error("Qwen3 TTS API key 未配置")
return None
# 准备参数
voice_type = parse_qwen3_voice(voice_name)
safe_speed = float(max(0.5, min(2.0, speed)))
text = text.strip()
# SDK 调用
try:
import dashscope
except ImportError:
logger.error("未安装 dashscope SDK请执行: pip install dashscope")
return None
except Exception as e:
logger.error(f"DashScope SDK 初始化失败: {e}")
return None
# Qwen3 TTS 直接使用英文参数,不需要映射
mapped_voice = voice_type or "Cherry"
for i in range(3):
try:
# 打印详细的请求参数日志
logger.info(f"=== Qwen3 TTS 请求参数 (第 {i+1} 次调用) ===")
# 官方推荐:使用 MultiModalConversation.call
result = dashscope.MultiModalConversation.call(
# 仅支持 qwen-tts 系列模型
model=(model_name or "qwen3-tts-flash"),
# 同时显式传入 api_key并兼容示例中从环境变量读取
api_key=api_key,
text=text,
voice=mapped_voice
)
logger.info(f"Qwen3 TTS API 响应: {result}")
audio_bytes: bytes | None = None
# 解析返回结果提取音频URL并下载
try:# 假设 result 是你收到的字符串
audio_url = None
if result.output and result.output.audio:
audio_url = result.output.audio.url
# 从响应中提取音频URL
if audio_url:
# 直接下载音频文件
response = requests.get(audio_url, timeout=30)
response.raise_for_status()
audio_bytes = response.content
else:
logger.warning("API响应中未找到音频URL")
except Exception as e:
logger.error(f"解析API响应失败: {str(e)}")
if not audio_bytes:
logger.warning("DashScope SDK 返回空音频数据,重试")
if i < 2:
time.sleep(1)
continue
# 写入文件
with open(voice_file, "wb") as f:
f.write(audio_bytes)
# 估算字幕
sub = SubMaker()
est_ms = max(800, int(len(text) * 180))
sub.create_sub((0, est_ms), text)
logger.info(f"Qwen3 TTS 生成成功DashScope SDK文件大小: {len(audio_bytes)} 字节")
return sub
except Exception as e:
logger.error(f"DashScope SDK 合成失败: {e}")
if i < 2:
time.sleep(1)
return None
def tencent_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""
使用腾讯云 TTS 生成语音
@ -1819,6 +1928,8 @@ def is_soulvoice_voice(voice_name: str) -> bool:
"""
return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:")
def is_qwen_engine(tts_engine: str) -> bool:
return tts_engine == "qwen3_tts"
def parse_soulvoice_voice(voice_name: str) -> str:
"""

View File

@ -114,8 +114,14 @@
# 默认模型(可选)
model = "FunAudioLLM/CosyVoice2-0.5B"
[tts_qwen]
# 通义千问 Qwen3 TTS 配置
# 访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取你的 API 密钥
api_key = ""
model_name = "qwen3-tts-flash"
[ui]
# TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts)
# TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen)
tts_engine = "edge_tts"
# Edge TTS 配置

View File

@ -15,6 +15,7 @@ openai>=1.77.0
google-generativeai>=0.8.5
azure-cognitiveservices-speech>=1.37.0
tencentcloud-sdk-python>=3.0.1200
dashscope>=1.24.6
# 图像处理依赖
Pillow>=10.3.0

View File

@ -1,3 +1,4 @@
from venv import logger
import streamlit as st
import os
from uuid import uuid4
@ -24,7 +25,8 @@ def get_tts_engine_options():
return {
"edge_tts": "Edge TTS",
"azure_speech": "Azure Speech Services",
"tencent_tts": "腾讯云 TTS"
"tencent_tts": "腾讯云 TTS",
"qwen3_tts": "通义千问 Qwen3 TTS"
}
@ -48,6 +50,12 @@ def get_tts_engine_descriptions():
"features": "提供免费额度,音质优秀,支持多种音色,国内访问速度快",
"use_case": "个人和企业用户,需要稳定的中文语音合成",
"registration": "https://console.cloud.tencent.com/tts"
},
"qwen3_tts": {
"title": "通义千问 Qwen3 TTS",
"features": "阿里云通义千问语音合成,音质优秀,支持多种音色",
"use_case": "需要高质量中文语音合成的用户",
"registration": "https://dashscope.aliyuncs.com/"
}
}
@ -129,6 +137,8 @@ def render_tts_settings(tr):
render_soulvoice_engine_settings(tr)
elif selected_engine == "tencent_tts":
render_tencent_tts_settings(tr)
elif selected_engine == "qwen3_tts":
render_qwen3_tts_settings(tr)
# 4. 试听功能
render_voice_preview_new(tr, selected_engine)
@ -469,8 +479,87 @@ def render_tencent_tts_settings(tr):
config.tencent["region"] = region
config.ui["tencent_voice_type"] = voice_type
config.ui["tencent_rate"] = voice_rate
config.ui["voice_name"] = saved_voice_type #兼容性
def render_qwen3_tts_settings(tr):
"""渲染 Qwen3 TTS 设置"""
api_key = st.text_input(
"API Key",
value=config.tts_qwen.get("api_key", ""),
type="password",
help="通义千问 DashScope API Key"
)
model_name = st.text_input(
"模型名称",
value=config.tts_qwen.get("model_name", "qwen3-tts-flash"),
help="Qwen TTS 模型名,例如 qwen3-tts-flash"
)
# Qwen3 TTS 音色选项 - 中文名: 英文参数
voice_options = {
"芊悦": "Cherry",
"晨煦": "Ethan",
"不吃鱼": "Nofish",
"詹妮弗": "Jennifer",
"甜茶": "Ryan",
"卡捷琳娜": "Katerina",
"墨讲师": "Elias",
"上海-阿珍": "Jada",
"北京-晓东": "Dylan",
"四川-晴儿": "Sunny",
"南京-老李": "Li",
"陕西-秦川": "Marcus",
"闽南-阿杰": "Roy",
"天津-李彼得": "Peter",
"粤语-阿强": "Rocky",
"粤语-阿清": "Kiki",
"四川-程川": "Eric"
}
# 显示给用户的中文名称列表
display_names = list(voice_options.keys())
saved_voice_param = config.ui.get("qwen_voice_type", "Cherry")
# 如果保存的英文参数不在选项中,查找对应的中文名称
saved_display_name = "芊悦" # 默认值
for chinese_name, english_param in voice_options.items():
if english_param == saved_voice_param:
saved_display_name = chinese_name
break
# 如果保存的音色不在选项中,添加到自定义选项
if saved_display_name not in display_names:
display_names.append(saved_display_name)
voice_options[saved_display_name] = saved_voice_param
selected_display_name = st.selectbox(
"音色选择",
options=display_names,
index=display_names.index(saved_display_name) if saved_display_name in display_names else 0,
help="选择Qwen3 TTS音色"
)
# 获取对应的英文参数
voice_type = voice_options.get(selected_display_name, "Cherry")
voice_rate = st.slider(
"语速调节",
min_value=0.5,
max_value=2.0,
value=1.0,
step=0.1,
help="调节语音速度 (0.5-2.0)"
)
# 保存配置
config.tts_qwen["api_key"] = api_key
config.tts_qwen["model_name"] = model_name
config.ui["qwen_voice_type"] = voice_type
config.ui["qwen3_rate"] = voice_rate
config.ui["voice_name"] = voice_type #兼容性
def render_voice_preview_new(tr, selected_engine):
"""渲染新的语音试听功能"""
if st.button("🎵 试听语音合成", use_container_width=True):
@ -503,6 +592,11 @@ def render_voice_preview_new(tr, selected_engine):
voice_name = f"tencent:{voice_type}"
voice_rate = config.ui.get("tencent_rate", 1.0)
voice_pitch = 1.0 # 腾讯云 TTS 不支持音调调节
elif selected_engine == "qwen3_tts":
vt = config.ui.get("qwen_voice_type", "Cherry")
voice_name = f"qwen3:{vt}"
voice_rate = config.ui.get("qwen3_rate", 1.0)
voice_pitch = 1.0 # Qwen3 TTS 不支持音调调节
if not voice_name:
st.error("请先配置语音设置")

View File

@ -1,3 +1,5 @@
from loguru import logger
import streamlit as st
from app.config import config
from webui.utils.cache import get_fonts_cache
@ -9,14 +11,15 @@ def render_subtitle_panel(tr):
with st.container(border=True):
st.write(tr("Subtitle Settings"))
# 检查是否选择了 SoulVoice 引擎
# 检查是否选择了 SoulVoice qwen3_tts引擎
from app.services import voice
current_voice = st.session_state.get('voice_name', '')
is_soulvoice = voice.is_soulvoice_voice(current_voice)
# current_voice = st.session_state.get('voice_name', '')
tts_engine = config.ui.get('tts_engine', '')
is_disabled_subtitle = is_disabled_subtitle_settings(tts_engine)
if is_soulvoice:
if is_disabled_subtitle:
# SoulVoice 引擎时显示禁用提示
st.warning("⚠️ SoulVoice TTS 不支持精确字幕生成")
st.warning(f"⚠️ {tts_engine}不支持精确字幕生成")
st.info("💡 建议使用专业剪辑工具如剪映、PR等手动添加字幕")
# 强制禁用字幕
@ -84,6 +87,10 @@ def render_font_settings(tr):
st.session_state['font_size'] = font_size
def is_disabled_subtitle_settings(tts_engine:str)->bool:
"""是否禁用字幕设置"""
return tts_engine=="soulvoice" or tts_engine=="qwen3_tts"
def render_position_settings(tr):
"""渲染位置设置"""
subtitle_positions = [