新增腾讯云 TTS 服务

This commit is contained in:
Emily-LMH 2025-09-16 14:40:08 +08:00 committed by linyq
parent da27d8d8a1
commit a1474bed02
11 changed files with 348 additions and 44 deletions

View File

@ -22,10 +22,9 @@ RUN python -m pip install --upgrade pip setuptools wheel && \
# 激活虚拟环境
ENV PATH="/opt/venv/bin:$PATH"
# 复制 requirements.txt 并安装 Python 依赖
# 复制 requirements.txt 并使用镜像安装 Python 依赖
COPY requirements.txt .
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
RUN pip install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
# 运行阶段
FROM python:3.12-slim-bookworm
@ -48,7 +47,7 @@ ENV PATH="/opt/venv/bin:$PATH" \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8
# 安装运行时系统依赖
# 一次性安装所有依赖、创建用户、配置系统,减少层级
RUN apt-get update && apt-get install -y --no-install-recommends \
imagemagick \
ffmpeg \
@ -56,32 +55,25 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
git-lfs \
ca-certificates \
dos2unix \
&& sed -i 's/<policy domain="path" rights="none" pattern="@\*"/<policy domain="path" rights="read|write" pattern="@\*"/' /etc/ImageMagick-6/policy.xml || true \
&& git lfs install \
&& groupadd -r narratoai && useradd -r -g narratoai -d /NarratoAI -s /bin/bash narratoai \
&& rm -rf /var/lib/apt/lists/*
# 配置 ImageMagick 策略(允许处理更多格式)
RUN sed -i 's/<policy domain="path" rights="none" pattern="@\*"/<policy domain="path" rights="read|write" pattern="@\*"/' /etc/ImageMagick-6/policy.xml || true
# 复制入口脚本并修复换行符问题
COPY --chown=narratoai:narratoai docker-entrypoint.sh /usr/local/bin/
RUN dos2unix /usr/local/bin/docker-entrypoint.sh && chmod +x /usr/local/bin/docker-entrypoint.sh
# 初始化 git-lfs
RUN git lfs install
# 创建非 root 用户(安全最佳实践)
RUN groupadd -r narratoai && useradd -r -g narratoai -d /NarratoAI -s /bin/bash narratoai
# 复制应用代码
# 复制其余的应用代码
COPY --chown=narratoai:narratoai . .
# 确保配置文件存在
RUN if [ ! -f config.toml ]; then cp config.example.toml config.toml; fi
# 创建必要的目录并设置权限
# 创建目录、复制配置、设置权限
RUN mkdir -p storage/temp storage/tasks storage/json storage/narration_scripts storage/drama_analysis && \
if [ ! -f config.toml ]; then cp config.example.toml config.toml; fi && \
chown -R narratoai:narratoai /NarratoAI && \
chmod -R 755 /NarratoAI
# 复制并设置入口点脚本
COPY --chown=narratoai:narratoai docker-entrypoint.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
# 切换到非 root 用户
USER narratoai
@ -93,5 +85,5 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8501/_stcore/health || exit 1
# 设置入口点
ENTRYPOINT ["docker-entrypoint.sh"]
CMD ["webui"]
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
CMD ["webui"]

View File

@ -48,6 +48,7 @@ def save_config():
with open(config_file, "w", encoding="utf-8") as f:
_cfg["app"] = app
_cfg["azure"] = azure
_cfg["tencent"] = tencent
_cfg["soulvoice"] = soulvoice
_cfg["ui"] = ui
f.write(toml.dumps(_cfg))
@ -58,6 +59,7 @@ app = _cfg.get("app", {})
whisper = _cfg.get("whisper", {})
proxy = _cfg.get("proxy", {})
azure = _cfg.get("azure", {})
tencent = _cfg.get("tencent", {})
soulvoice = _cfg.get("soulvoice", {})
ui = _cfg.get("ui", {})
frames = _cfg.get("frames", {})

View File

@ -176,7 +176,7 @@ class VideoClipParams(BaseModel):
voice_volume: Optional[float] = Field(default=AudioVolumeDefaults.VOICE_VOLUME, description="解说语音音量")
voice_rate: Optional[float] = Field(default=1.0, description="语速")
voice_pitch: Optional[float] = Field(default=1.0, description="语调")
tts_engine: Optional[str] = Field(default="tencent", description="TTS 引擎")
bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")
bgm_file: Optional[str] = Field(default="", description="背景音乐文件")

View File

@ -18,7 +18,6 @@ from pathlib import Path
from app.utils import ffmpeg_utils
def parse_timestamp(timestamp: str) -> tuple:
"""
解析时间戳字符串返回开始和结束时间

View File

@ -43,5 +43,5 @@ __all__ = [
'QwenTextProvider',
'DeepSeekTextProvider',
'SiliconflowVisionProvider',
'SiliconflowTextProvider'
'SiliconflowTextProvider',
]

View File

@ -73,6 +73,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
tts_results = voice.tts_multiple(
task_id=task_id,
list_script=tts_segments, # 只传入需要TTS的片段
tts_engine=params.tts_engine,
voice_name=params.voice_name,
voice_rate=params.voice_rate,
voice_pitch=params.voice_pitch,
@ -317,6 +318,7 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
tts_results = voice.tts_multiple(
task_id=task_id,
list_script=tts_segments, # 只传入需要TTS的片段
tts_engine=params.tts_engine,
voice_name=params.voice_name,
voice_rate=params.voice_rate,
voice_pitch=params.voice_pitch,

View File

@ -5,6 +5,7 @@ import traceback
import edge_tts
import asyncio
import requests
import uuid
from loguru import logger
from typing import List, Union, Tuple
from datetime import datetime
@ -1080,17 +1081,27 @@ def should_use_azure_speech_services(voice_name: str) -> bool:
def tts(
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str, tts_engine: str = "azure"
) -> Union[SubMaker, None]:
# 检查是否为 SoulVoice 引擎
if is_soulvoice_voice(voice_name):
logger.info(f"使用 TTS 引擎: '{tts_engine}', 语音: '{voice_name}'")
if tts_engine == "tencent":
logger.info("分发到腾讯云 TTS")
return tencent_tts(text, voice_name, voice_file, speed=voice_rate)
if tts_engine == "soulvoice":
logger.info("分发到 SoulVoice TTS")
return soulvoice_tts(text, voice_name, voice_file, speed=voice_rate)
# 检查是否应该使用 Azure Speech Services
if should_use_azure_speech_services(voice_name):
return azure_tts_v2(text, voice_name, voice_file)
if tts_engine == "azure":
if should_use_azure_speech_services(voice_name):
logger.info("分发到 Azure Speech Services (V2)")
return azure_tts_v2(text, voice_name, voice_file)
logger.info("分发到 Edge TTS (Azure V1)")
return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
# 默认使用 Edge TTS (Azure V1)
# Fallback for unknown engine - default to azure v1
logger.warning(f"未知的 TTS 引擎: '{tts_engine}', 将默认使用 Edge TTS (Azure V1)。")
return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
@ -1483,7 +1494,7 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
return sub_maker.offset[-1][1] / 10000000
def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float):
def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float, tts_engine: str = "azure"):
"""
根据JSON文件中的多段文本进行TTS转换
@ -1491,6 +1502,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
:param list_script: 脚本列表
:param voice_name: 语音名称
:param voice_rate: 语音速率
:param tts_engine: TTS 引擎
:return: 生成的音频文件列表
"""
voice_name = parse_voice_name(voice_name)
@ -1512,6 +1524,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
voice_rate=voice_rate,
voice_pitch=voice_pitch,
voice_file=audio_file,
tts_engine=tts_engine,
)
if sub_maker is None:
@ -1581,14 +1594,6 @@ def get_audio_duration_from_file(audio_file: str) -> float:
# 如果所有方法都失败,返回一个基于文本长度的估算
return 3.0 # 默认3秒避免返回0
def is_soulvoice_voice(voice_name: str) -> bool:
"""
检查是否为 SoulVoice 语音
"""
return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:")
def parse_soulvoice_voice(voice_name: str) -> str:
"""
解析 SoulVoice 语音名称
@ -1600,6 +1605,118 @@ def parse_soulvoice_voice(voice_name: str) -> str:
return voice_name[10:] # 移除 "soulvoice:" 前缀
return voice_name
def parse_tencent_voice(voice_name: str) -> str:
"""
解析腾讯云 TTS 语音名称
支持格式tencent:101001
"""
if voice_name.startswith("tencent:"):
return voice_name[8:] # 移除 "tencent:" 前缀
return voice_name
def tencent_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""
使用腾讯云 TTS 生成语音
"""
try:
# 导入腾讯云 SDK
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.tts.v20190823 import tts_client, models
import base64
except ImportError as e:
logger.error(f"腾讯云 SDK 未安装: {e}")
return None
# 获取腾讯云配置
tencent_config = config.tencent
secret_id = tencent_config.get("secret_id")
secret_key = tencent_config.get("secret_key")
region = tencent_config.get("region", "ap-beijing")
if not secret_id or not secret_key:
logger.error("腾讯云 TTS 配置不完整,请检查 secret_id 和 secret_key")
return None
# 解析语音名称
voice_type = parse_tencent_voice(voice_name)
# 转换速度参数 (腾讯云支持 -2 到 2 的范围)
speed_value = max(-2.0, min(2.0, (speed - 1.0) * 2))
for i in range(3):
try:
logger.info(f"{i+1} 次使用腾讯云 TTS 生成音频")
# 创建认证对象
cred = credential.Credential(secret_id, secret_key)
# 创建 HTTP 配置
httpProfile = HttpProfile()
httpProfile.endpoint = "tts.tencentcloudapi.com"
# 创建客户端配置
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
# 创建客户端
client = tts_client.TtsClient(cred, region, clientProfile)
req = models.TextToVoiceRequest()
req.Text = text
req.SessionId = str(uuid.uuid4())
req.VoiceType = int(voice_type) if voice_type.isdigit() else 101001
req.Speed = speed_value
req.SampleRate = 16000
req.Codec = "mp3"
req.ProjectId = 0
req.ModelType = 1
req.PrimaryLanguage = 1
req.EnableSubtitle = True
# 发送请求
resp = client.TextToVoice(req)
# 检查响应
if not resp.Audio:
logger.warning(f"腾讯云 TTS 返回空音频数据")
if i < 2:
time.sleep(1)
continue
# 解码音频数据
audio_data = base64.b64decode(resp.Audio)
# 写入文件
with open(voice_file, "wb") as f:
f.write(audio_data)
# 创建字幕对象
sub_maker = SubMaker()
if resp.Subtitles:
for sub in resp.Subtitles:
start_ms = sub.BeginTime
end_ms = sub.EndTime
text = sub.Text
# 转换为 100ns 单位
sub_maker.create_sub((start_ms * 10000, end_ms * 10000), text)
else:
# 如果没有字幕返回,则使用估算作为后备方案
duration_ms = len(text) * 200
sub_maker.create_sub((0, duration_ms * 10000), text)
logger.info(f"腾讯云 TTS 生成成功,文件大小: {len(audio_data)} 字节")
return sub_maker
except Exception as e:
logger.error(f"腾讯云 TTS 生成音频时出错: {str(e)}")
if i < 2:
time.sleep(1)
return None
def soulvoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""

View File

@ -96,6 +96,14 @@
speech_key = ""
speech_region = ""
[tencent]
# 腾讯云 TTS 配置
# 访问 https://console.cloud.tencent.com/cam/capi 获取你的密钥
secret_id = ""
secret_key = ""
# 地域配置,默认为 ap-beijing
region = "ap-beijing"
[soulvoice]
# SoulVoice TTS API 密钥
api_key = ""
@ -107,7 +115,7 @@
model = "FunAudioLLM/CosyVoice2-0.5B"
[ui]
# TTS引擎选择 (edge_tts, azure_speech, soulvoice)
# TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts)
tts_engine = "edge_tts"
# Edge TTS 配置

View File

@ -6,6 +6,61 @@ log() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1"
}
# 函数:安装运行时依赖
install_runtime_dependencies() {
log "检查并安装运行时依赖..."
# 检查是否需要安装新的依赖
local requirements_file="requirements.txt"
local installed_packages_file="/tmp/installed_packages.txt"
# 如果requirements.txt存在且比已安装包列表新则重新安装
if [ -f "$requirements_file" ]; then
if [ ! -f "$installed_packages_file" ] || [ "$requirements_file" -nt "$installed_packages_file" ]; then
log "发现新的依赖需求,开始安装..."
# 尝试使用sudo安装如果失败则使用用户级安装
if command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null; then
log "尝试使用sudo安装依赖..."
sudo pip install --no-cache-dir -r "$requirements_file" 2>&1 | while read line; do
log "pip: $line"
done
INSTALL_RESULT=${PIPESTATUS[0]}
else
INSTALL_RESULT=1 # 设置为失败,触发用户级安装
fi
# 如果sudo安装失败尝试用户级安装
if [ $INSTALL_RESULT -ne 0 ]; then
log "尝试用户级安装依赖..."
pip install --user --no-cache-dir -r "$requirements_file" 2>&1 | while read line; do
log "pip: $line"
done
# 确保用户级安装的包在PATH中
export PATH="$HOME/.local/bin:$PATH"
fi
# 单独安装腾讯云SDK确保安装
log "确保腾讯云SDK已安装..."
if ! pip list | grep -q "tencentcloud-sdk-python"; then
log "安装腾讯云SDK..."
pip install --user tencentcloud-sdk-python>=3.0.1200
else
log "腾讯云SDK已安装"
fi
# 记录安装时间
touch "$installed_packages_file"
log "依赖安装完成"
else
log "依赖已是最新版本,跳过安装"
fi
else
log "未找到 requirements.txt 文件"
fi
}
# 函数:检查必要的文件和目录
check_requirements() {
log "检查应用环境..."
@ -27,6 +82,9 @@ check_requirements() {
mkdir -p "$dir"
fi
done
# 安装运行时依赖
install_runtime_dependencies
log "环境检查完成"
}

View File

@ -14,6 +14,7 @@ pysrt==1.1.2
openai>=1.77.0
google-generativeai>=0.8.5
azure-cognitiveservices-speech>=1.37.0
tencentcloud-sdk-python>=3.0.1200
# 图像处理依赖
Pillow>=10.3.0

View File

@ -24,7 +24,8 @@ def get_tts_engine_options():
return {
"edge_tts": "Edge TTS",
"azure_speech": "Azure Speech Services",
"soulvoice": "SoulVoice"
"soulvoice": "SoulVoice",
"tencent_tts": "腾讯云 TTS"
}
@ -48,6 +49,12 @@ def get_tts_engine_descriptions():
"features": "提供免费额度,支持语音克隆,支持微信购买额度,无需信用卡,性价比极高",
"use_case": "个人用户和中小企业,需要语音克隆功能",
"registration": "https://soulvoice.scsmtech.cn/"
},
"tencent_tts": {
"title": "腾讯云 TTS",
"features": "提供免费额度,音质优秀,支持多种音色,国内访问速度快",
"use_case": "个人和企业用户,需要稳定的中文语音合成",
"registration": "https://console.cloud.tencent.com/tts"
}
}
@ -126,6 +133,8 @@ def render_tts_settings(tr):
render_azure_speech_settings(tr)
elif selected_engine == "soulvoice":
render_soulvoice_engine_settings(tr)
elif selected_engine == "tencent_tts":
render_tencent_tts_settings(tr)
# 4. 试听功能
render_voice_preview_new(tr, selected_engine)
@ -357,6 +366,117 @@ def render_azure_speech_settings(tr):
st.warning("⚠️ 请配置 API Key")
def render_tencent_tts_settings(tr):
"""渲染腾讯云 TTS 引擎设置"""
# Secret ID 输入
secret_id = st.text_input(
"Secret ID",
value=config.tencent.get("secret_id", ""),
help="请输入您的腾讯云 Secret ID"
)
# Secret Key 输入
secret_key = st.text_input(
"Secret Key",
value=config.tencent.get("secret_key", ""),
type="password",
help="请输入您的腾讯云 Secret Key"
)
# 地域选择
region_options = [
"ap-beijing",
"ap-shanghai",
"ap-guangzhou",
"ap-chengdu",
"ap-nanjing",
"ap-singapore",
"ap-hongkong"
]
saved_region = config.tencent.get("region", "ap-beijing")
if saved_region not in region_options:
region_options.append(saved_region)
region = st.selectbox(
"服务地域",
options=region_options,
index=region_options.index(saved_region),
help="选择腾讯云 TTS 服务地域"
)
# 音色选择
voice_type_options = {
"101001": "智瑜 - 女声(推荐)",
"101002": "智聆 - 女声",
"101003": "智美 - 女声",
"101004": "智云 - 男声",
"101005": "智莉 - 女声",
"101006": "智言 - 男声",
"101007": "智娜 - 女声",
"101008": "智琪 - 女声",
"101009": "智芸 - 女声",
"101010": "智华 - 男声",
"101011": "智燕 - 女声",
"101012": "智丹 - 女声",
"101013": "智辉 - 男声",
"101014": "智宁 - 女声",
"101015": "智萌 - 女声",
"101016": "智甜 - 女声",
"101017": "智蓉 - 女声",
"101018": "智靖 - 男声"
}
saved_voice_type = config.ui.get("tencent_voice_type", "101001")
if saved_voice_type not in voice_type_options:
voice_type_options[saved_voice_type] = f"自定义音色 ({saved_voice_type})"
selected_voice_display = st.selectbox(
"音色选择",
options=list(voice_type_options.values()),
index=list(voice_type_options.keys()).index(saved_voice_type),
help="选择腾讯云 TTS 音色"
)
# 获取实际的音色ID
voice_type = list(voice_type_options.keys())[
list(voice_type_options.values()).index(selected_voice_display)
]
# 语速调节
voice_rate = st.slider(
"语速调节",
min_value=0.5,
max_value=2.0,
value=config.ui.get("tencent_rate", 1.0),
step=0.1,
help="调节语音速度 (0.5-2.0)"
)
# 显示音色说明
with st.expander("💡 腾讯云 TTS 音色说明", expanded=False):
st.write("**女声音色:**")
female_voices = [(k, v) for k, v in voice_type_options.items() if "女声" in v]
for voice_id, voice_desc in female_voices[:6]: # 显示前6个
st.write(f"{voice_desc} (ID: {voice_id})")
st.write("")
st.write("**男声音色:**")
male_voices = [(k, v) for k, v in voice_type_options.items() if "男声" in v]
for voice_id, voice_desc in male_voices:
st.write(f"{voice_desc} (ID: {voice_id})")
st.write("")
st.info("💡 更多音色请参考腾讯云官方文档")
# 保存配置
config.tencent["secret_id"] = secret_id
config.tencent["secret_key"] = secret_key
config.tencent["region"] = region
config.ui["tencent_voice_type"] = voice_type
config.ui["tencent_rate"] = voice_rate
def render_soulvoice_engine_settings(tr):
"""渲染 SoulVoice 引擎设置"""
# API Key 输入
@ -453,6 +573,11 @@ def render_voice_preview_new(tr, selected_engine):
voice_name = voice_uri if voice_uri.startswith("soulvoice:") else f"soulvoice:{voice_uri}"
voice_rate = 1.0 # SoulVoice 使用默认语速
voice_pitch = 1.0 # SoulVoice 不支持音调调节
elif selected_engine == "tencent_tts":
voice_type = config.ui.get("tencent_voice_type", "101001")
voice_name = f"tencent:{voice_type}"
voice_rate = config.ui.get("tencent_rate", 1.0)
voice_pitch = 1.0 # 腾讯云 TTS 不支持音调调节
if not voice_name:
st.error("请先配置语音设置")