新增腾讯云 TTS 服务

2026-03-17 03:04:23 +00:00 · 2025-09-16 14:40:08 +08:00 · 2025-09-16 14:40:08 +08:00 · a1474bed02
commit a1474bed02
parent da27d8d8a1
11 changed files with 348 additions and 44 deletions
--- a/38
+++ b/38
@ -22,10 +22,9 @@ RUN python -m pip install --upgrade pip setuptools wheel && \
 # 激活虚拟环境
 ENV PATH="/opt/venv/bin:$PATH"
-# 复制 requirements.txt 并安装 Python 依赖
+# 复制 requirements.txt 并使用镜像安装 Python 依赖
 COPY requirements.txt .
-RUN pip install --no-cache-dir --upgrade pip && \
+RUN pip install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
    pip install --no-cache-dir -r requirements.txt
 # 运行阶段
 FROM python:3.12-slim-bookworm
@ -48,7 +47,7 @@ ENV PATH="/opt/venv/bin:$PATH" \
    LANG=C.UTF-8 \
    LC_ALL=C.UTF-8
-# 安装运行时系统依赖
+# 一次性安装所有依赖、创建用户、配置系统，减少层级
 RUN apt-get update && apt-get install -y --no-install-recommends \
    imagemagick \
    ffmpeg \
@ -56,32 +55,25 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    curl \
    git-lfs \
    ca-certificates \
    dos2unix \
    && sed -i 's/<policy domain="path" rights="none" pattern="@\*"/<policy domain="path" rights="read|write" pattern="@\*"/' /etc/ImageMagick-6/policy.xml || true \
    && git lfs install \
    && groupadd -r narratoai && useradd -r -g narratoai -d /NarratoAI -s /bin/bash narratoai \
    && rm -rf /var/lib/apt/lists/*
-# 配置 ImageMagick 策略（允许处理更多格式）
+# 复制入口脚本并修复换行符问题
-RUN sed -i 's/<policy domain="path" rights="none" pattern="@\*"/<policy domain="path" rights="read|write" pattern="@\*"/' /etc/ImageMagick-6/policy.xml || true
+COPY --chown=narratoai:narratoai docker-entrypoint.sh /usr/local/bin/
 RUN dos2unix /usr/local/bin/docker-entrypoint.sh && chmod +x /usr/local/bin/docker-entrypoint.sh
-# 初始化 git-lfs
+# 复制其余的应用代码
 RUN git lfs install
 # 创建非 root 用户（安全最佳实践）
 RUN groupadd -r narratoai && useradd -r -g narratoai -d /NarratoAI -s /bin/bash narratoai
 # 复制应用代码
 COPY --chown=narratoai:narratoai . .
-# 确保配置文件存在
+# 创建目录、复制配置、设置权限
 RUN if [ ! -f config.toml ]; then cp config.example.toml config.toml; fi
 # 创建必要的目录并设置权限
 RUN mkdir -p storage/temp storage/tasks storage/json storage/narration_scripts storage/drama_analysis && \
    if [ ! -f config.toml ]; then cp config.example.toml config.toml; fi && \
    chown -R narratoai:narratoai /NarratoAI && \
    chmod -R 755 /NarratoAI
 # 复制并设置入口点脚本
 COPY --chown=narratoai:narratoai docker-entrypoint.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/docker-entrypoint.sh
 # 切换到非 root 用户
 USER narratoai
@ -93,5 +85,5 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:8501/_stcore/health || exit 1
 # 设置入口点
-ENTRYPOINT ["docker-entrypoint.sh"]
+ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
-CMD ["webui"]
+CMD ["webui"]
--- a/app/config/config.py
+++ b/app/config/config.py
@ -48,6 +48,7 @@ def save_config():
    with open(config_file, "w", encoding="utf-8") as f:
        _cfg["app"] = app
        _cfg["azure"] = azure
        _cfg["tencent"] = tencent
        _cfg["soulvoice"] = soulvoice
        _cfg["ui"] = ui
        f.write(toml.dumps(_cfg))
@ -58,6 +59,7 @@ app = _cfg.get("app", {})
 whisper = _cfg.get("whisper", {})
 proxy = _cfg.get("proxy", {})
 azure = _cfg.get("azure", {})
 tencent = _cfg.get("tencent", {})
 soulvoice = _cfg.get("soulvoice", {})
 ui = _cfg.get("ui", {})
 frames = _cfg.get("frames", {})
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -176,7 +176,7 @@ class VideoClipParams(BaseModel):
    voice_volume: Optional[float] = Field(default=AudioVolumeDefaults.VOICE_VOLUME, description="解说语音音量")
    voice_rate: Optional[float] = Field(default=1.0, description="语速")
    voice_pitch: Optional[float] = Field(default=1.0, description="语调")
-
+    tts_engine: Optional[str] = Field(default="tencent", description="TTS 引擎")
    bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
    bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")
    bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
--- a/app/services/clip_video.py
+++ b/app/services/clip_video.py
@ -18,7 +18,6 @@ from pathlib import Path
 from app.utils import ffmpeg_utils
 def parse_timestamp(timestamp: str) -> tuple:
    """
    解析时间戳字符串，返回开始和结束时间
--- a/app/services/llm/providers/init.py
+++ b/app/services/llm/providers/init.py
@ -43,5 +43,5 @@ __all__ = [
    'QwenTextProvider',
    'DeepSeekTextProvider',
    'SiliconflowVisionProvider',
-    'SiliconflowTextProvider'
+    'SiliconflowTextProvider',
 ]
--- a/app/services/task.py
+++ b/app/services/task.py
@ -73,6 +73,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
    tts_results = voice.tts_multiple(
        task_id=task_id,
        list_script=tts_segments,  # 只传入需要TTS的片段
        tts_engine=params.tts_engine,
        voice_name=params.voice_name,
        voice_rate=params.voice_rate,
        voice_pitch=params.voice_pitch,
@ -317,6 +318,7 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
    tts_results = voice.tts_multiple(
        task_id=task_id,
        list_script=tts_segments,  # 只传入需要TTS的片段
        tts_engine=params.tts_engine,
        voice_name=params.voice_name,
        voice_rate=params.voice_rate,
        voice_pitch=params.voice_pitch,
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -5,6 +5,7 @@ import traceback
 import edge_tts
 import asyncio
 import requests
 import uuid
 from loguru import logger
 from typing import List, Union, Tuple
 from datetime import datetime
@ -1080,17 +1081,27 @@ def should_use_azure_speech_services(voice_name: str) -> bool:
 def tts(
-    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
+    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str, tts_engine: str = "azure"
 ) -> Union[SubMaker, None]:
-    # 检查是否为 SoulVoice 引擎
+    logger.info(f"使用 TTS 引擎: '{tts_engine}', 语音: '{voice_name}'")
-    if is_soulvoice_voice(voice_name):
+
    if tts_engine == "tencent":
        logger.info("分发到腾讯云 TTS")
        return tencent_tts(text, voice_name, voice_file, speed=voice_rate)
    if tts_engine == "soulvoice":
        logger.info("分发到 SoulVoice TTS")
        return soulvoice_tts(text, voice_name, voice_file, speed=voice_rate)
-    # 检查是否应该使用 Azure Speech Services
+    if tts_engine == "azure":
-    if should_use_azure_speech_services(voice_name):
+        if should_use_azure_speech_services(voice_name):
-        return azure_tts_v2(text, voice_name, voice_file)
+            logger.info("分发到 Azure Speech Services (V2)")
            return azure_tts_v2(text, voice_name, voice_file)
        logger.info("分发到 Edge TTS (Azure V1)")
        return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
-    # 默认使用 Edge TTS (Azure V1)
+    # Fallback for unknown engine - default to azure v1
    logger.warning(f"未知的 TTS 引擎: '{tts_engine}', 将默认使用 Edge TTS (Azure V1)。")
    return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
@ -1483,7 +1494,7 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
    return sub_maker.offset[-1][1] / 10000000
-def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float):
+def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float, tts_engine: str = "azure"):
    """
    根据JSON文件中的多段文本进行TTS转换
@ -1491,6 +1502,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
    :param list_script: 脚本列表
    :param voice_name: 语音名称
    :param voice_rate: 语音速率
    :param tts_engine: TTS 引擎
    :return: 生成的音频文件列表
    """
    voice_name = parse_voice_name(voice_name)
@ -1512,6 +1524,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
                voice_rate=voice_rate,
                voice_pitch=voice_pitch,
                voice_file=audio_file,
                tts_engine=tts_engine,
            )
            if sub_maker is None:
@ -1581,14 +1594,6 @@ def get_audio_duration_from_file(audio_file: str) -> float:
        # 如果所有方法都失败，返回一个基于文本长度的估算
        return 3.0  # 默认3秒，避免返回0
 def is_soulvoice_voice(voice_name: str) -> bool:
    """
    检查是否为 SoulVoice 语音
    """
    return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:")
 def parse_soulvoice_voice(voice_name: str) -> str:
    """
    解析 SoulVoice 语音名称
@ -1600,6 +1605,118 @@ def parse_soulvoice_voice(voice_name: str) -> str:
        return voice_name[10:]  # 移除 "soulvoice:" 前缀
    return voice_name
 def parse_tencent_voice(voice_name: str) -> str:
    """
    解析腾讯云 TTS 语音名称
    支持格式：tencent:101001
    """
    if voice_name.startswith("tencent:"):
        return voice_name[8:]  # 移除 "tencent:" 前缀
    return voice_name
 def tencent_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
    """
    使用腾讯云 TTS 生成语音
    """
    try:
        # 导入腾讯云 SDK
        from tencentcloud.common import credential
        from tencentcloud.common.profile.client_profile import ClientProfile
        from tencentcloud.common.profile.http_profile import HttpProfile
        from tencentcloud.tts.v20190823 import tts_client, models
        import base64
    except ImportError as e:
        logger.error(f"腾讯云 SDK 未安装: {e}")
        return None
    # 获取腾讯云配置
    tencent_config = config.tencent
    secret_id = tencent_config.get("secret_id")
    secret_key = tencent_config.get("secret_key")
    region = tencent_config.get("region", "ap-beijing")
    if not secret_id or not secret_key:
        logger.error("腾讯云 TTS 配置不完整，请检查 secret_id 和 secret_key")
        return None
    # 解析语音名称
    voice_type = parse_tencent_voice(voice_name)
    # 转换速度参数 (腾讯云支持 -2 到 2 的范围)
    speed_value = max(-2.0, min(2.0, (speed - 1.0) * 2))
    for i in range(3):
        try:
            logger.info(f"第 {i+1} 次使用腾讯云 TTS 生成音频")
            # 创建认证对象
            cred = credential.Credential(secret_id, secret_key)
            # 创建 HTTP 配置
            httpProfile = HttpProfile()
            httpProfile.endpoint = "tts.tencentcloudapi.com"
            # 创建客户端配置
            clientProfile = ClientProfile()
            clientProfile.httpProfile = httpProfile
            # 创建客户端
            client = tts_client.TtsClient(cred, region, clientProfile)
            req = models.TextToVoiceRequest()
            req.Text = text
            req.SessionId = str(uuid.uuid4())
            req.VoiceType = int(voice_type) if voice_type.isdigit() else 101001
            req.Speed = speed_value
            req.SampleRate = 16000
            req.Codec = "mp3"
            req.ProjectId = 0
            req.ModelType = 1
            req.PrimaryLanguage = 1
            req.EnableSubtitle = True
            # 发送请求
            resp = client.TextToVoice(req)
            # 检查响应
            if not resp.Audio:
                logger.warning(f"腾讯云 TTS 返回空音频数据")
                if i < 2:
                    time.sleep(1)
                continue
            # 解码音频数据
            audio_data = base64.b64decode(resp.Audio)
            # 写入文件
            with open(voice_file, "wb") as f:
                f.write(audio_data)
            # 创建字幕对象
            sub_maker = SubMaker()
            if resp.Subtitles:
                for sub in resp.Subtitles:
                    start_ms = sub.BeginTime
                    end_ms = sub.EndTime
                    text = sub.Text
                    # 转换为 100ns 单位
                    sub_maker.create_sub((start_ms * 10000, end_ms * 10000), text)
            else:
                # 如果没有字幕返回，则使用估算作为后备方案
                duration_ms = len(text) * 200
                sub_maker.create_sub((0, duration_ms * 10000), text)
            logger.info(f"腾讯云 TTS 生成成功，文件大小: {len(audio_data)} 字节")
            return sub_maker
        except Exception as e:
            logger.error(f"腾讯云 TTS 生成音频时出错: {str(e)}")
            if i < 2:
                 time.sleep(1)
    return None
 def soulvoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
    """
--- a/config.example.toml
+++ b/config.example.toml
@ -96,6 +96,14 @@
    speech_key = ""
    speech_region = ""
 [tencent]
    # 腾讯云 TTS 配置
    # 访问 https://console.cloud.tencent.com/cam/capi 获取你的密钥
    secret_id = ""
    secret_key = ""
    # 地域配置，默认为 ap-beijing
    region = "ap-beijing"
 [soulvoice]
    # SoulVoice TTS API 密钥
    api_key = ""
@ -107,7 +115,7 @@
    model = "FunAudioLLM/CosyVoice2-0.5B"
 [ui]
-    # TTS引擎选择 (edge_tts, azure_speech, soulvoice)
+    # TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts)
    tts_engine = "edge_tts"
    # Edge TTS 配置
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@ -6,6 +6,61 @@ log() {
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1"
 }
 # 函数：安装运行时依赖
 install_runtime_dependencies() {
    log "检查并安装运行时依赖..."
    # 检查是否需要安装新的依赖
    local requirements_file="requirements.txt"
    local installed_packages_file="/tmp/installed_packages.txt"
    # 如果requirements.txt存在且比已安装包列表新，则重新安装
    if [ -f "$requirements_file" ]; then
        if [ ! -f "$installed_packages_file" ] || [ "$requirements_file" -nt "$installed_packages_file" ]; then
            log "发现新的依赖需求，开始安装..."
            # 尝试使用sudo安装，如果失败则使用用户级安装
            if command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null; then
                log "尝试使用sudo安装依赖..."
                sudo pip install --no-cache-dir -r "$requirements_file" 2>&1 | while read line; do
                    log "pip: $line"
                done
                INSTALL_RESULT=${PIPESTATUS[0]}
            else
                INSTALL_RESULT=1  # 设置为失败，触发用户级安装
            fi
            # 如果sudo安装失败，尝试用户级安装
            if [ $INSTALL_RESULT -ne 0 ]; then
                log "尝试用户级安装依赖..."
                pip install --user --no-cache-dir -r "$requirements_file" 2>&1 | while read line; do
                    log "pip: $line"
                done
                # 确保用户级安装的包在PATH中
                export PATH="$HOME/.local/bin:$PATH"
            fi
            # 单独安装腾讯云SDK（确保安装）
            log "确保腾讯云SDK已安装..."
            if ! pip list | grep -q "tencentcloud-sdk-python"; then
                log "安装腾讯云SDK..."
                pip install --user tencentcloud-sdk-python>=3.0.1200
            else
                log "腾讯云SDK已安装"
            fi
            # 记录安装时间
            touch "$installed_packages_file"
            log "依赖安装完成"
        else
            log "依赖已是最新版本，跳过安装"
        fi
    else
        log "未找到 requirements.txt 文件"
    fi
 }
 # 函数：检查必要的文件和目录
 check_requirements() {
    log "检查应用环境..."
@ -27,6 +82,9 @@ check_requirements() {
            mkdir -p "$dir"
        fi
    done
    # 安装运行时依赖
    install_runtime_dependencies
    log "环境检查完成"
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -14,6 +14,7 @@ pysrt==1.1.2
 openai>=1.77.0
 google-generativeai>=0.8.5
 azure-cognitiveservices-speech>=1.37.0
 tencentcloud-sdk-python>=3.0.1200
 # 图像处理依赖
 Pillow>=10.3.0
--- a/webui/components/audio_settings.py
+++ b/webui/components/audio_settings.py
@ -24,7 +24,8 @@ def get_tts_engine_options():
    return {
        "edge_tts": "Edge TTS",
        "azure_speech": "Azure Speech Services",
-        "soulvoice": "SoulVoice"
+        "soulvoice": "SoulVoice",
        "tencent_tts": "腾讯云 TTS"
    }
@ -48,6 +49,12 @@ def get_tts_engine_descriptions():
            "features": "提供免费额度，支持语音克隆，支持微信购买额度，无需信用卡，性价比极高",
            "use_case": "个人用户和中小企业，需要语音克隆功能",
            "registration": "https://soulvoice.scsmtech.cn/"
        },
        "tencent_tts": {
            "title": "腾讯云 TTS",
            "features": "提供免费额度，音质优秀，支持多种音色，国内访问速度快",
            "use_case": "个人和企业用户，需要稳定的中文语音合成",
            "registration": "https://console.cloud.tencent.com/tts"
        }
    }
@ -126,6 +133,8 @@ def render_tts_settings(tr):
        render_azure_speech_settings(tr)
    elif selected_engine == "soulvoice":
        render_soulvoice_engine_settings(tr)
    elif selected_engine == "tencent_tts":
        render_tencent_tts_settings(tr)
    # 4. 试听功能
    render_voice_preview_new(tr, selected_engine)
@ -357,6 +366,117 @@ def render_azure_speech_settings(tr):
        st.warning("⚠️ 请配置 API Key")
 def render_tencent_tts_settings(tr):
    """渲染腾讯云 TTS 引擎设置"""
    # Secret ID 输入
    secret_id = st.text_input(
        "Secret ID",
        value=config.tencent.get("secret_id", ""),
        help="请输入您的腾讯云 Secret ID"
    )
    # Secret Key 输入
    secret_key = st.text_input(
        "Secret Key",
        value=config.tencent.get("secret_key", ""),
        type="password",
        help="请输入您的腾讯云 Secret Key"
    )
    # 地域选择
    region_options = [
        "ap-beijing",
        "ap-shanghai",
        "ap-guangzhou",
        "ap-chengdu",
        "ap-nanjing",
        "ap-singapore",
        "ap-hongkong"
    ]
    saved_region = config.tencent.get("region", "ap-beijing")
    if saved_region not in region_options:
        region_options.append(saved_region)
    region = st.selectbox(
        "服务地域",
        options=region_options,
        index=region_options.index(saved_region),
        help="选择腾讯云 TTS 服务地域"
    )
    # 音色选择
    voice_type_options = {
        "101001": "智瑜 - 女声（推荐）",
        "101002": "智聆 - 女声",
        "101003": "智美 - 女声",
        "101004": "智云 - 男声",
        "101005": "智莉 - 女声",
        "101006": "智言 - 男声",
        "101007": "智娜 - 女声",
        "101008": "智琪 - 女声",
        "101009": "智芸 - 女声",
        "101010": "智华 - 男声",
        "101011": "智燕 - 女声",
        "101012": "智丹 - 女声",
        "101013": "智辉 - 男声",
        "101014": "智宁 - 女声",
        "101015": "智萌 - 女声",
        "101016": "智甜 - 女声",
        "101017": "智蓉 - 女声",
        "101018": "智靖 - 男声"
    }
    saved_voice_type = config.ui.get("tencent_voice_type", "101001")
    if saved_voice_type not in voice_type_options:
        voice_type_options[saved_voice_type] = f"自定义音色 ({saved_voice_type})"
    selected_voice_display = st.selectbox(
        "音色选择",
        options=list(voice_type_options.values()),
        index=list(voice_type_options.keys()).index(saved_voice_type),
        help="选择腾讯云 TTS 音色"
    )
    # 获取实际的音色ID
    voice_type = list(voice_type_options.keys())[
        list(voice_type_options.values()).index(selected_voice_display)
    ]
    # 语速调节
    voice_rate = st.slider(
        "语速调节",
        min_value=0.5,
        max_value=2.0,
        value=config.ui.get("tencent_rate", 1.0),
        step=0.1,
        help="调节语音速度 (0.5-2.0)"
    )
    # 显示音色说明
    with st.expander("💡 腾讯云 TTS 音色说明", expanded=False):
        st.write("**女声音色：**")
        female_voices = [(k, v) for k, v in voice_type_options.items() if "女声" in v]
        for voice_id, voice_desc in female_voices[:6]:  # 显示前6个
            st.write(f"• {voice_desc} (ID: {voice_id})")
        st.write("")
        st.write("**男声音色：**")
        male_voices = [(k, v) for k, v in voice_type_options.items() if "男声" in v]
        for voice_id, voice_desc in male_voices:
            st.write(f"• {voice_desc} (ID: {voice_id})")
        st.write("")
        st.info("💡 更多音色请参考腾讯云官方文档")
    # 保存配置
    config.tencent["secret_id"] = secret_id
    config.tencent["secret_key"] = secret_key
    config.tencent["region"] = region
    config.ui["tencent_voice_type"] = voice_type
    config.ui["tencent_rate"] = voice_rate
 def render_soulvoice_engine_settings(tr):
    """渲染 SoulVoice 引擎设置"""
    # API Key 输入
@ -453,6 +573,11 @@ def render_voice_preview_new(tr, selected_engine):
                    voice_name = voice_uri if voice_uri.startswith("soulvoice:") else f"soulvoice:{voice_uri}"
            voice_rate = 1.0  # SoulVoice 使用默认语速
            voice_pitch = 1.0  # SoulVoice 不支持音调调节
        elif selected_engine == "tencent_tts":
            voice_type = config.ui.get("tencent_voice_type", "101001")
            voice_name = f"tencent:{voice_type}"
            voice_rate = config.ui.get("tencent_rate", 1.0)
            voice_pitch = 1.0  # 腾讯云 TTS 不支持音调调节
        if not voice_name:
            st.error("请先配置语音设置")