feat: support Doubao TTS API key auth

2026-07-02 12:25:35 +00:00 · 2026-07-02 11:35:23 +08:00 · 2026-07-02 11:35:23 +08:00 · 1b7bd79654
commit 1b7bd79654
parent d02c848977
9 changed files with 207 additions and 72 deletions
--- a/README-en.md
+++ b/README-en.md
@ -33,6 +33,7 @@ NarratoAI is an automated video narration tool that provides an all-in-one solut
 </div>

 ## Latest News
+- 2026.07.02 Released version 0.8.4 with Doubao TTS API Key setup and legacy credential compatibility
 - 2026.04.03 Released version 0.7.8, refactored the documentary frame-analysis pipeline with a shared service and improved extraction, caching, vision batching, and narration generation
 - 2025.05.11 Released new version 0.6.0, supports **short drama commentary** and optimized editing process
 - 2025.03.06 Released new version 0.5.2, supports DeepSeek R1 and DeepSeek V3 models for short drama mixing
--- a/README.md
+++ b/README.md
@ -41,6 +41,7 @@ NarratoAI 是一款自动化影视解说工具，基于 LLM 实现文案撰写
 本项目仅供学习和研究使用，不得商用。如需商业授权，请联系作者。

 ## 最新资讯
+- 2026.07.02 发布新版本 0.8.4，升级豆包语音 TTS 新版 API Key 配置并保留旧版凭据兼容
 - 2026.06.10 发布新版本 0.8.1，**大版本更新**，优化多个核心流程
 - 2026.04.27 发布新版本 0.7.9，新增 **Fun-ASR一键转录字幕**
 - 2026.04.03 发布新版本 0.7.8，重构纪录片逐帧分析链路，统一共享服务并优化抽帧、缓存、视觉并发与文案生成流程
--- a/app/services/test_doubaotts_tts_unittest.py
+++ b/app/services/test_doubaotts_tts_unittest.py
@ -0,0 +1,116 @@
+import base64
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+from app.services import voice
+
+
+class FakeDoubaoResponse:
+    status_code = 200
+    text = "OK"
+
+    def json(self):
+        return {
+            "code": 3000,
+            "data": base64.b64encode(b"mp3-bytes").decode("ascii"),
+        }
+
+
+class DoubaoTtsTests(unittest.TestCase):
+    def setUp(self):
+        self.original_doubaotts = dict(voice.config.doubaotts)
+        self.original_proxy = dict(voice.config.proxy)
+
+    def tearDown(self):
+        voice.config.doubaotts.clear()
+        voice.config.doubaotts.update(self.original_doubaotts)
+        voice.config.proxy.clear()
+        voice.config.proxy.update(self.original_proxy)
+
+    def test_api_key_auth_does_not_require_legacy_appid_or_token(self):
+        voice.config.doubaotts.clear()
+        voice.config.doubaotts.update(
+            {
+                "api_key": "db-api-key",
+                "cluster": "volcano_tts",
+                "volume": 1.2,
+                "pitch": 0.9,
+                "silence_duration": 0.25,
+            }
+        )
+        voice.config.proxy.clear()
+        voice.config.proxy.update({"enabled": False})
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_file = Path(temp_dir) / "doubao.mp3"
+            sub_maker = object()
+
+            with patch("requests.post", return_value=FakeDoubaoResponse()) as post, patch(
+                "app.services.voice.new_sub_maker", return_value=sub_maker
+            ):
+                result = voice.doubaotts_tts(
+                    text=" 你好，豆包新版鉴权。 ",
+                    voice_name="BV700_V2_streaming",
+                    voice_file=str(output_file),
+                    speed=1.25,
+                )
+            output_bytes = output_file.read_bytes() if output_file.exists() else b""
+
+        self.assertIs(result, sub_maker)
+        self.assertEqual(output_bytes, b"mp3-bytes")
+
+        _, kwargs = post.call_args
+        self.assertEqual(kwargs["headers"]["X-Api-Key"], "db-api-key")
+        self.assertNotIn("Authorization", kwargs["headers"])
+        self.assertEqual(kwargs["json"]["app"], {"cluster": "volcano_tts"})
+        self.assertEqual(kwargs["json"]["request"]["text"], "你好，豆包新版鉴权。")
+        self.assertEqual(kwargs["json"]["audio"]["voice_type"], "BV700_V2_streaming")
+        self.assertEqual(kwargs["json"]["audio"]["speed_ratio"], 1.25)
+        self.assertEqual(kwargs["json"]["audio"]["volume_ratio"], 1.2)
+        self.assertEqual(kwargs["json"]["audio"]["pitch_ratio"], 0.9)
+        self.assertEqual(kwargs["json"]["audio"]["silence_duration"], 0.25)
+
+    def test_legacy_token_auth_still_sends_appid_and_token(self):
+        voice.config.doubaotts.clear()
+        voice.config.doubaotts.update(
+            {
+                "appid": "legacy-appid",
+                "token": "legacy-token",
+                "cluster": "volcano_tts",
+            }
+        )
+        voice.config.proxy.clear()
+        voice.config.proxy.update({"enabled": False})
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_file = Path(temp_dir) / "doubao.mp3"
+
+            with patch("requests.post", return_value=FakeDoubaoResponse()) as post:
+                result = voice.doubaotts_tts(
+                    text="旧版鉴权仍然可用",
+                    voice_name="BV700_streaming",
+                    voice_file=str(output_file),
+                    speed=1.0,
+                )
+            output_bytes = output_file.read_bytes()
+
+        self.assertIsNotNone(result)
+        self.assertEqual(output_bytes, b"mp3-bytes")
+
+        _, kwargs = post.call_args
+        self.assertEqual(kwargs["headers"]["Authorization"], "Bearer;legacy-token")
+        self.assertNotIn("X-Api-Key", kwargs["headers"])
+        self.assertEqual(
+            kwargs["json"]["app"],
+            {
+                "appid": "legacy-appid",
+                "token": "legacy-token",
+                "cluster": "volcano_tts",
+            },
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -1150,14 +1150,13 @@ def doubaotts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
    """
    # 读取配置
    doubaotts_cfg = getattr(config, "doubaotts", {}) or {}
+    api_key = (doubaotts_cfg.get("api_key", "") or doubaotts_cfg.get("apikey", "")).strip()
    appid = doubaotts_cfg.get("appid", "")
    token = doubaotts_cfg.get("token", "")
-    ak = doubaotts_cfg.get("ak", "")
-    sk = doubaotts_cfg.get("sk", "")
    cluster = doubaotts_cfg.get("cluster", "volcano_tts")
    
-    if not appid or not token:
-        logger.error("豆包语音 TTS 配置未完成")
+    if not api_key and (not appid or not token):
+        logger.error("豆包语音 TTS 配置未完成，请配置 API Key，或填写旧版 AppID 和 Token")
        return None

    # 准备参数
@ -1174,12 +1173,15 @@ def doubaotts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
    pitch = doubaotts_cfg.get("pitch", 1.0)
    silence_duration = doubaotts_cfg.get("silence_duration", 0.125)
    
-    payload = {
-        "app": {
+    app_payload = {"cluster": cluster}
+    if not api_key:
+        app_payload.update({
            "appid": appid,
            "token": token,
-            "cluster": cluster
-        },
+        })
+
+    payload = {
+        "app": app_payload,
        "user": {
            "uid": "NarratoAI"
        },
@ -1206,11 +1208,14 @@ def doubaotts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
    # API 地址
    url = "https://openspeech.bytedance.com/api/v1/tts"
    
-    # 构建请求头（使用Bearer Token认证）
+    # 构建请求头。新版控制台优先使用 API Key，旧配置继续使用 Token。
    headers = {
        "Content-Type": "application/json",
-        "Authorization": f"Bearer;{token}"
    }
+    if api_key:
+        headers["X-Api-Key"] = api_key
+    else:
+        headers["Authorization"] = f"Bearer;{token}"

    for i in range(3):
        try:
--- a/config.example.toml
+++ b/config.example.toml
@ -1,5 +1,5 @@
 [app]
-    project_version="0.7.8"
+    project_version="0.8.4"

    # LLM API 超时配置（秒）
    llm_vision_timeout = 120  # 视觉模型基础超时时间
@ -221,10 +221,14 @@

 [doubaotts]
    # 豆包语音 TTS 配置
+    # 新版配置优先填写 API Key；旧版 appid/token 配置仍兼容
    # 申请流程：
-    # 1. 打开 https://console.volcengine.com/iam/keymanage 新建 Access Key 和 Secret Key
-    # 2. 打开 https://www.volcengine.com/product/voice-tech 点击立即使用
-    # 3. 在 API 服务中心找到音频生成下面的语音合成，获取 APPID 和 Token
+    # 1. 打开火山引擎豆包语音控制台
+    # 2. 进入 API Key 管理并创建 API Key
+    # 3. 确认已开通豆包语音合成服务
+    api_key = ""
+
+    # 旧版配置（兼容保留）
    ak = ""
    sk = ""
    appid = ""
--- a/2
+++ b/2
@ -1 +1 @@
-0.8.3
+0.8.4
--- a/webui/components/audio_settings.py
+++ b/webui/components/audio_settings.py
@ -1464,42 +1464,55 @@ def render_omnivoice_tts_settings(tr):

 def render_doubaotts_settings(tr):
    """渲染豆包语音 TTS 设置"""
-    # AK 输入
-    ak = st.text_input(
-        "Access Key",
-        value=config.doubaotts.get("ak", ""),
-        help=tr("Volcengine Access Key Help")
-    )
-
-    # SK 输入
-    sk = st.text_input(
-        "Secret Key",
-        value=config.doubaotts.get("sk", ""),
+    api_key = st.text_input(
+        "API Key",
+        value=config.doubaotts.get("api_key", ""),
        type="password",
-        help=tr("Volcengine Secret Key Help")
+        help=tr("Doubao API Key Help")
    )
+    ak = config.doubaotts.get("ak", "")
+    sk = config.doubaotts.get("sk", "")
+    appid = config.doubaotts.get("appid", "")
+    token = config.doubaotts.get("token", "")
+    cluster = config.doubaotts.get("cluster", "volcano_tts")

-    # AppID 输入
-    appid = st.text_input(
-        "AppID",
-        value=config.doubaotts.get("appid", ""),
-        help=tr("Doubao AppID Help")
-    )
+    with st.expander(tr("Doubao Legacy Credentials"), expanded=False):
+        # AK 输入
+        ak = st.text_input(
+            "Access Key",
+            value=ak,
+            help=tr("Volcengine Access Key Help")
+        )

-    # Token 输入
-    token = st.text_input(
-        "Token",
-        value=config.doubaotts.get("token", ""),
-        type="password",
-        help=tr("Doubao Token Help")
-    )
+        # SK 输入
+        sk = st.text_input(
+            "Secret Key",
+            value=sk,
+            type="password",
+            help=tr("Volcengine Secret Key Help")
+        )

-    # 集群配置
-    cluster = st.text_input(
-        tr("Cluster"),
-        value=config.doubaotts.get("cluster", "volcano_tts"),
-        help=tr("Doubao Cluster Help")
-    )
+        # AppID 输入
+        appid = st.text_input(
+            "AppID",
+            value=appid,
+            help=tr("Doubao AppID Help")
+        )
+
+        # Token 输入
+        token = st.text_input(
+            "Token",
+            value=token,
+            type="password",
+            help=tr("Doubao Token Help")
+        )
+
+        # 集群配置
+        cluster = st.text_input(
+            tr("Cluster"),
+            value=cluster,
+            help=tr("Doubao Cluster Help")
+        )

    # 音色选择
    # 在线音色列表（从文档中提取）
@ -1676,6 +1689,7 @@ def render_doubaotts_settings(tr):
        st.info(tr("Doubao TTS Fill Credentials Notice"))
    
    # 保存配置
+    config.doubaotts["api_key"] = api_key
    config.doubaotts["ak"] = ak
    config.doubaotts["sk"] = sk
    config.doubaotts["appid"] = appid
@ -1690,20 +1704,10 @@ def render_doubaotts_settings(tr):
    st.session_state['voice_rate'] = voice_rate # 确保语速参数被保存到session state

    # 显示配置状态
-    if ak and sk and appid and token:
+    if api_key or (appid and token):
        st.success(tr("Doubao TTS configured"))
    else:
-        missing = []
-        if not ak:
-            missing.append("Access Key")
-        if not sk:
-            missing.append("Secret Key")
-        if not appid:
-            missing.append("AppID")
-        if not token:
-            missing.append("Token")
-        if missing:
-            st.warning(tr("Please configure missing fields").format(fields=', '.join(missing)))
+        st.warning(tr("Please configure missing fields").format(fields="API Key / AppID + Token"))


 def render_voice_preview_new(tr, selected_engine):
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@ -652,6 +652,8 @@
    "OmniVoice Usage Instructions": "**OmniVoice-Pack speech synthesis**\n\n1. **Automatic voice**: set the API URL and language, then synthesize directly.\n2. **Voice design**: fill instruct with the desired gender, pitch, accent, or style.\n3. **Reference-audio clone**: upload or choose reference audio and fill its matching transcript.\n\n**Notes**:\n- The default service URL is http://127.0.0.1:7866/tts\n- Reference-audio cloning requires reference text when the service has no ASR model loaded\n- OmniVoice returns WAV audio, and NarratoAI estimates subtitle segment timing from the audio duration",
    "Volcengine Access Key Help": "Volcengine Access Key",
    "Volcengine Secret Key Help": "Volcengine Secret Key",
+    "Doubao API Key Help": "New Doubao Speech API Key. This field is preferred and does not require AppID or Token.",
+    "Doubao Legacy Credentials": "Legacy AppID / Token Credentials",
    "Doubao AppID Help": "Doubao TTS application AppID",
    "Doubao Token Help": "Doubao TTS application Token",
    "Cluster": "Cluster",
@ -664,13 +666,13 @@
    "Sentence Silence Duration Help": "Adjust sentence-end silence duration (0.0-2.0 seconds)",
    "Doubao TTS API Key Application Process": "Doubao TTS API Key Application Process",
    "Application Steps": "Application Steps",
-    "Doubao TTS Step 1": "1. Open [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)",
-    "Doubao TTS Step 2": "2. Create a new Access Key and Secret Key",
-    "Doubao TTS Step 3": "3. Open [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)",
-    "Doubao TTS Step 4": "4. Click Start Now",
-    "Doubao TTS Step 5": "5. In the left API Service Center, find Speech Synthesis under Audio Generation (note: Speech Synthesis, not the speech synthesis large model)",
-    "Doubao TTS Step 6": "6. Scroll to the bottom to get the APPID and Access Token",
-    "Doubao TTS Fill Credentials Notice": "Fill the Access Key, Secret Key, AppID, and Token above.",
+    "Doubao TTS Step 1": "1. Open the Volcengine Doubao Speech console",
+    "Doubao TTS Step 2": "2. Open API Key management and create an API Key",
+    "Doubao TTS Step 3": "3. Make sure Doubao speech synthesis is enabled",
+    "Doubao TTS Step 4": "4. Copy the API Key into the API Key field above",
+    "Doubao TTS Step 5": "5. The default cluster is volcano_tts and usually does not need changes",
+    "Doubao TTS Step 6": "6. Legacy AppID/Token users can keep using the compatibility fields",
+    "Doubao TTS Fill Credentials Notice": "The new setup only requires an API Key. Legacy AppID/Token credentials remain supported.",
    "Doubao TTS configured": "Doubao TTS is configured",
    "Please configure missing fields": "Please configure: {fields}",
    "Preview Voice Synthesis": "Preview Voice Synthesis",
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -591,6 +591,8 @@
    "OmniVoice Usage Instructions": "**OmniVoice-Pack 语音合成**\n\n1. **自动音色**：只需要设置 API 地址和语言，可直接合成。\n2. **指令音色**：填写 instruct 描述想要的性别、音高、口音或风格。\n3. **参考音频克隆**：上传或选择参考音频，并填写该音频对应文本。\n\n**注意事项**：\n- 当前默认服务地址为 http://127.0.0.1:7866/tts\n- 参考音频克隆在服务未加载 ASR 模型时必须填写参考文本\n- OmniVoice 返回 WAV 音频，系统会按音频时长估算字幕段落",
    "Volcengine Access Key Help": "火山引擎 Access Key",
    "Volcengine Secret Key Help": "火山引擎 Secret Key",
+    "Doubao API Key Help": "新版豆包语音 API Key；优先使用该字段，无需填写 AppID 和 Token",
+    "Doubao Legacy Credentials": "旧版 AppID / Token 配置（兼容）",
    "Doubao AppID Help": "豆包语音应用 AppID",
    "Doubao Token Help": "豆包语音应用 Token",
    "Cluster": "集群",
@ -603,13 +605,13 @@
    "Sentence Silence Duration Help": "调节句尾静音时长 (0.0-2.0 秒)",
    "Doubao TTS API Key Application Process": "豆包语音 TTS API Key申请流程",
    "Application Steps": "申请步骤",
-    "Doubao TTS Step 1": "1. 打开 [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)",
-    "Doubao TTS Step 2": "2. 新建 Access Key 和 Secret Key",
-    "Doubao TTS Step 3": "3. 打开 [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)",
-    "Doubao TTS Step 4": "4. 点击立即使用",
-    "Doubao TTS Step 5": "5. 在最左边的 API 服务中心找到音频生成下面的语音合成（注意：是语音合成，不是语音合成大模型）",
-    "Doubao TTS Step 6": "6. 翻到最下面获取 APPID 和 Access Token",
-    "Doubao TTS Fill Credentials Notice": "请将获取到的 Access Key、Secret Key、AppID 和 Token 填写到上方的配置中",
+    "Doubao TTS Step 1": "1. 打开火山引擎豆包语音控制台",
+    "Doubao TTS Step 2": "2. 进入 API Key 管理并创建 API Key",
+    "Doubao TTS Step 3": "3. 确认已开通豆包语音合成服务",
+    "Doubao TTS Step 4": "4. 复制 API Key 并填写到上方 API Key 输入框",
+    "Doubao TTS Step 5": "5. 默认集群使用 volcano_tts，通常无需修改",
+    "Doubao TTS Step 6": "6. 旧版 AppID/Token 用户可继续在兼容配置中填写原凭据",
+    "Doubao TTS Fill Credentials Notice": "新版配置只需要填写 API Key；旧版 AppID/Token 仍保留兼容",
    "Doubao TTS configured": "豆包语音 TTS 配置已设置",
    "Please configure missing fields": "请配置: {fields}",
    "Preview Voice Synthesis": "试听语音合成",
 @ -1 +1 @@
 .8.3
 .8.4