Merge pull request #237 from aw123456dew/feature/doubao-tts

add doubao tts
2026-07-29 17:35:53 +00:00 · 2026-04-08 15:14:10 +08:00 · 2026-04-08 15:14:10 +08:00 · 8c129790c7
commit 8c129790c7
parent de33c6d0bd 71dfc99839
4 changed files with 428 additions and 8 deletions
--- a/app/config/config.py
+++ b/app/config/config.py
@ -82,6 +82,7 @@ def save_config():
        _cfg["ui"] = ui
        _cfg["tts_qwen"] = tts_qwen
        _cfg["indextts2"] = indextts2
+        _cfg["doubaotts"] = doubaotts
        f.write(toml.dumps(_cfg))


@ -96,6 +97,7 @@ ui = _cfg.get("ui", {})
 frames = _cfg.get("frames", {})
 tts_qwen = _cfg.get("tts_qwen", {})
 indextts2 = _cfg.get("indextts2", {})
+doubaotts = _cfg.get("doubaotts", {})

 hostname = socket.gethostname()

--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -1116,6 +1116,125 @@ def should_use_azure_speech_services(voice_name: str) -> bool:
    return False


+def doubaotts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
+    """
+    使用豆包语音 TTS 生成语音
+    """
+    # 读取配置
+    doubaotts_cfg = getattr(config, "doubaotts", {}) or {}
+    appid = doubaotts_cfg.get("appid", "")
+    token = doubaotts_cfg.get("token", "")
+    ak = doubaotts_cfg.get("ak", "")
+    sk = doubaotts_cfg.get("sk", "")
+    cluster = doubaotts_cfg.get("cluster", "volcano_tts")
+    
+    if not appid or not token:
+        logger.error("豆包语音 TTS 配置未完成")
+        return None
+
+    # 准备参数
+    voice_type = voice_name
+    safe_speed = float(max(0.2, min(3.0, speed)))
+    text = text.strip()
+
+    # 构建请求参数
+    import uuid
+    reqid = str(uuid.uuid4())
+    
+    # 获取高级参数
+    volume = doubaotts_cfg.get("volume", 1.0)
+    pitch = doubaotts_cfg.get("pitch", 1.0)
+    silence_duration = doubaotts_cfg.get("silence_duration", 0.125)
+    
+    payload = {
+        "app": {
+            "appid": appid,
+            "token": token,
+            "cluster": cluster
+        },
+        "user": {
+            "uid": "NarratoAI"
+        },
+        "audio": {
+            "voice_type": voice_type,
+            "encoding": "mp3",
+            "rate": 24000,
+            "speed_ratio": safe_speed,
+            "volume_ratio": float(volume),
+            "pitch_ratio": float(pitch)
+        },
+        "request": {
+            "reqid": reqid,
+            "text": text,
+            "text_type": "plain",
+            "operation": "query"
+        }
+    }
+    
+    # 如果设置了句尾静音时长，添加到请求参数中
+    if silence_duration > 0:
+        payload["audio"]["silence_duration"] = float(silence_duration)
+
+    # API 地址
+    url = "https://openspeech.bytedance.com/api/v1/tts"
+    
+    # 构建请求头（使用Bearer Token认证）
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer;{token}"
+    }
+
+    for i in range(3):
+        try:
+            logger.info(f"=== 豆包语音 TTS 请求参数 (第 {i+1} 次调用) ===")
+            
+            # 发送请求
+            import requests
+            # 处理代理设置
+            proxies = None
+            proxy_enabled = config.proxy.get("enabled", False)
+            if proxy_enabled:
+                proxy_url = config.proxy.get("https", config.proxy.get("http", ""))
+                if proxy_url:
+                    proxies = {"https": proxy_url, "http": proxy_url}
+            response = requests.post(url, json=payload, headers=headers, proxies=proxies, timeout=60)
+            
+            if response.status_code == 200:
+                result = response.json()
+                if result.get("code") == 3000:
+                    # 成功
+                    audio_data = result.get("data", "")
+                    if audio_data:
+                        # 解码 base64 音频数据
+                        import base64
+                        audio_bytes = base64.b64decode(audio_data)
+                        
+                        # 写入文件
+                        with open(voice_file, "wb") as f:
+                            f.write(audio_bytes)
+                        
+                        logger.success(f"豆包语音 TTS 合成成功: {voice_file}")
+                        
+                        # 创建 SubMaker 对象（简化版，不包含时间戳）
+                        sub_maker = new_sub_maker()
+                        return sub_maker
+                    else:
+                        logger.error("豆包语音 TTS 响应中无音频数据")
+                else:
+                    logger.error(f"豆包语音 TTS 失败: {result.get('message', '未知错误')}")
+            else:
+                logger.error(f"豆包语音 TTS API 请求失败: {response.status_code}, {response.text}")
+                
+            if i < 2:
+                time.sleep(1)
+        except Exception as e:
+            logger.error(f"豆包语音 TTS 错误: {str(e)}")
+            if i < 2:
+                time.sleep(3)
+    
+    return None
+
+
 def tts(
    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str, tts_engine: str
 ) -> Union[SubMaker, None]:
@ -1147,6 +1266,10 @@ def tts(
    if tts_engine == "indextts2":
        logger.info("分发到 IndexTTS2")
        return indextts2_tts(text, voice_name, voice_file, speed=voice_rate)
+    
+    if tts_engine == "doubaotts":
+        logger.info("分发到豆包语音 TTS")
+        return doubaotts_tts(text, voice_name, voice_file, speed=voice_rate)

    # Fallback for unknown engine - default to azure v1
    logger.warning(f"未知的 TTS 引擎: '{tts_engine}', 将默认使用 Edge TTS (Azure V1)。")
@ -1606,8 +1729,8 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
                             f"或者使用其他 tts 引擎")
                continue
            else:
-                # SoulVoice、Qwen3、IndexTTS2 引擎不生成字幕文件
-                if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2":
+                # SoulVoice、Qwen3、IndexTTS2、豆包语音 引擎不生成字幕文件
+                if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2" or tts_engine == "doubaotts":
                    # 获取实际音频文件的时长
                    duration = get_audio_duration_from_file(audio_file)
                    if duration <= 0:
@ -1615,8 +1738,27 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
                        duration = get_audio_duration(sub_maker)
                        if duration <= 0:
                            # 最后的 fallback，基于文本长度估算
-                            duration = max(1.0, len(text) / 3.0)
-                            logger.warning(f"无法获取音频时长，使用文本估算: {duration:.2f}秒")
+                            # 对于英文文本，使用更准确的估算方法
+                            # 英文平均语速约为每分钟150-180个单词，即每秒2.5-3个单词
+                            # 对于中文文本，约为每秒3-4字
+                            import re
+                            # 计算英文单词数
+                            english_words = len(re.findall(r'\b\w+\b', text))
+                            # 计算中文字符数
+                            chinese_chars = len(re.findall(r'[\u4e00-\u9fa5]', text))
+                            
+                            if english_words > chinese_chars:
+                                # 主要是英文文本
+                                # 假设平均每个单词需要0.35秒
+                                estimated_duration = max(1.0, english_words * 0.35)
+                            else:
+                                # 主要是中文文本
+                                # 假设平均每个汉字需要0.3秒
+                                estimated_duration = max(1.0, chinese_chars * 0.3)
+                            
+                            # 确保估算时长合理
+                            duration = max(1.0, estimated_duration)
+                            logger.warning(f"无法获取音频时长，使用文本估算: {duration:.2f}秒 (英文单词: {english_words}, 中文字符: {chinese_chars})")
                    # 不创建字幕文件
                    subtitle_file = ""
                else:
@ -1658,8 +1800,6 @@ def get_audio_duration_from_file(audio_file: str) -> float:
        # 但实际文件还包含头部信息，所以调整系数
        estimated_duration = max(1.0, file_size / 20000)  # 调整为更保守的估算

-        # 对于中文语音，根据文本长度进行二次校正
-        # 一般中文语音速度约为 3-4 字/秒
        logger.warning(f"使用文件大小估算音频时长: {estimated_duration:.2f}秒")
        return estimated_duration
    except Exception as e:
--- a/config.example.toml
+++ b/config.example.toml
@ -114,9 +114,25 @@
    do_sample = true
    num_beams = 3
    repetition_penalty = 10.0
+[doubaotts]
+    # 豆包语音 TTS 配置
+    # 申请流程：
+    # 1. 打开 https://console.volcengine.com/iam/keymanage 新建 Access Key 和 Secret Key
+    # 2. 打开 https://www.volcengine.com/product/voice-tech 点击立即使用
+    # 3. 在 API 服务中心找到音频生成下面的语音合成，获取 APPID 和 Token
+    ak = ""
+    sk = ""
+    appid = ""
+    token = ""
+    cluster = "volcano_tts"
+    
+    # 高级参数
+    volume = 1.0
+    pitch = 1.0
+    silence_duration = 0.125

 [ui]
-    # TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen)
+    # TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen, doubaotts)
    tts_engine = "edge_tts"

    # Edge TTS 配置
@ -130,6 +146,10 @@
    azure_volume = 80
    azure_rate = 1.0
    azure_pitch = 0
+    
+    # 豆包语音 TTS 配置
+    doubaotts_voice_type = "BV700_V2_streaming"
+    doubaotts_rate = 1.0

 ##########################################
 # 代理和网络配置
--- a/webui/components/audio_settings.py
+++ b/webui/components/audio_settings.py
@ -26,7 +26,8 @@ def get_tts_engine_options():
        "azure_speech": "Azure Speech Services",
        "tencent_tts": "腾讯云 TTS",
        "qwen3_tts": "通义千问 Qwen3 TTS",
-        "indextts2": "IndexTTS2 语音克隆"
+        "indextts2": "IndexTTS2 语音克隆",
+        "doubaotts": "豆包语音 TTS"
    }


@ -62,6 +63,12 @@ def get_tts_engine_descriptions():
            "features": "零样本语音克隆，上传参考音频即可合成相同音色的语音，需要本地或私有部署",
            "use_case": "下载地址：https://pan.quark.cn/s/0767c9bcefd5",
            "registration": None
+        },
+        "doubaotts": {
+            "title": "豆包语音 TTS",
+            "features": "火山引擎豆包语音合成，支持多种音色和情感，国内访问速度快",
+            "use_case": "需要高质量中文语音合成的用户",
+            "registration": "https://www.volcengine.com/product/voice-tech"
        }
    }

@ -147,6 +154,8 @@ def render_tts_settings(tr):
        render_qwen3_tts_settings(tr)
    elif selected_engine == "indextts2":
        render_indextts2_tts_settings(tr)
+    elif selected_engine == "doubaotts":
+        render_doubaotts_settings(tr)

    # 4. 试听功能
    render_voice_preview_new(tr, selected_engine)
@ -703,6 +712,250 @@ def render_indextts2_tts_settings(tr):
        config.ui["voice_name"] = f"indextts2:{reference_audio}"


+def render_doubaotts_settings(tr):
+    """渲染豆包语音 TTS 设置"""
+    # AK 输入
+    ak = st.text_input(
+        "Access Key",
+        value=config.doubaotts.get("ak", ""),
+        help="火山引擎 Access Key"
+    )
+
+    # SK 输入
+    sk = st.text_input(
+        "Secret Key",
+        value=config.doubaotts.get("sk", ""),
+        type="password",
+        help="火山引擎 Secret Key"
+    )
+
+    # AppID 输入
+    appid = st.text_input(
+        "AppID",
+        value=config.doubaotts.get("appid", ""),
+        help="豆包语音应用 AppID"
+    )
+
+    # Token 输入
+    token = st.text_input(
+        "Token",
+        value=config.doubaotts.get("token", ""),
+        type="password",
+        help="豆包语音应用 Token"
+    )
+
+    # 集群配置
+    cluster = st.text_input(
+        "集群",
+        value=config.doubaotts.get("cluster", "volcano_tts"),
+        help="业务集群，标准音色使用 volcano_tts"
+    )
+
+    # 音色选择
+    # 在线音色列表（从文档中提取）
+    voice_options = {
+        "BV700_V2_streaming": "灿灿 2.0",
+        "BV705_streaming": "炀炀",
+        "BV701_V2_streaming": "擎苍 2.0",
+        "BV001_V2_streaming": "通用女声 2.0",
+        "BV700_streaming": "灿灿",
+        "BV406_V2_streaming": "超自然音色-梓梓2.0",
+        "BV406_streaming": "超自然音色-梓梓",
+        "BV407_V2_streaming": "超自然音色-燃燃2.0",
+        "BV407_streaming": "超自然音色-燃燃",
+        "BV001_streaming": "通用女声",
+        "BV002_streaming": "通用男声",
+        "BV701_streaming": "擎苍",
+        "BV123_streaming": "阳光青年",
+        "BV120_streaming": "反卷青年",
+        "BV119_streaming": "通用赘婿",
+        "BV115_streaming": "古风少御",
+        "BV107_streaming": "霸气青叔",
+        "BV100_streaming": "质朴青年",
+        "BV104_streaming": "温柔淑女",
+        "BV004_streaming": "开朗青年",
+        "BV113_streaming": "甜宠少御",
+        "BV102_streaming": "儒雅青年",
+        "BV405_streaming": "甜美小源",
+        "BV007_streaming": "亲切女声",
+        "BV009_streaming": "知性女声",
+        "BV419_streaming": "诚诚",
+        "BV415_streaming": "童童",
+        "BV008_streaming": "亲切男声",
+        "BV408_streaming": "译制片男声",
+        "BV426_streaming": "懒小羊",
+        "BV428_streaming": "清新文艺女声",
+        "BV403_streaming": "鸡汤女声",
+        "BV158_streaming": "智慧老者",
+        "BV157_streaming": "慈爱姥姥",
+        "BR001_streaming": "说唱小哥",
+        "BV410_streaming": "活力解说男",
+        "BV411_streaming": "影视解说小帅",
+        "BV437_streaming": "解说小帅-多情感",
+        "BV412_streaming": "影视解说小美",
+        "BV159_streaming": "纨绔青年",
+        "BV418_streaming": "直播一姐",
+        "BV142_streaming": "沉稳解说男",
+        "BV143_streaming": "潇洒青年",
+        "BV056_streaming": "阳光男声",
+        "BV005_streaming": "活泼女声",
+        "BV064_streaming": "小萝莉",
+        "BV051_streaming": "奶气萌娃",
+        "BV063_streaming": "动漫海绵",
+        "BV417_streaming": "动漫海星",
+        "BV050_streaming": "动漫小新",
+        "BV061_streaming": "天才童声",
+        "BV401_streaming": "促销男声",
+        "BV402_streaming": "促销女声",
+        "BV006_streaming": "磁性男声",
+        "BV011_streaming": "新闻女声",
+        "BV012_streaming": "新闻男声",
+        "BV034_streaming": "知性姐姐-双语",
+        "BV033_streaming": "温柔小哥",
+        "BV511_streaming": "慵懒女声-Ava",
+        "BV505_streaming": "议论女声-Alicia",
+        "BV138_streaming": "情感女声-Lawrence",
+        "BV027_streaming": "美式女声-Amelia",
+        "BV502_streaming": "讲述女声-Amanda",
+        "BV503_streaming": "活力女声-Ariana",
+        "BV504_streaming": "活力男声-Jackson",
+        "BV421_streaming": "天才少女",
+        "BV702_streaming": "Stefan",
+        "BV506_streaming": "天真萌娃-Lily",
+        "BV040_streaming": "亲切女声-Anna",
+        "BV516_streaming": "澳洲男声-Henry",
+        "BV520_streaming": "元气少女",
+        "BV521_streaming": "萌系少女",
+        "BV522_streaming": "气质女声",
+        "BV524_streaming": "日语男声",
+        "BV531_streaming": "活力男声Carlos（巴西地区）",
+        "BV530_streaming": "活力女声（巴西地区）",
+        "BV065_streaming": "气质御姐（墨西哥地区）",
+        "BV021_streaming": "东北老铁",
+        "BV020_streaming": "东北丫头",
+        "BV704_streaming": "方言灿灿",
+        "BV210_streaming": "西安佟掌柜",
+        "BV217_streaming": "沪上阿姐",
+        "BV213_streaming": "广西表哥",
+        "BV025_streaming": "甜美台妹",
+        "BV227_streaming": "台普男声",
+        "BV026_streaming": "港剧男神",
+        "BV424_streaming": "广东女仔",
+        "BV212_streaming": "相声演员",
+        "BV019_streaming": "重庆小伙",
+        "BV221_streaming": "四川甜妹儿",
+        "BV423_streaming": "重庆幺妹儿",
+        "BV214_streaming": "乡村企业家",
+        "BV226_streaming": "湖南妹坨",
+        "BV216_streaming": "长沙靓女"
+    }
+    
+    saved_voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming")
+    if saved_voice_type not in voice_options:
+        voice_options[saved_voice_type] = f"自定义音色 ({saved_voice_type})"
+    
+    selected_voice_display = st.selectbox(
+        "音色选择",
+        options=list(voice_options.values()),
+        index=list(voice_options.keys()).index(saved_voice_type) if saved_voice_type in voice_options else 0,
+        help="选择豆包语音 TTS 音色"
+    )
+    
+    # 获取实际的音色ID
+    voice_type = list(voice_options.keys())[
+        list(voice_options.values()).index(selected_voice_display)
+    ]
+    
+    # 高级参数折叠面板
+    with st.expander("🔧 高级参数", expanded=False):
+        col1, col2 = st.columns(2)
+        
+        with col1:
+            # 语速调节
+            voice_rate = st.slider(
+                "语速调节",
+                min_value=0.2,
+                max_value=3.0,
+                value=config.ui.get("doubaotts_rate", 1.0),
+                step=0.1,
+                help="调节语音速度 (0.2-3.0)"
+            )
+            
+            # 音量调节
+            voice_volume = st.slider(
+                "音量调节",
+                min_value=0.1,
+                max_value=2.0,
+                value=config.doubaotts.get("volume", 1.0),
+                step=0.1,
+                help="调节语音音量 (0.1-2.0)"
+            )
+        
+        with col2:
+            # 音高调节
+            voice_pitch = st.slider(
+                "音高调节",
+                min_value=0.5,
+                max_value=1.5,
+                value=config.doubaotts.get("pitch", 1.0),
+                step=0.1,
+                help="调节语音音高 (0.5-1.5)"
+            )
+            
+            # 句尾静音时长
+            silence_duration = st.slider(
+                "句尾静音时长 (秒)",
+                min_value=0.0,
+                max_value=2.0,
+                value=config.doubaotts.get("silence_duration", 0.125),
+                step=0.05,
+                help="调节句尾静音时长 (0.0-2.0秒)"
+            )
+    
+    # 显示API Key申请流程
+    with st.expander("💡 豆包语音 TTS API Key申请流程", expanded=False):
+        st.write("**申请步骤：**")
+        st.write("1. 打开 [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)")
+        st.write("2. 新建 Access Key 和 Secret Key")
+        st.write("3. 打开 [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)")
+        st.write("4. 点击立即使用")
+        st.write("5. 在最左边的API服务中心找到音频生成下面的语音合成（注意：是语音合成，不是语音合成大模型）")
+        st.write("6. 翻到最下面获取 APPID 和 Access Token")
+        
+        st.write("")
+        st.info("💡 请将获取到的 Access Key、Secret Key、AppID 和 Token 填写到上方的配置中")
+    
+    # 保存配置
+    config.doubaotts["ak"] = ak
+    config.doubaotts["sk"] = sk
+    config.doubaotts["appid"] = appid
+    config.doubaotts["token"] = token
+    config.doubaotts["cluster"] = cluster
+    config.doubaotts["volume"] = voice_volume
+    config.doubaotts["pitch"] = voice_pitch
+    config.doubaotts["silence_duration"] = silence_duration
+    config.ui["doubaotts_voice_type"] = voice_type
+    config.ui["doubaotts_rate"] = voice_rate
+    config.ui["voice_name"] = voice_type # 兼容性
+    st.session_state['voice_rate'] = voice_rate # 确保语速参数被保存到session state
+
+    # 显示配置状态
+    if ak and sk and appid and token:
+        st.success("✅ 豆包语音 TTS 配置已设置")
+    else:
+        missing = []
+        if not ak:
+            missing.append("Access Key")
+        if not sk:
+            missing.append("Secret Key")
+        if not appid:
+            missing.append("AppID")
+        if not token:
+            missing.append("Token")
+        if missing:
+            st.warning(f"⚠️ 请配置: {', '.join(missing)}")
+
+
 def render_voice_preview_new(tr, selected_engine):
    """渲染新的语音试听功能"""
    if st.button("🎵 试听语音合成", use_container_width=True):
@ -746,6 +999,11 @@ def render_voice_preview_new(tr, selected_engine):
                voice_name = f"indextts2:{reference_audio}"
            voice_rate = 1.0  # IndexTTS2 不支持速度调节
            voice_pitch = 1.0  # IndexTTS2 不支持音调调节
+        elif selected_engine == "doubaotts":
+            voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming")
+            voice_name = voice_type
+            voice_rate = config.ui.get("doubaotts_rate", 1.0)
+            voice_pitch = 1.0  # 豆包语音 TTS 不支持音调调节

        if not voice_name:
            st.error("请先配置语音设置")