diff --git a/app/config/config.py b/app/config/config.py index 21026e8..ddc091c 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -82,6 +82,7 @@ def save_config(): _cfg["ui"] = ui _cfg["tts_qwen"] = tts_qwen _cfg["indextts2"] = indextts2 + _cfg["doubaotts"] = doubaotts f.write(toml.dumps(_cfg)) @@ -96,6 +97,7 @@ ui = _cfg.get("ui", {}) frames = _cfg.get("frames", {}) tts_qwen = _cfg.get("tts_qwen", {}) indextts2 = _cfg.get("indextts2", {}) +doubaotts = _cfg.get("doubaotts", {}) hostname = socket.gethostname() diff --git a/app/services/voice.py b/app/services/voice.py index 4e5e419..bda672b 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1116,6 +1116,125 @@ def should_use_azure_speech_services(voice_name: str) -> bool: return False +def doubaotts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: + """ + 使用豆包语音 TTS 生成语音 + """ + # 读取配置 + doubaotts_cfg = getattr(config, "doubaotts", {}) or {} + appid = doubaotts_cfg.get("appid", "") + token = doubaotts_cfg.get("token", "") + ak = doubaotts_cfg.get("ak", "") + sk = doubaotts_cfg.get("sk", "") + cluster = doubaotts_cfg.get("cluster", "volcano_tts") + + if not appid or not token: + logger.error("豆包语音 TTS 配置未完成") + return None + + # 准备参数 + voice_type = voice_name + safe_speed = float(max(0.2, min(3.0, speed))) + text = text.strip() + + # 构建请求参数 + import uuid + reqid = str(uuid.uuid4()) + + # 获取高级参数 + volume = doubaotts_cfg.get("volume", 1.0) + pitch = doubaotts_cfg.get("pitch", 1.0) + silence_duration = doubaotts_cfg.get("silence_duration", 0.125) + + payload = { + "app": { + "appid": appid, + "token": token, + "cluster": cluster + }, + "user": { + "uid": "NarratoAI" + }, + "audio": { + "voice_type": voice_type, + "encoding": "mp3", + "rate": 24000, + "speed_ratio": safe_speed, + "volume_ratio": float(volume), + "pitch_ratio": float(pitch) + }, + "request": { + "reqid": reqid, + "text": text, + "text_type": "plain", + "operation": "query" + } + } + + # 如果设置了句尾静音时长,添加到请求参数中 + if silence_duration > 0: + payload["audio"]["silence_duration"] = float(silence_duration) + + # API 地址 + url = "https://openspeech.bytedance.com/api/v1/tts" + + # 构建请求头(使用Bearer Token认证) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer;{token}" + } + + for i in range(3): + try: + logger.info(f"=== 豆包语音 TTS 请求参数 (第 {i+1} 次调用) ===") + + # 发送请求 + import requests + # 处理代理设置 + proxies = None + proxy_enabled = config.proxy.get("enabled", False) + if proxy_enabled: + proxy_url = config.proxy.get("https", config.proxy.get("http", "")) + if proxy_url: + proxies = {"https": proxy_url, "http": proxy_url} + response = requests.post(url, json=payload, headers=headers, proxies=proxies, timeout=60) + + if response.status_code == 200: + result = response.json() + if result.get("code") == 3000: + # 成功 + audio_data = result.get("data", "") + if audio_data: + # 解码 base64 音频数据 + import base64 + audio_bytes = base64.b64decode(audio_data) + + # 写入文件 + with open(voice_file, "wb") as f: + f.write(audio_bytes) + + logger.success(f"豆包语音 TTS 合成成功: {voice_file}") + + # 创建 SubMaker 对象(简化版,不包含时间戳) + sub_maker = new_sub_maker() + return sub_maker + else: + logger.error("豆包语音 TTS 响应中无音频数据") + else: + logger.error(f"豆包语音 TTS 失败: {result.get('message', '未知错误')}") + else: + logger.error(f"豆包语音 TTS API 请求失败: {response.status_code}, {response.text}") + + if i < 2: + time.sleep(1) + except Exception as e: + logger.error(f"豆包语音 TTS 错误: {str(e)}") + if i < 2: + time.sleep(3) + + return None + + def tts( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str, tts_engine: str ) -> Union[SubMaker, None]: @@ -1147,6 +1266,10 @@ def tts( if tts_engine == "indextts2": logger.info("分发到 IndexTTS2") return indextts2_tts(text, voice_name, voice_file, speed=voice_rate) + + if tts_engine == "doubaotts": + logger.info("分发到豆包语音 TTS") + return doubaotts_tts(text, voice_name, voice_file, speed=voice_rate) # Fallback for unknown engine - default to azure v1 logger.warning(f"未知的 TTS 引擎: '{tts_engine}', 将默认使用 Edge TTS (Azure V1)。") @@ -1606,8 +1729,8 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f f"或者使用其他 tts 引擎") continue else: - # SoulVoice、Qwen3、IndexTTS2 引擎不生成字幕文件 - if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2": + # SoulVoice、Qwen3、IndexTTS2、豆包语音 引擎不生成字幕文件 + if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2" or tts_engine == "doubaotts": # 获取实际音频文件的时长 duration = get_audio_duration_from_file(audio_file) if duration <= 0: @@ -1615,8 +1738,27 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f duration = get_audio_duration(sub_maker) if duration <= 0: # 最后的 fallback,基于文本长度估算 - duration = max(1.0, len(text) / 3.0) - logger.warning(f"无法获取音频时长,使用文本估算: {duration:.2f}秒") + # 对于英文文本,使用更准确的估算方法 + # 英文平均语速约为每分钟150-180个单词,即每秒2.5-3个单词 + # 对于中文文本,约为每秒3-4字 + import re + # 计算英文单词数 + english_words = len(re.findall(r'\b\w+\b', text)) + # 计算中文字符数 + chinese_chars = len(re.findall(r'[\u4e00-\u9fa5]', text)) + + if english_words > chinese_chars: + # 主要是英文文本 + # 假设平均每个单词需要0.35秒 + estimated_duration = max(1.0, english_words * 0.35) + else: + # 主要是中文文本 + # 假设平均每个汉字需要0.3秒 + estimated_duration = max(1.0, chinese_chars * 0.3) + + # 确保估算时长合理 + duration = max(1.0, estimated_duration) + logger.warning(f"无法获取音频时长,使用文本估算: {duration:.2f}秒 (英文单词: {english_words}, 中文字符: {chinese_chars})") # 不创建字幕文件 subtitle_file = "" else: @@ -1658,8 +1800,6 @@ def get_audio_duration_from_file(audio_file: str) -> float: # 但实际文件还包含头部信息,所以调整系数 estimated_duration = max(1.0, file_size / 20000) # 调整为更保守的估算 - # 对于中文语音,根据文本长度进行二次校正 - # 一般中文语音速度约为 3-4 字/秒 logger.warning(f"使用文件大小估算音频时长: {estimated_duration:.2f}秒") return estimated_duration except Exception as e: diff --git a/config.example.toml b/config.example.toml index 781aaa6..5674e39 100644 --- a/config.example.toml +++ b/config.example.toml @@ -114,9 +114,25 @@ do_sample = true num_beams = 3 repetition_penalty = 10.0 +[doubaotts] + # 豆包语音 TTS 配置 + # 申请流程: + # 1. 打开 https://console.volcengine.com/iam/keymanage 新建 Access Key 和 Secret Key + # 2. 打开 https://www.volcengine.com/product/voice-tech 点击立即使用 + # 3. 在 API 服务中心找到音频生成下面的语音合成,获取 APPID 和 Token + ak = "" + sk = "" + appid = "" + token = "" + cluster = "volcano_tts" + + # 高级参数 + volume = 1.0 + pitch = 1.0 + silence_duration = 0.125 [ui] - # TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen) + # TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen, doubaotts) tts_engine = "edge_tts" # Edge TTS 配置 @@ -130,6 +146,10 @@ azure_volume = 80 azure_rate = 1.0 azure_pitch = 0 + + # 豆包语音 TTS 配置 + doubaotts_voice_type = "BV700_V2_streaming" + doubaotts_rate = 1.0 ########################################## # 代理和网络配置 diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index d75a34a..ed86698 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -26,7 +26,8 @@ def get_tts_engine_options(): "azure_speech": "Azure Speech Services", "tencent_tts": "腾讯云 TTS", "qwen3_tts": "通义千问 Qwen3 TTS", - "indextts2": "IndexTTS2 语音克隆" + "indextts2": "IndexTTS2 语音克隆", + "doubaotts": "豆包语音 TTS" } @@ -62,6 +63,12 @@ def get_tts_engine_descriptions(): "features": "零样本语音克隆,上传参考音频即可合成相同音色的语音,需要本地或私有部署", "use_case": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", "registration": None + }, + "doubaotts": { + "title": "豆包语音 TTS", + "features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快", + "use_case": "需要高质量中文语音合成的用户", + "registration": "https://www.volcengine.com/product/voice-tech" } } @@ -147,6 +154,8 @@ def render_tts_settings(tr): render_qwen3_tts_settings(tr) elif selected_engine == "indextts2": render_indextts2_tts_settings(tr) + elif selected_engine == "doubaotts": + render_doubaotts_settings(tr) # 4. 试听功能 render_voice_preview_new(tr, selected_engine) @@ -703,6 +712,250 @@ def render_indextts2_tts_settings(tr): config.ui["voice_name"] = f"indextts2:{reference_audio}" +def render_doubaotts_settings(tr): + """渲染豆包语音 TTS 设置""" + # AK 输入 + ak = st.text_input( + "Access Key", + value=config.doubaotts.get("ak", ""), + help="火山引擎 Access Key" + ) + + # SK 输入 + sk = st.text_input( + "Secret Key", + value=config.doubaotts.get("sk", ""), + type="password", + help="火山引擎 Secret Key" + ) + + # AppID 输入 + appid = st.text_input( + "AppID", + value=config.doubaotts.get("appid", ""), + help="豆包语音应用 AppID" + ) + + # Token 输入 + token = st.text_input( + "Token", + value=config.doubaotts.get("token", ""), + type="password", + help="豆包语音应用 Token" + ) + + # 集群配置 + cluster = st.text_input( + "集群", + value=config.doubaotts.get("cluster", "volcano_tts"), + help="业务集群,标准音色使用 volcano_tts" + ) + + # 音色选择 + # 在线音色列表(从文档中提取) + voice_options = { + "BV700_V2_streaming": "灿灿 2.0", + "BV705_streaming": "炀炀", + "BV701_V2_streaming": "擎苍 2.0", + "BV001_V2_streaming": "通用女声 2.0", + "BV700_streaming": "灿灿", + "BV406_V2_streaming": "超自然音色-梓梓2.0", + "BV406_streaming": "超自然音色-梓梓", + "BV407_V2_streaming": "超自然音色-燃燃2.0", + "BV407_streaming": "超自然音色-燃燃", + "BV001_streaming": "通用女声", + "BV002_streaming": "通用男声", + "BV701_streaming": "擎苍", + "BV123_streaming": "阳光青年", + "BV120_streaming": "反卷青年", + "BV119_streaming": "通用赘婿", + "BV115_streaming": "古风少御", + "BV107_streaming": "霸气青叔", + "BV100_streaming": "质朴青年", + "BV104_streaming": "温柔淑女", + "BV004_streaming": "开朗青年", + "BV113_streaming": "甜宠少御", + "BV102_streaming": "儒雅青年", + "BV405_streaming": "甜美小源", + "BV007_streaming": "亲切女声", + "BV009_streaming": "知性女声", + "BV419_streaming": "诚诚", + "BV415_streaming": "童童", + "BV008_streaming": "亲切男声", + "BV408_streaming": "译制片男声", + "BV426_streaming": "懒小羊", + "BV428_streaming": "清新文艺女声", + "BV403_streaming": "鸡汤女声", + "BV158_streaming": "智慧老者", + "BV157_streaming": "慈爱姥姥", + "BR001_streaming": "说唱小哥", + "BV410_streaming": "活力解说男", + "BV411_streaming": "影视解说小帅", + "BV437_streaming": "解说小帅-多情感", + "BV412_streaming": "影视解说小美", + "BV159_streaming": "纨绔青年", + "BV418_streaming": "直播一姐", + "BV142_streaming": "沉稳解说男", + "BV143_streaming": "潇洒青年", + "BV056_streaming": "阳光男声", + "BV005_streaming": "活泼女声", + "BV064_streaming": "小萝莉", + "BV051_streaming": "奶气萌娃", + "BV063_streaming": "动漫海绵", + "BV417_streaming": "动漫海星", + "BV050_streaming": "动漫小新", + "BV061_streaming": "天才童声", + "BV401_streaming": "促销男声", + "BV402_streaming": "促销女声", + "BV006_streaming": "磁性男声", + "BV011_streaming": "新闻女声", + "BV012_streaming": "新闻男声", + "BV034_streaming": "知性姐姐-双语", + "BV033_streaming": "温柔小哥", + "BV511_streaming": "慵懒女声-Ava", + "BV505_streaming": "议论女声-Alicia", + "BV138_streaming": "情感女声-Lawrence", + "BV027_streaming": "美式女声-Amelia", + "BV502_streaming": "讲述女声-Amanda", + "BV503_streaming": "活力女声-Ariana", + "BV504_streaming": "活力男声-Jackson", + "BV421_streaming": "天才少女", + "BV702_streaming": "Stefan", + "BV506_streaming": "天真萌娃-Lily", + "BV040_streaming": "亲切女声-Anna", + "BV516_streaming": "澳洲男声-Henry", + "BV520_streaming": "元气少女", + "BV521_streaming": "萌系少女", + "BV522_streaming": "气质女声", + "BV524_streaming": "日语男声", + "BV531_streaming": "活力男声Carlos(巴西地区)", + "BV530_streaming": "活力女声(巴西地区)", + "BV065_streaming": "气质御姐(墨西哥地区)", + "BV021_streaming": "东北老铁", + "BV020_streaming": "东北丫头", + "BV704_streaming": "方言灿灿", + "BV210_streaming": "西安佟掌柜", + "BV217_streaming": "沪上阿姐", + "BV213_streaming": "广西表哥", + "BV025_streaming": "甜美台妹", + "BV227_streaming": "台普男声", + "BV026_streaming": "港剧男神", + "BV424_streaming": "广东女仔", + "BV212_streaming": "相声演员", + "BV019_streaming": "重庆小伙", + "BV221_streaming": "四川甜妹儿", + "BV423_streaming": "重庆幺妹儿", + "BV214_streaming": "乡村企业家", + "BV226_streaming": "湖南妹坨", + "BV216_streaming": "长沙靓女" + } + + saved_voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming") + if saved_voice_type not in voice_options: + voice_options[saved_voice_type] = f"自定义音色 ({saved_voice_type})" + + selected_voice_display = st.selectbox( + "音色选择", + options=list(voice_options.values()), + index=list(voice_options.keys()).index(saved_voice_type) if saved_voice_type in voice_options else 0, + help="选择豆包语音 TTS 音色" + ) + + # 获取实际的音色ID + voice_type = list(voice_options.keys())[ + list(voice_options.values()).index(selected_voice_display) + ] + + # 高级参数折叠面板 + with st.expander("🔧 高级参数", expanded=False): + col1, col2 = st.columns(2) + + with col1: + # 语速调节 + voice_rate = st.slider( + "语速调节", + min_value=0.2, + max_value=3.0, + value=config.ui.get("doubaotts_rate", 1.0), + step=0.1, + help="调节语音速度 (0.2-3.0)" + ) + + # 音量调节 + voice_volume = st.slider( + "音量调节", + min_value=0.1, + max_value=2.0, + value=config.doubaotts.get("volume", 1.0), + step=0.1, + help="调节语音音量 (0.1-2.0)" + ) + + with col2: + # 音高调节 + voice_pitch = st.slider( + "音高调节", + min_value=0.5, + max_value=1.5, + value=config.doubaotts.get("pitch", 1.0), + step=0.1, + help="调节语音音高 (0.5-1.5)" + ) + + # 句尾静音时长 + silence_duration = st.slider( + "句尾静音时长 (秒)", + min_value=0.0, + max_value=2.0, + value=config.doubaotts.get("silence_duration", 0.125), + step=0.05, + help="调节句尾静音时长 (0.0-2.0秒)" + ) + + # 显示API Key申请流程 + with st.expander("💡 豆包语音 TTS API Key申请流程", expanded=False): + st.write("**申请步骤:**") + st.write("1. 打开 [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)") + st.write("2. 新建 Access Key 和 Secret Key") + st.write("3. 打开 [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)") + st.write("4. 点击立即使用") + st.write("5. 在最左边的API服务中心找到音频生成下面的语音合成(注意:是语音合成,不是语音合成大模型)") + st.write("6. 翻到最下面获取 APPID 和 Access Token") + + st.write("") + st.info("💡 请将获取到的 Access Key、Secret Key、AppID 和 Token 填写到上方的配置中") + + # 保存配置 + config.doubaotts["ak"] = ak + config.doubaotts["sk"] = sk + config.doubaotts["appid"] = appid + config.doubaotts["token"] = token + config.doubaotts["cluster"] = cluster + config.doubaotts["volume"] = voice_volume + config.doubaotts["pitch"] = voice_pitch + config.doubaotts["silence_duration"] = silence_duration + config.ui["doubaotts_voice_type"] = voice_type + config.ui["doubaotts_rate"] = voice_rate + config.ui["voice_name"] = voice_type # 兼容性 + st.session_state['voice_rate'] = voice_rate # 确保语速参数被保存到session state + + # 显示配置状态 + if ak and sk and appid and token: + st.success("✅ 豆包语音 TTS 配置已设置") + else: + missing = [] + if not ak: + missing.append("Access Key") + if not sk: + missing.append("Secret Key") + if not appid: + missing.append("AppID") + if not token: + missing.append("Token") + if missing: + st.warning(f"⚠️ 请配置: {', '.join(missing)}") + + def render_voice_preview_new(tr, selected_engine): """渲染新的语音试听功能""" if st.button("🎵 试听语音合成", use_container_width=True): @@ -746,6 +999,11 @@ def render_voice_preview_new(tr, selected_engine): voice_name = f"indextts2:{reference_audio}" voice_rate = 1.0 # IndexTTS2 不支持速度调节 voice_pitch = 1.0 # IndexTTS2 不支持音调调节 + elif selected_engine == "doubaotts": + voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming") + voice_name = voice_type + voice_rate = config.ui.get("doubaotts_rate", 1.0) + voice_pitch = 1.0 # 豆包语音 TTS 不支持音调调节 if not voice_name: st.error("请先配置语音设置")