diff --git a/app/config/config.py b/app/config/config.py index cb9d1fe..67bb88a 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -52,6 +52,7 @@ def save_config(): _cfg["soulvoice"] = soulvoice _cfg["ui"] = ui _cfg["tts_qwen"] = tts_qwen + _cfg["indextts2"] = indextts2 f.write(toml.dumps(_cfg)) @@ -65,6 +66,7 @@ soulvoice = _cfg.get("soulvoice", {}) ui = _cfg.get("ui", {}) frames = _cfg.get("frames", {}) tts_qwen = _cfg.get("tts_qwen", {}) +indextts2 = _cfg.get("indextts2", {}) hostname = socket.gethostname() diff --git a/app/services/voice.py b/app/services/voice.py index fdc07c8..a1acf94 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1107,6 +1107,10 @@ def tts( if tts_engine == "edge_tts": logger.info("分发到 Edge TTS") return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file) + + if tts_engine == "indextts2": + logger.info("分发到 IndexTTS2") + return indextts2_tts(text, voice_name, voice_file, speed=voice_rate) # Fallback for unknown engine - default to azure v1 logger.warning(f"未知的 TTS 引擎: '{tts_engine}', 将默认使用 Edge TTS (Azure V1)。") @@ -1541,8 +1545,8 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f f"或者使用其他 tts 引擎") continue else: - # SoulVoice 引擎不生成字幕文件 - if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine): + # SoulVoice、Qwen3、IndexTTS2 引擎不生成字幕文件 + if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2": # 获取实际音频文件的时长 duration = get_audio_duration_from_file(audio_file) if duration <= 0: @@ -1943,4 +1947,127 @@ def parse_soulvoice_voice(voice_name: str) -> str: return voice_name +def parse_indextts2_voice(voice_name: str) -> str: + """ + 解析 IndexTTS2 语音名称 + 支持格式:indextts2:reference_audio_path + 返回参考音频文件路径 + """ + if voice_name.startswith("indextts2:"): + return voice_name[10:] # 移除 "indextts2:" 前缀 + return voice_name + + +def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: + """ + 使用 IndexTTS2 API 进行零样本语音克隆 + + Args: + text: 要转换的文本 + voice_name: 参考音频路径(格式:indextts2:path/to/audio.wav) + voice_file: 输出音频文件路径 + speed: 语音速度(此引擎暂不支持速度调节) + + Returns: + SubMaker: 包含时间戳信息的字幕制作器,失败时返回 None + """ + # 获取配置 + api_url = config.indextts2.get("api_url", "http://192.168.3.6:8081/tts") + infer_mode = config.indextts2.get("infer_mode", "普通推理") + temperature = config.indextts2.get("temperature", 1.0) + top_p = config.indextts2.get("top_p", 0.8) + top_k = config.indextts2.get("top_k", 30) + do_sample = config.indextts2.get("do_sample", True) + num_beams = config.indextts2.get("num_beams", 3) + repetition_penalty = config.indextts2.get("repetition_penalty", 10.0) + + # 解析参考音频路径 + reference_audio_path = parse_indextts2_voice(voice_name) + + if not reference_audio_path or not os.path.exists(reference_audio_path): + logger.error(f"IndexTTS2 参考音频文件不存在: {reference_audio_path}") + return None + + # 准备请求数据 + files = { + 'prompt_audio': open(reference_audio_path, 'rb') + } + + data = { + 'text': text.strip(), + 'infer_mode': infer_mode, + 'temperature': temperature, + 'top_p': top_p, + 'top_k': top_k, + 'do_sample': do_sample, + 'num_beams': num_beams, + 'repetition_penalty': repetition_penalty, + } + + # 重试机制 + for attempt in range(3): + try: + logger.info(f"第 {attempt + 1} 次调用 IndexTTS2 API") + + # 设置代理 + proxies = {} + if config.proxy.get("http"): + proxies = { + 'http': config.proxy.get("http"), + 'https': config.proxy.get("https", config.proxy.get("http")) + } + + # 调用 API + response = requests.post( + api_url, + files=files, + data=data, + proxies=proxies, + timeout=120 # IndexTTS2 推理可能需要较长时间 + ) + + if response.status_code == 200: + # 保存音频文件 + with open(voice_file, 'wb') as f: + f.write(response.content) + + logger.info(f"IndexTTS2 成功生成音频: {voice_file}, 大小: {len(response.content)} 字节") + + # IndexTTS2 不支持精确字幕生成,返回简单的 SubMaker 对象 + sub_maker = SubMaker() + # 估算音频时长(基于文本长度) + estimated_duration_ms = max(1000, int(len(text) * 200)) + sub_maker.create_sub((0, estimated_duration_ms * 10000), text) + + return sub_maker + + else: + logger.error(f"IndexTTS2 API 调用失败: {response.status_code} - {response.text}") + + except requests.exceptions.Timeout: + logger.error(f"IndexTTS2 API 调用超时 (尝试 {attempt + 1}/3)") + except requests.exceptions.RequestException as e: + logger.error(f"IndexTTS2 API 网络错误: {str(e)} (尝试 {attempt + 1}/3)") + except Exception as e: + logger.error(f"IndexTTS2 TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)") + finally: + # 确保关闭文件 + try: + files['prompt_audio'].close() + except: + pass + + if attempt < 2: # 不是最后一次尝试 + time.sleep(2) # 等待2秒后重试 + # 重新打开文件用于下次重试 + if attempt < 2: + try: + files['prompt_audio'] = open(reference_audio_path, 'rb') + except: + pass + + logger.error("IndexTTS2 TTS 生成失败,已达到最大重试次数") + return None + + diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index 89291da..7d63ef9 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -1,3 +1,4 @@ +from tkinter import N from venv import logger import streamlit as st import os @@ -26,7 +27,8 @@ def get_tts_engine_options(): "edge_tts": "Edge TTS", "azure_speech": "Azure Speech Services", "tencent_tts": "腾讯云 TTS", - "qwen3_tts": "通义千问 Qwen3 TTS" + "qwen3_tts": "通义千问 Qwen3 TTS", + "indextts2": "IndexTTS2 语音克隆" } @@ -56,6 +58,12 @@ def get_tts_engine_descriptions(): "features": "阿里云通义千问语音合成,音质优秀,支持多种音色", "use_case": "需要高质量中文语音合成的用户", "registration": "https://dashscope.aliyuncs.com/" + }, + "indextts2": { + "title": "IndexTTS2 语音克隆", + "features": "零样本语音克隆,上传参考音频即可合成相同音色的语音,需要本地或私有部署", + "use_case": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", + "registration": None } } @@ -139,6 +147,8 @@ def render_tts_settings(tr): render_tencent_tts_settings(tr) elif selected_engine == "qwen3_tts": render_qwen3_tts_settings(tr) + elif selected_engine == "indextts2": + render_indextts2_tts_settings(tr) # 4. 试听功能 render_voice_preview_new(tr, selected_engine) @@ -562,6 +572,139 @@ def render_qwen3_tts_settings(tr): config.ui["qwen3_rate"] = voice_rate config.ui["voice_name"] = voice_type #兼容性 + +def render_indextts2_tts_settings(tr): + """渲染 IndexTTS2 TTS 设置""" + import os + + # API 地址配置 + api_url = st.text_input( + "API 地址", + value=config.indextts2.get("api_url", "http://127.0.0.1:8081/tts"), + help="IndexTTS2 API 服务地址" + ) + + # 参考音频文件路径 + reference_audio = st.text_input( + "参考音频路径", + value=config.indextts2.get("reference_audio", ""), + help="用于语音克隆的参考音频文件路径(WAV 格式,建议 3-10 秒)" + ) + + # 文件上传功能 + uploaded_file = st.file_uploader( + "或上传参考音频文件", + type=["wav", "mp3"], + help="上传一段清晰的音频用于语音克隆" + ) + + if uploaded_file is not None: + # 保存上传的文件 + import tempfile + temp_dir = tempfile.gettempdir() + audio_path = os.path.join(temp_dir, f"indextts2_ref_{uploaded_file.name}") + with open(audio_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + reference_audio = audio_path + st.success(f"✅ 音频已上传: {audio_path}") + + # 推理模式 + infer_mode = st.selectbox( + "推理模式", + options=["普通推理", "快速推理"], + index=0 if config.indextts2.get("infer_mode", "普通推理") == "普通推理" else 1, + help="普通推理质量更高但速度较慢,快速推理速度更快但质量略低" + ) + + # 高级参数折叠面板 + with st.expander("🔧 高级参数", expanded=False): + col1, col2 = st.columns(2) + + with col1: + temperature = st.slider( + "采样温度 (Temperature)", + min_value=0.1, + max_value=2.0, + value=float(config.indextts2.get("temperature", 1.0)), + step=0.1, + help="控制随机性,值越高输出越随机,值越低越确定" + ) + + top_p = st.slider( + "Top P", + min_value=0.0, + max_value=1.0, + value=float(config.indextts2.get("top_p", 0.8)), + step=0.05, + help="nucleus 采样的概率阈值,值越小结果越确定" + ) + + top_k = st.slider( + "Top K", + min_value=0, + max_value=100, + value=int(config.indextts2.get("top_k", 30)), + step=5, + help="top-k 采样的 k 值,0 表示不使用 top-k" + ) + + with col2: + num_beams = st.slider( + "束搜索 (Num Beams)", + min_value=1, + max_value=10, + value=int(config.indextts2.get("num_beams", 3)), + step=1, + help="束搜索的 beam 数量,值越大质量可能越好但速度越慢" + ) + + repetition_penalty = st.slider( + "重复惩罚 (Repetition Penalty)", + min_value=1.0, + max_value=20.0, + value=float(config.indextts2.get("repetition_penalty", 10.0)), + step=0.5, + help="值越大越能避免重复,但过大可能导致不自然" + ) + + do_sample = st.checkbox( + "启用采样", + value=config.indextts2.get("do_sample", True), + help="启用采样可以获得更自然的语音" + ) + + # 显示使用说明 + with st.expander("💡 IndexTTS2 使用说明", expanded=False): + st.markdown(""" + **零样本语音克隆** + + 1. **准备参考音频**:上传或指定一段清晰的音频文件(建议 3-10 秒) + 2. **设置 API 地址**:确保 IndexTTS2 服务正常运行 + 3. **开始合成**:系统会自动使用参考音频的音色合成新语音 + + **注意事项**: + - 参考音频质量直接影响合成效果 + - 建议使用无背景噪音的清晰音频 + - 文本长度建议控制在合理范围内 + - 首次合成可能需要较长时间 + """) + + # 保存配置 + config.indextts2["api_url"] = api_url + config.indextts2["reference_audio"] = reference_audio + config.indextts2["infer_mode"] = infer_mode + config.indextts2["temperature"] = temperature + config.indextts2["top_p"] = top_p + config.indextts2["top_k"] = top_k + config.indextts2["num_beams"] = num_beams + config.indextts2["repetition_penalty"] = repetition_penalty + config.indextts2["do_sample"] = do_sample + + # 保存 voice_name 用于兼容性 + if reference_audio: + config.ui["voice_name"] = f"indextts2:{reference_audio}" + + def render_voice_preview_new(tr, selected_engine): """渲染新的语音试听功能""" if st.button("🎵 试听语音合成", use_container_width=True): @@ -599,6 +742,12 @@ def render_voice_preview_new(tr, selected_engine): voice_name = f"qwen3:{vt}" voice_rate = config.ui.get("qwen3_rate", 1.0) voice_pitch = 1.0 # Qwen3 TTS 不支持音调调节 + elif selected_engine == "indextts2": + reference_audio = config.indextts2.get("reference_audio", "") + if reference_audio: + voice_name = f"indextts2:{reference_audio}" + voice_rate = 1.0 # IndexTTS2 不支持速度调节 + voice_pitch = 1.0 # IndexTTS2 不支持音调调节 if not voice_name: st.error("请先配置语音设置")