From 283617deb001f099e9861c4755a74844ca726a38 Mon Sep 17 00:00:00 2001 From: viccy Date: Fri, 5 Jun 2026 14:57:00 +0800 Subject: [PATCH 01/24] =?UTF-8?q?feat(jianying,=20webui):=20=E6=96=B0?= =?UTF-8?q?=E5=A2=9EIndexTTS2=E6=94=AF=E6=8C=81=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?TTS=E5=A4=84=E7=90=86=E5=B9=B6=E6=B7=BB=E5=8A=A0=E5=8D=95?= =?UTF-8?q?=E5=85=83=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 重构WebUI的TTS语音名称获取逻辑,适配多种TTS引擎 - 为IndexTTS2添加参考音频校验与归一化处理 - 新增剪映任务工具函数的完整单元测试用例 - 修复音频时长取整逻辑以提升匹配精度 - 更新默认TTS引擎为配置值而非硬编码内容 --- app/services/jianying_task.py | 33 ++++++++++++++++ app/services/test_jianying_task_unittest.py | 43 +++++++++++++++++++++ webui.py | 27 ++++++++++--- 3 files changed, 97 insertions(+), 6 deletions(-) create mode 100644 app/services/test_jianying_task_unittest.py diff --git a/app/services/jianying_task.py b/app/services/jianying_task.py index 25b4a74..f06d2f0 100644 --- a/app/services/jianying_task.py +++ b/app/services/jianying_task.py @@ -43,6 +43,37 @@ def get_audio_duration_ffprobe(audio_file: str) -> float: raise +def _strip_indextts2_prefix(voice_name: str) -> str: + prefix = "indextts2:" + if voice_name.startswith(prefix): + return voice_name[len(prefix):] + return voice_name + + +def _floor_duration_to_milliseconds(duration: float) -> float: + return int(duration * 1000) / 1000.0 + + +def _normalize_indextts2_reference_audio(params: VideoClipParams) -> None: + """Ensure IndexTTS2 uses the configured reference audio instead of a stale UI voice.""" + if params.tts_engine != "indextts2": + return + + candidate = _strip_indextts2_prefix(getattr(params, "voice_name", "") or "") + if candidate and os.path.isfile(candidate): + params.voice_name = f"indextts2:{candidate}" + logger.info(f"IndexTTS2 使用参考音频: {candidate}") + return + + configured_ref = _strip_indextts2_prefix(config.indextts2.get("reference_audio", "") or "") + if configured_ref and os.path.isfile(configured_ref): + params.voice_name = f"indextts2:{configured_ref}" + logger.info(f"IndexTTS2 使用配置中的参考音频: {configured_ref}") + return + + raise ValueError("IndexTTS2 参考音频不存在,请在音频设置中上传或填写有效的参考音频路径") + + def start_export_jianying_draft(task_id: str, params: VideoClipParams): """ 导出到剪映草稿的后台任务 @@ -83,6 +114,7 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams): 2. 使用 TTS 生成音频素材 """ logger.info("\n\n## 2. 根据OST设置生成音频列表") + _normalize_indextts2_reference_audio(params) tts_segments = [ segment for segment in list_script if segment['OST'] in [0, 2] @@ -199,6 +231,7 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams): if os.path.exists(audio_file): # 使用ffprobe获取精确的音频时长,避免因TTS引擎差异导致时长不匹配 actual_audio_duration = get_audio_duration_ffprobe(audio_file) + actual_audio_duration = _floor_duration_to_milliseconds(actual_audio_duration) logger.info(f"音频文件实际时长: {actual_audio_duration:.6f}秒, 脚本时长(视频): {duration:.3f}秒") # 使用音频实际时长和视频时长中的较小值,确保不超过素材时长 diff --git a/app/services/test_jianying_task_unittest.py b/app/services/test_jianying_task_unittest.py new file mode 100644 index 0000000..24a87fe --- /dev/null +++ b/app/services/test_jianying_task_unittest.py @@ -0,0 +1,43 @@ +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch + +from app.models.schema import VideoClipParams +from app.services import jianying_task + + +class JianyingTaskTests(unittest.TestCase): + def test_normalize_indextts2_uses_valid_param_reference(self): + with tempfile.NamedTemporaryFile(suffix=".wav") as ref: + params = VideoClipParams(tts_engine="indextts2", voice_name=ref.name) + + jianying_task._normalize_indextts2_reference_audio(params) + + self.assertEqual(f"indextts2:{ref.name}", params.voice_name) + + def test_normalize_indextts2_uses_config_reference_when_param_is_stale(self): + with tempfile.TemporaryDirectory() as temp_dir: + ref_path = Path(temp_dir) / "reference.wav" + ref_path.write_bytes(b"fake wav") + params = VideoClipParams(tts_engine="indextts2", voice_name="zh-CN-YunjianNeural") + + with patch.dict(jianying_task.config.indextts2, {"reference_audio": str(ref_path)}, clear=False): + jianying_task._normalize_indextts2_reference_audio(params) + + self.assertEqual(f"indextts2:{ref_path}", params.voice_name) + + def test_normalize_indextts2_requires_existing_reference_audio(self): + params = VideoClipParams(tts_engine="indextts2", voice_name="zh-CN-YunjianNeural") + + with patch.dict(jianying_task.config.indextts2, {"reference_audio": ""}, clear=False): + with self.assertRaisesRegex(ValueError, "IndexTTS2 参考音频不存在"): + jianying_task._normalize_indextts2_reference_audio(params) + + def test_floor_duration_to_milliseconds(self): + self.assertAlmostEqual(6.997, jianying_task._floor_duration_to_milliseconds(6.997333)) + self.assertAlmostEqual(7.0, jianying_task._floor_duration_to_milliseconds(7.000999)) + + +if __name__ == "__main__": + unittest.main() diff --git a/webui.py b/webui.py index d2ab42b..3b9c12b 100644 --- a/webui.py +++ b/webui.py @@ -224,17 +224,32 @@ def render_generate_button(): def get_voice_name_for_tts_engine(tts_engine: str) -> str: """根据TTS引擎获取用户选择的音色""" + if tts_engine == 'edge_tts': + return config.ui.get('edge_voice_name', 'zh-CN-XiaoxiaoNeural-Female') + if tts_engine == 'azure_speech': + return config.ui.get('azure_voice_name', 'zh-CN-XiaoxiaoMultilingualNeural') + if tts_engine == 'tencent_tts': + return f"tencent:{config.ui.get('tencent_voice_type', '101001')}" + if tts_engine == 'qwen3_tts': + return f"qwen3:{config.ui.get('qwen_voice_type', 'Cherry')}" + if tts_engine == 'indextts2': + reference_audio = config.indextts2.get('reference_audio', '') + if reference_audio: + return f"indextts2:{reference_audio}" + return config.ui.get('voice_name', '') if tts_engine == 'doubaotts': - return st.session_state.get('voice_name', config.ui.get('doubaotts_voice_type', 'BV700_streaming')) - elif tts_engine == 'azure_speech': - return st.session_state.get('voice_name', config.ui.get('azure_voice_name', 'zh-CN-XiaoxiaoMultilingualNeural')) - else: - return st.session_state.get('voice_name', config.ui.get('edge_voice_name', 'zh-CN-XiaoxiaoNeural-Female')) + return config.ui.get('doubaotts_voice_type', 'BV700_streaming') + if tts_engine == 'soulvoice': + voice_uri = config.soulvoice.get('voice_uri', '') + if voice_uri and not voice_uri.startswith(('soulvoice:', 'speech:')): + return f"soulvoice:{voice_uri}" + return voice_uri + return config.ui.get('voice_name', config.ui.get('edge_voice_name', 'zh-CN-XiaoxiaoNeural-Female')) def get_jianying_export_params() -> VideoClipParams: """获取导出到剪映草稿的参数""" - tts_engine = st.session_state.get('tts_engine', 'azure') + tts_engine = st.session_state.get('tts_engine', config.ui.get('tts_engine', 'edge_tts')) voice_name = get_voice_name_for_tts_engine(tts_engine) voice_rate = st.session_state.get('voice_rate', 1.0) voice_pitch = st.session_state.get('voice_pitch', 1.0) From a1b434fbdae5813eef855638cb5412924f42c655 Mon Sep 17 00:00:00 2001 From: viccy Date: Fri, 5 Jun 2026 15:29:15 +0800 Subject: [PATCH 02/24] =?UTF-8?q?refactor:=20=E4=BC=98=E5=8C=96=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E8=AE=BE=E7=BD=AE=E4=B8=8E=E9=85=8D=E7=BD=AE=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 重构 script_settings.py 中的脚本模式状态处理逻辑,完善会话状态 fallback 与默认值处理,新增 required 参数修复空选择问题。新增 update_app_config_if_changed 工具函数,仅在配置值实际变更时更新配置,替换基础设置中多处直接修改配置的代码,统一逻辑并减少不必要的状态更新。 --- webui/components/basic_settings.py | 45 +++++++++++++++++++++-------- webui/components/script_settings.py | 23 ++++++++++----- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py index f219a33..95db275 100644 --- a/webui/components/basic_settings.py +++ b/webui/components/basic_settings.py @@ -140,6 +140,15 @@ def show_config_validation_errors(errors: list): st.error(error) +def update_app_config_if_changed(key: str, value) -> bool: + """Update app config only when the value really changed.""" + if config.app.get(key) == value: + return False + + config.app[key] = value + return True + + def render_basic_settings(tr): """渲染基础设置面板""" with st.expander(tr("Basic Settings"), expanded=False): @@ -546,9 +555,11 @@ def render_vision_llm_settings(tr): # 这里的验证逻辑可能需要微调,因为我们现在是自动组合的 is_valid, error_msg = validate_openai_compatible_model_name(st_vision_model_name, "视频分析") if is_valid: - config.app["vision_openai_model_name"] = st_vision_model_name + config_changed |= update_app_config_if_changed( + "vision_openai_model_name", + st_vision_model_name + ) st.session_state["vision_openai_model_name"] = st_vision_model_name - config_changed = True else: validation_errors.append(error_msg) @@ -556,9 +567,11 @@ def render_vision_llm_settings(tr): if st_vision_api_key: is_valid, error_msg = validate_api_key(st_vision_api_key, "视频分析") if is_valid: - config.app["vision_openai_api_key"] = st_vision_api_key + config_changed |= update_app_config_if_changed( + "vision_openai_api_key", + st_vision_api_key + ) st.session_state["vision_openai_api_key"] = st_vision_api_key - config_changed = True else: validation_errors.append(error_msg) @@ -566,9 +579,11 @@ def render_vision_llm_settings(tr): if st_vision_base_url: is_valid, error_msg = validate_base_url(st_vision_base_url, "视频分析") if is_valid: - config.app["vision_openai_base_url"] = st_vision_base_url + config_changed |= update_app_config_if_changed( + "vision_openai_base_url", + st_vision_base_url + ) st.session_state["vision_openai_base_url"] = st_vision_base_url - config_changed = True else: validation_errors.append(error_msg) @@ -804,9 +819,11 @@ def render_text_llm_settings(tr): if st_text_model_name: is_valid, error_msg = validate_openai_compatible_model_name(st_text_model_name, "文案生成") if is_valid: - config.app["text_openai_model_name"] = st_text_model_name + text_config_changed |= update_app_config_if_changed( + "text_openai_model_name", + st_text_model_name + ) st.session_state["text_openai_model_name"] = st_text_model_name - text_config_changed = True else: text_validation_errors.append(error_msg) @@ -814,9 +831,11 @@ def render_text_llm_settings(tr): if st_text_api_key: is_valid, error_msg = validate_api_key(st_text_api_key, "文案生成") if is_valid: - config.app["text_openai_api_key"] = st_text_api_key + text_config_changed |= update_app_config_if_changed( + "text_openai_api_key", + st_text_api_key + ) st.session_state["text_openai_api_key"] = st_text_api_key - text_config_changed = True else: text_validation_errors.append(error_msg) @@ -824,9 +843,11 @@ def render_text_llm_settings(tr): if st_text_base_url: is_valid, error_msg = validate_base_url(st_text_base_url, "文案生成") if is_valid: - config.app["text_openai_base_url"] = st_text_base_url + text_config_changed |= update_app_config_if_changed( + "text_openai_base_url", + st_text_base_url + ) st.session_state["text_openai_base_url"] = st_text_base_url - text_config_changed = True else: text_validation_errors.append(error_msg) diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index 7c7a3f2..4ac6f2b 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -56,11 +56,6 @@ def render_script_file(tr, params): MODE_SHORT = "short" MODE_SUMMARY = "summary" - # 处理保存脚本后的模式切换(必须在 widget 实例化之前) - if st.session_state.get('_switch_to_file_mode'): - st.session_state['script_mode_selection'] = tr("Select/Upload Script") - del st.session_state['_switch_to_file_mode'] - # 模式选项映射 mode_options = { tr("Select/Upload Script"): MODE_FILE, @@ -88,6 +83,18 @@ def render_script_file(tr, params): # 1. 渲染功能选择下拉框 # 使用 segmented_control 替代 selectbox,提供更好的视觉体验 default_mode_label = mode_keys[default_index] + default_mode = mode_options[default_mode_label] + + if st.session_state.get('_switch_to_file_mode'): + st.session_state['script_mode_selection'] = tr("Select/Upload Script") + del st.session_state['_switch_to_file_mode'] + elif ( + 'script_mode_selection' not in st.session_state + or st.session_state['script_mode_selection'] not in mode_options + ): + st.session_state['script_mode_selection'] = default_mode_label + elif mode_options[st.session_state['script_mode_selection']] != default_mode: + st.session_state['script_mode_selection'] = default_mode_label # 定义回调函数来处理状态更新 def update_script_mode(): @@ -107,12 +114,12 @@ def render_script_file(tr, params): selected_mode_label = st.segmented_control( tr("Video Type"), options=mode_keys, - default=default_mode_label, key="script_mode_selection", - on_change=update_script_mode + on_change=update_script_mode, + required=True ) - # 处理未选择的情况(虽然有default,但在某些交互下可能为空) + # 处理旧状态为空的兜底情况 if not selected_mode_label: selected_mode_label = default_mode_label From 35477a933d251331d37a8881bf51d7420a208b64 Mon Sep 17 00:00:00 2001 From: viccy Date: Fri, 5 Jun 2026 15:59:54 +0800 Subject: [PATCH 03/24] =?UTF-8?q?feat(edge-tts,=20webui):=20=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E5=8A=A8=E6=80=81=E8=8E=B7=E5=8F=96=E9=9F=B3=E8=89=B2?= =?UTF-8?q?=E5=88=97=E8=A1=A8=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E5=86=85=E7=BD=AE=E5=88=97=E8=A1=A8=E5=B9=B6=E7=AE=80=E5=8C=96?= =?UTF-8?q?UI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增`get_all_edge_voices()`工具函数链,支持缓存与代理,拉取失败时回退到内置音色列表 - 更新内置音色文档与列表,修正部分命名并新增多语言神经音色及因纽特语相关音色 - 简化WebUI的Edge TTS设置界面,不再硬编码筛选音色,直接展示所有已加载的音色 --- app/services/voice.py | 79 ++++++++++++++++++++++++++++-- docs/voice-list.txt | 32 ++++++++++-- webui/components/audio_settings.py | 28 +++-------- 3 files changed, 109 insertions(+), 30 deletions(-) diff --git a/app/services/voice.py b/app/services/voice.py index bda672b..e6c94f7 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -6,6 +6,7 @@ import edge_tts import asyncio import requests import uuid +from functools import lru_cache from loguru import logger from typing import List, Union, Tuple from datetime import datetime @@ -282,7 +283,7 @@ Gender: Male Name: en-AU-NatashaNeural Gender: Female -Name: en-AU-WilliamNeural +Name: en-AU-WilliamMultilingualNeural Gender: Male Name: en-CA-ClaraNeural @@ -369,21 +370,33 @@ Gender: Female Name: en-US-AndrewNeural Gender: Male +Name: en-US-AndrewMultilingualNeural +Gender: Male + Name: en-US-AriaNeural Gender: Female Name: en-US-AvaNeural Gender: Female +Name: en-US-AvaMultilingualNeural +Gender: Female + Name: en-US-BrianNeural Gender: Male +Name: en-US-BrianMultilingualNeural +Gender: Male + Name: en-US-ChristopherNeural Gender: Male Name: en-US-EmmaNeural Gender: Female +Name: en-US-EmmaMultilingualNeural +Gender: Female + Name: en-US-EricNeural Gender: Male @@ -666,12 +679,24 @@ Gender: Male Name: it-IT-ElsaNeural Gender: Female -Name: it-IT-GiuseppeNeural +Name: it-IT-GiuseppeMultilingualNeural Gender: Male Name: it-IT-IsabellaNeural Gender: Female +Name: iu-Cans-CA-SiqiniqNeural +Gender: Female + +Name: iu-Cans-CA-TaqqiqNeural +Gender: Male + +Name: iu-Latn-CA-SiqiniqNeural +Gender: Female + +Name: iu-Latn-CA-TaqqiqNeural +Gender: Male + Name: ja-JP-KeitaNeural Gender: Male @@ -708,7 +733,7 @@ Gender: Male Name: kn-IN-SapnaNeural Gender: Female -Name: ko-KR-HyunsuNeural +Name: ko-KR-HyunsuMultilingualNeural Gender: Male Name: ko-KR-InJoonNeural @@ -822,7 +847,7 @@ Gender: Male Name: pt-BR-FranciscaNeural Gender: Female -Name: pt-BR-ThalitaNeural +Name: pt-BR-ThalitaMultilingualNeural Gender: Female Name: pt-PT-DuarteNeural @@ -1306,6 +1331,52 @@ def get_edge_tts_proxy() -> str | None: return proxy_url or None +def _run_async_safely(coro_func, *args, **kwargs): + """在同步代码里安全运行异步 edge_tts 调用。""" + def run_in_new_loop(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(coro_func(*args, **kwargs)) + finally: + loop.close() + asyncio.set_event_loop(None) + + try: + asyncio.get_running_loop() + except RuntimeError: + return run_in_new_loop() + + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + return executor.submit(run_in_new_loop).result() + + +@lru_cache(maxsize=8) +def _get_all_edge_voices_cached(proxy: str | None) -> list[str]: + async def _list_voices(): + return await edge_tts.list_voices(proxy=proxy) + + voices = [] + for item in _run_async_safely(_list_voices): + name = item.get("ShortName", "").strip() + gender = item.get("Gender", "").strip() + if name and gender: + voices.append(f"{name}-{gender}") + + voices.sort() + return voices + + +def get_all_edge_voices() -> list[str]: + """获取 Edge TTS 当前支持的全部语言和音色,失败时回退到内置列表。""" + try: + return _get_all_edge_voices_cached(get_edge_tts_proxy()) + except Exception as e: + logger.warning(f"获取 Edge TTS 在线音色列表失败,使用内置音色列表: {e}") + return [v for v in get_all_azure_voices(filter_locals=[]) if "-V2" not in v] + + def azure_tts_v1( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str ) -> Union[SubMaker, None]: diff --git a/docs/voice-list.txt b/docs/voice-list.txt index 4672117..b22486d 100644 --- a/docs/voice-list.txt +++ b/docs/voice-list.txt @@ -199,7 +199,7 @@ Gender: Male Name: en-AU-NatashaNeural Gender: Female -Name: en-AU-WilliamNeural +Name: en-AU-WilliamMultilingualNeural Gender: Male Name: en-CA-ClaraNeural @@ -286,21 +286,33 @@ Gender: Female Name: en-US-AndrewNeural Gender: Male +Name: en-US-AndrewMultilingualNeural +Gender: Male + Name: en-US-AriaNeural Gender: Female Name: en-US-AvaNeural Gender: Female +Name: en-US-AvaMultilingualNeural +Gender: Female + Name: en-US-BrianNeural Gender: Male +Name: en-US-BrianMultilingualNeural +Gender: Male + Name: en-US-ChristopherNeural Gender: Male Name: en-US-EmmaNeural Gender: Female +Name: en-US-EmmaMultilingualNeural +Gender: Female + Name: en-US-EricNeural Gender: Male @@ -583,12 +595,24 @@ Gender: Male Name: it-IT-ElsaNeural Gender: Female -Name: it-IT-GiuseppeNeural +Name: it-IT-GiuseppeMultilingualNeural Gender: Male Name: it-IT-IsabellaNeural Gender: Female +Name: iu-Cans-CA-SiqiniqNeural +Gender: Female + +Name: iu-Cans-CA-TaqqiqNeural +Gender: Male + +Name: iu-Latn-CA-SiqiniqNeural +Gender: Female + +Name: iu-Latn-CA-TaqqiqNeural +Gender: Male + Name: ja-JP-KeitaNeural Gender: Male @@ -625,7 +649,7 @@ Gender: Male Name: kn-IN-SapnaNeural Gender: Female -Name: ko-KR-HyunsuNeural +Name: ko-KR-HyunsuMultilingualNeural Gender: Male Name: ko-KR-InJoonNeural @@ -739,7 +763,7 @@ Gender: Male Name: pt-BR-FranciscaNeural Gender: Female -Name: pt-BR-ThalitaNeural +Name: pt-BR-ThalitaMultilingualNeural Gender: Female Name: pt-PT-DuarteNeural diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index ed86698..ce4aa8e 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -163,12 +163,8 @@ def render_tts_settings(tr): def render_edge_tts_settings(tr): """渲染 Edge TTS 引擎设置""" - # 获取支持的语音列表 - support_locales = ["zh-CN", "en-US"] - all_voices = voice.get_all_azure_voices(filter_locals=support_locales) - - # 只保留标准版本的语音(Edge TTS专用,不包含V2) - edge_voices = [v for v in all_voices if "-V2" not in v] + # 获取 Edge TTS 支持的全部语言和音色 + edge_voices = voice.get_all_edge_voices() # 创建友好的显示名称 friendly_names = {} @@ -189,7 +185,7 @@ def render_edge_tts_settings(tr): # 如果没找到匹配的,使用第一个 saved_voice_name = edge_voices[0] if edge_voices else "" - # 音色选择下拉框(Edge TTS音色相对较少,保留下拉框) + # 音色选择下拉框 selected_friendly_name = st.selectbox( "音色选择", options=list(friendly_names.values()), @@ -204,24 +200,12 @@ def render_edge_tts_settings(tr): # 显示音色信息 with st.expander("💡 Edge TTS 音色说明", expanded=False): - st.write("**中文音色:**") - zh_voices = [v for v in edge_voices if v.startswith("zh-CN")] - for v in zh_voices: + st.write(f"已加载 {len(edge_voices)} 个音色") + for v in edge_voices: gender = "女声" if "Female" in v else "男声" - name = v.replace("-Female", "").replace("-Male", "").replace("zh-CN-", "").replace("Neural", "") + name = v.replace("-Female", "").replace("-Male", "").replace("Neural", "") st.write(f"• {name} ({gender})") - st.write("") - st.write("**英文音色:**") - en_voices = [v for v in edge_voices if v.startswith("en-US")][:5] # 只显示前5个 - for v in en_voices: - gender = "女声" if "Female" in v else "男声" - name = v.replace("-Female", "").replace("-Male", "").replace("en-US-", "").replace("Neural", "") - st.write(f"• {name} ({gender})") - - if len([v for v in edge_voices if v.startswith("en-US")]) > 5: - st.write("• ... 更多英文音色") - config.ui["edge_voice_name"] = voice_name config.ui["voice_name"] = voice_name # 兼容性 From 89eebb8b41d1d3f0e42310fa9e9b1c453c2f7f71 Mon Sep 17 00:00:00 2001 From: viccy Date: Fri, 5 Jun 2026 16:36:03 +0800 Subject: [PATCH 04/24] =?UTF-8?q?feat(webui):=20=E5=AE=8C=E5=96=84?= =?UTF-8?q?=E5=9B=BD=E9=99=85=E5=8C=96=E5=A4=9A=E8=AF=AD=E8=A8=80=E9=80=82?= =?UTF-8?q?=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将webui所有页面的硬编码中文提示文本替换为多语言翻译调用,为相关函数添加兼容的tr参数,同时补充zh.json和en.json中的对应翻译词条。 --- webui.py | 42 ++-- webui/components/audio_settings.py | 332 +++++++++++++------------- webui/components/basic_settings.py | 106 ++++---- webui/components/script_settings.py | 50 ++-- webui/components/subtitle_settings.py | 8 +- webui/i18n/en.json | 312 +++++++++++++++++++++++- webui/i18n/zh.json | 231 +++++++++++++++++- webui/tools/generate_script_docu.py | 18 +- webui/tools/generate_script_short.py | 24 +- webui/tools/generate_short_summary.py | 34 +-- 10 files changed, 847 insertions(+), 310 deletions(-) diff --git a/webui.py b/webui.py index 3b9c12b..7c70ad7 100644 --- a/webui.py +++ b/webui.py @@ -143,10 +143,10 @@ def render_generate_button(): # 移除task_id检查 - 现在使用统一裁剪策略,不再需要预裁剪 # 直接检查必要的文件是否存在 if not st.session_state.get('video_clip_json_path'): - st.error(tr("脚本文件不能为空")) + st.error(tr("Script file cannot be empty")) return if not st.session_state.get('video_origin_path'): - st.error(tr("视频文件不能为空")) + st.error(tr("Video file cannot be empty")) return # 获取所有参数 @@ -199,7 +199,7 @@ def render_generate_button(): status_text.text(f"Processing... {progress}%") if state == const.TASK_STATE_COMPLETE: - status_text.text(tr("视频生成完成")) + status_text.text(tr("Video Generation Completed")) progress_bar.progress(1.0) # 显示结果 @@ -212,11 +212,11 @@ def render_generate_button(): except Exception as e: logger.error(f"播放视频失败: {e}") - st.success(tr("视频生成完成")) + st.success(tr("Video Generation Completed")) break elif state == const.TASK_STATE_FAILED: - st.error(f"任务失败: {task.get('message', 'Unknown error')}") + st.error(f"{tr('Task failed')}: {task.get('message', 'Unknown error')}") break time.sleep(0.5) @@ -291,23 +291,23 @@ def render_export_jianying_button(): if 'jianying_export_error' not in st.session_state: st.session_state['jianying_export_error'] = None - if st.button("📤 导出到剪映草稿", use_container_width=True, type="secondary"): + if st.button(tr("Export to Jianying Draft"), use_container_width=True, type="secondary"): config.save_config() if not st.session_state.get('video_clip_json_path'): - st.error("脚本文件不能为空") + st.error(tr("Script file cannot be empty")) return if not st.session_state.get('video_origin_path'): - st.error("视频文件不能为空") + st.error(tr("Video file cannot be empty")) return jianying_draft_path = config.ui.get("jianying_draft_path", "") if not jianying_draft_path: - st.error("请在基础设置中配置剪映草稿地址") + st.error(tr("Please configure Jianying draft folder in basic settings")) return if not os.path.exists(jianying_draft_path): - st.error(f"剪映草稿文件夹不存在: {jianying_draft_path}") + st.error(tr("Jianying draft folder does not exist").format(path=jianying_draft_path)) return # 显示导出表单 @@ -318,17 +318,17 @@ def render_export_jianying_button(): # 显示导出表单 if st.session_state['show_jianying_export_form']: st.markdown("---") - st.subheader("导出到剪映草稿") + st.subheader(tr("Export to Jianying Draft")) draft_name = st.text_input( - "请输入剪映草稿名称", + tr("Please enter Jianying draft name"), value=f"NarratoAI_{int(time.time())}", key="draft_name_input" ) - if st.button("确认导出", key="confirm_export"): + if st.button(tr("Confirm Export"), key="confirm_export"): if not draft_name: - st.error("请输入草稿名称") + st.error(tr("Please enter draft name")) return # 创建任务ID @@ -340,10 +340,10 @@ def render_export_jianying_button(): params = get_jianying_export_params() except Exception as e: logger.error(f"构建参数失败: {e}") - st.error(f"参数构建失败: {e}") + st.error(f"{tr('Failed to build parameters')}: {e}") return - with st.spinner("正在导出到剪映草稿,请稍候..."): + with st.spinner(tr("Exporting to Jianying draft...")): try: from app.services import jianying_task @@ -359,17 +359,17 @@ def render_export_jianying_button(): st.session_state['jianying_export_error'] = None st.session_state['show_jianying_export_form'] = False - st.success(f"✅ 成功导出到剪映草稿: {result['draft_name']}") - st.info(f"📁 草稿已保存到: {result['draft_path']}") + st.success(tr("Jianying draft exported successfully").format(name=result['draft_name'])) + st.info(tr("Draft saved to").format(path=result['draft_path'])) except Exception as e: logger.error(f"导出到剪映草稿失败: {e}") import traceback logger.error(f"错误详情: {traceback.format_exc()}") st.session_state['jianying_export_error'] = str(e) st.session_state['jianying_export_result'] = None - st.error(f"❌ 导出到剪映草稿失败: {e}") + st.error(f"{tr('Failed to export Jianying draft')}: {e}") - if st.button("取消", key="cancel_export"): + if st.button(tr("Cancel"), key="cancel_export"): st.session_state['show_jianying_export_form'] = False st.session_state['jianying_export_result'] = None st.session_state['jianying_export_error'] = None @@ -394,7 +394,7 @@ def main(): logger.error(f"❌ LLM 提供商注册失败: {str(e)}") import traceback logger.error(traceback.format_exc()) - st.error(f"⚠️ LLM 初始化失败: {str(e)}\n\n请检查配置文件和依赖是否正确安装。") + st.error(tr("LLM initialization failed").format(error=str(e))) # 不抛出异常,允许应用继续运行(但 LLM 功能不可用) # 检测FFmpeg硬件加速,但只打印一次日志(使用 session_state 持久化) diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index ce4aa8e..862457d 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -19,55 +19,55 @@ def get_soulvoice_voices(): return ["soulvoice:custom"] -def get_tts_engine_options(): +def get_tts_engine_options(tr=lambda key: key): """获取TTS引擎选项""" return { "edge_tts": "Edge TTS", "azure_speech": "Azure Speech Services", - "tencent_tts": "腾讯云 TTS", - "qwen3_tts": "通义千问 Qwen3 TTS", - "indextts2": "IndexTTS2 语音克隆", - "doubaotts": "豆包语音 TTS" + "tencent_tts": tr("Tencent Cloud TTS"), + "qwen3_tts": tr("Tongyi Qwen3 TTS"), + "indextts2": tr("IndexTTS2 Voice Clone"), + "doubaotts": tr("Doubao TTS") } -def get_tts_engine_descriptions(): +def get_tts_engine_descriptions(tr=lambda key: key): """获取TTS引擎详细描述""" return { "edge_tts": { "title": "Edge TTS", - "features": "完全免费,但服务稳定性一般,不支持语音克隆功能", - "use_case": "测试和轻量级使用", + "features": tr("Edge TTS features"), + "use_case": tr("Edge TTS use case"), "registration": None }, "azure_speech": { "title": "Azure Speech Services", - "features": "提供一定免费额度,超出后按量付费,需要绑定海外信用卡", - "use_case": "企业级应用,需要稳定服务", + "features": tr("Azure Speech Services features"), + "use_case": tr("Azure Speech Services use case"), "registration": "https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices" }, "tencent_tts": { - "title": "腾讯云 TTS", - "features": "提供免费额度,音质优秀,支持多种音色,国内访问速度快", - "use_case": "个人和企业用户,需要稳定的中文语音合成", + "title": tr("Tencent Cloud TTS"), + "features": tr("Tencent Cloud TTS features"), + "use_case": tr("Tencent Cloud TTS use case"), "registration": "https://console.cloud.tencent.com/tts" }, "qwen3_tts": { - "title": "通义千问 Qwen3 TTS", - "features": "阿里云通义千问语音合成,音质优秀,支持多种音色", - "use_case": "需要高质量中文语音合成的用户", + "title": tr("Tongyi Qwen3 TTS"), + "features": tr("Tongyi Qwen3 TTS features"), + "use_case": tr("High-quality Chinese speech synthesis use case"), "registration": "https://dashscope.aliyuncs.com/" }, "indextts2": { - "title": "IndexTTS2 语音克隆", - "features": "零样本语音克隆,上传参考音频即可合成相同音色的语音,需要本地或私有部署", - "use_case": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", + "title": tr("IndexTTS2 Voice Clone"), + "features": tr("IndexTTS2 features"), + "use_case": tr("IndexTTS2 download link"), "registration": None }, "doubaotts": { - "title": "豆包语音 TTS", - "features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快", - "use_case": "需要高质量中文语音合成的用户", + "title": tr("Doubao TTS"), + "features": tr("Doubao TTS features"), + "use_case": tr("High-quality Chinese speech synthesis use case"), "registration": "https://www.volcengine.com/product/voice-tech" } } @@ -105,8 +105,8 @@ def render_tts_settings(tr): # 1. TTS引擎选择器 # st.subheader("🎤 TTS引擎选择") - engine_options = get_tts_engine_options() - engine_descriptions = get_tts_engine_descriptions() + engine_options = get_tts_engine_options(tr) + engine_descriptions = get_tts_engine_descriptions(tr) # 获取保存的TTS引擎设置 saved_tts_engine = config.ui.get("tts_engine", "edge_tts") @@ -117,11 +117,11 @@ def render_tts_settings(tr): # TTS引擎选择下拉框 selected_engine = st.selectbox( - "选择TTS引擎", + tr("Select TTS Engine"), options=list(engine_options.keys()), format_func=lambda x: engine_options[x], index=list(engine_options.keys()).index(saved_tts_engine), - help="选择您要使用的文本转语音引擎" + help=tr("Select TTS Engine Help") ) # 保存TTS引擎选择 @@ -132,12 +132,12 @@ def render_tts_settings(tr): if selected_engine in engine_descriptions: desc = engine_descriptions[selected_engine] - with st.expander(f"📋 {desc['title']} 详细说明", expanded=True): - st.markdown(f"**特点:** {desc['features']}") - st.markdown(f"**适用场景:** {desc['use_case']}") + with st.expander(tr("TTS Engine Details").format(engine=desc['title']), expanded=True): + st.markdown(f"**{tr('Features')}:** {desc['features']}") + st.markdown(f"**{tr('Use Case')}:** {desc['use_case']}") if desc['registration']: - st.markdown(f"**注册地址:** [{desc['registration']}]({desc['registration']})") + st.markdown(f"**{tr('Registration URL')}:** [{desc['registration']}]({desc['registration']})") # 3. 根据选择的引擎渲染对应的配置界面 # st.subheader("⚙️ 引擎配置") @@ -187,10 +187,10 @@ def render_edge_tts_settings(tr): # 音色选择下拉框 selected_friendly_name = st.selectbox( - "音色选择", + tr("Voice Selection"), options=list(friendly_names.values()), index=list(friendly_names.keys()).index(saved_voice_name) if saved_voice_name in friendly_names else 0, - help="选择Edge TTS音色" + help=tr("Select Edge TTS Voice") ) # 获取实际的语音名称 @@ -199,10 +199,10 @@ def render_edge_tts_settings(tr): ] # 显示音色信息 - with st.expander("💡 Edge TTS 音色说明", expanded=False): - st.write(f"已加载 {len(edge_voices)} 个音色") + with st.expander(tr("Edge TTS Voice Description"), expanded=False): + st.write(tr("Loaded voice count").format(count=len(edge_voices))) for v in edge_voices: - gender = "女声" if "Female" in v else "男声" + gender = tr("Female Voice") if "Female" in v else tr("Male Voice") name = v.replace("-Female", "").replace("-Male", "").replace("Neural", "") st.write(f"• {name} ({gender})") @@ -211,36 +211,36 @@ def render_edge_tts_settings(tr): # 音量调节 voice_volume = st.slider( - "音量调节", + tr("Voice Volume"), min_value=0, max_value=100, value=int(config.ui.get("edge_volume", 80)), step=1, - help="调节语音音量 (0-100)" + help=tr("Voice Volume Help Percent") ) config.ui["edge_volume"] = voice_volume st.session_state['voice_volume'] = voice_volume / 100.0 # 语速调节 voice_rate = st.slider( - "语速调节", + tr("Voice Rate"), min_value=0.5, max_value=2.0, value=config.ui.get("edge_rate", 1.0), step=0.1, - help="调节语音速度 (0.5-2.0倍速)" + help=tr("Voice Rate Help 0.5-2.0") ) config.ui["edge_rate"] = voice_rate st.session_state['voice_rate'] = voice_rate # 语调调节 voice_pitch = st.slider( - "语调调节", + tr("Voice Pitch"), min_value=-50, max_value=50, value=int(config.ui.get("edge_pitch", 0)), step=5, - help="调节语音音调 (-50%到+50%)" + help=tr("Voice Pitch Help Percent") ) config.ui["edge_pitch"] = voice_pitch # 转换为比例值 @@ -251,10 +251,10 @@ def render_azure_speech_settings(tr): """渲染 Azure Speech Services 引擎设置""" # 服务区域配置 azure_speech_region = st.text_input( - "服务区域", + tr("Service Region"), value=config.azure.get("speech_region", ""), - placeholder="例如:eastus", - help="Azure Speech Services 服务区域,如:eastus, westus2, eastasia 等" + placeholder=tr("Service Region Placeholder"), + help=tr("Azure Service Region Help") ) # API Key配置 @@ -262,7 +262,7 @@ def render_azure_speech_settings(tr): "API Key", value=config.azure.get("speech_key", ""), type="password", - help="Azure Speech Services API 密钥" + help=tr("Azure Speech Key Help") ) # 保存Azure配置 @@ -274,41 +274,41 @@ def render_azure_speech_settings(tr): # 音色名称输入 voice_name = st.text_input( - "音色名称", + tr("Voice Name"), value=saved_voice_name, - help="输入Azure Speech Services音色名称,直接使用官方音色名称即可。例如:zh-CN-YunzeNeural", + help=tr("Azure Voice Name Help"), placeholder="zh-CN-YunzeNeural" ) # 显示常用音色示例 - with st.expander("💡 常用音色参考", expanded=False): - st.write("**中文音色:**") - st.write("• zh-CN-XiaoxiaoMultilingualNeural (女声,多语言)") - st.write("• zh-CN-YunzeNeural (男声)") - st.write("• zh-CN-YunxiNeural (男声)") - st.write("• zh-CN-XiaochenNeural (女声)") + with st.expander(tr("Common Voice Reference"), expanded=False): + st.write(f"**{tr('Chinese Voices')}:**") + st.write(f"• zh-CN-XiaoxiaoMultilingualNeural ({tr('Female Voice')}, {tr('Multilingual')})") + st.write(f"• zh-CN-YunzeNeural ({tr('Male Voice')})") + st.write(f"• zh-CN-YunxiNeural ({tr('Male Voice')})") + st.write(f"• zh-CN-XiaochenNeural ({tr('Female Voice')})") st.write("") - st.write("**英文音色:**") - st.write("• en-US-AndrewMultilingualNeural (男声,多语言)") - st.write("• en-US-AvaMultilingualNeural (女声,多语言)") - st.write("• en-US-BrianMultilingualNeural (男声,多语言)") - st.write("• en-US-EmmaMultilingualNeural (女声,多语言)") + st.write(f"**{tr('English Voices')}:**") + st.write(f"• en-US-AndrewMultilingualNeural ({tr('Male Voice')}, {tr('Multilingual')})") + st.write(f"• en-US-AvaMultilingualNeural ({tr('Female Voice')}, {tr('Multilingual')})") + st.write(f"• en-US-BrianMultilingualNeural ({tr('Male Voice')}, {tr('Multilingual')})") + st.write(f"• en-US-EmmaMultilingualNeural ({tr('Female Voice')}, {tr('Multilingual')})") st.write("") - st.info("💡 更多音色请参考 [Azure Speech Services 官方文档](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support)") + st.info(tr("Azure Voices Docs Notice")) # 快速选择按钮 - st.write("**快速选择:**") + st.write(f"**{tr('Quick Select')}:**") cols = st.columns(3) with cols[0]: - if st.button("中文女声", help="zh-CN-XiaoxiaoMultilingualNeural"): + if st.button(tr("Chinese Female Voice"), help="zh-CN-XiaoxiaoMultilingualNeural"): voice_name = "zh-CN-XiaoxiaoMultilingualNeural" st.rerun() with cols[1]: - if st.button("中文男声", help="zh-CN-YunzeNeural"): + if st.button(tr("Chinese Male Voice"), help="zh-CN-YunzeNeural"): voice_name = "zh-CN-YunzeNeural" st.rerun() with cols[2]: - if st.button("英文女声", help="en-US-AvaMultilingualNeural"): + if st.button(tr("English Female Voice"), help="en-US-AvaMultilingualNeural"): voice_name = "en-US-AvaMultilingualNeural" st.rerun() @@ -316,10 +316,10 @@ def render_azure_speech_settings(tr): if voice_name.strip(): # 检查是否为有效的Azure音色格式 if is_valid_azure_voice_name(voice_name): - st.success(f"✅ 音色名称有效: {voice_name}") + st.success(tr("Voice name valid").format(voice=voice_name)) else: - st.warning(f"⚠️ 音色名称格式可能不正确: {voice_name}") - st.info("💡 Azure音色名称通常格式为: [语言]-[地区]-[名称]Neural") + st.warning(tr("Voice name format may be invalid").format(voice=voice_name)) + st.info(tr("Azure voice name format notice")) # 保存配置 config.ui["azure_voice_name"] = voice_name @@ -327,36 +327,36 @@ def render_azure_speech_settings(tr): # 音量调节 voice_volume = st.slider( - "音量调节", + tr("Voice Volume"), min_value=0, max_value=100, value=int(config.ui.get("azure_volume", 80)), step=1, - help="调节语音音量 (0-100)" + help=tr("Voice Volume Help Percent") ) config.ui["azure_volume"] = voice_volume st.session_state['voice_volume'] = voice_volume / 100.0 # 语速调节 voice_rate = st.slider( - "语速调节", + tr("Voice Rate"), min_value=0.5, max_value=2.0, value=config.ui.get("azure_rate", 1.0), step=0.1, - help="调节语音速度 (0.5-2.0倍速)" + help=tr("Voice Rate Help 0.5-2.0") ) config.ui["azure_rate"] = voice_rate st.session_state['voice_rate'] = voice_rate # 语调调节 voice_pitch = st.slider( - "语调调节", + tr("Voice Pitch"), min_value=-50, max_value=50, value=int(config.ui.get("azure_pitch", 0)), step=5, - help="调节语音音调 (-50%到+50%)" + help=tr("Voice Pitch Help Percent") ) config.ui["azure_pitch"] = voice_pitch # 转换为比例值 @@ -364,11 +364,11 @@ def render_azure_speech_settings(tr): # 显示配置状态 if azure_speech_region and azure_speech_key: - st.success("✅ Azure Speech Services 配置已设置") + st.success(tr("Azure Speech Services configured")) elif not azure_speech_region: - st.warning("⚠️ 请配置服务区域") + st.warning(tr("Please configure service region")) elif not azure_speech_key: - st.warning("⚠️ 请配置 API Key") + st.warning(tr("Please configure API Key")) def render_tencent_tts_settings(tr): @@ -377,7 +377,7 @@ def render_tencent_tts_settings(tr): secret_id = st.text_input( "Secret ID", value=config.tencent.get("secret_id", ""), - help="请输入您的腾讯云 Secret ID" + help=tr("Tencent Secret ID Help") ) # Secret Key 输入 @@ -385,7 +385,7 @@ def render_tencent_tts_settings(tr): "Secret Key", value=config.tencent.get("secret_key", ""), type="password", - help="请输入您的腾讯云 Secret Key" + help=tr("Tencent Secret Key Help") ) # 地域选择 @@ -404,10 +404,10 @@ def render_tencent_tts_settings(tr): region_options.append(saved_region) region = st.selectbox( - "服务地域", + tr("Service Region"), options=region_options, index=region_options.index(saved_region), - help="选择腾讯云 TTS 服务地域" + help=tr("Tencent Service Region Help") ) # 音色选择 @@ -434,13 +434,13 @@ def render_tencent_tts_settings(tr): saved_voice_type = config.ui.get("tencent_voice_type", "101001") if saved_voice_type not in voice_type_options: - voice_type_options[saved_voice_type] = f"自定义音色 ({saved_voice_type})" + voice_type_options[saved_voice_type] = f"{tr('Custom Voice')} ({saved_voice_type})" selected_voice_display = st.selectbox( - "音色选择", + tr("Voice Selection"), options=list(voice_type_options.values()), index=list(voice_type_options.keys()).index(saved_voice_type), - help="选择腾讯云 TTS 音色" + help=tr("Select Tencent TTS Voice") ) # 获取实际的音色ID @@ -450,31 +450,31 @@ def render_tencent_tts_settings(tr): # 语速调节 voice_rate = st.slider( - "语速调节", + tr("Voice Rate"), min_value=0.5, max_value=2.0, value=config.ui.get("tencent_rate", 1.0), step=0.1, - help="调节语音速度 (0.5-2.0)" + help=tr("Voice Rate Help 0.5-2.0") ) config.ui["voice_name"] = saved_voice_type # 兼容性 # 显示音色说明 - with st.expander("💡 腾讯云 TTS 音色说明", expanded=False): - st.write("**女声音色:**") + with st.expander(tr("Tencent Cloud TTS Voice Description"), expanded=False): + st.write(f"**{tr('Female Voices')}:**") female_voices = [(k, v) for k, v in voice_type_options.items() if "女声" in v] for voice_id, voice_desc in female_voices[:6]: # 显示前6个 st.write(f"• {voice_desc} (ID: {voice_id})") st.write("") - st.write("**男声音色:**") + st.write(f"**{tr('Male Voices')}:**") male_voices = [(k, v) for k, v in voice_type_options.items() if "男声" in v] for voice_id, voice_desc in male_voices: st.write(f"• {voice_desc} (ID: {voice_id})") st.write("") - st.info("💡 更多音色请参考腾讯云官方文档") + st.info(tr("Tencent More Voices Notice")) # 保存配置 config.tencent["secret_id"] = secret_id @@ -491,13 +491,13 @@ def render_qwen3_tts_settings(tr): "API Key", value=config.tts_qwen.get("api_key", ""), type="password", - help="通义千问 DashScope API Key" + help=tr("Qwen DashScope API Key Help") ) model_name = st.text_input( - "模型名称", + tr("TTS Model Name"), value=config.tts_qwen.get("model_name", "qwen3-tts-flash"), - help="Qwen TTS 模型名,例如 qwen3-tts-flash" + help=tr("Qwen TTS Model Help") ) # Qwen3 TTS 音色选项 - 中文名: 英文参数 @@ -538,22 +538,22 @@ def render_qwen3_tts_settings(tr): voice_options[saved_display_name] = saved_voice_param selected_display_name = st.selectbox( - "音色选择", + tr("Voice Selection"), options=display_names, index=display_names.index(saved_display_name) if saved_display_name in display_names else 0, - help="选择Qwen3 TTS音色" + help=tr("Select Qwen3 TTS Voice") ) # 获取对应的英文参数 voice_type = voice_options.get(selected_display_name, "Cherry") voice_rate = st.slider( - "语速调节", + tr("Voice Rate"), min_value=0.5, max_value=2.0, value=1.0, step=0.1, - help="调节语音速度 (0.5-2.0)" + help=tr("Voice Rate Help 0.5-2.0") ) # 保存配置 @@ -570,23 +570,23 @@ def render_indextts2_tts_settings(tr): # API 地址配置 api_url = st.text_input( - "API 地址", + tr("API URL"), value=config.indextts2.get("api_url", "http://127.0.0.1:8081/tts"), - help="IndexTTS2 API 服务地址" + help=tr("IndexTTS2 API URL Help") ) # 参考音频文件路径 reference_audio = st.text_input( - "参考音频路径", + tr("Reference Audio Path"), value=config.indextts2.get("reference_audio", ""), - help="用于语音克隆的参考音频文件路径(WAV 格式,建议 3-10 秒)" + help=tr("Reference Audio Path Help") ) # 文件上传功能 uploaded_file = st.file_uploader( - "或上传参考音频文件", + tr("Upload Reference Audio File"), type=["wav", "mp3"], - help="上传一段清晰的音频用于语音克隆" + help=tr("Upload Reference Audio Help") ) if uploaded_file is not None: @@ -597,28 +597,34 @@ def render_indextts2_tts_settings(tr): with open(audio_path, "wb") as f: f.write(uploaded_file.getbuffer()) reference_audio = audio_path - st.success(f"✅ 音频已上传: {audio_path}") + st.success(tr("Audio uploaded").format(path=audio_path)) # 推理模式 - infer_mode = st.selectbox( - "推理模式", - options=["普通推理", "快速推理"], - index=0 if config.indextts2.get("infer_mode", "普通推理") == "普通推理" else 1, - help="普通推理质量更高但速度较慢,快速推理速度更快但质量略低" - ) + infer_mode_options = [ + ("普通推理", tr("Standard Inference")), + ("快速推理", tr("Fast Inference")), + ] + infer_mode_index = 0 if config.indextts2.get("infer_mode", "普通推理") == "普通推理" else 1 + infer_mode = infer_mode_options[st.selectbox( + tr("Inference Mode"), + options=range(len(infer_mode_options)), + index=infer_mode_index, + format_func=lambda x: infer_mode_options[x][1], + help=tr("Inference Mode Help") + )][0] # 高级参数折叠面板 - with st.expander("🔧 高级参数", expanded=False): + with st.expander(tr("Advanced Parameters"), expanded=False): col1, col2 = st.columns(2) with col1: temperature = st.slider( - "采样温度 (Temperature)", + tr("Sampling Temperature"), min_value=0.1, max_value=2.0, value=float(config.indextts2.get("temperature", 1.0)), step=0.1, - help="控制随机性,值越高输出越随机,值越低越确定" + help=tr("Sampling Temperature Help") ) top_p = st.slider( @@ -627,7 +633,7 @@ def render_indextts2_tts_settings(tr): max_value=1.0, value=float(config.indextts2.get("top_p", 0.8)), step=0.05, - help="nucleus 采样的概率阈值,值越小结果越确定" + help=tr("Top P Help") ) top_k = st.slider( @@ -636,49 +642,37 @@ def render_indextts2_tts_settings(tr): max_value=100, value=int(config.indextts2.get("top_k", 30)), step=5, - help="top-k 采样的 k 值,0 表示不使用 top-k" + help=tr("Top K Help") ) with col2: num_beams = st.slider( - "束搜索 (Num Beams)", + tr("Num Beams"), min_value=1, max_value=10, value=int(config.indextts2.get("num_beams", 3)), step=1, - help="束搜索的 beam 数量,值越大质量可能越好但速度越慢" + help=tr("Num Beams Help") ) repetition_penalty = st.slider( - "重复惩罚 (Repetition Penalty)", + tr("Repetition Penalty"), min_value=1.0, max_value=20.0, value=float(config.indextts2.get("repetition_penalty", 10.0)), step=0.5, - help="值越大越能避免重复,但过大可能导致不自然" + help=tr("Repetition Penalty Help") ) do_sample = st.checkbox( - "启用采样", + tr("Enable Sampling"), value=config.indextts2.get("do_sample", True), - help="启用采样可以获得更自然的语音" + help=tr("Enable Sampling Help") ) # 显示使用说明 - with st.expander("💡 IndexTTS2 使用说明", expanded=False): - st.markdown(""" - **零样本语音克隆** - - 1. **准备参考音频**:上传或指定一段清晰的音频文件(建议 3-10 秒) - 2. **设置 API 地址**:确保 IndexTTS2 服务正常运行 - 3. **开始合成**:系统会自动使用参考音频的音色合成新语音 - - **注意事项**: - - 参考音频质量直接影响合成效果 - - 建议使用无背景噪音的清晰音频 - - 文本长度建议控制在合理范围内 - - 首次合成可能需要较长时间 - """) + with st.expander(tr("IndexTTS2 Usage Instructions Title"), expanded=False): + st.markdown(tr("IndexTTS2 Usage Instructions")) # 保存配置 config.indextts2["api_url"] = api_url @@ -702,7 +696,7 @@ def render_doubaotts_settings(tr): ak = st.text_input( "Access Key", value=config.doubaotts.get("ak", ""), - help="火山引擎 Access Key" + help=tr("Volcengine Access Key Help") ) # SK 输入 @@ -710,14 +704,14 @@ def render_doubaotts_settings(tr): "Secret Key", value=config.doubaotts.get("sk", ""), type="password", - help="火山引擎 Secret Key" + help=tr("Volcengine Secret Key Help") ) # AppID 输入 appid = st.text_input( "AppID", value=config.doubaotts.get("appid", ""), - help="豆包语音应用 AppID" + help=tr("Doubao AppID Help") ) # Token 输入 @@ -725,14 +719,14 @@ def render_doubaotts_settings(tr): "Token", value=config.doubaotts.get("token", ""), type="password", - help="豆包语音应用 Token" + help=tr("Doubao Token Help") ) # 集群配置 cluster = st.text_input( - "集群", + tr("Cluster"), value=config.doubaotts.get("cluster", "volcano_tts"), - help="业务集群,标准音色使用 volcano_tts" + help=tr("Doubao Cluster Help") ) # 音色选择 @@ -836,13 +830,13 @@ def render_doubaotts_settings(tr): saved_voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming") if saved_voice_type not in voice_options: - voice_options[saved_voice_type] = f"自定义音色 ({saved_voice_type})" + voice_options[saved_voice_type] = f"{tr('Custom Voice')} ({saved_voice_type})" selected_voice_display = st.selectbox( - "音色选择", + tr("Voice Selection"), options=list(voice_options.values()), index=list(voice_options.keys()).index(saved_voice_type) if saved_voice_type in voice_options else 0, - help="选择豆包语音 TTS 音色" + help=tr("Select Doubao TTS Voice") ) # 获取实际的音色ID @@ -851,63 +845,63 @@ def render_doubaotts_settings(tr): ] # 高级参数折叠面板 - with st.expander("🔧 高级参数", expanded=False): + with st.expander(tr("Advanced Parameters"), expanded=False): col1, col2 = st.columns(2) with col1: # 语速调节 voice_rate = st.slider( - "语速调节", + tr("Voice Rate"), min_value=0.2, max_value=3.0, value=config.ui.get("doubaotts_rate", 1.0), step=0.1, - help="调节语音速度 (0.2-3.0)" + help=tr("Voice Rate Help 0.2-3.0") ) # 音量调节 voice_volume = st.slider( - "音量调节", + tr("Voice Volume"), min_value=0.1, max_value=2.0, value=config.doubaotts.get("volume", 1.0), step=0.1, - help="调节语音音量 (0.1-2.0)" + help=tr("Voice Volume Help 0.1-2.0") ) with col2: # 音高调节 voice_pitch = st.slider( - "音高调节", + tr("Voice Pitch"), min_value=0.5, max_value=1.5, value=config.doubaotts.get("pitch", 1.0), step=0.1, - help="调节语音音高 (0.5-1.5)" + help=tr("Voice Pitch Help 0.5-1.5") ) # 句尾静音时长 silence_duration = st.slider( - "句尾静音时长 (秒)", + tr("Sentence Silence Duration"), min_value=0.0, max_value=2.0, value=config.doubaotts.get("silence_duration", 0.125), step=0.05, - help="调节句尾静音时长 (0.0-2.0秒)" + help=tr("Sentence Silence Duration Help") ) # 显示API Key申请流程 - with st.expander("💡 豆包语音 TTS API Key申请流程", expanded=False): - st.write("**申请步骤:**") - st.write("1. 打开 [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)") - st.write("2. 新建 Access Key 和 Secret Key") - st.write("3. 打开 [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)") - st.write("4. 点击立即使用") - st.write("5. 在最左边的API服务中心找到音频生成下面的语音合成(注意:是语音合成,不是语音合成大模型)") - st.write("6. 翻到最下面获取 APPID 和 Access Token") + with st.expander(tr("Doubao TTS API Key Application Process"), expanded=False): + st.write(f"**{tr('Application Steps')}:**") + st.write(tr("Doubao TTS Step 1")) + st.write(tr("Doubao TTS Step 2")) + st.write(tr("Doubao TTS Step 3")) + st.write(tr("Doubao TTS Step 4")) + st.write(tr("Doubao TTS Step 5")) + st.write(tr("Doubao TTS Step 6")) st.write("") - st.info("💡 请将获取到的 Access Key、Secret Key、AppID 和 Token 填写到上方的配置中") + st.info(tr("Doubao TTS Fill Credentials Notice")) # 保存配置 config.doubaotts["ak"] = ak @@ -925,7 +919,7 @@ def render_doubaotts_settings(tr): # 显示配置状态 if ak and sk and appid and token: - st.success("✅ 豆包语音 TTS 配置已设置") + st.success(tr("Doubao TTS configured")) else: missing = [] if not ak: @@ -937,13 +931,13 @@ def render_doubaotts_settings(tr): if not token: missing.append("Token") if missing: - st.warning(f"⚠️ 请配置: {', '.join(missing)}") + st.warning(tr("Please configure missing fields").format(fields=', '.join(missing))) def render_voice_preview_new(tr, selected_engine): """渲染新的语音试听功能""" - if st.button("🎵 试听语音合成", use_container_width=True): - play_content = "感谢关注 NarratoAI,有任何问题或建议,可以关注微信公众号,求助或讨论" + if st.button(tr("Preview Voice Synthesis"), use_container_width=True): + play_content = tr("Voice Preview Sample") # 根据选择的引擎获取对应的语音配置 voice_name = "" @@ -990,10 +984,10 @@ def render_voice_preview_new(tr, selected_engine): voice_pitch = 1.0 # 豆包语音 TTS 不支持音调调节 if not voice_name: - st.error("请先配置语音设置") + st.error(tr("Please configure voice settings first")) return - with st.spinner("正在合成语音..."): + with st.spinner(tr("Synthesizing Voice")): temp_dir = utils.storage_dir("temp", create=True) audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3") @@ -1007,7 +1001,7 @@ def render_voice_preview_new(tr, selected_engine): ) if sub_maker and os.path.exists(audio_file): - st.success("✅ 语音合成成功!") + st.success(tr("Voice synthesis successful")) # 播放音频 with open(audio_file, 'rb') as audio_file_obj: @@ -1020,7 +1014,7 @@ def render_voice_preview_new(tr, selected_engine): except: pass else: - st.error("❌ 语音合成失败,请检查配置") + st.error(tr("Voice synthesis failed")) def render_azure_v2_settings(tr): @@ -1089,7 +1083,7 @@ def render_voice_parameters(tr, voice_name): else: # SoulVoice 不支持音调调节,设置默认值 st.session_state['voice_pitch'] = 1.0 - st.info("ℹ️ SoulVoice 引擎不支持音调调节") + st.info(tr("SoulVoice pitch not supported")) def render_voice_preview(tr, voice_name): diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py index 95db275..842a500 100644 --- a/webui/components/basic_settings.py +++ b/webui/components/basic_settings.py @@ -26,7 +26,7 @@ OPENAI_COMPATIBLE_GATEWAY_BASE_URLS = { } -def build_base_url_help(provider: str, model_type: str) -> tuple[str, bool, str]: +def build_base_url_help(provider: str, model_type: str, tr=lambda key: key) -> tuple[str, bool, str]: """ 根据 provider 返回 Base URL 的帮助文案 @@ -35,14 +35,14 @@ def build_base_url_help(provider: str, model_type: str) -> tuple[str, bool, str] requires_base: 是否强制提示必须填写 Base URL placeholder: 推荐的默认值(可为空字符串) """ - default_help = "自定义 API 端点(可选),当使用自建或第三方代理时需要填写" + default_help = tr("Custom API endpoint help") provider_key = (provider or "").lower() example_url = OPENAI_COMPATIBLE_GATEWAY_BASE_URLS.get(provider_key) if example_url is not None: - extra = f"\n推荐接口地址: {example_url}" if example_url else "" + extra = f"\n{tr('Recommended API endpoint')}: {example_url}" if example_url else "" help_text = ( - f"{model_type} 选择的提供商基于 OpenAI 兼容网关,必须填写完整的接口地址。" + f"{tr('OpenAI compatible gateway help').format(model_type=model_type)}" f"{extra}" ) return help_text, True, example_url @@ -227,11 +227,11 @@ def render_proxy_settings(tr): config.proxy["https"] = "" # 剪映草稿地址设置 - st.subheader("剪映草稿设置") + st.subheader(tr("Jianying Draft Settings")) jianying_draft_path = st.text_input( - "剪映草稿文件夹路径", + tr("Jianying Draft Folder Path"), value=config.ui.get("jianying_draft_path", ""), - help="剪映草稿文件夹路径,例如:C:\\Users\\用户名\\Documents\\JianyingPro Drafts" + help=tr("Jianying Draft Folder Path Help") ) config.ui["jianying_draft_path"] = jianying_draft_path @@ -479,13 +479,15 @@ def render_vision_llm_settings(tr): model_name_input = st.text_input( tr("Vision Model Name"), value=current_model, - help="输入完整模型名称\n\n" - "常用示例:\n" - "• Qwen/Qwen3.5-122B-A10B\n" - "• gemini/gemini-2.0-flash-lite\n" - "• gpt-4o\n" - "• Qwen/Qwen2.5-VL-32B-Instruct (SiliconFlow)\n\n" - "支持常见 OpenAI 兼容网关(如 OpenAI/DeepSeek/OpenRouter/SiliconFlow)", + help=( + tr("Model Name Input Help") + + "\n\n" + + "• Qwen/Qwen3.5-122B-A10B\n" + + "• gemini/gemini-2.0-flash-lite\n" + + "• gpt-4o\n" + + "• Qwen/Qwen2.5-VL-32B-Instruct (SiliconFlow)\n\n" + + tr("OpenAI compatible providers help") + ), key="vision_model_input" ) @@ -496,16 +498,18 @@ def render_vision_llm_settings(tr): tr("Vision API Key"), value=vision_api_key, type="password", - help="对应 provider 的 API 密钥\n\n" - "获取地址:\n" - "• Gemini: https://makersuite.google.com/app/apikey\n" - "• OpenAI: https://platform.openai.com/api-keys\n" - "• Qwen: https://bailian.console.aliyun.com/\n" - "• SiliconFlow: https://cloud.siliconflow.cn/account/ak" + help=( + tr("Provider API Key Help") + + "\n\n" + + "• Gemini: https://makersuite.google.com/app/apikey\n" + + "• OpenAI: https://platform.openai.com/api-keys\n" + + "• Qwen: https://bailian.console.aliyun.com/\n" + + "• SiliconFlow: https://cloud.siliconflow.cn/account/ak" + ) ) vision_base_help, vision_base_required, vision_placeholder = build_base_url_help( - selected_provider, "视频分析模型" + selected_provider, tr("Vision model"), tr ) st_vision_base_url = st.text_input( tr("Vision Base URL"), @@ -515,15 +519,15 @@ def render_vision_llm_settings(tr): ) if vision_base_required and not st_vision_base_url: info_example = vision_placeholder or "https://your-openai-compatible-endpoint/v1" - st.info(f"请在上方填写 OpenAI 兼容网关地址,例如:{info_example}") + st.info(tr("Please fill OpenAI compatible gateway").format(example=info_example)) # 添加测试连接按钮 if st.button(tr("Test Connection"), key="test_vision_connection"): test_errors = [] if not st_vision_api_key: - test_errors.append("请先输入 API 密钥") + test_errors.append(tr("Please enter API key")) if not model_name_input: - test_errors.append("请先输入模型名称") + test_errors.append(tr("Please enter model name")) if test_errors: for error in test_errors: @@ -543,7 +547,7 @@ def render_vision_llm_settings(tr): else: st.error(message) except Exception as e: - st.error(f"测试连接时发生错误: {str(e)}") + st.error(f"{tr('Connection test error')}: {str(e)}") logger.error(f"OpenAI 兼容 视频分析模型连接测试失败: {str(e)}") # 验证和保存配置 @@ -597,9 +601,9 @@ def render_vision_llm_settings(tr): # 清除缓存,确保下次使用新配置 UnifiedLLMService.clear_cache() if st_vision_api_key or st_vision_base_url or st_vision_model_name: - st.success(f"视频分析模型配置已保存(OpenAI 兼容)") + st.success(tr("Vision model config saved")) except Exception as e: - st.error(f"保存配置失败: {str(e)}") + st.error(f"{tr('Failed to save config')}: {str(e)}") logger.error(f"保存视频分析配置失败: {str(e)}") @@ -742,13 +746,15 @@ def render_text_llm_settings(tr): model_name_input = st.text_input( tr("Text Model Name"), value=current_model, - help="输入完整模型名称\n\n" - "常用示例:\n" - "• Pro/zai-org/GLM-5\n" - "• deepseek/deepseek-chat\n" - "• gpt-4o\n" - "• deepseek-ai/DeepSeek-R1 (SiliconFlow)\n\n" - "支持常见 OpenAI 兼容网关(如 OpenAI/DeepSeek/OpenRouter/SiliconFlow)", + help=( + tr("Model Name Input Help") + + "\n\n" + + "• Pro/zai-org/GLM-5\n" + + "• deepseek/deepseek-chat\n" + + "• gpt-4o\n" + + "• deepseek-ai/DeepSeek-R1 (SiliconFlow)\n\n" + + tr("OpenAI compatible providers help") + ), key="text_model_input" ) @@ -759,18 +765,20 @@ def render_text_llm_settings(tr): tr("Text API Key"), value=text_api_key, type="password", - help="对应 provider 的 API 密钥\n\n" - "获取地址:\n" - "• DeepSeek: https://platform.deepseek.com/api_keys\n" - "• Gemini: https://makersuite.google.com/app/apikey\n" - "• OpenAI: https://platform.openai.com/api-keys\n" - "• Qwen: https://bailian.console.aliyun.com/\n" - "• SiliconFlow: https://cloud.siliconflow.cn/account/ak\n" - "• Moonshot: https://platform.moonshot.cn/console/api-keys" + help=( + tr("Provider API Key Help") + + "\n\n" + + "• DeepSeek: https://platform.deepseek.com/api_keys\n" + + "• Gemini: https://makersuite.google.com/app/apikey\n" + + "• OpenAI: https://platform.openai.com/api-keys\n" + + "• Qwen: https://bailian.console.aliyun.com/\n" + + "• SiliconFlow: https://cloud.siliconflow.cn/account/ak\n" + + "• Moonshot: https://platform.moonshot.cn/console/api-keys" + ) ) text_base_help, text_base_required, text_placeholder = build_base_url_help( - selected_provider, "文案生成模型" + selected_provider, tr("Text model"), tr ) st_text_base_url = st.text_input( tr("Text Base URL"), @@ -780,15 +788,15 @@ def render_text_llm_settings(tr): ) if text_base_required and not st_text_base_url: info_example = text_placeholder or "https://your-openai-compatible-endpoint/v1" - st.info(f"请在上方填写 OpenAI 兼容网关地址,例如:{info_example}") + st.info(tr("Please fill OpenAI compatible gateway").format(example=info_example)) # 添加测试连接按钮 if st.button(tr("Test Connection"), key="test_text_connection"): test_errors = [] if not st_text_api_key: - test_errors.append("请先输入 API 密钥") + test_errors.append(tr("Please enter API key")) if not model_name_input: - test_errors.append("请先输入模型名称") + test_errors.append(tr("Please enter model name")) if test_errors: for error in test_errors: @@ -808,7 +816,7 @@ def render_text_llm_settings(tr): else: st.error(message) except Exception as e: - st.error(f"测试连接时发生错误: {str(e)}") + st.error(f"{tr('Connection test error')}: {str(e)}") logger.error(f"OpenAI 兼容 文案生成模型连接测试失败: {str(e)}") # 验证和保存配置 @@ -861,9 +869,9 @@ def render_text_llm_settings(tr): # 清除缓存,确保下次使用新配置 UnifiedLLMService.clear_cache() if st_text_api_key or st_text_base_url or st_text_model_name: - st.success(f"文案生成模型配置已保存(OpenAI 兼容)") + st.success(tr("Text model config saved")) except Exception as e: - st.error(f"保存配置失败: {str(e)}") + st.error(f"{tr('Failed to save config')}: {str(e)}") logger.error(f"保存文案生成配置失败: {str(e)}") # # Cloudflare 特殊配置 diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index 4ac6f2b..772906b 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -346,7 +346,7 @@ def short_drama_summary(tr): # 显示当前已上传的字幕文件路径 if 'subtitle_path' in st.session_state and st.session_state['subtitle_path']: - st.info(f"已上传字幕: {os.path.basename(st.session_state['subtitle_path'])}") + st.info(tr("Uploaded subtitle").format(file=os.path.basename(st.session_state['subtitle_path']))) if st.button(tr("清除已上传字幕")): st.session_state['subtitle_path'] = None st.session_state['subtitle_content'] = None @@ -388,8 +388,8 @@ def short_drama_summary(tr): # 更新状态 st.success( f"{tr('字幕上传成功')} " - f"(编码: {detected_encoding.upper()}, " - f"大小: {len(script_content)} 字符)" + f"({tr('Encoding')}: {detected_encoding.upper()}, " + f"{tr('Size')}: {len(script_content)} {tr('Characters')})" ) st.session_state['subtitle_path'] = script_file_path st.session_state['subtitle_content'] = script_content @@ -417,23 +417,23 @@ def render_fun_asr_transcription(tr): st.session_state['subtitle_content'] = None st.session_state['subtitle_file_processed'] = False - with st.expander("阿里百炼 Fun-ASR 字幕转录", expanded=False): - st.caption("上传本地音频/视频后,将自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。") + with st.expander(tr("Ali Bailian Fun-ASR Subtitle Transcription"), expanded=False): + st.caption(tr("Fun-ASR upload caption")) st.markdown( - "API Key 获取地址:" + f"{tr('API Key URL')}: " "[https://bailian.console.aliyun.com/?tab=model#/api-key]" "(https://bailian.console.aliyun.com/?tab=model#/api-key)" ) api_key = st.text_input( - "阿里百炼 API Key", + tr("Ali Bailian API Key"), value=config.fun_asr.get("api_key", ""), type="password", - help="请输入你自己的阿里百炼 API Key;保存配置后会写入本地 config.toml", + help=tr("Ali Bailian API Key Help"), key="fun_asr_api_key", ) uploaded_media = st.file_uploader( - "上传需要转录的音频/视频", + tr("Upload media to transcribe"), type=[ "aac", "amr", "avi", "flac", "flv", "m4a", "mkv", "mov", "mp3", "mp4", "mpeg", "ogg", "opus", "wav", "webm", "wma", "wmv", @@ -442,14 +442,14 @@ def render_fun_asr_transcription(tr): key="fun_asr_media_uploader", ) - if st.button("转写生成字幕", key="fun_asr_transcribe"): + if st.button(tr("Transcribe subtitles"), key="fun_asr_transcribe"): if not api_key.strip(): clear_fun_asr_subtitle_state() - st.error("请先输入阿里百炼 API Key") + st.error(tr("Please enter Ali Bailian API Key")) return if uploaded_media is None: clear_fun_asr_subtitle_state() - st.error("请先上传需要转录的音频或视频文件") + st.error(tr("Please upload media to transcribe")) return try: @@ -474,7 +474,7 @@ def render_fun_asr_transcription(tr): subtitle_name = f"{os.path.splitext(os.path.basename(media_path))[0]}_fun_asr.srt" subtitle_path = os.path.join(utils.subtitle_dir(), subtitle_name) - with st.spinner("正在使用阿里百炼 Fun-ASR 转写字幕,请稍候..."): + with st.spinner(tr("Transcribing with Fun-ASR...")): generated_path = fun_asr_subtitle.create_with_fun_asr( local_file=media_path, subtitle_file=subtitle_path, @@ -483,7 +483,7 @@ def render_fun_asr_transcription(tr): if not generated_path or not os.path.exists(generated_path): clear_fun_asr_subtitle_state() - st.error("Fun-ASR 转写失败:未生成字幕文件") + st.error(tr("Fun-ASR failed without subtitle file")) return with open(generated_path, "r", encoding="utf-8") as f: @@ -492,11 +492,11 @@ def render_fun_asr_transcription(tr): st.session_state['subtitle_path'] = generated_path st.session_state['subtitle_content'] = subtitle_content st.session_state['subtitle_file_processed'] = True - st.success(f"字幕转写成功: {os.path.basename(generated_path)}") + st.success(tr("Subtitle transcription succeeded").format(file=os.path.basename(generated_path))) except Exception as e: clear_fun_asr_subtitle_state() logger.error(f"Fun-ASR 字幕转写失败: {traceback.format_exc()}") - st.error(f"Fun-ASR 字幕转写失败: {str(e)}") + st.error(f"{tr('Fun-ASR transcription failed')}: {str(e)}") def render_script_buttons(tr, params): @@ -519,7 +519,7 @@ def render_script_buttons(tr, params): if st.button(button_name, key="script_action", disabled=not script_path): if script_path == "auto": # 执行纪录片视频脚本生成(视频无字幕无配音) - generate_script_docu(params) + generate_script_docu(params, tr) elif script_path == "short": # 执行 短剧混剪 脚本生成 custom_clips = st.session_state.get('custom_clips') @@ -529,7 +529,7 @@ def render_script_buttons(tr, params): subtitle_path = st.session_state.get('subtitle_path') video_theme = st.session_state.get('video_theme') temperature = st.session_state.get('temperature') - generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature) + generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature, tr) else: load_script(tr, script_path) @@ -566,7 +566,7 @@ def save_script_with_validation(tr, video_clip_json_details): st.stop() # 第一步:格式验证 - with st.spinner("正在验证脚本格式..."): + with st.spinner(tr("Validating script format...")): try: result = check_script.check_format(video_clip_json_details) if not result.get('success'): @@ -574,13 +574,13 @@ def save_script_with_validation(tr, video_clip_json_details): error_message = result.get('message', '未知错误') error_details = result.get('details', '') - st.error(f"**脚本格式验证失败**") - st.error(f"**错误信息:** {error_message}") + st.error(f"**{tr('Script format validation failed')}**") + st.error(f"**{tr('Error Message')}:** {error_message}") if error_details: - st.error(f"**详细说明:** {error_details}") + st.error(f"**{tr('Details')}:** {error_details}") # 显示正确格式示例 - st.info("**正确的脚本格式示例:**") + st.info(f"**{tr('Correct script format example')}:**") example_script = [ { "_id": 1, @@ -601,7 +601,7 @@ def save_script_with_validation(tr, video_clip_json_details): st.stop() except Exception as e: - st.error(f"格式验证过程中发生错误: {str(e)}") + st.error(f"{tr('Script format validation error')}: {str(e)}") st.stop() # 第二步:保存脚本 @@ -624,7 +624,7 @@ def save_script_with_validation(tr, video_clip_json_details): config.app["video_clip_json_path"] = save_path # 显示成功消息 - st.success("✅ 脚本格式验证通过,保存成功!") + st.success(tr("Script validated and saved successfully")) # 强制重新加载页面更新选择框 time.sleep(0.5) # 给一点时间让用户看到成功消息 diff --git a/webui/components/subtitle_settings.py b/webui/components/subtitle_settings.py index ed1141a..41fac95 100644 --- a/webui/components/subtitle_settings.py +++ b/webui/components/subtitle_settings.py @@ -10,7 +10,7 @@ def render_subtitle_panel(tr): """渲染字幕设置面板""" with st.container(border=True): st.write(tr("Subtitle Settings")) - st.info("💡 提示:目前仅 **edge-tts** 引擎支持自动生成字幕,其他 TTS 引擎暂不支持。") + st.info(tr("Subtitle TTS support notice")) # 检查是否选择了 SoulVoice qwen3_tts引擎 from app.services import voice @@ -20,8 +20,8 @@ def render_subtitle_panel(tr): if is_disabled_subtitle: # SoulVoice 引擎时显示禁用提示 - st.warning(f"⚠️ {tts_engine}不支持精确字幕生成") - st.info("💡 建议使用专业剪辑工具(如剪映、PR等)手动添加字幕") + st.warning(tr("TTS engine does not support precise subtitles").format(engine=tts_engine)) + st.info(tr("Manual subtitle editing recommendation")) # 强制禁用字幕 st.session_state['subtitle_enabled'] = False @@ -31,7 +31,7 @@ def render_subtitle_panel(tr): tr("Enable Subtitles"), value=False, disabled=True, - help="SoulVoice 引擎不支持字幕生成,请使用其他 TTS 引擎" + help=tr("Disabled subtitles help") ) else: # 其他引擎正常显示字幕选项 diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 3a69807..bfab52e 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -86,6 +86,314 @@ "Hide Log": "Hide Log", "Upload Local Files": "Upload Local Files", "File Uploaded Successfully": "File Uploaded Successfully", - "Frame Interval (seconds)": "Frame Interval (seconds) (More keyframes consume more tokens)" + "Frame Interval (seconds)": "Frame Interval (seconds)", + "Generate Video Script": "Generate Video Script", + "Video Theme": "Video Theme", + "Generation Prompt": "Custom Prompt", + "Video LLM Provider": "Video Analysis Model", + "timestamp": "Timestamp", + "Picture description": "Picture Description", + "Narration": "Narration", + "Rebuild": "Regenerate", + "Load Video Script": "Load Video Script", + "Speech Pitch": "Speech Pitch", + "Please Select Script File": "Please Select Script File", + "Check Format": "Check Format", + "Script Loaded Successfully": "Script Loaded Successfully", + "Script loaded successfully": "Script loaded successfully", + "Script format check passed": "Script format check passed", + "Script format check failed": "Script format check failed", + "Failed to Load Script": "Failed to Load Script", + "Failed to load script": "Failed to load script", + "Failed to Save Script": "Failed to Save Script", + "Failed to save script": "Failed to save script", + "Script saved successfully": "Script saved successfully", + "Video Quality": "Video Quality", + "Custom prompt for LLM, leave empty to use default prompt": "Custom prompt for LLM. Leave empty to use the default prompt.", + "Proxy Settings": "Proxy Settings", + "HTTP_PROXY": "HTTP Proxy", + "HTTPs_PROXY": "HTTPS Proxy", + "Vision Model Settings": "Vision Model Settings", + "Vision Model Provider": "Vision Model Provider", + "Vision API Key": "Vision API Key", + "Vision Base URL": "Vision Base URL", + "Vision Model Name": "Vision Model Name", + "Text Generation Model Settings": "Text Generation Model Settings", + "LLM Model Name": "LLM Model Name", + "LLM Model API Key": "LLM Model API Key", + "Text Model Provider": "Text Model Provider", + "Text API Key": "Text API Key", + "Text Base URL": "Text Base URL", + "Text Model Name": "Text Model Name", + "Skip the first few seconds": "Skip the first few seconds", + "Difference threshold": "Difference Threshold", + "Vision processing batch size": "Vision Processing Batch Size", + "Test Connection": "Test Connection", + "Testing connection...": "Testing connection...", + "gemini model is available": "Gemini model is available", + "gemini model is not available": "Gemini model is not available", + "Unsupported provider": "Unsupported provider", + "0: Keep the audio only, 1: Keep the original sound only, 2: Keep the original sound and audio": "0: Keep the narration only, 1: Keep the original sound only, 2: Keep both original sound and narration", + "Text model is not available": "Text model is not available", + "Text model is available": "Text model is available", + "Upload Script": "Upload Script", + "Upload Script File": "Upload Script File", + "Script Uploaded Successfully": "Script Uploaded Successfully", + "Invalid JSON format": "Invalid JSON format", + "Upload failed": "Upload failed", + "Enable Proxy": "Enable Proxy", + "QwenVL model is available": "QwenVL model is available", + "QwenVL model is not available": "QwenVL model is not available", + "QwenVL model returned invalid response": "QwenVL model returned an invalid response", + "System settings": "System Settings", + "Clear Cache": "Clear Cache", + "Cache cleared": "Cache cleared", + "storage directory does not exist": "Storage directory does not exist", + "Failed to clear cache": "Failed to clear cache", + "Clear frames": "Clear frames", + "Clear clip videos": "Clear clip videos", + "Clear tasks": "Clear tasks", + "Directory cleared": "Directory cleared", + "Directory does not exist": "Directory does not exist", + "Failed to clear directory": "Failed to clear directory", + "Subtitle Preview": "Subtitle Preview", + "One-Click Transcribe": "One-Click Transcribe", + "Transcribing...": "Transcribing...", + "Transcription Complete!": "Transcription Complete!", + "Transcription Failed. Please try again.": "Transcription failed. Please try again.", + "API rate limit exceeded. Please wait about an hour and try again.": "API rate limit exceeded. Please wait about an hour and try again.", + "Resources exhausted. Please try again later.": "Resources exhausted. Please try again later.", + "Transcription Failed": "Transcription Failed", + "Short Generate": "Short Drama Mix", + "Generate Short Video Script": "Generate Short Video Script", + "Adjust the volume of the original audio": "Adjust the volume of the original audio", + "Original Volume": "Original Volume", + "Frame Interval (seconds) (More keyframes consume more tokens)": "Frame Interval (seconds) (More keyframes consume more tokens)", + "Batch Size": "Batch Size", + "Batch Size (More keyframes consume more tokens)": "Batch Size (smaller batches consume more tokens)", + "Short Drama Summary": "Short Drama Summary", + "Video Type": "Video Type", + "Select/Upload Script": "Select/Upload Script", + "原生Gemini模型连接成功": "Native Gemini model connection succeeded", + "原生Gemini模型连接失败": "Native Gemini model connection failed", + "OpenAI兼容Gemini代理连接成功": "OpenAI-compatible Gemini proxy connection succeeded", + "OpenAI兼容Gemini代理连接失败": "OpenAI-compatible Gemini proxy connection failed", + "Connection failed": "Connection failed", + "自定义片段": "Custom Clips", + "设置需要生成的短视频片段数量": "Set the number of short video clips to generate", + "上传字幕文件": "Upload Subtitle File", + "清除已上传字幕": "Clear Uploaded Subtitle", + "无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312)": "Unable to read the subtitle file. Please check the file encoding. Supported encodings: UTF-8, UTF-16, GBK, GB2312.", + "字幕文件内容似乎为空,请检查文件": "The subtitle file appears to be empty. Please check the file.", + "字幕上传成功": "Subtitle uploaded successfully", + "短剧名称": "Short Drama Name", + "生成短剧解说脚本": "Generate Short Drama Narration Script", + "请输入视频脚本": "Please enter the video script", + "Subtitle TTS support notice": "💡 Note: currently only the **edge-tts** engine supports automatic subtitle generation. Other TTS engines are not supported yet.", + "TTS engine does not support precise subtitles": "⚠️ {engine} does not support precise subtitle generation", + "Manual subtitle editing recommendation": "💡 We recommend adding subtitles manually in a professional editor such as CapCut or Premiere Pro.", + "Disabled subtitles help": "This TTS engine does not support subtitle generation. Please use another TTS engine.", + "Tencent Cloud TTS": "Tencent Cloud TTS", + "Tongyi Qwen3 TTS": "Tongyi Qwen3 TTS", + "IndexTTS2 Voice Clone": "IndexTTS2 Voice Clone", + "Doubao TTS": "Doubao TTS", + "Edge TTS features": "Completely free, but service stability can vary and voice cloning is not supported.", + "Edge TTS use case": "Testing and lightweight use", + "Azure Speech Services features": "Includes a free quota, then pay-as-you-go billing. An overseas credit card may be required.", + "Azure Speech Services use case": "Enterprise use cases that need a stable service", + "Tencent Cloud TTS features": "Includes a free quota, good voice quality, multiple voices, and fast access in mainland China.", + "Tencent Cloud TTS use case": "Personal and enterprise users who need stable Chinese speech synthesis", + "Tongyi Qwen3 TTS features": "Alibaba Cloud Tongyi Qwen speech synthesis with high-quality voices and multiple voice options.", + "High-quality Chinese speech synthesis use case": "Users who need high-quality Chinese speech synthesis", + "IndexTTS2 features": "Zero-shot voice cloning. Upload a reference audio file to synthesize speech with a matching voice. Requires local or private deployment.", + "IndexTTS2 download link": "Download link: https://pan.quark.cn/s/0767c9bcefd5", + "Doubao TTS features": "Volcengine Doubao speech synthesis with multiple voices and emotions, plus fast access in mainland China.", + "Select TTS Engine": "Select TTS Engine", + "Select TTS Engine Help": "Choose the text-to-speech engine you want to use.", + "TTS Engine Details": "📋 {engine} Details", + "Features": "Features", + "Use Case": "Use Case", + "Registration URL": "Registration URL", + "Voice Selection": "Voice Selection", + "Select Edge TTS Voice": "Select an Edge TTS voice", + "Edge TTS Voice Description": "💡 Edge TTS Voice Notes", + "Loaded voice count": "Loaded {count} voices", + "Female Voice": "Female voice", + "Male Voice": "Male voice", + "Voice Volume": "Voice Volume", + "Voice Volume Help Percent": "Adjust voice volume (0-100)", + "Voice Rate": "Voice Rate", + "Voice Rate Help 0.5-2.0": "Adjust voice speed (0.5-2.0x)", + "Voice Pitch": "Voice Pitch", + "Voice Pitch Help Percent": "Adjust voice pitch (-50% to +50%)", + "Service Region": "Service Region", + "Service Region Placeholder": "e.g. eastus", + "Azure Service Region Help": "Azure Speech Services region, such as eastus, westus2, or eastasia.", + "Azure Speech Key Help": "Azure Speech Services API key", + "Voice Name": "Voice Name", + "Azure Voice Name Help": "Enter an Azure Speech Services voice name. You can use the official voice name directly, such as zh-CN-YunzeNeural.", + "Common Voice Reference": "💡 Common Voice Reference", + "Chinese Voices": "Chinese Voices", + "English Voices": "English Voices", + "Multilingual": "multilingual", + "Azure Voices Docs Notice": "💡 For more voices, see the [Azure Speech Services documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support).", + "Quick Select": "Quick Select", + "Chinese Female Voice": "Chinese Female Voice", + "Chinese Male Voice": "Chinese Male Voice", + "English Female Voice": "English Female Voice", + "Voice name valid": "✅ Voice name is valid: {voice}", + "Voice name format may be invalid": "⚠️ Voice name format may be incorrect: {voice}", + "Azure voice name format notice": "💡 Azure voice names usually follow this format: [language]-[region]-[name]Neural", + "Azure Speech Services configured": "✅ Azure Speech Services is configured", + "Please configure service region": "⚠️ Please configure the service region", + "Please configure API Key": "⚠️ Please configure the API Key", + "Task failed": "Task failed", + "Script file cannot be empty": "Script file cannot be empty", + "Video file cannot be empty": "Video file cannot be empty", + "Export to Jianying Draft": "📤 Export to Jianying Draft", + "Please configure Jianying draft folder in basic settings": "Please configure the Jianying draft folder in Basic Settings", + "Jianying draft folder does not exist": "Jianying draft folder does not exist: {path}", + "Please enter Jianying draft name": "Please enter the Jianying draft name", + "Confirm Export": "Confirm Export", + "Please enter draft name": "Please enter a draft name", + "Failed to build parameters": "Failed to build parameters", + "Exporting to Jianying draft...": "Exporting to Jianying draft, please wait...", + "Jianying draft exported successfully": "✅ Successfully exported to Jianying draft: {name}", + "Draft saved to": "📁 Draft saved to: {path}", + "Failed to export Jianying draft": "❌ Failed to export Jianying draft", + "Cancel": "Cancel", + "LLM initialization failed": "⚠️ LLM initialization failed: {error}\n\nPlease check whether the configuration file and dependencies are installed correctly.", + "Jianying Draft Settings": "Jianying Draft Settings", + "Jianying Draft Folder Path": "Jianying Draft Folder Path", + "Jianying Draft Folder Path Help": "Jianying draft folder path, for example: C:\\Users\\Username\\Documents\\JianyingPro Drafts", + "Custom API endpoint help": "Custom API endpoint (optional). Required when using a self-hosted or third-party proxy.", + "Recommended API endpoint": "Recommended endpoint", + "OpenAI compatible gateway help": "{model_type} uses an OpenAI-compatible gateway provider, so a complete endpoint URL is required.", + "Vision model": "Vision model", + "Text model": "Text model", + "Model Name Input Help": "Enter the full model name.\n\nCommon examples:", + "OpenAI compatible providers help": "Supports common OpenAI-compatible gateways such as OpenAI, DeepSeek, OpenRouter, and SiliconFlow.", + "Provider API Key Help": "API key for the selected provider.\n\nWhere to get one:", + "Please fill OpenAI compatible gateway": "Please fill in the OpenAI-compatible gateway URL above, for example: {example}", + "Please enter API key": "Please enter the API key first", + "Please enter model name": "Please enter the model name first", + "Connection test error": "An error occurred while testing the connection", + "Vision model config saved": "Vision model configuration saved (OpenAI compatible)", + "Text model config saved": "Text generation model configuration saved (OpenAI compatible)", + "Failed to save config": "Failed to save configuration", + "Custom Position (% from top)": "Custom Position (% from top)", + "Please enter a value between 0 and 100": "Please enter a value between 0 and 100", + "Please enter a valid number": "Please enter a valid number", + "None": "None", + "Uploaded subtitle": "Uploaded subtitle: {file}", + "Encoding": "Encoding", + "Size": "Size", + "Characters": "characters", + "Ali Bailian Fun-ASR Subtitle Transcription": "Ali Bailian Fun-ASR Subtitle Transcription", + "Fun-ASR upload caption": "After uploading a local audio/video file, it will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.", + "API Key URL": "API Key URL", + "Ali Bailian API Key": "Ali Bailian API Key", + "Ali Bailian API Key Help": "Enter your Ali Bailian API Key. After saving, it will be written to the local config.toml file.", + "Upload media to transcribe": "Upload audio/video to transcribe", + "Transcribe subtitles": "Transcribe Subtitles", + "Please enter Ali Bailian API Key": "Please enter the Ali Bailian API Key first", + "Please upload media to transcribe": "Please upload the audio or video file to transcribe first", + "Transcribing with Fun-ASR...": "Transcribing subtitles with Ali Bailian Fun-ASR, please wait...", + "Fun-ASR failed without subtitle file": "Fun-ASR transcription failed: no subtitle file was generated", + "Subtitle transcription succeeded": "Subtitle transcription succeeded: {file}", + "Fun-ASR transcription failed": "Fun-ASR transcription failed", + "Validating script format...": "Validating script format...", + "Script format validation failed": "Script format validation failed", + "Error Message": "Error Message", + "Details": "Details", + "Correct script format example": "Correct script format example", + "Script format validation error": "An error occurred during script format validation", + "Script validated and saved successfully": "✅ Script format validated and saved successfully!", + "Tencent Secret ID Help": "Enter your Tencent Cloud Secret ID", + "Tencent Secret Key Help": "Enter your Tencent Cloud Secret Key", + "Tencent Service Region Help": "Select the Tencent Cloud TTS service region", + "Custom Voice": "Custom Voice", + "Select Tencent TTS Voice": "Select a Tencent Cloud TTS voice", + "Tencent Cloud TTS Voice Description": "💡 Tencent Cloud TTS Voice Notes", + "Female Voices": "Female Voices", + "Male Voices": "Male Voices", + "Tencent More Voices Notice": "💡 See the official Tencent Cloud documentation for more voices.", + "Qwen DashScope API Key Help": "Tongyi Qwen DashScope API Key", + "TTS Model Name": "TTS Model Name", + "Qwen TTS Model Help": "Qwen TTS model name, for example qwen3-tts-flash", + "Select Qwen3 TTS Voice": "Select a Qwen3 TTS voice", + "API URL": "API URL", + "IndexTTS2 API URL Help": "IndexTTS2 API service URL", + "Reference Audio Path": "Reference Audio Path", + "Reference Audio Path Help": "Reference audio file path for voice cloning (WAV format, 3-10 seconds recommended)", + "Upload Reference Audio File": "Or Upload Reference Audio File", + "Upload Reference Audio Help": "Upload a clear audio clip for voice cloning", + "Audio uploaded": "✅ Audio uploaded: {path}", + "Inference Mode": "Inference Mode", + "Standard Inference": "Standard Inference", + "Fast Inference": "Fast Inference", + "Inference Mode Help": "Standard inference has higher quality but is slower. Fast inference is faster with slightly lower quality.", + "Advanced Parameters": "🔧 Advanced Parameters", + "Sampling Temperature": "Sampling Temperature", + "Sampling Temperature Help": "Controls randomness. Higher values are more random; lower values are more deterministic.", + "Top P Help": "Probability threshold for nucleus sampling. Smaller values make results more deterministic.", + "Top K Help": "The k value for top-k sampling. 0 disables top-k.", + "Num Beams": "Num Beams", + "Num Beams Help": "Number of beams for beam search. Higher values may improve quality but slow generation.", + "Repetition Penalty": "Repetition Penalty", + "Repetition Penalty Help": "Higher values reduce repetition, but overly high values may sound unnatural.", + "Enable Sampling": "Enable Sampling", + "Enable Sampling Help": "Enable sampling for more natural speech.", + "IndexTTS2 Usage Instructions Title": "💡 IndexTTS2 Usage Instructions", + "IndexTTS2 Usage Instructions": "**Zero-shot voice cloning**\n\n1. **Prepare reference audio**: upload or specify a clear audio file (3-10 seconds recommended)\n2. **Set API URL**: make sure the IndexTTS2 service is running\n3. **Start synthesis**: the system will use the reference voice to synthesize new speech\n\n**Notes**:\n- Reference audio quality directly affects synthesis quality\n- Use clean audio without background noise when possible\n- Keep text length within a reasonable range\n- The first synthesis may take longer", + "Volcengine Access Key Help": "Volcengine Access Key", + "Volcengine Secret Key Help": "Volcengine Secret Key", + "Doubao AppID Help": "Doubao TTS application AppID", + "Doubao Token Help": "Doubao TTS application Token", + "Cluster": "Cluster", + "Doubao Cluster Help": "Business cluster. Standard voices use volcano_tts.", + "Select Doubao TTS Voice": "Select a Doubao TTS voice", + "Voice Rate Help 0.2-3.0": "Adjust voice speed (0.2-3.0)", + "Voice Volume Help 0.1-2.0": "Adjust voice volume (0.1-2.0)", + "Voice Pitch Help 0.5-1.5": "Adjust voice pitch (0.5-1.5)", + "Sentence Silence Duration": "Sentence-end Silence Duration (seconds)", + "Sentence Silence Duration Help": "Adjust sentence-end silence duration (0.0-2.0 seconds)", + "Doubao TTS API Key Application Process": "💡 Doubao TTS API Key Application Process", + "Application Steps": "Application Steps", + "Doubao TTS Step 1": "1. Open [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)", + "Doubao TTS Step 2": "2. Create a new Access Key and Secret Key", + "Doubao TTS Step 3": "3. Open [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)", + "Doubao TTS Step 4": "4. Click Start Now", + "Doubao TTS Step 5": "5. In the left API Service Center, find Speech Synthesis under Audio Generation (note: Speech Synthesis, not the speech synthesis large model)", + "Doubao TTS Step 6": "6. Scroll to the bottom to get the APPID and Access Token", + "Doubao TTS Fill Credentials Notice": "💡 Fill the Access Key, Secret Key, AppID, and Token above.", + "Doubao TTS configured": "✅ Doubao TTS is configured", + "Please configure missing fields": "⚠️ Please configure: {fields}", + "Preview Voice Synthesis": "🎵 Preview Voice Synthesis", + "Voice Preview Sample": "Thanks for using NarratoAI. If you have any questions or suggestions, please join the community for help and discussion.", + "Please configure voice settings first": "Please configure voice settings first", + "Voice synthesis successful": "✅ Voice synthesis successful!", + "Voice synthesis failed": "❌ Voice synthesis failed. Please check your configuration.", + "SoulVoice pitch not supported": "ℹ️ SoulVoice does not support pitch adjustment", + "Progress": "Progress", + "Generating script...": "Generating script...", + "Please select video file first": "Please select a video file first", + "Extracting keyframes...": "Extracting keyframes...", + "Script generation completed": "Script generation completed", + "Script generation completed!": "Script generation completed!", + "Video script generated successfully": "✅ Video script generated successfully!", + "Generation error": "❌ An error occurred during generation", + "Please upload subtitle file first": "Please upload a subtitle file first", + "Video": "Video", + "Subtitle": "Subtitle", + "Preparing script generation": "Preparing script generation", + "Script generation failed check logs": "Script generation failed. Please check the logs.", + "Parsing subtitles...": "Parsing subtitles...", + "Subtitle file does not exist": "Subtitle file does not exist", + "Subtitle file is empty or unreadable": "Subtitle file is empty or unreadable", + "Generating narration copy...": "Generating narration copy...", + "Generated narration JSON parse failed": "The generated narration format is invalid and could not be parsed as JSON", + "Generated narration missing items field": "The generated narration is missing the required 'items' field", + "Preparing output...": "Preparing output..." } -} \ No newline at end of file +} diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index f9a2da1..4c732cc 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -163,6 +163,233 @@ "Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多", "Short Drama Summary": "短剧解说", "Video Type": "视频类型", - "Select/Upload Script": "选择/上传脚本" + "Select/Upload Script": "选择/上传脚本", + "Script loaded successfully": "脚本加载成功", + "Failed to load script": "加载脚本失败", + "Failed to save script": "保存脚本失败", + "QwenVL model returned invalid response": "QwenVL 模型返回了无效响应", + "Testing connection...": "正在测试连接...", + "Connection failed": "连接失败", + "Subtitle TTS support notice": "💡 提示:目前仅 **edge-tts** 引擎支持自动生成字幕,其他 TTS 引擎暂不支持。", + "TTS engine does not support precise subtitles": "⚠️ {engine} 不支持精确字幕生成", + "Manual subtitle editing recommendation": "💡 建议使用专业剪辑工具(如剪映、PR 等)手动添加字幕", + "Disabled subtitles help": "当前 TTS 引擎不支持字幕生成,请使用其他 TTS 引擎", + "Tencent Cloud TTS": "腾讯云 TTS", + "Tongyi Qwen3 TTS": "通义千问 Qwen3 TTS", + "IndexTTS2 Voice Clone": "IndexTTS2 语音克隆", + "Doubao TTS": "豆包语音 TTS", + "Edge TTS features": "完全免费,但服务稳定性一般,不支持语音克隆功能", + "Edge TTS use case": "测试和轻量级使用", + "Azure Speech Services features": "提供一定免费额度,超出后按量付费,需要绑定海外信用卡", + "Azure Speech Services use case": "企业级应用,需要稳定服务", + "Tencent Cloud TTS features": "提供免费额度,音质优秀,支持多种音色,国内访问速度快", + "Tencent Cloud TTS use case": "个人和企业用户,需要稳定的中文语音合成", + "Tongyi Qwen3 TTS features": "阿里云通义千问语音合成,音质优秀,支持多种音色", + "High-quality Chinese speech synthesis use case": "需要高质量中文语音合成的用户", + "IndexTTS2 features": "零样本语音克隆,上传参考音频即可合成相同音色的语音,需要本地或私有部署", + "IndexTTS2 download link": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", + "Doubao TTS features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快", + "Select TTS Engine": "选择 TTS 引擎", + "Select TTS Engine Help": "选择您要使用的文本转语音引擎", + "TTS Engine Details": "📋 {engine} 详细说明", + "Features": "特点", + "Use Case": "适用场景", + "Registration URL": "注册地址", + "Voice Selection": "音色选择", + "Select Edge TTS Voice": "选择 Edge TTS 音色", + "Edge TTS Voice Description": "💡 Edge TTS 音色说明", + "Loaded voice count": "已加载 {count} 个音色", + "Female Voice": "女声", + "Male Voice": "男声", + "Voice Volume": "音量调节", + "Voice Volume Help Percent": "调节语音音量 (0-100)", + "Voice Rate": "语速调节", + "Voice Rate Help 0.5-2.0": "调节语音速度 (0.5-2.0 倍速)", + "Voice Pitch": "语调调节", + "Voice Pitch Help Percent": "调节语音音调 (-50% 到 +50%)", + "Service Region": "服务区域", + "Service Region Placeholder": "例如:eastus", + "Azure Service Region Help": "Azure Speech Services 服务区域,如:eastus、westus2、eastasia 等", + "Azure Speech Key Help": "Azure Speech Services API 密钥", + "Voice Name": "音色名称", + "Azure Voice Name Help": "输入 Azure Speech Services 音色名称,直接使用官方音色名称即可。例如:zh-CN-YunzeNeural", + "Common Voice Reference": "💡 常用音色参考", + "Chinese Voices": "中文音色", + "English Voices": "英文音色", + "Multilingual": "多语言", + "Azure Voices Docs Notice": "💡 更多音色请参考 [Azure Speech Services 官方文档](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support)", + "Quick Select": "快速选择", + "Chinese Female Voice": "中文女声", + "Chinese Male Voice": "中文男声", + "English Female Voice": "英文女声", + "Voice name valid": "✅ 音色名称有效: {voice}", + "Voice name format may be invalid": "⚠️ 音色名称格式可能不正确: {voice}", + "Azure voice name format notice": "💡 Azure 音色名称通常格式为: [语言]-[地区]-[名称]Neural", + "Azure Speech Services configured": "✅ Azure Speech Services 配置已设置", + "Please configure service region": "⚠️ 请配置服务区域", + "Please configure API Key": "⚠️ 请配置 API Key", + "Language": "界面语言", + "Task failed": "任务失败", + "Script file cannot be empty": "脚本文件不能为空", + "Video file cannot be empty": "视频文件不能为空", + "Export to Jianying Draft": "📤 导出到剪映草稿", + "Please configure Jianying draft folder in basic settings": "请在基础设置中配置剪映草稿地址", + "Jianying draft folder does not exist": "剪映草稿文件夹不存在: {path}", + "Please enter Jianying draft name": "请输入剪映草稿名称", + "Confirm Export": "确认导出", + "Please enter draft name": "请输入草稿名称", + "Failed to build parameters": "参数构建失败", + "Exporting to Jianying draft...": "正在导出到剪映草稿,请稍候...", + "Jianying draft exported successfully": "✅ 成功导出到剪映草稿: {name}", + "Draft saved to": "📁 草稿已保存到: {path}", + "Failed to export Jianying draft": "❌ 导出到剪映草稿失败", + "Cancel": "取消", + "LLM initialization failed": "⚠️ LLM 初始化失败: {error}\n\n请检查配置文件和依赖是否正确安装。", + "Jianying Draft Settings": "剪映草稿设置", + "Jianying Draft Folder Path": "剪映草稿文件夹路径", + "Jianying Draft Folder Path Help": "剪映草稿文件夹路径,例如:C:\\Users\\用户名\\Documents\\JianyingPro Drafts", + "Custom API endpoint help": "自定义 API 端点(可选),当使用自建或第三方代理时需要填写", + "Recommended API endpoint": "推荐接口地址", + "OpenAI compatible gateway help": "{model_type} 选择的提供商基于 OpenAI 兼容网关,必须填写完整的接口地址。", + "Vision model": "视频分析模型", + "Text model": "文案生成模型", + "Model Name Input Help": "输入完整模型名称\n\n常用示例:", + "OpenAI compatible providers help": "支持常见 OpenAI 兼容网关(如 OpenAI/DeepSeek/OpenRouter/SiliconFlow)", + "Provider API Key Help": "对应 provider 的 API 密钥\n\n获取地址:", + "Please fill OpenAI compatible gateway": "请在上方填写 OpenAI 兼容网关地址,例如:{example}", + "Please enter API key": "请先输入 API 密钥", + "Please enter model name": "请先输入模型名称", + "Connection test error": "测试连接时发生错误", + "Vision model config saved": "视频分析模型配置已保存(OpenAI 兼容)", + "Text model config saved": "文案生成模型配置已保存(OpenAI 兼容)", + "Failed to save config": "保存配置失败", + "Custom Position (% from top)": "自定义位置(距顶部百分比)", + "Please enter a value between 0 and 100": "请输入 0 到 100 之间的值", + "Please enter a valid number": "请输入有效数字", + "None": "无", + "Uploaded subtitle": "已上传字幕: {file}", + "Encoding": "编码", + "Size": "大小", + "Characters": "字符", + "Ali Bailian Fun-ASR Subtitle Transcription": "阿里百炼 Fun-ASR 字幕转录", + "Fun-ASR upload caption": "上传本地音频/视频后,将自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。", + "API Key URL": "API Key 获取地址", + "Ali Bailian API Key": "阿里百炼 API Key", + "Ali Bailian API Key Help": "请输入你自己的阿里百炼 API Key;保存配置后会写入本地 config.toml", + "Upload media to transcribe": "上传需要转录的音频/视频", + "Transcribe subtitles": "转写生成字幕", + "Please enter Ali Bailian API Key": "请先输入阿里百炼 API Key", + "Please upload media to transcribe": "请先上传需要转录的音频或视频文件", + "Transcribing with Fun-ASR...": "正在使用阿里百炼 Fun-ASR 转写字幕,请稍候...", + "Fun-ASR failed without subtitle file": "Fun-ASR 转写失败:未生成字幕文件", + "Subtitle transcription succeeded": "字幕转写成功: {file}", + "Fun-ASR transcription failed": "Fun-ASR 字幕转写失败", + "Validating script format...": "正在验证脚本格式...", + "Script format validation failed": "脚本格式验证失败", + "Error Message": "错误信息", + "Details": "详细说明", + "Correct script format example": "正确的脚本格式示例", + "Script format validation error": "格式验证过程中发生错误", + "Script validated and saved successfully": "✅ 脚本格式验证通过,保存成功!", + "Tencent Secret ID Help": "请输入您的腾讯云 Secret ID", + "Tencent Secret Key Help": "请输入您的腾讯云 Secret Key", + "Tencent Service Region Help": "选择腾讯云 TTS 服务地域", + "Custom Voice": "自定义音色", + "Select Tencent TTS Voice": "选择腾讯云 TTS 音色", + "Tencent Cloud TTS Voice Description": "💡 腾讯云 TTS 音色说明", + "Female Voices": "女声音色", + "Male Voices": "男声音色", + "Tencent More Voices Notice": "💡 更多音色请参考腾讯云官方文档", + "Qwen DashScope API Key Help": "通义千问 DashScope API Key", + "TTS Model Name": "模型名称", + "Qwen TTS Model Help": "Qwen TTS 模型名,例如 qwen3-tts-flash", + "Select Qwen3 TTS Voice": "选择 Qwen3 TTS 音色", + "API URL": "API 地址", + "IndexTTS2 API URL Help": "IndexTTS2 API 服务地址", + "Reference Audio Path": "参考音频路径", + "Reference Audio Path Help": "用于语音克隆的参考音频文件路径(WAV 格式,建议 3-10 秒)", + "Upload Reference Audio File": "或上传参考音频文件", + "Upload Reference Audio Help": "上传一段清晰的音频用于语音克隆", + "Audio uploaded": "✅ 音频已上传: {path}", + "Inference Mode": "推理模式", + "Standard Inference": "普通推理", + "Fast Inference": "快速推理", + "Inference Mode Help": "普通推理质量更高但速度较慢,快速推理速度更快但质量略低", + "Advanced Parameters": "🔧 高级参数", + "Sampling Temperature": "采样温度 (Temperature)", + "Sampling Temperature Help": "控制随机性,值越高输出越随机,值越低越确定", + "Top P Help": "nucleus 采样的概率阈值,值越小结果越确定", + "Top K Help": "top-k 采样的 k 值,0 表示不使用 top-k", + "Num Beams": "束搜索 (Num Beams)", + "Num Beams Help": "束搜索的 beam 数量,值越大质量可能越好但速度越慢", + "Repetition Penalty": "重复惩罚 (Repetition Penalty)", + "Repetition Penalty Help": "值越大越能避免重复,但过大可能导致不自然", + "Enable Sampling": "启用采样", + "Enable Sampling Help": "启用采样可以获得更自然的语音", + "IndexTTS2 Usage Instructions Title": "💡 IndexTTS2 使用说明", + "IndexTTS2 Usage Instructions": "**零样本语音克隆**\n\n1. **准备参考音频**:上传或指定一段清晰的音频文件(建议 3-10 秒)\n2. **设置 API 地址**:确保 IndexTTS2 服务正常运行\n3. **开始合成**:系统会自动使用参考音频的音色合成新语音\n\n**注意事项**:\n- 参考音频质量直接影响合成效果\n- 建议使用无背景噪音的清晰音频\n- 文本长度建议控制在合理范围内\n- 首次合成可能需要较长时间", + "Volcengine Access Key Help": "火山引擎 Access Key", + "Volcengine Secret Key Help": "火山引擎 Secret Key", + "Doubao AppID Help": "豆包语音应用 AppID", + "Doubao Token Help": "豆包语音应用 Token", + "Cluster": "集群", + "Doubao Cluster Help": "业务集群,标准音色使用 volcano_tts", + "Select Doubao TTS Voice": "选择豆包语音 TTS 音色", + "Voice Rate Help 0.2-3.0": "调节语音速度 (0.2-3.0)", + "Voice Volume Help 0.1-2.0": "调节语音音量 (0.1-2.0)", + "Voice Pitch Help 0.5-1.5": "调节语音音高 (0.5-1.5)", + "Sentence Silence Duration": "句尾静音时长 (秒)", + "Sentence Silence Duration Help": "调节句尾静音时长 (0.0-2.0 秒)", + "Doubao TTS API Key Application Process": "💡 豆包语音 TTS API Key申请流程", + "Application Steps": "申请步骤", + "Doubao TTS Step 1": "1. 打开 [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)", + "Doubao TTS Step 2": "2. 新建 Access Key 和 Secret Key", + "Doubao TTS Step 3": "3. 打开 [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)", + "Doubao TTS Step 4": "4. 点击立即使用", + "Doubao TTS Step 5": "5. 在最左边的 API 服务中心找到音频生成下面的语音合成(注意:是语音合成,不是语音合成大模型)", + "Doubao TTS Step 6": "6. 翻到最下面获取 APPID 和 Access Token", + "Doubao TTS Fill Credentials Notice": "💡 请将获取到的 Access Key、Secret Key、AppID 和 Token 填写到上方的配置中", + "Doubao TTS configured": "✅ 豆包语音 TTS 配置已设置", + "Please configure missing fields": "⚠️ 请配置: {fields}", + "Preview Voice Synthesis": "🎵 试听语音合成", + "Voice Preview Sample": "感谢关注 NarratoAI,有任何问题或建议,可以加入社区频道求助或讨论", + "Please configure voice settings first": "请先配置语音设置", + "Voice synthesis successful": "✅ 语音合成成功!", + "Voice synthesis failed": "❌ 语音合成失败,请检查配置", + "SoulVoice pitch not supported": "ℹ️ SoulVoice 引擎不支持音调调节", + "上传字幕文件": "上传字幕文件", + "清除已上传字幕": "清除已上传字幕", + "无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312)": "无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312)", + "字幕文件内容似乎为空,请检查文件": "字幕文件内容似乎为空,请检查文件", + "字幕上传成功": "字幕上传成功", + "短剧名称": "短剧名称", + "生成短剧解说脚本": "生成短剧解说脚本", + "请输入视频脚本": "请输入视频脚本", + "自定义片段": "自定义片段", + "设置需要生成的短视频片段数量": "设置需要生成的短视频片段数量", + "原生Gemini模型连接成功": "原生 Gemini 模型连接成功", + "原生Gemini模型连接失败": "原生 Gemini 模型连接失败", + "OpenAI兼容Gemini代理连接成功": "OpenAI 兼容 Gemini 代理连接成功", + "OpenAI兼容Gemini代理连接失败": "OpenAI 兼容 Gemini 代理连接失败", + "Progress": "进度", + "Generating script...": "正在生成脚本...", + "Please select video file first": "请先选择视频文件", + "Extracting keyframes...": "正在提取关键帧...", + "Script generation completed": "脚本生成完成", + "Script generation completed!": "🎉 脚本生成完成!", + "Video script generated successfully": "✅ 视频脚本生成成功!", + "Generation error": "❌ 生成过程中发生错误", + "Please upload subtitle file first": "请先上传字幕文件", + "Video": "视频", + "Subtitle": "字幕", + "Preparing script generation": "开始准备生成脚本", + "Script generation failed check logs": "生成脚本失败,请检查日志", + "Parsing subtitles...": "正在解析字幕...", + "Subtitle file does not exist": "字幕文件不存在", + "Subtitle file is empty or unreadable": "字幕文件内容为空或无法读取", + "Generating narration copy...": "正在生成文案...", + "Generated narration JSON parse failed": "生成的解说文案格式错误,无法解析为 JSON", + "Generated narration missing items field": "生成的解说文案缺少必要的 'items' 字段", + "Preparing output...": "整理输出..." } -} \ No newline at end of file +} diff --git a/webui/tools/generate_script_docu.py b/webui/tools/generate_script_docu.py index b366156..9f6052b 100644 --- a/webui/tools/generate_script_docu.py +++ b/webui/tools/generate_script_docu.py @@ -24,7 +24,7 @@ def _normalize_progress_value(progress: float | int) -> int: return max(0, min(100, int(round(value)))) -def generate_script_docu(params): +def generate_script_docu(params, tr=lambda key: key): """ 生成纪录片视频脚本。 要求: 原视频无字幕无配音 @@ -39,12 +39,12 @@ def generate_script_docu(params): if message: status_text.text(f"🎬 {message}") else: - status_text.text(f"📊 进度: {normalized_progress}%") + status_text.text(f"📊 {tr('Progress')}: {normalized_progress}%") try: - with st.spinner("正在生成脚本..."): + with st.spinner(tr("Generating script...")): if not params.video_origin_path: - st.error("请先选择视频文件") + st.error(tr("Please select video file first")) return vision_llm_provider = ( @@ -76,7 +76,7 @@ def generate_script_docu(params): "vision_max_concurrency", 2 ) - update_progress(10, "正在提取关键帧...") + update_progress(10, tr("Extracting keyframes...")) service = DocumentaryFrameAnalysisService() script_items = asyncio.run( service.generate_documentary_script( @@ -100,15 +100,15 @@ def generate_script_docu(params): st.session_state["video_clip_json"] = script elif isinstance(script, str): st.session_state["video_clip_json"] = json.loads(script) - update_progress(100, "脚本生成完成") + update_progress(100, tr("Script generation completed")) time.sleep(0.1) progress_bar.progress(100) - status_text.text("🎉 脚本生成完成!") - st.success("✅ 视频脚本生成成功!") + status_text.text(tr("Script generation completed!")) + st.success(tr("Video script generated successfully")) except Exception as err: - st.error(f"❌ 生成过程中发生错误: {str(err)}") + st.error(f"{tr('Generation error')}: {str(err)}") logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}") finally: time.sleep(2) diff --git a/webui/tools/generate_script_short.py b/webui/tools/generate_script_short.py index 2f6ef9b..5c7b43d 100644 --- a/webui/tools/generate_script_short.py +++ b/webui/tools/generate_script_short.py @@ -27,21 +27,21 @@ def generate_script_short(tr, params, custom_clips=5): if message: status_text.text(f"{progress}% - {message}") else: - status_text.text(f"进度: {progress}%") + status_text.text(f"{tr('Progress')}: {progress}%") try: - with st.spinner("正在生成脚本..."): + with st.spinner(tr("Generating script...")): # ========== 严格验证:必须上传视频和字幕(与短剧解说保持一致)========== # 1. 验证视频文件 video_path = getattr(params, "video_origin_path", None) if not video_path or not str(video_path).strip(): - st.error("请先选择视频文件") + st.error(tr("Please select video file first")) st.stop() try: ensure_existing_file( str(video_path), - label="视频", + label=tr("Video"), allowed_exts=(".mp4", ".mov", ".avi", ".flv", ".mkv"), ) except InputValidationError as e: @@ -51,13 +51,13 @@ def generate_script_short(tr, params, custom_clips=5): # 2. 验证字幕文件(移除推断逻辑,必须上传) subtitle_path = st.session_state.get("subtitle_path") if not subtitle_path or not str(subtitle_path).strip(): - st.error("请先上传字幕文件") + st.error(tr("Please upload subtitle file first")) st.stop() try: subtitle_path = ensure_existing_file( str(subtitle_path), - label="字幕", + label=tr("Subtitle"), allowed_exts=(".srt",), ) except InputValidationError as e: @@ -78,7 +78,7 @@ def generate_script_short(tr, params, custom_clips=5): vision_model = st.session_state.get(f'vision_{vision_llm_provider}_model_name') or config.app.get(f'vision_{vision_llm_provider}_model_name', "") vision_base_url = st.session_state.get(f'vision_{vision_llm_provider}_base_url') or config.app.get(f'vision_{vision_llm_provider}_base_url', "") - update_progress(20, "开始准备生成脚本") + update_progress(20, tr("Preparing script generation")) # ========== 调用后端生成脚本 ========== from app.services.SDP.generate_script_short import generate_script_result @@ -103,7 +103,7 @@ def generate_script_short(tr, params, custom_clips=5): ) if result.get("status") != "success": - st.error(result.get("message", "生成脚本失败,请检查日志")) + st.error(result.get("message", tr("Script generation failed check logs"))) st.stop() script = result.get("script") @@ -114,14 +114,14 @@ def generate_script_short(tr, params, custom_clips=5): elif isinstance(script, str): st.session_state['video_clip_json'] = json.loads(script) - update_progress(80, "脚本生成完成") + update_progress(80, tr("Script generation completed")) time.sleep(0.1) progress_bar.progress(100) - status_text.text("脚本生成完成!") - st.success("视频脚本生成成功!") + status_text.text(tr("Script generation completed!")) + st.success(tr("Video script generated successfully")) except Exception as err: progress_bar.progress(100) - st.error(f"生成过程中发生错误: {str(err)}") + st.error(f"{tr('Generation error')}: {str(err)}") logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}") diff --git a/webui/tools/generate_short_summary.py b/webui/tools/generate_short_summary.py index 0ead867..fe2d223 100644 --- a/webui/tools/generate_short_summary.py +++ b/webui/tools/generate_short_summary.py @@ -135,7 +135,7 @@ def parse_and_fix_json(json_string): return None -def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature): +def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature, tr=lambda key: key): """ 生成 短剧解说 视频脚本 要求: 提供高质量短剧字幕 @@ -149,20 +149,20 @@ def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperatu if message: status_text.text(f"{progress}% - {message}") else: - status_text.text(f"进度: {progress}%") + status_text.text(f"{tr('Progress')}: {progress}%") try: - with st.spinner("正在生成脚本..."): + with st.spinner(tr("Generating script...")): if not params.video_origin_path: - st.error("请先选择视频文件") + st.error(tr("Please select video file first")) return """ 1. 获取字幕 """ - update_progress(30, "正在解析字幕...") + update_progress(30, tr("Parsing subtitles...")) # 判断字幕文件是否存在 if not os.path.exists(subtitle_path): - st.error("字幕文件不存在") + st.error(tr("Subtitle file does not exist")) return """ @@ -176,7 +176,7 @@ def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperatu # 读取字幕文件内容(无论使用哪种实现都需要) subtitle_content = read_subtitle_text(subtitle_path).text if not subtitle_content: - st.error("字幕文件内容为空或无法读取") + st.error(tr("Subtitle file is empty or unreadable")) return try: @@ -203,7 +203,7 @@ def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperatu """ if analysis_result["status"] == "success": logger.info("字幕分析成功!") - update_progress(60, "正在生成文案...") + update_progress(60, tr("Generating narration copy...")) # 根据剧情生成解说文案 - 使用新的LLM服务架构 try: @@ -235,11 +235,11 @@ def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperatu logger.info(narration_result["narration_script"]) else: logger.info(f"\n解说文案生成失败: {narration_result['message']}") - st.error("生成脚本失败,请检查日志") + st.error(tr("Script generation failed check logs")) st.stop() else: logger.error(f"分析失败: {analysis_result['message']}") - st.error("生成脚本失败,请检查日志") + st.error(tr("Script generation failed check logs")) st.stop() """ @@ -253,35 +253,35 @@ def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperatu # 增强JSON解析,包含错误处理和修复 narration_dict = parse_and_fix_json(narration_script) if narration_dict is None: - st.error("生成的解说文案格式错误,无法解析为JSON") + st.error(tr("Generated narration JSON parse failed")) logger.error(f"JSON解析失败,原始内容: {narration_script}") st.stop() # 验证JSON结构 if 'items' not in narration_dict: - st.error("生成的解说文案缺少必要的'items'字段") + st.error(tr("Generated narration missing items field")) logger.error(f"JSON结构错误,缺少items字段: {narration_dict}") st.stop() script = json.dumps(narration_dict['items'], ensure_ascii=False, indent=2) if script is None: - st.error("生成脚本失败,请检查日志") + st.error(tr("Script generation failed check logs")) st.stop() logger.success(f"剪辑脚本生成完成") if isinstance(script, list): st.session_state['video_clip_json'] = script elif isinstance(script, str): st.session_state['video_clip_json'] = json.loads(script) - update_progress(90, "整理输出...") + update_progress(90, tr("Preparing output...")) time.sleep(0.1) progress_bar.progress(100) - status_text.text("脚本生成完成!") - st.success("视频脚本生成成功!") + status_text.text(tr("Script generation completed!")) + st.success(tr("Video script generated successfully")) except Exception as err: - st.error(f"生成过程中发生错误: {str(err)}") + st.error(f"{tr('Generation error')}: {str(err)}") logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}") finally: time.sleep(2) From e744960ac1d845d4731d416411d76939ba4cc79b Mon Sep 17 00:00:00 2001 From: viccy Date: Fri, 5 Jun 2026 18:46:56 +0800 Subject: [PATCH 05/24] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=E6=9C=AC?= =?UTF-8?q?=E5=9C=B0FunASR=E6=94=AF=E6=8C=81=E5=B9=B6=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E7=BD=91=E9=A1=B5=E7=AB=AF=E8=A7=86=E9=A2=91=E4=B8=8E=E5=AD=97?= =?UTF-8?q?=E5=B9=95=E5=B7=A5=E4=BD=9C=E6=B5=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 更新示例配置文件,添加本地FunASR后端配置项 - 重构fun_asr_subtitle服务,完整支持本地FunASR-Pack API调用 - 优化多语言翻译文件,更新界面相关译文 - 重写网页端视频选择组件,支持本地资源目录选择和直接上传 - 重构字幕转写UI,支持本地/在线百炼/直接上传三种模式 - 新增本地FunASR相关单元测试 --- app/services/fun_asr_subtitle.py | 296 ++++++++++++- .../test_fun_asr_subtitle_unittest.py | 110 ++++- config.example.toml | 9 +- webui/components/script_settings.py | 396 ++++++++++++------ webui/i18n/en.json | 37 +- webui/i18n/zh.json | 37 +- 6 files changed, 744 insertions(+), 141 deletions(-) diff --git a/app/services/fun_asr_subtitle.py b/app/services/fun_asr_subtitle.py index 7af2637..2567bc8 100644 --- a/app/services/fun_asr_subtitle.py +++ b/app/services/fun_asr_subtitle.py @@ -1,15 +1,17 @@ -"""Aliyun Bailian Fun-ASR subtitle transcription helpers. +"""Fun-ASR subtitle transcription helpers. -This module intentionally uses the REST API because the official Fun-ASR +The Bailian path intentionally uses the REST API because the official Fun-ASR recorded-file API supports temporary `oss://` resources only through REST. """ from __future__ import annotations import os +import shutil import time from dataclasses import dataclass from typing import Any, Optional +from urllib.parse import urljoin, urlparse, urlunparse import requests from loguru import logger @@ -21,6 +23,7 @@ UPLOAD_POLICY_URL = f"{DASHSCOPE_BASE_URL}/api/v1/uploads" TRANSCRIPTION_URL = f"{DASHSCOPE_BASE_URL}/api/v1/services/audio/asr/transcription" TASK_URL_TEMPLATE = f"{DASHSCOPE_BASE_URL}/api/v1/tasks/{{task_id}}" MODEL_NAME = "fun-asr" +LOCAL_FUN_ASR_API_URL = "http://127.0.0.1:7860" TERMINAL_FAILED_STATUSES = {"FAILED", "CANCELED", "UNKNOWN"} PUNCTUATION_BREAKS = set(",。!?;,.!?;") @@ -89,6 +92,85 @@ def _session_post(session, url: str, **kwargs): return session.post(url, **kwargs) +def _require_local_file(local_file: str) -> None: + if not os.path.isfile(local_file): + raise FunAsrError(f"待转写文件不存在: {local_file}") + + +def _normalize_local_api_url(api_url: str = "") -> str: + api_url = (api_url or LOCAL_FUN_ASR_API_URL).strip().rstrip("/") + if not api_url: + raise FunAsrError("请先填写本地 FunASR-Pack API 地址") + if "://" not in api_url: + api_url = f"http://{api_url}" + return api_url + + +def _local_base_url(api_url: str = "") -> str: + api_url = _normalize_local_api_url(api_url) + parsed = urlparse(api_url) + path = parsed.path.rstrip("/") + if path.endswith("/asr"): + path = path[:-4].rstrip("/") + return urlunparse(parsed._replace(path=path, params="", query="", fragment="")).rstrip("/") + + +def _local_asr_url(api_url: str = "") -> str: + api_url = _normalize_local_api_url(api_url) + if urlparse(api_url).path.rstrip("/").endswith("/asr"): + return api_url + return f"{api_url}/asr" + + +def _absolute_local_download_url(api_url: str, download_url: str) -> str: + download_url = (download_url or "").strip() + if not download_url: + return "" + if urlparse(download_url).scheme: + return download_url + return urljoin(f"{_local_base_url(api_url)}/", download_url) + + +def _raise_for_local_http(response: requests.Response, action: str) -> None: + status_code = getattr(response, "status_code", 200) + if status_code and status_code >= 400: + detail = "" + try: + data = response.json() + if isinstance(data, dict): + detail = str(data.get("detail") or "") + except Exception: + detail = "" + suffix = f": {detail}" if detail else "" + raise FunAsrError(f"{action}失败{suffix},请确认本地 FunASR-Pack 服务可用") + + try: + response.raise_for_status() + except Exception as exc: + raise FunAsrError(f"{action}失败,请确认本地 FunASR-Pack 服务可用") from exc + + +def _local_json(response: requests.Response, action: str) -> dict[str, Any]: + _raise_for_local_http(response, action) + try: + data = response.json() + except Exception as exc: + raise FunAsrError(f"{action}返回了无效 JSON") from exc + if not isinstance(data, dict): + raise FunAsrError(f"{action}返回格式无效") + return data + + +def _response_text(response: requests.Response) -> str: + text = getattr(response, "text", None) + if isinstance(text, str): + return text + content = getattr(response, "content", b"") + if isinstance(content, bytes): + return content.decode("utf-8") + return str(content) + + def request_upload_policy(api_key: str, model: str = MODEL_NAME, session=requests) -> UploadPolicy: """Request Bailian temporary-storage upload policy for the target model.""" api_key = _require_api_key(api_key) @@ -418,6 +500,216 @@ def write_srt_file(srt_content: str, subtitle_file: str = "") -> str: return subtitle_file +def copy_srt_file(source_file: str, subtitle_file: str = "") -> str: + """Copy an existing SRT file into NarratoAI's subtitle directory.""" + if not os.path.isfile(source_file): + raise FunAsrError(f"本地 FunASR-Pack 返回的字幕文件不存在: {source_file}") + if not subtitle_file: + subtitle_file = os.path.join(utils.subtitle_dir(), f"fun_asr_local_{int(time.time())}.srt") + parent = os.path.dirname(subtitle_file) + if parent: + os.makedirs(parent, exist_ok=True) + if os.path.abspath(source_file) != os.path.abspath(subtitle_file): + shutil.copyfile(source_file, subtitle_file) + return subtitle_file + + +def request_local_fun_asr_health(api_url: str = LOCAL_FUN_ASR_API_URL, session=requests) -> dict[str, Any]: + """Fetch FunASR-Pack health metadata from the local service.""" + response = _session_get(session, f"{_local_base_url(api_url)}/health", timeout=10) + return _local_json(response, "检查本地 FunASR-Pack 服务") + + +def request_local_fun_asr( + local_file: str, + api_url: str = LOCAL_FUN_ASR_API_URL, + hotword: str = "", + enable_spk: Optional[bool] = None, + timeout: float = 600.0, + session=requests, +) -> dict[str, Any]: + """Call the local FunASR-Pack `/asr` API and return its JSON result.""" + _require_local_file(local_file) + data: dict[str, str] = {} + if hotword.strip(): + data["hotword"] = hotword.strip() + if enable_spk is not None: + data["enable_spk"] = "true" if enable_spk else "false" + + with open(local_file, "rb") as file_obj: + files = {"file": (_safe_upload_name(local_file), file_obj)} + response = _session_post( + session, + _local_asr_url(api_url), + data=data, + files=files, + timeout=timeout, + ) + return _local_json(response, "调用本地 FunASR-Pack ASR API") + + +def download_local_srt( + download_url: str, + api_url: str = LOCAL_FUN_ASR_API_URL, + subtitle_file: str = "", + session=requests, +) -> str: + """Download an SRT exposed by FunASR-Pack and save it as a NarratoAI subtitle.""" + absolute_url = _absolute_local_download_url(api_url, download_url) + if not absolute_url: + raise FunAsrError("本地 FunASR-Pack 结果缺少 SRT 下载地址") + response = _session_get(session, absolute_url, timeout=60) + _raise_for_local_http(response, "下载本地 FunASR-Pack SRT") + srt_content = _response_text(response) + if not srt_content.strip(): + raise FunAsrError("本地 FunASR-Pack 返回了空 SRT") + return write_srt_file(srt_content, subtitle_file) + + +def _local_result_items(result_json: dict[str, Any]): + raw = result_json.get("raw") + if isinstance(raw, dict): + yield raw + elif isinstance(raw, list): + for item in raw: + if isinstance(item, dict): + yield item + elif result_json.get("text"): + yield result_json + + +def _blocks_from_local_timestamp(item: dict[str, Any], max_chars: int, max_duration: float) -> list[dict[str, Any]]: + text = str(item.get("text") or "").strip() + timestamps = item.get("timestamp") or [] + if not text or not isinstance(timestamps, list): + return [] + + non_space_chars = [char for char in text if char.strip()] + consume_punctuation = len(timestamps) >= len(non_space_chars) + blocks: list[dict[str, Any]] = [] + current: Optional[dict[str, Any]] = None + timestamp_index = 0 + last_end = 0.0 + max_duration_ms = max_duration * 1000 + + for char in text: + if not char.strip(): + continue + + is_punctuation = char in PUNCTUATION_BREAKS + consume_timestamp = consume_punctuation or not is_punctuation + if consume_timestamp and timestamp_index < len(timestamps): + pair = timestamps[timestamp_index] + timestamp_index += 1 + if not isinstance(pair, (list, tuple)) or len(pair) < 2: + continue + start_ms = _timestamp_ms(pair[0], "local.timestamp.start") + end_ms = _timestamp_ms(pair[1], "local.timestamp.end") + last_end = end_ms + else: + start_ms = last_end + end_ms = last_end if is_punctuation else last_end + 200 + last_end = end_ms + + if current is None: + current = {"start": start_ms, "end": end_ms, "text": char} + else: + should_split_before = ( + len(current["text"] + char) > max_chars + or (end_ms - current["start"]) > max_duration_ms + ) + if should_split_before: + _flush_block(blocks, current) + current = {"start": start_ms, "end": end_ms, "text": char} + else: + current["text"] += char + current["end"] = end_ms + + if current and is_punctuation: + _flush_block(blocks, current) + current = None + + if current: + _flush_block(blocks, current) + return blocks + + +def local_fun_asr_result_to_srt( + result_json: dict[str, Any], + max_chars: int = 20, + max_duration: float = 3.5, +) -> str: + """Convert a FunASR-Pack JSON response into SRT when the API SRT is unavailable.""" + blocks: list[dict[str, Any]] = [] + for item in _local_result_items(result_json): + item_blocks = _blocks_from_local_timestamp(item, max_chars, max_duration) + if not item_blocks: + text = str(item.get("text") or "").strip() + if text: + item_blocks = _blocks_from_sentence( + { + "begin_time": 0, + "end_time": max(1500, len(text) * 180), + "text": text, + }, + max_chars=max_chars, + ) + blocks.extend(item_blocks) + + if not blocks: + raise FunAsrError("本地 FunASR-Pack 转写结果为空:未找到可用字幕内容") + + lines = [] + for index, block in enumerate(blocks, start=1): + lines.append(_srt_block(index, block["start"], block["end"], block["text"])) + return "\n".join(lines).rstrip() + "\n" + + +def create_with_local_fun_asr( + local_file: str, + subtitle_file: str = "", + api_url: str = LOCAL_FUN_ASR_API_URL, + hotword: str = "", + enable_spk: Optional[bool] = None, + timeout: float = 600.0, + session=requests, +) -> Optional[str]: + """Create an SRT file through a locally running FunASR-Pack API.""" + try: + result_json = request_local_fun_asr( + local_file=local_file, + api_url=api_url, + hotword=hotword, + enable_spk=enable_spk, + timeout=timeout, + session=session, + ) + + srt_file = result_json.get("srt_file") + if isinstance(srt_file, str) and srt_file and os.path.isfile(srt_file): + output_file = copy_srt_file(srt_file, subtitle_file) + else: + downloads = result_json.get("downloads") or {} + download_url = downloads.get("srt") if isinstance(downloads, dict) else "" + if download_url: + output_file = download_local_srt( + download_url, + api_url=api_url, + subtitle_file=subtitle_file, + session=session, + ) + else: + srt_content = local_fun_asr_result_to_srt(result_json) + output_file = write_srt_file(srt_content, subtitle_file) + + logger.info(f"本地 FunASR-Pack 字幕文件已生成: {output_file}") + return output_file + except FunAsrError: + raise + except Exception as exc: + raise FunAsrError("本地 FunASR-Pack 字幕转写失败,请检查服务地址、文件或模型状态") from exc + + def create_with_fun_asr( local_file: str, subtitle_file: str = "", diff --git a/app/services/test_fun_asr_subtitle_unittest.py b/app/services/test_fun_asr_subtitle_unittest.py index 83062bd..d59550d 100644 --- a/app/services/test_fun_asr_subtitle_unittest.py +++ b/app/services/test_fun_asr_subtitle_unittest.py @@ -12,9 +12,11 @@ from app.services import fun_asr_subtitle as fasr class FakeResponse: - def __init__(self, payload=None, status_code=200): + def __init__(self, payload=None, status_code=200, text=None): self.payload = payload or {} self.status_code = status_code + self.text = text + self.content = text.encode("utf-8") if isinstance(text, str) else b"" def json(self): return self.payload @@ -375,6 +377,110 @@ class FunAsrServiceTests(unittest.TestCase): fasr.download_transcription_result("https://result.example/bad.json", session=MalformedDownloadSession({})) +class LocalFunAsrServiceTests(unittest.TestCase): + def test_request_local_fun_asr_posts_file_and_options(self): + class LocalSession: + def __init__(self): + self.calls = [] + + def post(self, url, **kwargs): + self.calls.append(("POST", url, kwargs)) + return FakeResponse({"text": "你好", "srt_file": "/tmp/out.srt"}) + + with tempfile.TemporaryDirectory() as tmp_dir: + local_file = Path(tmp_dir) / "audio.wav" + local_file.write_bytes(b"audio") + session = LocalSession() + + result = fasr.request_local_fun_asr( + str(local_file), + api_url="127.0.0.1:7860", + hotword="NarratoAI", + enable_spk=True, + timeout=123, + session=session, + ) + + self.assertEqual("你好", result["text"]) + self.assertEqual("POST", session.calls[0][0]) + self.assertEqual("http://127.0.0.1:7860/asr", session.calls[0][1]) + self.assertEqual({"hotword": "NarratoAI", "enable_spk": "true"}, session.calls[0][2]["data"]) + self.assertEqual(123, session.calls[0][2]["timeout"]) + self.assertIn("file", session.calls[0][2]["files"]) + + def test_create_with_local_fun_asr_copies_pack_srt_file(self): + class LocalSession: + def __init__(self, srt_file): + self.srt_file = srt_file + self.calls = [] + + def post(self, url, **kwargs): + self.calls.append(("POST", url, kwargs)) + return FakeResponse({"text": "你好", "srt_file": str(self.srt_file)}) + + with tempfile.TemporaryDirectory() as tmp_dir: + local_file = Path(tmp_dir) / "audio.wav" + local_file.write_bytes(b"audio") + pack_srt = Path(tmp_dir) / "pack.srt" + pack_srt.write_text("1\n00:00:00,000 --> 00:00:01,000\n你好\n", encoding="utf-8") + subtitle_file = Path(tmp_dir) / "out.srt" + + result_path = fasr.create_with_local_fun_asr( + str(local_file), + subtitle_file=str(subtitle_file), + api_url="http://127.0.0.1:7860", + session=LocalSession(pack_srt), + ) + + self.assertEqual(str(subtitle_file), result_path) + self.assertEqual(pack_srt.read_text(encoding="utf-8"), subtitle_file.read_text(encoding="utf-8")) + + def test_create_with_local_fun_asr_downloads_relative_srt(self): + class LocalSession: + def __init__(self): + self.calls = [] + + def post(self, url, **kwargs): + self.calls.append(("POST", url, kwargs)) + return FakeResponse({"text": "你好", "downloads": {"srt": "/download/result.srt"}}) + + def get(self, url, **kwargs): + self.calls.append(("GET", url, kwargs)) + return FakeResponse(text="1\n00:00:00,000 --> 00:00:01,000\n你好\n") + + with tempfile.TemporaryDirectory() as tmp_dir: + local_file = Path(tmp_dir) / "audio.wav" + local_file.write_bytes(b"audio") + subtitle_file = Path(tmp_dir) / "out.srt" + session = LocalSession() + + result_path = fasr.create_with_local_fun_asr( + str(local_file), + subtitle_file=str(subtitle_file), + api_url="http://127.0.0.1:7860/asr", + session=session, + ) + + self.assertEqual(str(subtitle_file), result_path) + self.assertEqual("http://127.0.0.1:7860/download/result.srt", session.calls[1][1]) + self.assertIn("你好", subtitle_file.read_text(encoding="utf-8")) + + def test_local_fun_asr_result_to_srt_uses_raw_timestamps(self): + result = { + "raw": [ + { + "text": "你好,世界。", + "timestamp": [[0, 300], [300, 600], [600, 900], [900, 1200]], + } + ] + } + + srt = fasr.local_fun_asr_result_to_srt(result, max_chars=20) + + self.assertIn("00:00:00,000 --> 00:00:00,600\n你好,", srt) + self.assertIn("世界。", srt) + + class FunAsrConfigTests(unittest.TestCase): def test_save_config_persists_fun_asr_section(self): original_config_file = cfg.config_file @@ -395,6 +501,8 @@ class FunAsrConfigTests(unittest.TestCase): def test_config_example_fun_asr_section_parses(self): config_data = tomllib.loads(Path("config.example.toml").read_text(encoding="utf-8")) + self.assertEqual("local", config_data["fun_asr"]["backend"]) + self.assertEqual("http://127.0.0.1:7860", config_data["fun_asr"]["api_url"]) self.assertEqual("fun-asr", config_data["fun_asr"]["model"]) self.assertIn("api_key", config_data["fun_asr"]) diff --git a/config.example.toml b/config.example.toml index c503129..805610b 100644 --- a/config.example.toml +++ b/config.example.toml @@ -95,8 +95,13 @@ model_name = "qwen3-tts-flash" [fun_asr] - # 阿里百炼 Fun-ASR 字幕转录配置 - # 访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取你的 API 密钥 + # Fun-ASR 字幕转录配置 + # backend = "local" 使用本地 FunASR-Pack API;backend = "bailian" 使用阿里百炼在线 fun-asr + backend = "local" + api_url = "http://127.0.0.1:7860" + hotword = "" + enable_spk = false + # 使用阿里百炼在线 fun-asr 时,访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取 API Key api_key = "" model = "fun-asr" diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index 772906b..94a8910 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -56,12 +56,12 @@ def render_script_file(tr, params): MODE_SHORT = "short" MODE_SUMMARY = "summary" - # 模式选项映射 + # 模式选项映射,按工作流优先级展示 mode_options = { - tr("Select/Upload Script"): MODE_FILE, + tr("Short Drama Summary"): MODE_SUMMARY, tr("Auto Generate"): MODE_AUTO, tr("Short Generate"): MODE_SHORT, - tr("Short Drama Summary"): MODE_SUMMARY, + tr("Select/Upload Script"): MODE_FILE, } # 获取当前状态 @@ -80,8 +80,7 @@ def render_script_file(tr, params): else: default_index = mode_keys.index(tr("Select/Upload Script")) - # 1. 渲染功能选择下拉框 - # 使用 segmented_control 替代 selectbox,提供更好的视觉体验 + # 1. 渲染功能选择组件 default_mode_label = mode_keys[default_index] default_mode = mode_options[default_mode_label] @@ -106,17 +105,16 @@ def render_script_file(tr, params): st.session_state.video_clip_json_path = new_mode params.video_clip_json_path = new_mode else: - # 如果用户取消选择(segmented_control 允许取消),恢复到默认或上一个状态 - # 这里我们强制保持当前状态,或者重置为默认 - st.session_state.script_mode_selection = default_mode_label + st.session_state.video_clip_json_path = default_mode + params.video_clip_json_path = default_mode # 渲染组件 - selected_mode_label = st.segmented_control( + selected_mode_label = st.selectbox( tr("Video Type"), options=mode_keys, + index=None, key="script_mode_selection", on_change=update_script_mode, - required=True ) # 处理旧状态为空的兜底情况 @@ -231,50 +229,115 @@ def render_script_file(tr, params): def render_video_file(tr, params): """渲染视频文件选择""" - video_list = [(tr("None"), ""), (tr("Upload Local Files"), "upload_local")] + source_options = { + tr("Select from resource directory"): "resource", + tr("Upload Local Files"): "upload", + } + source_labels = list(source_options.keys()) + default_source_label = source_labels[0] - # 获取已有视频文件 - for suffix in ["*.mp4", "*.mov", "*.avi", "*.mkv"]: - video_files = glob.glob(os.path.join(utils.video_dir(), suffix)) - for file in video_files: - display_name = file.replace(config.root_dir, "") - video_list.append((display_name, file)) + if ( + 'video_source_selection' not in st.session_state + or st.session_state['video_source_selection'] not in source_options + ): + st.session_state['video_source_selection'] = default_source_label - selected_video_index = st.selectbox( - tr("Video File"), - index=0, - options=range(len(video_list)), - format_func=lambda x: video_list[x][0] + current_source = st.session_state['video_source_selection'] + source_caption = ( + tr("Select a video from resource videos directory") + if source_options[current_source] == "resource" + else tr("Upload a new video file up to 2GB") ) + st.markdown(f"**{tr('Video Source')}** :gray[{source_caption}]") - video_path = video_list[selected_video_index][1] - st.session_state['video_origin_path'] = video_path - params.video_origin_path = video_path + source = st.selectbox( + tr("Video Source"), + options=source_labels, + index=None, + key="video_source_selection", + label_visibility="collapsed", + ) + if not source: + source = default_source_label - if video_path == "upload_local": - uploaded_file = st.file_uploader( - tr("Upload Local Files"), - type=["mp4", "mov", "avi", "flv", "mkv"], - accept_multiple_files=False, + if source_options[source] == "resource": + video_files = [] + for suffix in ["*.mp4", "*.mov", "*.avi", "*.flv", "*.mkv", "*.mpeg4"]: + video_files.extend(glob.glob(os.path.join(utils.video_dir(), suffix))) + + video_files = sorted(video_files, key=os.path.getctime, reverse=True) + saved_video_path = st.session_state.get('video_origin_path', '') + selected_video_path = st.session_state.get('resource_video_selection') + if selected_video_path not in video_files: + st.session_state['resource_video_selection'] = ( + saved_video_path if saved_video_path in video_files else None + ) + + def format_video_name(path): + return path.replace(config.root_dir, "") + + video_path = st.selectbox( + tr("Select Video"), + options=video_files, + index=None, + placeholder=tr("Choose a video file"), + format_func=format_video_name, + key="resource_video_selection", ) - if uploaded_file is not None: - safe_filename = os.path.basename(uploaded_file.name) - video_file_path = os.path.join(utils.video_dir(), safe_filename) - file_name, file_extension = os.path.splitext(safe_filename) + if video_path: + st.session_state['video_origin_path'] = video_path + params.video_origin_path = video_path + else: + st.session_state['video_origin_path'] = "" + params.video_origin_path = "" + if not video_files: + st.info(tr("No video files found in resource videos directory")) + return - if os.path.exists(video_file_path): - timestamp = time.strftime("%Y%m%d%H%M%S") - file_name_with_timestamp = f"{file_name}_{timestamp}" - video_file_path = os.path.join(utils.video_dir(), file_name_with_timestamp + file_extension) + if source_options[source] == "upload": + uploaded_file = st.file_uploader( + tr("Upload Video"), + type=["mp4", "mov", "avi", "flv", "mkv", "mpeg4"], + accept_multiple_files=False, + key="video_file_uploader", + ) - with open(video_file_path, "wb") as f: - f.write(uploaded_file.read()) - st.success(tr("File Uploaded Successfully")) + if uploaded_file is None: + st.session_state['video_origin_path'] = "" + params.video_origin_path = "" + st.session_state['video_file_processed'] = False + st.session_state['uploaded_video_path'] = "" + st.session_state['uploaded_video_signature'] = "" + else: + uploaded_signature = f"{uploaded_file.name}:{uploaded_file.size}" + uploaded_video_path = st.session_state.get('uploaded_video_path', '') + is_processed = ( + st.session_state.get('video_file_processed', False) + and st.session_state.get('uploaded_video_signature') == uploaded_signature + and uploaded_video_path + ) + + if is_processed: + st.session_state['video_origin_path'] = uploaded_video_path + params.video_origin_path = uploaded_video_path + else: + safe_filename = os.path.basename(uploaded_file.name) + video_file_path = os.path.join(utils.video_dir(), safe_filename) + file_name, file_extension = os.path.splitext(safe_filename) + + if os.path.exists(video_file_path): + timestamp = time.strftime("%Y%m%d%H%M%S") + file_name_with_timestamp = f"{file_name}_{timestamp}" + video_file_path = os.path.join(utils.video_dir(), file_name_with_timestamp + file_extension) + + with open(video_file_path, "wb") as f: + f.write(uploaded_file.read()) st.session_state['video_origin_path'] = video_file_path params.video_origin_path = video_file_path - time.sleep(1) - st.rerun() + st.session_state['uploaded_video_path'] = video_file_path + st.session_state['uploaded_video_signature'] = uploaded_signature + st.session_state['video_file_processed'] = True def render_short_generate_options(tr): @@ -336,7 +399,18 @@ def short_drama_summary(tr): st.session_state['subtitle_file_processed'] = False render_fun_asr_transcription(tr) - + + # 名称输入框 + video_theme = st.text_input(tr("短剧名称")) + st.session_state['video_theme'] = video_theme + # 数字输入框 + temperature = st.slider("temperature", 0.0, 2.0, 0.7) + st.session_state['temperature'] = temperature + return video_theme + + +def render_subtitle_upload(tr): + """上传并保存用户提供的 SRT 字幕文件。""" subtitle_file = st.file_uploader( tr("上传字幕文件"), type=["srt"], @@ -401,102 +475,180 @@ def short_drama_summary(tr): except Exception as e: st.error(f"{tr('Upload failed')}: {str(e)}") - # 名称输入框 - video_theme = st.text_input(tr("短剧名称")) - st.session_state['video_theme'] = video_theme - # 数字输入框 - temperature = st.slider("temperature", 0.0, 2.0, 0.7) - st.session_state['temperature'] = temperature - return video_theme - def render_fun_asr_transcription(tr): - """使用阿里百炼 Fun-ASR 从本地音视频转写生成字幕。""" + """使用 Fun-ASR 从本地音视频转写生成字幕。""" def clear_fun_asr_subtitle_state(): st.session_state['subtitle_path'] = None st.session_state['subtitle_content'] = None st.session_state['subtitle_file_processed'] = False - with st.expander(tr("Ali Bailian Fun-ASR Subtitle Transcription"), expanded=False): - st.caption(tr("Fun-ASR upload caption")) - st.markdown( - f"{tr('API Key URL')}: " - "[https://bailian.console.aliyun.com/?tab=model#/api-key]" - "(https://bailian.console.aliyun.com/?tab=model#/api-key)" + from app.services import fun_asr_subtitle + + backend_options = { + tr("Local FunASR-Pack API"): "local", + tr("Ali Bailian Online Fun-ASR"): "bailian", + tr("上传字幕文件"): "upload", + } + saved_backend = str(config.fun_asr.get("backend", "")).strip().lower() + if saved_backend not in {"local", "bailian", "upload"}: + saved_backend = ( + "bailian" + if config.fun_asr.get("api_key") and not config.fun_asr.get("api_url") + else "local" ) - api_key = st.text_input( - tr("Ali Bailian API Key"), - value=config.fun_asr.get("api_key", ""), - type="password", - help=tr("Ali Bailian API Key Help"), - key="fun_asr_api_key", - ) - uploaded_media = st.file_uploader( - tr("Upload media to transcribe"), - type=[ - "aac", "amr", "avi", "flac", "flv", "m4a", "mkv", "mov", - "mp3", "mp4", "mpeg", "ogg", "opus", "wav", "webm", "wma", "wmv", - ], - accept_multiple_files=False, - key="fun_asr_media_uploader", - ) + backend_values = list(backend_options.values()) + backend_labels = list(backend_options.keys()) + backend = saved_backend + api_key = "" + api_url = config.fun_asr.get("api_url", fun_asr_subtitle.LOCAL_FUN_ASR_API_URL) + hotword = config.fun_asr.get("hotword", "") + enable_spk = bool(config.fun_asr.get("enable_spk", False)) + media_path = st.session_state.get('video_origin_path', '') - if st.button(tr("Transcribe subtitles"), key="fun_asr_transcribe"): - if not api_key.strip(): - clear_fun_asr_subtitle_state() - st.error(tr("Please enter Ali Bailian API Key")) - return - if uploaded_media is None: - clear_fun_asr_subtitle_state() - st.error(tr("Please upload media to transcribe")) - return + subtitle_cols = st.columns([3, 2], vertical_alignment="top") - try: - clear_fun_asr_subtitle_state() - from app.services import fun_asr_subtitle + with subtitle_cols[0]: + with st.expander(tr("Ali Bailian Fun-ASR Subtitle Transcription"), expanded=False): + backend_label = st.radio( + tr("Subtitle Processing Method"), + options=backend_labels, + index=backend_values.index(saved_backend), + horizontal=True, + key="fun_asr_backend", + ) + backend = backend_options[backend_label] - config.fun_asr["api_key"] = api_key.strip() - config.fun_asr["model"] = "fun-asr" - config.save_config() + if backend == "upload": + render_subtitle_upload(tr) + elif backend == "local": + st.caption(tr("Local Fun-ASR upload caption")) + api_url = st.text_input( + tr("Local FunASR-Pack API URL"), + value=api_url, + help=tr("Local FunASR-Pack API URL Help"), + key="fun_asr_api_url", + ) + hotword = st.text_input( + tr("Fun-ASR Hotword"), + value=hotword, + help=tr("Fun-ASR Hotword Help"), + key="fun_asr_hotword", + ) + enable_spk = st.checkbox( + tr("Enable speaker diarization"), + value=enable_spk, + help=tr("Enable speaker diarization Help"), + key="fun_asr_enable_spk", + ) + else: + st.caption(tr("Fun-ASR upload caption")) + st.markdown( + f"{tr('API Key URL')}: " + "[https://bailian.console.aliyun.com/?tab=model#/api-key]" + "(https://bailian.console.aliyun.com/?tab=model#/api-key)" + ) - temp_dir = utils.temp_dir("fun_asr") - safe_filename = os.path.basename(uploaded_media.name) - media_path = os.path.join(temp_dir, safe_filename) - file_name, file_extension = os.path.splitext(safe_filename) - if os.path.exists(media_path): - timestamp = time.strftime("%Y%m%d%H%M%S") - media_path = os.path.join(temp_dir, f"{file_name}_{timestamp}{file_extension}") + api_key = st.text_input( + tr("Ali Bailian API Key"), + value=config.fun_asr.get("api_key", ""), + type="password", + help=tr("Ali Bailian API Key Help"), + key="fun_asr_api_key", + ) - with open(media_path, "wb") as f: - f.write(uploaded_media.getbuffer()) - - subtitle_name = f"{os.path.splitext(os.path.basename(media_path))[0]}_fun_asr.srt" - subtitle_path = os.path.join(utils.subtitle_dir(), subtitle_name) - - with st.spinner(tr("Transcribing with Fun-ASR...")): - generated_path = fun_asr_subtitle.create_with_fun_asr( - local_file=media_path, - subtitle_file=subtitle_path, - api_key=api_key.strip(), + if backend != "upload": + if media_path: + st.info( + tr("Using selected video for subtitle transcription").format( + file=os.path.basename(media_path) + ) ) + else: + st.warning(tr("Please select or upload a video first")) - if not generated_path or not os.path.exists(generated_path): - clear_fun_asr_subtitle_state() - st.error(tr("Fun-ASR failed without subtitle file")) - return + can_transcribe = backend != "upload" and bool(media_path) + with subtitle_cols[1]: + transcribe_clicked = st.button( + tr("Transcribe subtitles"), + key="fun_asr_transcribe", + disabled=not can_transcribe, + use_container_width=True, + ) - with open(generated_path, "r", encoding="utf-8") as f: - subtitle_content = f.read() + if not transcribe_clicked: + return - st.session_state['subtitle_path'] = generated_path - st.session_state['subtitle_content'] = subtitle_content - st.session_state['subtitle_file_processed'] = True - st.success(tr("Subtitle transcription succeeded").format(file=os.path.basename(generated_path))) - except Exception as e: - clear_fun_asr_subtitle_state() - logger.error(f"Fun-ASR 字幕转写失败: {traceback.format_exc()}") - st.error(f"{tr('Fun-ASR transcription failed')}: {str(e)}") + if backend == "bailian" and not api_key.strip(): + clear_fun_asr_subtitle_state() + st.error(tr("Please enter Ali Bailian API Key")) + return + if backend == "local" and not str(api_url).strip(): + clear_fun_asr_subtitle_state() + st.error(tr("Please enter local FunASR-Pack API URL")) + return + if not media_path or not os.path.exists(media_path): + clear_fun_asr_subtitle_state() + st.error(tr("Selected video file does not exist")) + return + + try: + clear_fun_asr_subtitle_state() + + config.fun_asr["backend"] = backend + config.fun_asr["api_url"] = str(api_url).strip() + config.fun_asr["api_key"] = api_key.strip() + config.fun_asr["hotword"] = str(hotword).strip() + config.fun_asr["enable_spk"] = bool(enable_spk) + config.fun_asr["model"] = "fun-asr" + config.save_config() + + subtitle_name = f"{os.path.splitext(os.path.basename(media_path))[0]}_fun_asr.srt" + subtitle_path = os.path.join(utils.subtitle_dir(), subtitle_name) + + spinner_text = ( + tr("Transcribing with local FunASR-Pack...") + if backend == "local" + else tr("Transcribing with Fun-ASR...") + ) + with st.spinner(spinner_text): + if backend == "local": + generated_path = fun_asr_subtitle.create_with_local_fun_asr( + local_file=media_path, + subtitle_file=subtitle_path, + api_url=str(api_url).strip(), + hotword=str(hotword).strip(), + enable_spk=bool(enable_spk), + ) + else: + generated_path = fun_asr_subtitle.create_with_fun_asr( + local_file=media_path, + subtitle_file=subtitle_path, + api_key=api_key.strip(), + ) + + if not generated_path or not os.path.exists(generated_path): + clear_fun_asr_subtitle_state() + st.error(tr("Fun-ASR failed without subtitle file")) + return + + with open(generated_path, "r", encoding="utf-8") as f: + subtitle_content = f.read() + + st.session_state['subtitle_path'] = generated_path + st.session_state['subtitle_content'] = subtitle_content + st.session_state['subtitle_file_processed'] = True + success_placeholder = st.empty() + success_placeholder.success( + tr("Subtitle transcription succeeded").format(file=os.path.basename(generated_path)) + ) + time.sleep(3) + success_placeholder.empty() + except Exception as e: + clear_fun_asr_subtitle_state() + logger.error(f"Fun-ASR 字幕转写失败: {traceback.format_exc()}") + st.error(f"{tr('Fun-ASR transcription failed')}: {str(e)}") def render_script_buttons(tr, params): diff --git a/webui/i18n/en.json b/webui/i18n/en.json index bfab52e..cfa122b 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -8,11 +8,11 @@ "Script Files": "Script Files", "Generate Video Script and Keywords": "Click to use AI to generate **Video Script** and **Video Keywords** based on the **subject**", "Auto Detect": "Auto Detect", - "Auto Generate": "Auto Generate", + "Auto Generate": "Frame Analysis", "Video Script": "Video Script (:blue[①Optional, use AI to generate ②Proper punctuation helps in generating subtitles])", "Save Script": "Save Script", "Crop Video": "Crop Video", - "Video File": "Video File (:blue[1️⃣Supports uploading video files (limit 2G) 2️⃣For large files, it is recommended to directly import them into the ./resource/videos directory])", + "Video File": "Video File", "Plot Description": "Plot Description (:blue[Can be obtained from https://www.tvmao.com/])", "Generate Video Keywords": "Click to use AI to generate **Video Keywords** based on the **script**", "Please Enter the Video Subject": "Please enter the video script first", @@ -84,6 +84,13 @@ "Synthesizing Voice": "Synthesizing voice, please wait...", "TTS Provider": "TTS Provider", "Hide Log": "Hide Log", + "Select from resource directory": "Select from resource directory", + "Select a video from resource videos directory": "Select a video from the ./resource/videos directory", + "Upload a new video file up to 2GB": "Upload a new video file, up to 2GB", + "Select Video": "Select Video", + "Choose a video file": "Choose a video file", + "Upload Video": "Upload Video", + "No video files found in resource videos directory": "No video files found in the ./resource/videos directory", "Upload Local Files": "Upload Local Files", "File Uploaded Successfully": "File Uploaded Successfully", "Frame Interval (seconds)": "Frame Interval (seconds)", @@ -172,8 +179,8 @@ "Batch Size": "Batch Size", "Batch Size (More keyframes consume more tokens)": "Batch Size (smaller batches consume more tokens)", "Short Drama Summary": "Short Drama Summary", - "Video Type": "Video Type", - "Select/Upload Script": "Select/Upload Script", + "Video Type": "Creation Type", + "Select/Upload Script": "Custom Script", "原生Gemini模型连接成功": "Native Gemini model connection succeeded", "原生Gemini模型连接失败": "Native Gemini model connection failed", "OpenAI兼容Gemini代理连接成功": "OpenAI-compatible Gemini proxy connection succeeded", @@ -181,7 +188,7 @@ "Connection failed": "Connection failed", "自定义片段": "Custom Clips", "设置需要生成的短视频片段数量": "Set the number of short video clips to generate", - "上传字幕文件": "Upload Subtitle File", + "上传字幕文件": "Upload SRT", "清除已上传字幕": "Clear Uploaded Subtitle", "无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312)": "Unable to read the subtitle file. Please check the file encoding. Supported encodings: UTF-8, UTF-16, GBK, GB2312.", "字幕文件内容似乎为空,请检查文件": "The subtitle file appears to be empty. Please check the file.", @@ -289,15 +296,31 @@ "Encoding": "Encoding", "Size": "Size", "Characters": "characters", - "Ali Bailian Fun-ASR Subtitle Transcription": "Ali Bailian Fun-ASR Subtitle Transcription", - "Fun-ASR upload caption": "After uploading a local audio/video file, it will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.", + "Ali Bailian Fun-ASR Subtitle Transcription": "Subtitle Processing", + "Subtitle Processing Method": "Subtitle Processing Method", + "Fun-ASR Backend": "Fun-ASR Backend", + "Local FunASR-Pack API": "Local", + "Ali Bailian Online Fun-ASR": "Online", + "Local Fun-ASR upload caption": "The current video above will be converted to SRT subtitles through the locally running FunASR-Pack API.", + "Fun-ASR upload caption": "The current video above will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.", + "Local FunASR-Pack API URL": "Local FunASR-Pack API URL", + "Local FunASR-Pack API URL Help": "For example, http://127.0.0.1:7860. A full /asr endpoint URL is also supported.", + "Fun-ASR Hotword": "Hotword", + "Fun-ASR Hotword Help": "Optional hotwords passed to the local FunASR-Pack API.", + "Enable speaker diarization": "Enable speaker diarization", + "Enable speaker diarization Help": "Requires the local FunASR-Pack service to enable and load the spk model.", "API Key URL": "API Key URL", "Ali Bailian API Key": "Ali Bailian API Key", "Ali Bailian API Key Help": "Enter your Ali Bailian API Key. After saving, it will be written to the local config.toml file.", "Upload media to transcribe": "Upload audio/video to transcribe", + "Using selected video for subtitle transcription": "Using current video for subtitle transcription: {file}", + "Please select or upload a video first": "Please select or upload a video file above first", + "Selected video file does not exist": "The selected video file does not exist. Please select or upload it again", "Transcribe subtitles": "Transcribe Subtitles", "Please enter Ali Bailian API Key": "Please enter the Ali Bailian API Key first", + "Please enter local FunASR-Pack API URL": "Please enter the local FunASR-Pack API URL first", "Please upload media to transcribe": "Please upload the audio or video file to transcribe first", + "Transcribing with local FunASR-Pack...": "Transcribing subtitles with local FunASR-Pack, please wait...", "Transcribing with Fun-ASR...": "Transcribing subtitles with Ali Bailian Fun-ASR, please wait...", "Fun-ASR failed without subtitle file": "Fun-ASR transcription failed: no subtitle file was generated", "Subtitle transcription succeeded": "Subtitle transcription succeeded: {file}", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 4c732cc..602b616 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -11,7 +11,7 @@ "Video Theme": "视频主题", "Generation Prompt": "自定义提示词", "Save Script": "保存脚本", - "Video File": "视频文件(:blue[1️⃣支持上传视频文件(限制2G) 2️⃣大文件建议直接导入 ./resource/videos 目录])", + "Video File": "视频文件", "Plot Description": "剧情描述 (:blue[可从 https://www.tvmao.com/ 获取])", "Generate Video Keywords": "点击使用AI根据**文案**生成【视频关键】", "Please Enter the Video Subject": "请先填写视频文案", @@ -80,6 +80,13 @@ "Synthesizing Voice": "语音合成中,请稍候...", "TTS Provider": "语音合成提供商", "Hide Log": "隐藏日志", + "Select from resource directory": "从资源目录选择", + "Select a video from resource videos directory": "选择 ./resource/videos 目录中的视频", + "Upload a new video file up to 2GB": "上传一个新的视频文件,限制 2GB", + "Select Video": "选择视频", + "Choose a video file": "选择一个视频文件", + "Upload Video": "上传视频", + "No video files found in resource videos directory": "未在 ./resource/videos 目录中找到视频文件", "Upload Local Files": "上传本地文件", "File Uploaded Successfully": "文件上传成功", "timestamp": "时间戳", @@ -156,14 +163,14 @@ "Generate Short Video Script": "AI生成短剧混剪脚本", "Adjust the volume of the original audio": "调整原始音频的音量", "Original Volume": "视频音量", - "Auto Generate": "逐帧解说", + "Auto Generate": "逐帧分析", "Frame Interval (seconds)": "帧间隔 (秒)", "Frame Interval (seconds) (More keyframes consume more tokens)": "帧间隔 (秒) (更多关键帧消耗更多令牌)", "Batch Size": "批处理大小", "Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多", "Short Drama Summary": "短剧解说", - "Video Type": "视频类型", - "Select/Upload Script": "选择/上传脚本", + "Video Type": "创作类型", + "Select/Upload Script": "自定义脚本", "Script loaded successfully": "脚本加载成功", "Failed to load script": "加载脚本失败", "Failed to save script": "保存脚本失败", @@ -271,15 +278,31 @@ "Encoding": "编码", "Size": "大小", "Characters": "字符", - "Ali Bailian Fun-ASR Subtitle Transcription": "阿里百炼 Fun-ASR 字幕转录", - "Fun-ASR upload caption": "上传本地音频/视频后,将自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。", + "Ali Bailian Fun-ASR Subtitle Transcription": "字幕处理", + "Subtitle Processing Method": "字幕处理方式", + "Fun-ASR Backend": "Fun-ASR 后端", + "Local FunASR-Pack API": "本地转写", + "Ali Bailian Online Fun-ASR": "在线转写", + "Local Fun-ASR upload caption": "将使用上方当前视频,通过本机运行的 FunASR-Pack API 生成 SRT 字幕。", + "Fun-ASR upload caption": "将使用上方当前视频,自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。", + "Local FunASR-Pack API URL": "本地 FunASR-Pack API 地址", + "Local FunASR-Pack API URL Help": "例如 http://127.0.0.1:7860;也可以直接填到 /asr 的完整地址。", + "Fun-ASR Hotword": "热词", + "Fun-ASR Hotword Help": "可选,传给本地 FunASR-Pack 的热词参数。", + "Enable speaker diarization": "启用说话人分段", + "Enable speaker diarization Help": "需要本地 FunASR-Pack 已启用并加载 spk 模型。", "API Key URL": "API Key 获取地址", "Ali Bailian API Key": "阿里百炼 API Key", "Ali Bailian API Key Help": "请输入你自己的阿里百炼 API Key;保存配置后会写入本地 config.toml", "Upload media to transcribe": "上传需要转录的音频/视频", + "Using selected video for subtitle transcription": "将使用当前视频生成字幕: {file}", + "Please select or upload a video first": "请先在上方选择或上传视频文件", + "Selected video file does not exist": "当前视频文件不存在,请重新选择或上传", "Transcribe subtitles": "转写生成字幕", "Please enter Ali Bailian API Key": "请先输入阿里百炼 API Key", + "Please enter local FunASR-Pack API URL": "请先输入本地 FunASR-Pack API 地址", "Please upload media to transcribe": "请先上传需要转录的音频或视频文件", + "Transcribing with local FunASR-Pack...": "正在使用本地 FunASR-Pack 转写字幕,请稍候...", "Transcribing with Fun-ASR...": "正在使用阿里百炼 Fun-ASR 转写字幕,请稍候...", "Fun-ASR failed without subtitle file": "Fun-ASR 转写失败:未生成字幕文件", "Subtitle transcription succeeded": "字幕转写成功: {file}", @@ -357,7 +380,7 @@ "Voice synthesis successful": "✅ 语音合成成功!", "Voice synthesis failed": "❌ 语音合成失败,请检查配置", "SoulVoice pitch not supported": "ℹ️ SoulVoice 引擎不支持音调调节", - "上传字幕文件": "上传字幕文件", + "上传字幕文件": "上传字幕", "清除已上传字幕": "清除已上传字幕", "无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312)": "无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312)", "字幕文件内容似乎为空,请检查文件": "字幕文件内容似乎为空,请检查文件", From e6d15fe24699936778320fad84343afc8db59e5a Mon Sep 17 00:00:00 2001 From: viccy Date: Fri, 5 Jun 2026 19:31:35 +0800 Subject: [PATCH 06/24] =?UTF-8?q?feat(webui):=20=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E7=9F=AD=E5=89=A7=E5=89=A7=E6=83=85=E5=88=86=E6=9E=90=E3=80=81?= =?UTF-8?q?=E5=8F=AF=E8=A7=86=E5=8C=96=E8=84=9A=E6=9C=AC=E7=BC=96=E8=BE=91?= =?UTF-8?q?=E5=99=A8=E4=B8=8E=E9=80=9A=E7=94=A8=E7=94=9F=E6=88=90=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 抽离通用生成参数设置组件,统一管理temperature等LLM生成参数 - 新增短剧字幕剧情分析功能,支持一键分析与手动编辑分析结果 - 重构短剧脚本生成逻辑,支持传入预先生成的剧情分析内容 - 新增可视化视频脚本表格编辑器,支持增删编辑行与原始JSON预览 - 优化多语言翻译、UI交互细节与字幕相关提示文案 --- webui/components/basic_settings.py | 11 ++ webui/components/script_settings.py | 265 +++++++++++++++++++++++--- webui/i18n/en.json | 19 +- webui/i18n/zh.json | 17 ++ webui/tools/generate_short_summary.py | 97 ++++++++-- 5 files changed, 361 insertions(+), 48 deletions(-) diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py index 842a500..39d9904 100644 --- a/webui/components/basic_settings.py +++ b/webui/components/basic_settings.py @@ -167,6 +167,17 @@ def render_basic_settings(tr): with right_config_panel: render_text_llm_settings(tr) # 文案生成模型设置 + render_generation_settings(tr) + + +def render_generation_settings(tr): + """渲染通用生成参数。""" + st.divider() + st.subheader(tr("Generation Settings")) + if 'temperature' not in st.session_state: + st.session_state['temperature'] = 0.7 + st.slider("temperature", 0.0, 2.0, key="temperature") + def render_language_settings(tr): st.subheader(tr("Proxy Settings")) diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index 94a8910..3c91c44 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -1,18 +1,23 @@ import os import glob import json +import math import time import traceback +import pandas as pd import streamlit as st from loguru import logger from app.config import config from app.models.schema import VideoClipParams -from app.services.subtitle_text import decode_subtitle_bytes +from app.services.subtitle_text import decode_subtitle_bytes, read_subtitle_text from app.utils import utils, check_script from webui.tools.generate_script_docu import generate_script_docu from webui.tools.generate_script_short import generate_script_short -from webui.tools.generate_short_summary import generate_script_short_sunmmary +from webui.tools.generate_short_summary import analyze_short_drama_plot, generate_script_short_sunmmary + + +SCRIPT_TABLE_BASE_COLUMNS = ["_id", "timestamp", "picture", "narration", "OST"] def render_script_panel(tr): @@ -77,8 +82,10 @@ def render_script_file(tr, params): default_index = mode_keys.index(tr("Short Generate")) elif current_path == "summary": default_index = mode_keys.index(tr("Short Drama Summary")) - else: + elif current_path: default_index = mode_keys.index(tr("Select/Upload Script")) + else: + default_index = 0 # 1. 渲染功能选择组件 default_mode_label = mode_keys[default_index] @@ -230,16 +237,21 @@ def render_script_file(tr, params): def render_video_file(tr, params): """渲染视频文件选择""" source_options = { - tr("Select from resource directory"): "resource", tr("Upload Local Files"): "upload", + tr("Select from resource directory"): "resource", } source_labels = list(source_options.keys()) default_source_label = source_labels[0] + source_default_version = "upload_first_v1" - if ( - 'video_source_selection' not in st.session_state - or st.session_state['video_source_selection'] not in source_options - ): + if st.session_state.get('_video_source_default_version') != source_default_version: + if ( + st.session_state.get('video_source_selection') not in source_options + or not st.session_state.get('video_origin_path') + ): + st.session_state['video_source_selection'] = default_source_label + st.session_state['_video_source_default_version'] = source_default_version + elif st.session_state.get('video_source_selection') not in source_options: st.session_state['video_source_selection'] = default_source_label current_source = st.session_state['video_source_selection'] @@ -250,12 +262,13 @@ def render_video_file(tr, params): ) st.markdown(f"**{tr('Video Source')}** :gray[{source_caption}]") - source = st.selectbox( + source = st.pills( tr("Video Source"), options=source_labels, - index=None, + selection_mode="single", key="video_source_selection", label_visibility="collapsed", + width="stretch", ) if not source: source = default_source_label @@ -399,23 +412,84 @@ def short_drama_summary(tr): st.session_state['subtitle_file_processed'] = False render_fun_asr_transcription(tr) + render_subtitle_preview(tr) - # 名称输入框 - video_theme = st.text_input(tr("短剧名称")) + current_subtitle_path = st.session_state.get('subtitle_path', '') + plot_analysis_source = st.session_state.get('short_drama_plot_analysis_subtitle_path') + if plot_analysis_source and plot_analysis_source != current_subtitle_path: + st.session_state['short_drama_plot_analysis'] = "" + st.session_state['short_drama_plot_analysis_subtitle_path'] = "" + + name_cols = st.columns([4, 1.2], vertical_alignment="bottom") + with name_cols[0]: + video_theme = st.text_input(tr("短剧名称")) + with name_cols[1]: + analyze_plot_clicked = st.button( + tr("剧情理解"), + key="short_drama_plot_analysis_button", + disabled=not current_subtitle_path, + use_container_width=True, + ) st.session_state['video_theme'] = video_theme - # 数字输入框 - temperature = st.slider("temperature", 0.0, 2.0, 0.7) - st.session_state['temperature'] = temperature + + if analyze_plot_clicked: + with st.spinner(tr("Analyzing plot...")): + plot_analysis = analyze_short_drama_plot( + current_subtitle_path, + st.session_state.get('temperature', 0.7), + tr, + subtitle_content=st.session_state.get('subtitle_content', ''), + ) + if plot_analysis: + st.session_state['short_drama_plot_analysis'] = plot_analysis + st.session_state['short_drama_plot_analysis_subtitle_path'] = current_subtitle_path + st.success(tr("Plot analysis completed")) + + if st.session_state.get('short_drama_plot_analysis'): + st.text_area( + tr("剧情理解结果"), + key="short_drama_plot_analysis", + height=240, + ) + return video_theme +def render_subtitle_preview(tr): + """渲染可折叠的当前字幕预览;没有字幕时提示用户先转写或上传。""" + subtitle_path = st.session_state.get('subtitle_path', '') + subtitle_content = st.session_state.get('subtitle_content', '') + + if subtitle_path and not subtitle_content and os.path.exists(subtitle_path): + subtitle_content = read_subtitle_text(subtitle_path).text + st.session_state['subtitle_content'] = subtitle_content + + with st.expander(tr("Subtitle Preview"), expanded=False): + if not subtitle_path or not subtitle_content: + st.info(tr("Please transcribe or upload subtitles first")) + return + + st.text_area( + tr("Subtitle Preview"), + key="subtitle_content", + height=180, + label_visibility="collapsed", + ) + + def render_subtitle_upload(tr): """上传并保存用户提供的 SRT 字幕文件。""" + subtitle_dir_label = utils.subtitle_dir().replace(config.root_dir, ".") + st.markdown( + f"**{tr('上传字幕文件')}** " + f":gray[{tr('Transcribed subtitles storage hint').format(path=subtitle_dir_label)}]" + ) subtitle_file = st.file_uploader( tr("上传字幕文件"), type=["srt"], accept_multiple_files=False, - key="subtitle_file_uploader" # 添加唯一key + key="subtitle_file_uploader", # 添加唯一key + label_visibility="collapsed", ) # 显示当前已上传的字幕文件路径 @@ -476,6 +550,141 @@ def render_subtitle_upload(tr): st.error(f"{tr('Upload failed')}: {str(e)}") +def _is_blank_table_value(value): + if value is None: + return True + if isinstance(value, float) and math.isnan(value): + return True + if isinstance(value, str) and not value.strip(): + return True + return False + + +def _ordered_script_columns(script_rows): + columns = [] + for column in SCRIPT_TABLE_BASE_COLUMNS: + columns.append(column) + + for row in script_rows: + if not isinstance(row, dict): + continue + for column in row.keys(): + if column not in columns: + columns.append(column) + + return columns + + +def _script_json_to_table(script_data): + if not isinstance(script_data, list): + script_data = [] + + if not script_data: + return pd.DataFrame(columns=SCRIPT_TABLE_BASE_COLUMNS) + + if not all(isinstance(item, dict) for item in script_data): + rows = [ + {"value": json.dumps(item, ensure_ascii=False)} + for item in script_data + ] + return pd.DataFrame(rows, columns=["value"]) + + columns = _ordered_script_columns(script_data) + return pd.DataFrame(script_data, columns=columns) + + +def _normalize_script_table_value(column, value): + if _is_blank_table_value(value): + return "" + + if column in {"_id", "OST"}: + try: + return int(value) + except (TypeError, ValueError): + return value + + return value + + +def _script_table_to_json(edited_data): + if isinstance(edited_data, pd.DataFrame): + records = edited_data.to_dict("records") + elif isinstance(edited_data, list): + records = edited_data + else: + records = pd.DataFrame(edited_data).to_dict("records") + + script_data = [] + for row in records: + if not isinstance(row, dict): + continue + if all(_is_blank_table_value(value) for value in row.values()): + continue + + cleaned_row = {} + for column, value in row.items(): + if not column: + continue + normalized_value = _normalize_script_table_value(column, value) + if _is_blank_table_value(normalized_value) and column not in SCRIPT_TABLE_BASE_COLUMNS: + continue + cleaned_row[column] = normalized_value + + if cleaned_row: + script_data.append(cleaned_row) + + return json.dumps(script_data, indent=2, ensure_ascii=False) + + +def render_video_script_editor(tr): + """使用弹窗和表格编辑视频脚本 JSON。""" + @st.dialog(tr("Video Script"), width="large") + def video_script_dialog(): + script_data = st.session_state.get('video_clip_json', []) + table_data = _script_json_to_table(script_data) + column_order = list(table_data.columns) + + st.caption(tr("Video script table help")) + edited_table = st.data_editor( + table_data, + key="video_script_table_editor", + hide_index=True, + num_rows="dynamic", + use_container_width=True, + height=520, + row_height=72, + column_order=column_order, + column_config={ + "_id": st.column_config.NumberColumn(tr("Script Column ID"), step=1, format="%d", width=52), + "timestamp": st.column_config.TextColumn(tr("Script Column Timestamp"), width=200), + "picture": st.column_config.TextColumn(tr("Script Column Picture"), width=320), + "narration": st.column_config.TextColumn(tr("Script Column Narration"), width=480), + "OST": st.column_config.NumberColumn( + tr("Script Column OST"), + min_value=0, + max_value=2, + step=1, + format="%d", + width=52, + ), + }, + ) + + video_clip_json_details = _script_table_to_json(edited_table) + with st.expander(tr("Raw JSON Preview"), expanded=False): + st.code(video_clip_json_details, language="json") + + if st.button(tr("Save Script"), key="save_script_from_dialog", use_container_width=True): + save_script_with_validation(tr, video_clip_json_details) + + script_data = st.session_state.get('video_clip_json', []) + script_count = len(script_data) if isinstance(script_data, list) else 0 + st.markdown(f"**{tr('Video Script')}** :gray[{tr('Video script row count').format(count=script_count)}]") + + if st.button(tr("Edit Video Script"), key="open_video_script_editor", use_container_width=True): + video_script_dialog() + + def render_fun_asr_transcription(tr): """使用 Fun-ASR 从本地音视频转写生成字幕。""" def clear_fun_asr_subtitle_state(): @@ -681,20 +890,22 @@ def render_script_buttons(tr, params): subtitle_path = st.session_state.get('subtitle_path') video_theme = st.session_state.get('video_theme') temperature = st.session_state.get('temperature') - generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature, tr) + plot_analysis = "" + if st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path: + plot_analysis = st.session_state.get('short_drama_plot_analysis', '') + generate_script_short_sunmmary( + params, + subtitle_path, + video_theme, + temperature, + tr, + plot_analysis=plot_analysis, + subtitle_content=st.session_state.get('subtitle_content', ''), + ) else: load_script(tr, script_path) - # 视频脚本编辑区 - video_clip_json_details = st.text_area( - tr("Video Script"), - value=json.dumps(st.session_state.get('video_clip_json', []), indent=2, ensure_ascii=False), - height=500 - ) - - # 操作按钮行 - 合并格式检查和保存功能 - if st.button(tr("Save Script"), key="save_script", use_container_width=True): - save_script_with_validation(tr, video_clip_json_details) + render_video_script_editor(tr) def load_script(tr, script_path): diff --git a/webui/i18n/en.json b/webui/i18n/en.json index cfa122b..dbce928 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -9,7 +9,17 @@ "Generate Video Script and Keywords": "Click to use AI to generate **Video Script** and **Video Keywords** based on the **subject**", "Auto Detect": "Auto Detect", "Auto Generate": "Frame Analysis", - "Video Script": "Video Script (:blue[①Optional, use AI to generate ②Proper punctuation helps in generating subtitles])", + "Video Script": "Video Script", + "Edit Video Script": "View/Edit Video Script", + "Video script row count": "{count} script rows", + "Video script table help": "Edit the full script JSON as a table. You can add or delete rows; saving will validate and write the script file again.", + "Raw JSON Preview": "Raw JSON Preview", + "Script Column ID": "ID", + "Script Column Timestamp": "Timestamp", + "Script Column Picture": "Picture", + "Script Column Narration": "Narration", + "Script Column OST": "Mark", + "Generation Settings": "Generation Settings", "Save Script": "Save Script", "Crop Video": "Crop Video", "Video File": "Video File", @@ -324,6 +334,13 @@ "Transcribing with Fun-ASR...": "Transcribing subtitles with Ali Bailian Fun-ASR, please wait...", "Fun-ASR failed without subtitle file": "Fun-ASR transcription failed: no subtitle file was generated", "Subtitle transcription succeeded": "Subtitle transcription succeeded: {file}", + "Transcribed subtitles storage hint": "Previously transcribed subtitles are saved in {path}; drag a file from that folder to upload", + "剧情理解": "Plot Analysis", + "剧情理解结果": "Plot Analysis Result", + "Analyzing plot...": "Analyzing plot...", + "Plot analysis completed": "Plot analysis completed", + "Please generate or upload subtitles first": "Please transcribe or upload subtitles first", + "Please transcribe or upload subtitles first": "Please transcribe or upload subtitles first", "Fun-ASR transcription failed": "Fun-ASR transcription failed", "Validating script format...": "Validating script format...", "Script format validation failed": "Script format validation failed", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 602b616..33eb74a 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -10,6 +10,7 @@ "Auto Detect": "自动检测", "Video Theme": "视频主题", "Generation Prompt": "自定义提示词", + "Generation Settings": "生成参数", "Save Script": "保存脚本", "Video File": "视频文件", "Plot Description": "剧情描述 (:blue[可从 https://www.tvmao.com/ 获取])", @@ -104,6 +105,15 @@ "Failed to Save Script": "保存脚本失败", "Script saved successfully": "脚本保存成功", "Video Script": "视频脚本", + "Edit Video Script": "查看/编辑视频脚本", + "Video script row count": "共 {count} 条脚本", + "Video script table help": "在表格中编辑完整脚本 JSON。可新增、删除行;保存时会重新校验并写入脚本文件。", + "Raw JSON Preview": "原始 JSON 预览", + "Script Column ID": "序号", + "Script Column Timestamp": "时间戳", + "Script Column Picture": "画面描述", + "Script Column Narration": "解说台词", + "Script Column OST": "标记", "Video Quality": "视频质量", "Custom prompt for LLM, leave empty to use default prompt": "自定义提示词,留空则使用默认提示词", "Proxy Settings": "代理设置", @@ -306,6 +316,13 @@ "Transcribing with Fun-ASR...": "正在使用阿里百炼 Fun-ASR 转写字幕,请稍候...", "Fun-ASR failed without subtitle file": "Fun-ASR 转写失败:未生成字幕文件", "Subtitle transcription succeeded": "字幕转写成功: {file}", + "Transcribed subtitles storage hint": "之前转录生成的字幕保存在 {path},可从该目录拖入上传", + "剧情理解": "剧情理解", + "剧情理解结果": "剧情理解结果", + "Analyzing plot...": "正在理解剧情...", + "Plot analysis completed": "剧情理解完成", + "Please generate or upload subtitles first": "请先转写或上传字幕", + "Please transcribe or upload subtitles first": "请先转写或上传字幕", "Fun-ASR transcription failed": "Fun-ASR 字幕转写失败", "Validating script format...": "正在验证脚本格式...", "Script format validation failed": "脚本格式验证失败", diff --git a/webui/tools/generate_short_summary.py b/webui/tools/generate_short_summary.py index fe2d223..eb42361 100644 --- a/webui/tools/generate_short_summary.py +++ b/webui/tools/generate_short_summary.py @@ -135,7 +135,58 @@ def parse_and_fix_json(json_string): return None -def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature, tr=lambda key: key): +def analyze_short_drama_plot(subtitle_path, temperature, tr=lambda key: key, subtitle_content=None): + """仅执行短剧字幕剧情理解,返回可编辑的剧情分析文本。""" + if not subtitle_path: + st.error(tr("Please generate or upload subtitles first")) + return None + if not os.path.exists(subtitle_path): + st.error(tr("Subtitle file does not exist")) + return None + + text_provider = config.app.get('text_llm_provider', 'gemini').lower() + text_api_key = config.app.get(f'text_{text_provider}_api_key') + text_model = config.app.get(f'text_{text_provider}_model_name') + text_base_url = config.app.get(f'text_{text_provider}_base_url') + + subtitle_content = str(subtitle_content or "").strip() or read_subtitle_text(subtitle_path).text + if not subtitle_content: + st.error(tr("Subtitle file is empty or unreadable")) + return None + + try: + logger.info("使用新的LLM服务架构进行字幕分析") + analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) + analysis_result = analyzer.analyze_subtitle(subtitle_content) + except Exception as e: + logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") + analysis_result = analyze_subtitle( + subtitle_content=subtitle_content, + api_key=text_api_key, + model=text_model, + base_url=text_base_url, + save_result=True, + temperature=temperature, + provider=text_provider + ) + + if analysis_result["status"] != "success": + logger.error(f"分析失败: {analysis_result['message']}") + st.error(tr("Script generation failed check logs")) + return None + + return analysis_result["analysis"] + + +def generate_script_short_sunmmary( + params, + subtitle_path, + video_theme, + temperature, + tr=lambda key: key, + plot_analysis=None, + subtitle_content=None, +): """ 生成 短剧解说 视频脚本 要求: 提供高质量短剧字幕 @@ -174,30 +225,36 @@ def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperatu text_base_url = config.app.get(f'text_{text_provider}_base_url') # 读取字幕文件内容(无论使用哪种实现都需要) - subtitle_content = read_subtitle_text(subtitle_path).text + subtitle_content = str(subtitle_content or "").strip() or read_subtitle_text(subtitle_path).text if not subtitle_content: st.error(tr("Subtitle file is empty or unreadable")) return - try: - # 优先使用新的LLM服务架构 - logger.info("使用新的LLM服务架构进行字幕分析") - analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) + analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) + if plot_analysis and str(plot_analysis).strip(): + logger.info("使用用户编辑后的剧情理解结果生成解说文案") + analysis_result = { + "status": "success", + "analysis": str(plot_analysis).strip(), + } + else: + try: + # 优先使用新的LLM服务架构 + logger.info("使用新的LLM服务架构进行字幕分析") + analysis_result = analyzer.analyze_subtitle(subtitle_content) - analysis_result = analyzer.analyze_subtitle(subtitle_content) - - except Exception as e: - logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") - # 回退到旧的实现 - analysis_result = analyze_subtitle( - subtitle_file_path=subtitle_path, - api_key=text_api_key, - model=text_model, - base_url=text_base_url, - save_result=True, - temperature=temperature, - provider=text_provider - ) + except Exception as e: + logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") + # 回退到旧的实现 + analysis_result = analyze_subtitle( + subtitle_content=subtitle_content, + api_key=text_api_key, + model=text_model, + base_url=text_base_url, + save_result=True, + temperature=temperature, + provider=text_provider + ) """ 3. 根据剧情生成解说文案 """ From 0bd001ce332638f80fe02bf59046989d74da76d6 Mon Sep 17 00:00:00 2001 From: viccy Date: Fri, 5 Jun 2026 23:15:11 +0800 Subject: [PATCH 07/24] =?UTF-8?q?feat(webui,=20llm,=20subtitle):=20?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=AD=97=E5=B9=95=E6=A0=A1=E5=87=86=E3=80=81?= =?UTF-8?q?=E5=A4=9A=E8=A7=86=E9=A2=91=E6=94=AF=E6=8C=81=E4=B8=8ELLM?= =?UTF-8?q?=E7=94=9F=E6=88=90=E5=8F=82=E6=95=B0=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加字幕校准服务,支持通过LLM校对SRT格式字幕文件,支持批量处理 - 为视频参数模型新增video_origin_paths字段,支持多视频上传与批量处理 - 为OpenAI兼容LLM提供商添加temperature、top_p、max_tokens和thinking_level参数配置支持 - 重构WebUI模型设置页面,将通用生成参数配置拆分到各模型的独立配置项中 - 更新示例配置文件与默认配置,新增对应参数的默认值 - 完善多语言国际化文案,添加批量操作与字幕校准相关翻译 - 添加相关单元测试以覆盖新功能与配置项 --- app/config/defaults.py | 16 + app/config/test_config_bootstrap_unittest.py | 4 + app/models/schema.py | 2 +- .../llm/openai_compatible_provider.py | 70 ++- .../llm/test_openai_compat_unittest.py | 55 ++- app/services/subtitle_corrector.py | 231 ++++++++++ .../test_subtitle_corrector_unittest.py | 100 +++++ config.example.toml | 8 + webui/components/basic_settings.py | 158 +++++-- webui/components/script_settings.py | 398 ++++++++++++++---- webui/i18n/en.json | 35 +- webui/i18n/zh.json | 49 ++- 12 files changed, 983 insertions(+), 143 deletions(-) create mode 100644 app/services/subtitle_corrector.py create mode 100644 app/services/test_subtitle_corrector_unittest.py diff --git a/app/config/defaults.py b/app/config/defaults.py index 9a686f2..a001978 100644 --- a/app/config/defaults.py +++ b/app/config/defaults.py @@ -11,6 +11,21 @@ DEFAULT_VISION_OPENAI_MODEL_NAME = "Qwen/Qwen3.5-122B-A10B" DEFAULT_TEXT_LLM_PROVIDER = DEFAULT_OPENAI_COMPATIBLE_PROVIDER DEFAULT_TEXT_OPENAI_MODEL_NAME = "Pro/zai-org/GLM-5" +DEFAULT_LLM_GENERATION_CONFIG = { + "temperature": 1.0, + "top_p": 0.95, + "max_tokens": 65536, + "thinking_level": "auto", +} + +DEFAULT_LLM_THINKING_LEVELS = ["auto", "off", "low", "medium", "high"] + +DEFAULT_LLM_GENERATION_APP_CONFIG = { + f"{model_type}_openai_{param_name}": value + for model_type in ("vision", "text") + for param_name, value in DEFAULT_LLM_GENERATION_CONFIG.items() +} + DEFAULT_LLM_APP_CONFIG = { "vision_llm_provider": DEFAULT_VISION_LLM_PROVIDER, "vision_openai_model_name": DEFAULT_VISION_OPENAI_MODEL_NAME, @@ -21,6 +36,7 @@ DEFAULT_LLM_APP_CONFIG = { "text_openai_api_key": "", "text_openai_base_url": DEFAULT_OPENAI_COMPATIBLE_BASE_URL, } +DEFAULT_LLM_APP_CONFIG.update(DEFAULT_LLM_GENERATION_APP_CONFIG) def build_default_app_config(app_config: dict | None = None) -> dict: diff --git a/app/config/test_config_bootstrap_unittest.py b/app/config/test_config_bootstrap_unittest.py index 720a934..8398fea 100644 --- a/app/config/test_config_bootstrap_unittest.py +++ b/app/config/test_config_bootstrap_unittest.py @@ -53,9 +53,13 @@ hide_config = true self.assertEqual("openai", config_data["app"]["vision_llm_provider"]) self.assertEqual("Qwen/Qwen3.5-122B-A10B", config_data["app"]["vision_openai_model_name"]) self.assertEqual("https://api.siliconflow.cn/v1", config_data["app"]["vision_openai_base_url"]) + self.assertEqual(1.0, config_data["app"]["vision_openai_temperature"]) + self.assertEqual(0.95, config_data["app"]["vision_openai_top_p"]) self.assertEqual("openai", config_data["app"]["text_llm_provider"]) self.assertEqual("Pro/zai-org/GLM-5", config_data["app"]["text_openai_model_name"]) self.assertEqual("https://api.siliconflow.cn/v1", config_data["app"]["text_openai_base_url"]) + self.assertEqual(1.0, config_data["app"]["text_openai_temperature"]) + self.assertEqual(0.95, config_data["app"]["text_openai_top_p"]) self.assertEqual("Qwen/Qwen3.5-122B-A10B", saved_config["app"]["vision_openai_model_name"]) self.assertEqual("Pro/zai-org/GLM-5", saved_config["app"]["text_openai_model_name"]) self.assertTrue(saved_config["app"]["hide_config"]) diff --git a/app/models/schema.py b/app/models/schema.py index d22d03d..a41b1e1 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -164,6 +164,7 @@ class VideoClipParams(BaseModel): video_clip_json: Optional[list] = Field(default=[], description="LLM 生成的视频剪辑脚本内容") video_clip_json_path: Optional[str] = Field(default="", description="LLM 生成的视频剪辑脚本路径") video_origin_path: Optional[str] = Field(default="", description="原视频路径") + video_origin_paths: Optional[List[str]] = Field(default=[], description="原视频路径列表") video_aspect: Optional[VideoAspect] = Field(default=VideoAspect.portrait.value, description="视频比例") video_language: Optional[str] = Field(default="zh-CN", description="视频语言") @@ -206,4 +207,3 @@ class SubtitlePosition(str, Enum): TOP = "top" CENTER = "center" BOTTOM = "bottom" - diff --git a/app/services/llm/openai_compatible_provider.py b/app/services/llm/openai_compatible_provider.py index b91c6dc..9a2b183 100644 --- a/app/services/llm/openai_compatible_provider.py +++ b/app/services/llm/openai_compatible_provider.py @@ -22,7 +22,7 @@ from openai import ( ) from app.config import config -from app.config.defaults import normalize_openai_compatible_model_name +from app.config.defaults import DEFAULT_LLM_GENERATION_CONFIG, normalize_openai_compatible_model_name from .base import TextModelProvider, VisionModelProvider from .exceptions import APICallError, AuthenticationError, ContentFilterError, RateLimitError @@ -68,18 +68,59 @@ class _OpenAICompatibleBase: # SDK client 按请求参数动态构建,这里无需初始化全局状态。 pass + def _generation_config_value(self, model_type: str, param_name: str, override: Any = None) -> Any: + if override is not None: + return override + return config.app.get( + f"{model_type}_openai_{param_name}", + DEFAULT_LLM_GENERATION_CONFIG[param_name], + ) + + def _build_chat_completion_options( + self, + model_type: str, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + **kwargs, + ) -> Dict[str, Any]: + """Build common OpenAI-compatible generation options from config and overrides.""" + options: Dict[str, Any] = { + "temperature": float(self._generation_config_value(model_type, "temperature", temperature)), + } + + top_p = float(self._generation_config_value(model_type, "top_p", kwargs.get("top_p"))) + options["top_p"] = top_p + + configured_max_tokens = self._generation_config_value(model_type, "max_tokens", max_tokens) + if configured_max_tokens is not None and int(configured_max_tokens) > 0: + options["max_tokens"] = int(configured_max_tokens) + + extra_body: Dict[str, Any] = {} + + thinking_level = str( + self._generation_config_value(model_type, "thinking_level", kwargs.get("thinking_level")) or "auto" + ) + if thinking_level in {"low", "medium", "high"}: + extra_body["reasoning_effort"] = thinking_level + + if extra_body: + options["extra_body"] = extra_body + + return options + def _build_client( self, api_key_override: Optional[str] = None, base_url_override: Optional[str] = None, timeout_override: Optional[float] = None, + max_retries_override: Optional[int] = None, ) -> AsyncOpenAI: """按请求构建 AsyncOpenAI 客户端,支持动态覆盖 api_key / base_url。""" api_key = api_key_override or self.api_key base_url = base_url_override or self.base_url or None timeout_seconds: float = timeout_override or config.app.get("llm_text_timeout", 180) - max_retries: int = config.app.get("llm_max_retries", 3) + max_retries: int = max_retries_override or config.app.get("llm_max_retries", 3) return AsyncOpenAI( api_key=api_key, @@ -147,11 +188,17 @@ class OpenAICompatibleVisionProvider(_OpenAICompatibleBase, VisionModelProvider) ) try: + generation_overrides = dict(kwargs) + completion_options = self._build_chat_completion_options( + "vision", + temperature=generation_overrides.pop("temperature", None), + max_tokens=generation_overrides.pop("max_tokens", None), + **generation_overrides, + ) response = await client.chat.completions.create( model=model_name, messages=messages, - temperature=kwargs.get("temperature", 1.0), - max_tokens=kwargs.get("max_tokens", 4000), + **completion_options, ) if response.choices and response.choices[0].message and response.choices[0].message.content: return response.choices[0].message.content @@ -204,13 +251,22 @@ class OpenAICompatibleTextProvider(_OpenAICompatibleBase, TextModelProvider): timeout_override=config.app.get("llm_text_timeout", 180), ) + temperature_override = kwargs.pop("temperature", None) + if temperature_override is None and temperature != 1.0: + temperature_override = temperature + completion_kwargs: Dict[str, Any] = { "model": model_name, "messages": messages, - "temperature": temperature, } - if max_tokens: - completion_kwargs["max_tokens"] = max_tokens + completion_kwargs.update( + self._build_chat_completion_options( + "text", + temperature=temperature_override, + max_tokens=kwargs.pop("max_tokens", max_tokens), + **kwargs, + ) + ) if response_format == "json": completion_kwargs["response_format"] = {"type": "json_object"} diff --git a/app/services/llm/test_openai_compat_unittest.py b/app/services/llm/test_openai_compat_unittest.py index acef31a..14b3ab1 100644 --- a/app/services/llm/test_openai_compat_unittest.py +++ b/app/services/llm/test_openai_compat_unittest.py @@ -8,7 +8,7 @@ from app.config import config from app.services.llm.base import TextModelProvider from app.services.llm.manager import LLMServiceManager from app.services.llm.migration_adapter import LegacyLLMAdapter, VisionAnalyzerAdapter -from app.services.llm.openai_compatible_provider import OpenAICompatibleVisionProvider +from app.services.llm.openai_compatible_provider import OpenAICompatibleTextProvider, OpenAICompatibleVisionProvider from app.services.llm.providers import register_all_providers @@ -116,6 +116,59 @@ class OpenAICompatVisionConcurrencyTests(unittest.IsolatedAsyncioTestCase): self.assertEqual(2, max_in_flight) +class OpenAICompatGenerationOptionTests(unittest.TestCase): + def setUp(self): + self._original_app = dict(config.app) + + def tearDown(self): + config.app.clear() + config.app.update(self._original_app) + + def test_build_options_uses_generation_defaults(self): + provider = OpenAICompatibleTextProvider(api_key="k", model_name="m") + for key in ( + "text_openai_temperature", + "text_openai_top_p", + "text_openai_max_tokens", + "text_openai_thinking_level", + ): + config.app.pop(key, None) + + options = provider._build_chat_completion_options("text") + + self.assertEqual(1.0, options["temperature"]) + self.assertEqual(0.95, options["top_p"]) + self.assertEqual(65536, options["max_tokens"]) + self.assertNotIn("extra_body", options) + + def test_build_options_uses_per_model_generation_config(self): + provider = OpenAICompatibleTextProvider(api_key="k", model_name="m") + config.app.update( + { + "text_openai_temperature": 0.3, + "text_openai_top_p": 0.8, + "text_openai_max_tokens": 2048, + "text_openai_thinking_level": "high", + } + ) + + options = provider._build_chat_completion_options("text") + + self.assertEqual(0.3, options["temperature"]) + self.assertEqual(0.8, options["top_p"]) + self.assertEqual(2048, options["max_tokens"]) + self.assertEqual({"reasoning_effort": "high"}, options["extra_body"]) + + def test_explicit_generation_options_override_config(self): + provider = OpenAICompatibleTextProvider(api_key="k", model_name="m") + config.app["text_openai_temperature"] = 0.3 + + options = provider._build_chat_completion_options("text", temperature=0.9, max_tokens=512) + + self.assertEqual(0.9, options["temperature"]) + self.assertEqual(512, options["max_tokens"]) + + class ExplicitVisionAdapterSettingsTests(unittest.IsolatedAsyncioTestCase): class _CapturingVisionProvider: last_init: tuple[str, str, str | None] | None = None diff --git a/app/services/subtitle_corrector.py b/app/services/subtitle_corrector.py new file mode 100644 index 0000000..5f80512 --- /dev/null +++ b/app/services/subtitle_corrector.py @@ -0,0 +1,231 @@ +"""LLM-powered SRT subtitle correction.""" + +from __future__ import annotations + +import json +import os +import re +from dataclasses import dataclass +from typing import Any + +from loguru import logger + +from app.services.llm.manager import LLMServiceManager +from app.services.llm.migration_adapter import _run_async_safely +from app.services.llm.unified_service import UnifiedLLMService +from app.services.subtitle_text import has_timecodes, normalize_subtitle_text, read_subtitle_text +from app.utils import utils + + +class SubtitleCorrectionError(RuntimeError): + """Raised when subtitle correction cannot produce a valid SRT.""" + + +_TIME_LINE_RE = re.compile( + r"^\s*\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3}(?:\s+.*)?$" +) +_JSON_BLOCK_RE = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.DOTALL | re.IGNORECASE) + + +@dataclass(frozen=True) +class SubtitleBlock: + order: int + index_line: str + time_line: str + text: str + + +def _ensure_llm_providers_registered() -> None: + if LLMServiceManager.is_registered(): + return + from app.services.llm.providers import register_all_providers + + register_all_providers() + + +def parse_srt_blocks(srt_content: str) -> list[SubtitleBlock]: + normalized = normalize_subtitle_text(srt_content) + if not normalized or not has_timecodes(normalized): + raise SubtitleCorrectionError("字幕内容为空或未检测到有效 SRT 时间轴") + + blocks: list[SubtitleBlock] = [] + raw_blocks = re.split(r"\n\s*\n", normalized) + for raw_block in raw_blocks: + lines = [line.rstrip() for line in raw_block.splitlines() if line.strip()] + if not lines: + continue + + if len(lines) >= 2 and _TIME_LINE_RE.match(lines[1]): + index_line = lines[0].strip() + time_line = lines[1].strip() + text = "\n".join(lines[2:]).strip() + elif _TIME_LINE_RE.match(lines[0]): + index_line = str(len(blocks) + 1) + time_line = lines[0].strip() + text = "\n".join(lines[1:]).strip() + else: + raise SubtitleCorrectionError(f"无法解析字幕块: {raw_block[:80]}") + + blocks.append( + SubtitleBlock( + order=len(blocks) + 1, + index_line=index_line, + time_line=time_line, + text=text, + ) + ) + + if not blocks: + raise SubtitleCorrectionError("字幕内容为空或未检测到有效字幕块") + return blocks + + +def _build_correction_prompt(blocks: list[SubtitleBlock]) -> str: + payload = [ + { + "id": block.order, + "time": block.time_line, + "text": block.text, + } + for block in blocks + ] + return f""" +请校准以下 SRT 字幕文本中的明显语音识别错误。字幕可能是中文、英文、日文、韩文或其他语言,也可能包含多语言混合内容。 + +校准要求: +1. 先结合全部字幕内容识别原语言和语境,保持原语言输出;多语言混合内容也要保持原有语言混合方式。 +2. 只纠正明显的 ASR 错字、拼写错误、同音或近音误识别、词形误识别、专有名词前后不一致。 +3. 不要润色、扩写、改写句意,不要翻译,不要增删剧情信息。 +4. 不要修改时间轴、序号、条目数量或条目顺序。 +5. 不确定的内容保持原样。 +6. 保留必要的说话人标记、标点和换行。 + +只输出严格 JSON,不要输出 Markdown 或解释文字。格式必须为: +{{"items":[{{"id":1,"text":"校准后的字幕文本"}}]}} + +待校准字幕条目: +{json.dumps(payload, ensure_ascii=False, indent=2)} +""".strip() + + +def _extract_json_text(raw_output: str) -> str: + text = str(raw_output or "").strip() + block_match = _JSON_BLOCK_RE.search(text) + if block_match: + return block_match.group(1).strip() + + if not text.startswith(("{", "[")): + starts = [pos for pos in (text.find("{"), text.find("[")) if pos >= 0] + if starts: + start = min(starts) + end = max(text.rfind("}"), text.rfind("]")) + if end > start: + return text[start:end + 1] + return text + + +def _parse_corrections(raw_output: str, expected_ids: set[int]) -> dict[int, str]: + json_text = _extract_json_text(raw_output) + try: + data = json.loads(json_text) + except json.JSONDecodeError as exc: + raise SubtitleCorrectionError("LLM 未返回有效 JSON 字幕校准结果") from exc + + if isinstance(data, dict) and "items" in data: + items = data["items"] + elif isinstance(data, list): + items = data + elif isinstance(data, dict): + items = [{"id": key, "text": value} for key, value in data.items()] + else: + raise SubtitleCorrectionError("LLM 字幕校准结果格式无效") + + corrections: dict[int, str] = {} + if not isinstance(items, list): + raise SubtitleCorrectionError("LLM 字幕校准结果缺少 items 列表") + + for item in items: + if not isinstance(item, dict): + continue + try: + item_id = int(item.get("id")) + except (TypeError, ValueError): + continue + if item_id in expected_ids: + corrections[item_id] = str(item.get("text") or "").strip() + + missing_ids = sorted(expected_ids - set(corrections.keys())) + if missing_ids: + raise SubtitleCorrectionError(f"LLM 字幕校准结果缺少字幕条目: {missing_ids[:10]}") + return corrections + + +def _render_srt(blocks: list[SubtitleBlock], corrections: dict[int, str]) -> str: + rendered_blocks = [] + for block in blocks: + corrected_text = corrections.get(block.order, "").strip() or block.text + rendered_blocks.append(f"{block.index_line}\n{block.time_line}\n{corrected_text}") + return "\n\n".join(rendered_blocks).rstrip() + "\n" + + +def correct_srt_content( + srt_content: str, + *, + provider: str = "", + api_key: str = "", + base_url: str = "", + temperature: float = 0.1, +) -> str: + blocks = parse_srt_blocks(srt_content) + _ensure_llm_providers_registered() + + logger.info(f"开始校准字幕,共 {len(blocks)} 条") + prompt = _build_correction_prompt(blocks) + raw_output = _run_async_safely( + UnifiedLLMService.generate_text, + prompt=prompt, + system_prompt="你是一位专业的多语言字幕校对员,擅长修正 ASR 语音识别造成的明显错字、拼写错误、同音或近音误识别,同时严格保留字幕结构和原语言。", + provider=provider, + temperature=temperature, + response_format="json", + api_key=api_key, + api_base=base_url, + ) + corrections = _parse_corrections(raw_output, {block.order for block in blocks}) + corrected_srt = _render_srt(blocks, corrections) + logger.info("字幕校准完成") + return corrected_srt + + +def write_srt_file(srt_content: str, subtitle_file: str = "") -> str: + if not subtitle_file: + subtitle_file = os.path.join(utils.subtitle_dir(), "subtitle_corrected.srt") + parent = os.path.dirname(subtitle_file) + if parent: + os.makedirs(parent, exist_ok=True) + with open(subtitle_file, "w", encoding="utf-8") as f: + f.write(srt_content) + return subtitle_file + + +def correct_subtitle_file( + subtitle_file: str, + output_file: str = "", + *, + provider: str = "", + api_key: str = "", + base_url: str = "", + temperature: float = 0.1, +) -> str: + if not subtitle_file or not os.path.isfile(subtitle_file): + raise SubtitleCorrectionError(f"字幕文件不存在: {subtitle_file}") + + decoded = read_subtitle_text(subtitle_file) + corrected_srt = correct_srt_content( + decoded.text, + provider=provider, + api_key=api_key, + base_url=base_url, + temperature=temperature, + ) + return write_srt_file(corrected_srt, output_file) diff --git a/app/services/test_subtitle_corrector_unittest.py b/app/services/test_subtitle_corrector_unittest.py new file mode 100644 index 0000000..9afda81 --- /dev/null +++ b/app/services/test_subtitle_corrector_unittest.py @@ -0,0 +1,100 @@ +import json +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +from app.services import subtitle_corrector as corrector + + +SAMPLE_SRT = """1 +00:00:01,000 --> 00:00:03,000 +今天我们来看张三的顾是 + +2 +00:00:04,000 --> 00:00:06,000 +他来到北精找李四 +""" + + +class SubtitleCorrectorTests(unittest.TestCase): + def test_correct_srt_content_preserves_timecodes_and_rebuilds_text(self): + llm_output = { + "items": [ + {"id": 1, "text": "今天我们来看张三的故事"}, + {"id": 2, "text": "他来到北京找李四"}, + ] + } + + with ( + mock.patch("app.services.subtitle_corrector._ensure_llm_providers_registered"), + mock.patch( + "app.services.subtitle_corrector._run_async_safely", + return_value=json.dumps(llm_output, ensure_ascii=False), + ) as run_llm, + ): + corrected = corrector.correct_srt_content( + SAMPLE_SRT, + provider="openai", + api_key="sk-test", + base_url="https://llm.example/v1", + ) + + self.assertIn("00:00:01,000 --> 00:00:03,000", corrected) + self.assertIn("今天我们来看张三的故事", corrected) + self.assertIn("他来到北京找李四", corrected) + self.assertNotIn("顾是", corrected) + + call_kwargs = run_llm.call_args.kwargs + self.assertEqual("openai", call_kwargs["provider"]) + self.assertEqual("sk-test", call_kwargs["api_key"]) + self.assertEqual("https://llm.example/v1", call_kwargs["api_base"]) + self.assertEqual("json", call_kwargs["response_format"]) + self.assertIn("多语言字幕校对员", call_kwargs["system_prompt"]) + self.assertIn("保持原语言", call_kwargs["prompt"]) + + def test_correct_srt_content_rejects_missing_items(self): + llm_output = {"items": [{"id": 1, "text": "今天我们来看张三的故事"}]} + + with ( + mock.patch("app.services.subtitle_corrector._ensure_llm_providers_registered"), + mock.patch( + "app.services.subtitle_corrector._run_async_safely", + return_value=json.dumps(llm_output, ensure_ascii=False), + ), + ): + with self.assertRaises(corrector.SubtitleCorrectionError): + corrector.correct_srt_content(SAMPLE_SRT, provider="openai") + + def test_correct_subtitle_file_writes_corrected_srt(self): + llm_output = { + "items": [ + {"id": 1, "text": "今天我们来看张三的故事"}, + {"id": 2, "text": "他来到北京找李四"}, + ] + } + + with tempfile.TemporaryDirectory() as tmp_dir: + input_file = Path(tmp_dir) / "input.srt" + output_file = Path(tmp_dir) / "output.srt" + input_file.write_text(SAMPLE_SRT, encoding="utf-8") + + with ( + mock.patch("app.services.subtitle_corrector._ensure_llm_providers_registered"), + mock.patch( + "app.services.subtitle_corrector._run_async_safely", + return_value=json.dumps(llm_output, ensure_ascii=False), + ), + ): + result_path = corrector.correct_subtitle_file( + str(input_file), + str(output_file), + provider="openai", + ) + + self.assertEqual(str(output_file), result_path) + self.assertIn("北京", output_file.read_text(encoding="utf-8")) + + +if __name__ == "__main__": + unittest.main() diff --git a/config.example.toml b/config.example.toml index 805610b..2df60dc 100644 --- a/config.example.toml +++ b/config.example.toml @@ -25,6 +25,10 @@ vision_openai_model_name = "Qwen/Qwen3.5-122B-A10B" vision_openai_api_key = "" # 填入对应 provider 的 API key vision_openai_base_url = "https://api.siliconflow.cn/v1" # 可选:自定义 API base URL(官方 OpenAI 可留空) + vision_openai_temperature = 1.0 + vision_openai_top_p = 0.95 + vision_openai_max_tokens = 65536 + vision_openai_thinking_level = "auto" # auto/off/low/medium/high # ===== 文本模型配置 ===== text_llm_provider = "openai" @@ -40,6 +44,10 @@ text_openai_model_name = "Pro/zai-org/GLM-5" text_openai_api_key = "" # 填入对应 provider 的 API key text_openai_base_url = "https://api.siliconflow.cn/v1" # 可选:自定义 API base URL(官方 OpenAI 可留空) + text_openai_temperature = 1.0 + text_openai_top_p = 0.95 + text_openai_max_tokens = 65536 + text_openai_thinking_level = "auto" # auto/off/low/medium/high # ===== API Keys 参考 ===== # 主流 LLM Providers API Key 获取地址: diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py index 39d9904..a8185bc 100644 --- a/webui/components/basic_settings.py +++ b/webui/components/basic_settings.py @@ -4,6 +4,8 @@ import streamlit as st import os from app.config import config from app.config.defaults import ( + DEFAULT_LLM_GENERATION_CONFIG, + DEFAULT_LLM_THINKING_LEVELS, DEFAULT_OPENAI_COMPATIBLE_BASE_URL, DEFAULT_OPENAI_COMPATIBLE_PROVIDER, DEFAULT_TEXT_LLM_PROVIDER, @@ -87,7 +89,7 @@ def validate_openai_compatible_model_name(model_name: str, model_type: str) -> t Args: model_name: 模型名称,应为 provider/model 格式 - model_type: 模型类型(如"视频分析"、"文案生成") + model_type: 模型类型(如"视觉分析"、"文案生成") Returns: (是否有效, 错误消息) @@ -149,6 +151,104 @@ def update_app_config_if_changed(key: str, value) -> bool: return True +def render_openai_compatible_protocol_field(tr, label_key: str, key: str) -> None: + """Render the fixed OpenAI-compatible protocol as a non-selectable field.""" + st.text_input( + tr(label_key), + value=tr("OpenAI compatible protocol"), + help=tr("OpenAI compatible protocol help"), + disabled=True, + key=key, + ) + + +def get_generation_config_value(model_prefix: str, param_name: str): + """Read a per-model generation parameter with a shared default.""" + config_key = f"{model_prefix}_openai_{param_name}" + if config_key in config.app: + return config.app.get(config_key) + + if model_prefix == "text" and param_name == "temperature": + return st.session_state.get("temperature", DEFAULT_LLM_GENERATION_CONFIG[param_name]) + + return DEFAULT_LLM_GENERATION_CONFIG[param_name] + + +def render_llm_generation_settings(tr, model_prefix: str) -> dict: + """Render generation parameters directly below a model's Base URL.""" + st.markdown(f"**{tr('Generation Settings')}**") + + row1 = st.columns(2) + with row1[0]: + temperature = st.slider( + tr("Sampling Temperature"), + min_value=0.0, + max_value=2.0, + value=float(get_generation_config_value(model_prefix, "temperature")), + step=0.05, + help=tr("Sampling Temperature Help"), + key=f"{model_prefix}_openai_temperature_input", + ) + with row1[1]: + top_p = st.slider( + tr("Top P"), + min_value=0.0, + max_value=1.0, + value=float(get_generation_config_value(model_prefix, "top_p")), + step=0.05, + help=tr("Top P Help"), + key=f"{model_prefix}_openai_top_p_input", + ) + + row2 = st.columns(2) + with row2[0]: + max_tokens = st.number_input( + tr("Max Output Tokens"), + min_value=0, + max_value=200000, + value=int(get_generation_config_value(model_prefix, "max_tokens")), + step=256, + help=tr("Max Output Tokens Help"), + key=f"{model_prefix}_openai_max_tokens_input", + ) + with row2[1]: + current_thinking_level = str(get_generation_config_value(model_prefix, "thinking_level") or "auto") + if current_thinking_level not in DEFAULT_LLM_THINKING_LEVELS: + current_thinking_level = "auto" + + thinking_level = st.selectbox( + tr("Thinking Level"), + options=DEFAULT_LLM_THINKING_LEVELS, + index=DEFAULT_LLM_THINKING_LEVELS.index(current_thinking_level), + format_func=lambda level: tr(f"Thinking Level {level.title()}"), + help=tr("Thinking Level Help"), + key=f"{model_prefix}_openai_thinking_level_input", + ) + + params = { + "temperature": round(float(temperature), 2), + "top_p": round(float(top_p), 2), + "max_tokens": int(max_tokens), + "thinking_level": thinking_level, + } + + if model_prefix == "text": + st.session_state["temperature"] = params["temperature"] + + return params + + +def save_llm_generation_settings(model_prefix: str, params: dict) -> bool: + """Persist per-model generation parameters in app config.""" + changed = False + for param_name, value in params.items(): + config_key = f"{model_prefix}_openai_{param_name}" + changed |= update_app_config_if_changed(config_key, value) + st.session_state[config_key] = value + + return changed + + def render_basic_settings(tr): """渲染基础设置面板""" with st.expander(tr("Basic Settings"), expanded=False): @@ -162,20 +262,18 @@ def render_basic_settings(tr): render_proxy_settings(tr) with middle_config_panel: - render_vision_llm_settings(tr) # 视频分析模型设置 + render_vision_llm_settings(tr) # 视觉分析模型设置 with right_config_panel: render_text_llm_settings(tr) # 文案生成模型设置 - render_generation_settings(tr) - def render_generation_settings(tr): """渲染通用生成参数。""" st.divider() st.subheader(tr("Generation Settings")) if 'temperature' not in st.session_state: - st.session_state['temperature'] = 0.7 + st.session_state['temperature'] = DEFAULT_LLM_GENERATION_CONFIG["temperature"] st.slider("temperature", 0.0, 2.0, key="temperature") @@ -455,7 +553,7 @@ def test_openai_compatible_text_model(api_key: str, base_url: str, model_name: s return False, f"连接失败: {error_msg}" def render_vision_llm_settings(tr): - """渲染视频分析模型设置(OpenAI 兼容 统一配置)""" + """渲染视觉分析模型设置(OpenAI 兼容 统一配置)""" st.subheader(tr("Vision Model Settings")) # 固定使用 OpenAI 兼容 提供商 @@ -467,23 +565,20 @@ def render_vision_llm_settings(tr): vision_base_url = config.app.get("vision_openai_base_url", DEFAULT_OPENAI_COMPATIBLE_BASE_URL) # 固定 provider 为 openai,模型输入框保留完整模型名称 - current_provider, current_model = get_openai_compatible_ui_values( + _current_provider, current_model = get_openai_compatible_ui_values( full_vision_model_name, DEFAULT_VISION_OPENAI_MODEL_NAME, provider=DEFAULT_VISION_LLM_PROVIDER, ) - - # 定义支持的 provider 列表 - OPENAI_COMPATIBLE_PROVIDERS = ["openai"] + selected_provider = DEFAULT_VISION_LLM_PROVIDER # 渲染配置输入框 col1, col2 = st.columns([1, 2]) with col1: - selected_provider = st.selectbox( - tr("Vision Model Provider"), - options=OPENAI_COMPATIBLE_PROVIDERS, - index=OPENAI_COMPATIBLE_PROVIDERS.index(current_provider) if current_provider in OPENAI_COMPATIBLE_PROVIDERS else 0, - key="vision_provider_select" + render_openai_compatible_protocol_field( + tr, + "Vision Model Provider", + key="vision_openai_protocol_display", ) with col2: @@ -532,6 +627,8 @@ def render_vision_llm_settings(tr): info_example = vision_placeholder or "https://your-openai-compatible-endpoint/v1" st.info(tr("Please fill OpenAI compatible gateway").format(example=info_example)) + vision_generation_params = render_llm_generation_settings(tr, "vision") + # 添加测试连接按钮 if st.button(tr("Test Connection"), key="test_vision_connection"): test_errors = [] @@ -559,7 +656,7 @@ def render_vision_llm_settings(tr): st.error(message) except Exception as e: st.error(f"{tr('Connection test error')}: {str(e)}") - logger.error(f"OpenAI 兼容 视频分析模型连接测试失败: {str(e)}") + logger.error(f"OpenAI 兼容 视觉分析模型连接测试失败: {str(e)}") # 验证和保存配置 validation_errors = [] @@ -568,7 +665,7 @@ def render_vision_llm_settings(tr): # 验证模型名称 if st_vision_model_name: # 这里的验证逻辑可能需要微调,因为我们现在是自动组合的 - is_valid, error_msg = validate_openai_compatible_model_name(st_vision_model_name, "视频分析") + is_valid, error_msg = validate_openai_compatible_model_name(st_vision_model_name, "视觉分析") if is_valid: config_changed |= update_app_config_if_changed( "vision_openai_model_name", @@ -580,7 +677,7 @@ def render_vision_llm_settings(tr): # 验证 API 密钥 if st_vision_api_key: - is_valid, error_msg = validate_api_key(st_vision_api_key, "视频分析") + is_valid, error_msg = validate_api_key(st_vision_api_key, "视觉分析") if is_valid: config_changed |= update_app_config_if_changed( "vision_openai_api_key", @@ -592,7 +689,7 @@ def render_vision_llm_settings(tr): # 验证 Base URL(可选) if st_vision_base_url: - is_valid, error_msg = validate_base_url(st_vision_base_url, "视频分析") + is_valid, error_msg = validate_base_url(st_vision_base_url, "视觉分析") if is_valid: config_changed |= update_app_config_if_changed( "vision_openai_base_url", @@ -602,6 +699,8 @@ def render_vision_llm_settings(tr): else: validation_errors.append(error_msg) + config_changed |= save_llm_generation_settings("vision", vision_generation_params) + # 显示验证错误 show_config_validation_errors(validation_errors) @@ -615,7 +714,7 @@ def render_vision_llm_settings(tr): st.success(tr("Vision model config saved")) except Exception as e: st.error(f"{tr('Failed to save config')}: {str(e)}") - logger.error(f"保存视频分析配置失败: {str(e)}") + logger.error(f"保存视觉分析配置失败: {str(e)}") def test_text_model_connection(api_key, base_url, model_name, provider, tr): @@ -734,23 +833,20 @@ def render_text_llm_settings(tr): text_base_url = config.app.get("text_openai_base_url", DEFAULT_OPENAI_COMPATIBLE_BASE_URL) # 固定 provider 为 openai,模型输入框保留完整模型名称 - current_provider, current_model = get_openai_compatible_ui_values( + _current_provider, current_model = get_openai_compatible_ui_values( full_text_model_name, DEFAULT_TEXT_OPENAI_MODEL_NAME, provider=DEFAULT_TEXT_LLM_PROVIDER, ) - - # 定义支持的 provider 列表 - OPENAI_COMPATIBLE_PROVIDERS = ["openai"] + selected_provider = DEFAULT_TEXT_LLM_PROVIDER # 渲染配置输入框 col1, col2 = st.columns([1, 2]) with col1: - selected_provider = st.selectbox( - tr("Text Model Provider"), - options=OPENAI_COMPATIBLE_PROVIDERS, - index=OPENAI_COMPATIBLE_PROVIDERS.index(current_provider) if current_provider in OPENAI_COMPATIBLE_PROVIDERS else 0, - key="text_provider_select" + render_openai_compatible_protocol_field( + tr, + "Text Model Provider", + key="text_openai_protocol_display", ) with col2: @@ -801,6 +897,8 @@ def render_text_llm_settings(tr): info_example = text_placeholder or "https://your-openai-compatible-endpoint/v1" st.info(tr("Please fill OpenAI compatible gateway").format(example=info_example)) + text_generation_params = render_llm_generation_settings(tr, "text") + # 添加测试连接按钮 if st.button(tr("Test Connection"), key="test_text_connection"): test_errors = [] @@ -870,6 +968,8 @@ def render_text_llm_settings(tr): else: text_validation_errors.append(error_msg) + text_config_changed |= save_llm_generation_settings("text", text_generation_params) + # 显示验证错误 show_config_validation_errors(text_validation_errors) diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index 3c91c44..9b03457 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -18,6 +18,114 @@ from webui.tools.generate_short_summary import analyze_short_drama_plot, generat SCRIPT_TABLE_BASE_COLUMNS = ["_id", "timestamp", "picture", "narration", "OST"] +VIDEO_UPLOAD_TYPES = ["mp4", "mov", "avi", "flv", "mkv", "mpeg4"] +VIDEO_GLOB_PATTERNS = [f"*.{suffix}" for suffix in VIDEO_UPLOAD_TYPES] + + +def _normalize_video_paths(paths): + if isinstance(paths, str): + paths = [paths] + if not paths: + return [] + + normalized_paths = [] + seen = set() + for path in paths: + if not isinstance(path, str): + continue + path = path.strip() + if not path or path in seen: + continue + normalized_paths.append(path) + seen.add(path) + return normalized_paths + + +def _set_video_origin_state(paths, params=None): + video_paths = _normalize_video_paths(paths) + first_video_path = video_paths[0] if video_paths else "" + st.session_state['video_origin_paths'] = video_paths + st.session_state['video_origin_path'] = first_video_path + if params is not None: + params.video_origin_path = first_video_path + params.video_origin_paths = video_paths + + +def _selected_video_paths(): + video_paths = _normalize_video_paths(st.session_state.get('video_origin_paths', [])) + if not video_paths: + video_paths = _normalize_video_paths(st.session_state.get('video_origin_path', '')) + return video_paths + + +def _uploaded_files_signature(uploaded_files): + return "|".join(f"{uploaded_file.name}:{uploaded_file.size}" for uploaded_file in uploaded_files) + + +def _unique_file_path(directory, filename): + safe_filename = os.path.basename(filename).strip() + if not safe_filename: + safe_filename = f"video_{int(time.time())}.mp4" + + os.makedirs(directory, exist_ok=True) + file_name, file_extension = os.path.splitext(safe_filename) + candidate_path = os.path.join(directory, safe_filename) + if not os.path.exists(candidate_path): + return candidate_path + + timestamp = time.strftime("%Y%m%d%H%M%S") + counter = 1 + while True: + suffix = f"_{timestamp}" if counter == 1 else f"_{timestamp}_{counter}" + candidate_path = os.path.join(directory, f"{file_name}{suffix}{file_extension}") + if not os.path.exists(candidate_path): + return candidate_path + counter += 1 + + +def _format_file_list_for_display(paths, max_items=3): + file_names = [os.path.basename(path) for path in _normalize_video_paths(paths)] + if len(file_names) <= max_items: + return ", ".join(file_names) + visible_names = ", ".join(file_names[:max_items]) + return f"{visible_names} +{len(file_names) - max_items}" + + +def _read_subtitle_file(path): + try: + return read_subtitle_text(path).text + except Exception: + with open(path, "r", encoding="utf-8") as f: + return f.read() + + +def _build_combined_subtitle_content(subtitle_paths): + sections = [] + subtitle_contents = {} + for subtitle_path in subtitle_paths: + if not subtitle_path or not os.path.exists(subtitle_path): + continue + content = _read_subtitle_file(subtitle_path) + subtitle_contents[subtitle_path] = content + sections.append(f"# {os.path.basename(subtitle_path)}\n{content}".strip()) + return "\n\n".join(sections), subtitle_contents + + +def _selected_subtitle_paths(): + subtitle_paths = _normalize_video_paths(st.session_state.get('subtitle_paths', [])) + if not subtitle_paths: + subtitle_paths = _normalize_video_paths(st.session_state.get('subtitle_path', '')) + return subtitle_paths + + +def _set_subtitle_state(subtitle_paths): + subtitle_paths = _normalize_video_paths(subtitle_paths) + subtitle_content, subtitle_contents = _build_combined_subtitle_content(subtitle_paths) + st.session_state['subtitle_path'] = subtitle_paths[0] if subtitle_paths else None + st.session_state['subtitle_paths'] = subtitle_paths + st.session_state['subtitle_content'] = subtitle_content if subtitle_content else None + st.session_state['subtitle_contents'] = subtitle_contents + st.session_state['subtitle_file_processed'] = bool(subtitle_paths) def render_script_panel(tr): @@ -242,12 +350,12 @@ def render_video_file(tr, params): } source_labels = list(source_options.keys()) default_source_label = source_labels[0] - source_default_version = "upload_first_v1" + source_default_version = "upload_first_v2" if st.session_state.get('_video_source_default_version') != source_default_version: if ( st.session_state.get('video_source_selection') not in source_options - or not st.session_state.get('video_origin_path') + or not _selected_video_paths() ): st.session_state['video_source_selection'] = default_source_label st.session_state['_video_source_default_version'] = source_default_version @@ -258,7 +366,7 @@ def render_video_file(tr, params): source_caption = ( tr("Select a video from resource videos directory") if source_options[current_source] == "resource" - else tr("Upload a new video file up to 2GB") + else tr("Upload new video files up to 2GB each") ) st.markdown(f"**{tr('Video Source')}** :gray[{source_caption}]") @@ -275,7 +383,7 @@ def render_video_file(tr, params): if source_options[source] == "resource": video_files = [] - for suffix in ["*.mp4", "*.mov", "*.avi", "*.flv", "*.mkv", "*.mpeg4"]: + for suffix in VIDEO_GLOB_PATTERNS: video_files.extend(glob.glob(os.path.join(utils.video_dir(), suffix))) video_files = sorted(video_files, key=os.path.getctime, reverse=True) @@ -299,59 +407,62 @@ def render_video_file(tr, params): ) if video_path: - st.session_state['video_origin_path'] = video_path - params.video_origin_path = video_path + _set_video_origin_state([video_path], params) else: - st.session_state['video_origin_path'] = "" - params.video_origin_path = "" + _set_video_origin_state([], params) if not video_files: st.info(tr("No video files found in resource videos directory")) return if source_options[source] == "upload": - uploaded_file = st.file_uploader( + uploaded_files = st.file_uploader( tr("Upload Video"), - type=["mp4", "mov", "avi", "flv", "mkv", "mpeg4"], - accept_multiple_files=False, + type=VIDEO_UPLOAD_TYPES, + accept_multiple_files=True, key="video_file_uploader", ) - if uploaded_file is None: - st.session_state['video_origin_path'] = "" - params.video_origin_path = "" + if not uploaded_files: + _set_video_origin_state([], params) st.session_state['video_file_processed'] = False st.session_state['uploaded_video_path'] = "" + st.session_state['uploaded_video_paths'] = [] st.session_state['uploaded_video_signature'] = "" else: - uploaded_signature = f"{uploaded_file.name}:{uploaded_file.size}" - uploaded_video_path = st.session_state.get('uploaded_video_path', '') + uploaded_signature = _uploaded_files_signature(uploaded_files) + uploaded_video_paths = _normalize_video_paths(st.session_state.get('uploaded_video_paths', [])) is_processed = ( st.session_state.get('video_file_processed', False) and st.session_state.get('uploaded_video_signature') == uploaded_signature - and uploaded_video_path + and uploaded_video_paths + and all(os.path.exists(path) for path in uploaded_video_paths) ) if is_processed: - st.session_state['video_origin_path'] = uploaded_video_path - params.video_origin_path = uploaded_video_path + _set_video_origin_state(uploaded_video_paths, params) else: - safe_filename = os.path.basename(uploaded_file.name) - video_file_path = os.path.join(utils.video_dir(), safe_filename) - file_name, file_extension = os.path.splitext(safe_filename) + video_paths = [] + for uploaded_file in uploaded_files: + video_file_path = _unique_file_path(utils.video_dir(), uploaded_file.name) + with open(video_file_path, "wb") as f: + f.write(uploaded_file.read()) + video_paths.append(video_file_path) - if os.path.exists(video_file_path): - timestamp = time.strftime("%Y%m%d%H%M%S") - file_name_with_timestamp = f"{file_name}_{timestamp}" - video_file_path = os.path.join(utils.video_dir(), file_name_with_timestamp + file_extension) - - with open(video_file_path, "wb") as f: - f.write(uploaded_file.read()) - st.session_state['video_origin_path'] = video_file_path - params.video_origin_path = video_file_path - st.session_state['uploaded_video_path'] = video_file_path + _set_video_origin_state(video_paths, params) + st.session_state['uploaded_video_path'] = video_paths[0] if video_paths else "" + st.session_state['uploaded_video_paths'] = video_paths st.session_state['uploaded_video_signature'] = uploaded_signature st.session_state['video_file_processed'] = True + current_video_paths = _selected_video_paths() + if current_video_paths: + st.info( + tr("Selected videos for processing").format( + count=len(current_video_paths), + files=_format_file_list_for_display(current_video_paths), + ) + ) + def render_short_generate_options(tr): """ @@ -457,18 +568,38 @@ def short_drama_summary(tr): def render_subtitle_preview(tr): """渲染可折叠的当前字幕预览;没有字幕时提示用户先转写或上传。""" - subtitle_path = st.session_state.get('subtitle_path', '') + subtitle_paths = _selected_subtitle_paths() subtitle_content = st.session_state.get('subtitle_content', '') + subtitle_contents = st.session_state.get('subtitle_contents', {}) + if not isinstance(subtitle_contents, dict): + subtitle_contents = {} - if subtitle_path and not subtitle_content and os.path.exists(subtitle_path): - subtitle_content = read_subtitle_text(subtitle_path).text + if subtitle_paths and (not subtitle_content or not subtitle_contents): + subtitle_content, subtitle_contents = _build_combined_subtitle_content(subtitle_paths) st.session_state['subtitle_content'] = subtitle_content + st.session_state['subtitle_contents'] = subtitle_contents with st.expander(tr("Subtitle Preview"), expanded=False): - if not subtitle_path or not subtitle_content: + if not subtitle_paths or not subtitle_content: st.info(tr("Please transcribe or upload subtitles first")) return + if len(subtitle_paths) > 1: + for index, path in enumerate(subtitle_paths, start=1): + content = subtitle_contents.get(path, "") + if not content and os.path.exists(path): + content = _read_subtitle_file(path) + st.markdown(f"**{index}. {os.path.basename(path)}**") + st.text_area( + tr("Subtitle Preview"), + value=content, + height=180, + label_visibility="collapsed", + disabled=True, + key=f"subtitle_content_preview_{index}", + ) + return + st.text_area( tr("Subtitle Preview"), key="subtitle_content", @@ -496,9 +627,7 @@ def render_subtitle_upload(tr): if 'subtitle_path' in st.session_state and st.session_state['subtitle_path']: st.info(tr("Uploaded subtitle").format(file=os.path.basename(st.session_state['subtitle_path']))) if st.button(tr("清除已上传字幕")): - st.session_state['subtitle_path'] = None - st.session_state['subtitle_content'] = None - st.session_state['subtitle_file_processed'] = False + _set_subtitle_state([]) st.rerun() # 只有当有文件上传且尚未处理时才执行处理逻辑 @@ -539,9 +668,7 @@ def render_subtitle_upload(tr): f"({tr('Encoding')}: {detected_encoding.upper()}, " f"{tr('Size')}: {len(script_content)} {tr('Characters')})" ) - st.session_state['subtitle_path'] = script_file_path - st.session_state['subtitle_content'] = script_content - st.session_state['subtitle_file_processed'] = True # 标记已处理 + _set_subtitle_state([script_file_path]) # 避免使用rerun,使用更新状态的方式 # st.rerun() @@ -688,9 +815,7 @@ def render_video_script_editor(tr): def render_fun_asr_transcription(tr): """使用 Fun-ASR 从本地音视频转写生成字幕。""" def clear_fun_asr_subtitle_state(): - st.session_state['subtitle_path'] = None - st.session_state['subtitle_content'] = None - st.session_state['subtitle_file_processed'] = False + _set_subtitle_state([]) from app.services import fun_asr_subtitle @@ -714,7 +839,7 @@ def render_fun_asr_transcription(tr): api_url = config.fun_asr.get("api_url", fun_asr_subtitle.LOCAL_FUN_ASR_API_URL) hotword = config.fun_asr.get("hotword", "") enable_spk = bool(config.fun_asr.get("enable_spk", False)) - media_path = st.session_state.get('video_origin_path', '') + media_paths = _selected_video_paths() subtitle_cols = st.columns([3, 2], vertical_alignment="top") @@ -768,23 +893,92 @@ def render_fun_asr_transcription(tr): ) if backend != "upload": - if media_path: - st.info( - tr("Using selected video for subtitle transcription").format( - file=os.path.basename(media_path) + if media_paths: + if len(media_paths) == 1: + st.info( + tr("Using selected video for subtitle transcription").format( + file=os.path.basename(media_paths[0]) + ) + ) + else: + st.info( + tr("Using selected videos for subtitle transcription").format( + count=len(media_paths), + files=_format_file_list_for_display(media_paths), + ) ) - ) else: st.warning(tr("Please select or upload a video first")) - can_transcribe = backend != "upload" and bool(media_path) + # 上传字幕面板会在本轮渲染中更新 session_state,这里重新读取一次,保证按钮状态同步。 + subtitle_paths = _selected_subtitle_paths() + can_transcribe = backend != "upload" and bool(media_paths) + can_correct_subtitles = bool(subtitle_paths) with subtitle_cols[1]: - transcribe_clicked = st.button( - tr("Transcribe subtitles"), - key="fun_asr_transcribe", - disabled=not can_transcribe, - use_container_width=True, - ) + action_cols = st.columns(2) + with action_cols[0]: + transcribe_clicked = st.button( + tr("Transcribe subtitles"), + key="fun_asr_transcribe", + disabled=not can_transcribe, + use_container_width=True, + ) + with action_cols[1]: + correct_clicked = st.button( + tr("Calibrate subtitles"), + key="subtitle_correct", + disabled=not can_correct_subtitles, + use_container_width=True, + ) + + if correct_clicked: + from app.services import subtitle_corrector + + text_provider = config.app.get('text_llm_provider', 'openai').lower() + text_api_key = config.app.get(f'text_{text_provider}_api_key') + text_base_url = config.app.get(f'text_{text_provider}_base_url') + + corrected_paths = [] + try: + spinner_text = tr("Calibrating subtitles...") + with st.spinner(spinner_text): + progress_bar = st.progress(0) if len(subtitle_paths) > 1 else None + for index, subtitle_path in enumerate(subtitle_paths, start=1): + subtitle_name = f"{os.path.splitext(os.path.basename(subtitle_path))[0]}_corrected.srt" + output_path = _unique_file_path(utils.subtitle_dir(), subtitle_name) + corrected_path = subtitle_corrector.correct_subtitle_file( + subtitle_file=subtitle_path, + output_file=output_path, + provider=text_provider, + api_key=text_api_key, + base_url=text_base_url, + ) + corrected_paths.append(corrected_path) + if progress_bar: + progress_bar.progress(index / len(subtitle_paths)) + + if progress_bar: + progress_bar.empty() + + _set_subtitle_state(corrected_paths) + success_placeholder = st.empty() + if len(corrected_paths) == 1: + success_placeholder.success( + tr("Subtitle calibration succeeded").format(file=os.path.basename(corrected_paths[0])) + ) + else: + success_placeholder.success( + tr("Subtitle calibration succeeded for multiple files").format( + count=len(corrected_paths), + files=_format_file_list_for_display(corrected_paths), + ) + ) + time.sleep(3) + success_placeholder.empty() + except Exception as e: + logger.error(f"字幕校准失败: {traceback.format_exc()}") + st.error(f"{tr('Subtitle calibration failed')}: {str(e)}") + return if not transcribe_clicked: return @@ -797,9 +991,17 @@ def render_fun_asr_transcription(tr): clear_fun_asr_subtitle_state() st.error(tr("Please enter local FunASR-Pack API URL")) return - if not media_path or not os.path.exists(media_path): + missing_paths = [path for path in media_paths if not os.path.exists(path)] + if not media_paths or missing_paths: clear_fun_asr_subtitle_state() - st.error(tr("Selected video file does not exist")) + if missing_paths: + st.error( + tr("Selected video files do not exist").format( + files=_format_file_list_for_display(missing_paths) + ) + ) + else: + st.error(tr("Selected video file does not exist")) return try: @@ -813,47 +1015,70 @@ def render_fun_asr_transcription(tr): config.fun_asr["model"] = "fun-asr" config.save_config() - subtitle_name = f"{os.path.splitext(os.path.basename(media_path))[0]}_fun_asr.srt" - subtitle_path = os.path.join(utils.subtitle_dir(), subtitle_name) - spinner_text = ( tr("Transcribing with local FunASR-Pack...") if backend == "local" else tr("Transcribing with Fun-ASR...") ) with st.spinner(spinner_text): - if backend == "local": - generated_path = fun_asr_subtitle.create_with_local_fun_asr( - local_file=media_path, - subtitle_file=subtitle_path, - api_url=str(api_url).strip(), - hotword=str(hotword).strip(), - enable_spk=bool(enable_spk), - ) - else: - generated_path = fun_asr_subtitle.create_with_fun_asr( - local_file=media_path, - subtitle_file=subtitle_path, - api_key=api_key.strip(), - ) + progress_bar = st.progress(0) if len(media_paths) > 1 else None + generated_paths = [] + for index, media_path in enumerate(media_paths, start=1): + subtitle_name = f"{os.path.splitext(os.path.basename(media_path))[0]}_fun_asr.srt" + subtitle_path = _unique_file_path(utils.subtitle_dir(), subtitle_name) - if not generated_path or not os.path.exists(generated_path): + if backend == "local": + generated_path = fun_asr_subtitle.create_with_local_fun_asr( + local_file=media_path, + subtitle_file=subtitle_path, + api_url=str(api_url).strip(), + hotword=str(hotword).strip(), + enable_spk=bool(enable_spk), + ) + else: + generated_path = fun_asr_subtitle.create_with_fun_asr( + local_file=media_path, + subtitle_file=subtitle_path, + api_key=api_key.strip(), + ) + + if not generated_path or not os.path.exists(generated_path): + raise RuntimeError(tr("Fun-ASR failed without subtitle file")) + + generated_paths.append(generated_path) + if progress_bar: + progress_bar.progress(index / len(media_paths)) + + if progress_bar: + progress_bar.empty() + + if not generated_paths: clear_fun_asr_subtitle_state() st.error(tr("Fun-ASR failed without subtitle file")) return - with open(generated_path, "r", encoding="utf-8") as f: - subtitle_content = f.read() + subtitle_content, subtitle_contents = _build_combined_subtitle_content(generated_paths) + if not subtitle_content.strip(): + clear_fun_asr_subtitle_state() + st.error(tr("Fun-ASR failed without subtitle file")) + return - st.session_state['subtitle_path'] = generated_path - st.session_state['subtitle_content'] = subtitle_content - st.session_state['subtitle_file_processed'] = True + _set_subtitle_state(generated_paths) success_placeholder = st.empty() - success_placeholder.success( - tr("Subtitle transcription succeeded").format(file=os.path.basename(generated_path)) - ) + if len(generated_paths) == 1: + success_placeholder.success( + tr("Subtitle transcription succeeded").format(file=os.path.basename(generated_paths[0])) + ) + else: + success_placeholder.success( + tr("Subtitle transcription succeeded for multiple files").format( + count=len(generated_paths), + files=_format_file_list_for_display(generated_paths), + ) + ) time.sleep(3) success_placeholder.empty() + st.rerun() except Exception as e: clear_fun_asr_subtitle_state() logger.error(f"Fun-ASR 字幕转写失败: {traceback.format_exc()}") @@ -1007,6 +1232,7 @@ def get_script_params(): 'video_language': st.session_state.get('video_language', ''), 'video_clip_json_path': st.session_state.get('video_clip_json_path', ''), 'video_origin_path': st.session_state.get('video_origin_path', ''), + 'video_origin_paths': _selected_video_paths(), 'video_name': st.session_state.get('video_name', ''), 'video_plot': st.session_state.get('video_plot', '') } diff --git a/webui/i18n/en.json b/webui/i18n/en.json index dbce928..fe53cbc 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -97,12 +97,14 @@ "Select from resource directory": "Select from resource directory", "Select a video from resource videos directory": "Select a video from the ./resource/videos directory", "Upload a new video file up to 2GB": "Upload a new video file, up to 2GB", + "Upload new video files up to 2GB each": "Upload one or more video files, up to 2GB each", "Select Video": "Select Video", "Choose a video file": "Choose a video file", "Upload Video": "Upload Video", "No video files found in resource videos directory": "No video files found in the ./resource/videos directory", "Upload Local Files": "Upload Local Files", "File Uploaded Successfully": "File Uploaded Successfully", + "Selected videos for processing": "Selected {count} video(s): {files}", "Frame Interval (seconds)": "Frame Interval (seconds)", "Generate Video Script": "Generate Video Script", "Video Theme": "Video Theme", @@ -131,17 +133,28 @@ "HTTP_PROXY": "HTTP Proxy", "HTTPs_PROXY": "HTTPS Proxy", "Vision Model Settings": "Vision Model Settings", - "Vision Model Provider": "Vision Model Provider", + "Vision Model Provider": "API Protocol", "Vision API Key": "Vision API Key", "Vision Base URL": "Vision Base URL", "Vision Model Name": "Vision Model Name", "Text Generation Model Settings": "Text Generation Model Settings", "LLM Model Name": "LLM Model Name", "LLM Model API Key": "LLM Model API Key", - "Text Model Provider": "Text Model Provider", + "Text Model Provider": "API Protocol", "Text API Key": "Text API Key", "Text Base URL": "Text Base URL", "Text Model Name": "Text Model Name", + "Top P": "Top P", + "Top K": "Top K", + "Max Output Tokens": "Max Output Tokens", + "Max Output Tokens Help": "Maximum generated output length. 0 uses the provider default.", + "Thinking Level": "Thinking Level", + "Thinking Level Help": "Controls reasoning effort. Auto sends no extra thinking parameter; low/medium/high tries reasoning_effort.", + "Thinking Level Auto": "Auto", + "Thinking Level Off": "Off", + "Thinking Level Low": "Low", + "Thinking Level Medium": "Medium", + "Thinking Level High": "High", "Skip the first few seconds": "Skip the first few seconds", "Difference threshold": "Difference Threshold", "Vision processing batch size": "Vision Processing Batch Size", @@ -283,14 +296,16 @@ "Jianying Draft Settings": "Jianying Draft Settings", "Jianying Draft Folder Path": "Jianying Draft Folder Path", "Jianying Draft Folder Path Help": "Jianying draft folder path, for example: C:\\Users\\Username\\Documents\\JianyingPro Drafts", - "Custom API endpoint help": "Custom API endpoint (optional). Required when using a self-hosted or third-party proxy.", + "Custom API endpoint help": "OpenAI-compatible endpoint URL. Use a full /v1 URL for third-party or self-hosted gateways; leave empty for the official OpenAI API.", "Recommended API endpoint": "Recommended endpoint", - "OpenAI compatible gateway help": "{model_type} uses an OpenAI-compatible gateway provider, so a complete endpoint URL is required.", + "OpenAI compatible gateway help": "{model_type} uses an OpenAI-compatible API, so a complete endpoint URL is required.", "Vision model": "Vision model", "Text model": "Text model", "Model Name Input Help": "Enter the full model name.\n\nCommon examples:", - "OpenAI compatible providers help": "Supports common OpenAI-compatible gateways such as OpenAI, DeepSeek, OpenRouter, and SiliconFlow.", - "Provider API Key Help": "API key for the selected provider.\n\nWhere to get one:", + "OpenAI compatible providers help": "The vendor is not limited here; OpenAI, DeepSeek, OpenRouter, SiliconFlow, or a self-hosted gateway all work as long as the endpoint is OpenAI-compatible.", + "OpenAI compatible protocol": "OpenAI-compatible", + "OpenAI compatible protocol help": "This does not require the official OpenAI model; any service that supports the OpenAI Chat Completions compatible API can be used.", + "Provider API Key Help": "API key for the model service.\n\nCommon places to get one:", "Please fill OpenAI compatible gateway": "Please fill in the OpenAI-compatible gateway URL above, for example: {example}", "Please enter API key": "Please enter the API key first", "Please enter model name": "Please enter the model name first", @@ -324,9 +339,12 @@ "Ali Bailian API Key Help": "Enter your Ali Bailian API Key. After saving, it will be written to the local config.toml file.", "Upload media to transcribe": "Upload audio/video to transcribe", "Using selected video for subtitle transcription": "Using current video for subtitle transcription: {file}", + "Using selected videos for subtitle transcription": "Using {count} current videos for subtitle transcription: {files}", "Please select or upload a video first": "Please select or upload a video file above first", "Selected video file does not exist": "The selected video file does not exist. Please select or upload it again", + "Selected video files do not exist": "These selected video files do not exist. Please select or upload them again: {files}", "Transcribe subtitles": "Transcribe Subtitles", + "Calibrate subtitles": "Calibrate Subtitles", "Please enter Ali Bailian API Key": "Please enter the Ali Bailian API Key first", "Please enter local FunASR-Pack API URL": "Please enter the local FunASR-Pack API URL first", "Please upload media to transcribe": "Please upload the audio or video file to transcribe first", @@ -334,6 +352,11 @@ "Transcribing with Fun-ASR...": "Transcribing subtitles with Ali Bailian Fun-ASR, please wait...", "Fun-ASR failed without subtitle file": "Fun-ASR transcription failed: no subtitle file was generated", "Subtitle transcription succeeded": "Subtitle transcription succeeded: {file}", + "Subtitle transcription succeeded for multiple files": "Subtitle transcription succeeded for {count} files: {files}", + "Calibrating subtitles...": "Calibrating subtitles with the LLM, please wait...", + "Subtitle calibration succeeded": "Subtitle calibration succeeded: {file}", + "Subtitle calibration succeeded for multiple files": "Subtitle calibration succeeded for {count} files: {files}", + "Subtitle calibration failed": "Subtitle calibration failed", "Transcribed subtitles storage hint": "Previously transcribed subtitles are saved in {path}; drag a file from that folder to upload", "剧情理解": "Plot Analysis", "剧情理解结果": "Plot Analysis Result", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 33eb74a..27328ad 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -84,12 +84,14 @@ "Select from resource directory": "从资源目录选择", "Select a video from resource videos directory": "选择 ./resource/videos 目录中的视频", "Upload a new video file up to 2GB": "上传一个新的视频文件,限制 2GB", + "Upload new video files up to 2GB each": "上传一个或多个视频文件,单个文件限制 2GB", "Select Video": "选择视频", "Choose a video file": "选择一个视频文件", "Upload Video": "上传视频", "No video files found in resource videos directory": "未在 ./resource/videos 目录中找到视频文件", "Upload Local Files": "上传本地文件", "File Uploaded Successfully": "文件上传成功", + "Selected videos for processing": "已选择 {count} 个视频: {files}", "timestamp": "时间戳", "Picture description": "图片描述", "Narration": "视频文案", @@ -119,18 +121,29 @@ "Proxy Settings": "代理设置", "HTTP_PROXY": "HTTP 代理", "HTTPs_PROXY": "HTTPS 代理", - "Vision Model Settings": "视频分析模型设置", - "Vision Model Provider": "视频分析模型提供商", - "Vision API Key": "视频分析 API 密钥", - "Vision Base URL": "视频分析接口地址", - "Vision Model Name": "视频分析模型名称", + "Vision Model Settings": "视觉分析模型设置", + "Vision Model Provider": "接口规范", + "Vision API Key": "视觉分析 API 密钥", + "Vision Base URL": "视觉分析接口地址", + "Vision Model Name": "视觉分析模型名称", "Text Generation Model Settings": "文案生成模型设置", "LLM Model Name": "大语言模型名称", "LLM Model API Key": "大语言模型 API 密钥", - "Text Model Provider": "文案生成模型提供商", + "Text Model Provider": "接口规范", "Text API Key": "文案生成 API 密钥", "Text Base URL": "文案生成接口地址", "Text Model Name": "文案生成模型名称", + "Top P": "Top P", + "Top K": "Top K", + "Max Output Tokens": "最大输出 Token", + "Max Output Tokens Help": "单次生成的最大输出长度,0 表示使用服务端默认值", + "Thinking Level": "思考等级", + "Thinking Level Help": "控制推理/思考强度。自动表示不额外发送思考参数,低/中/高会尝试传递 reasoning_effort", + "Thinking Level Auto": "自动", + "Thinking Level Off": "关闭", + "Thinking Level Low": "低", + "Thinking Level Medium": "中", + "Thinking Level High": "高", "Account ID": "账户 ID", "Skip the first few seconds": "跳过开头多少秒", "Difference threshold": "差异阈值", @@ -265,19 +278,21 @@ "Jianying Draft Settings": "剪映草稿设置", "Jianying Draft Folder Path": "剪映草稿文件夹路径", "Jianying Draft Folder Path Help": "剪映草稿文件夹路径,例如:C:\\Users\\用户名\\Documents\\JianyingPro Drafts", - "Custom API endpoint help": "自定义 API 端点(可选),当使用自建或第三方代理时需要填写", + "Custom API endpoint help": "OpenAI 兼容接口地址。使用第三方或自建网关时填写完整 /v1 地址;使用 OpenAI 官方接口可留空。", "Recommended API endpoint": "推荐接口地址", - "OpenAI compatible gateway help": "{model_type} 选择的提供商基于 OpenAI 兼容网关,必须填写完整的接口地址。", - "Vision model": "视频分析模型", + "OpenAI compatible gateway help": "{model_type} 使用 OpenAI 兼容接口,请填写完整的接口地址。", + "Vision model": "视觉分析模型", "Text model": "文案生成模型", "Model Name Input Help": "输入完整模型名称\n\n常用示例:", - "OpenAI compatible providers help": "支持常见 OpenAI 兼容网关(如 OpenAI/DeepSeek/OpenRouter/SiliconFlow)", - "Provider API Key Help": "对应 provider 的 API 密钥\n\n获取地址:", + "OpenAI compatible providers help": "这里不限定模型厂商;OpenAI、DeepSeek、OpenRouter、SiliconFlow 或自建网关均可,只需提供兼容 OpenAI 的接口地址和模型名称。", + "OpenAI compatible protocol": "OpenAI 兼容", + "OpenAI compatible protocol help": "不是限定 OpenAI 官方模型;只要模型服务支持 OpenAI Chat Completions 兼容接口即可。", + "Provider API Key Help": "模型服务的 API 密钥\n\n常见获取地址:", "Please fill OpenAI compatible gateway": "请在上方填写 OpenAI 兼容网关地址,例如:{example}", "Please enter API key": "请先输入 API 密钥", "Please enter model name": "请先输入模型名称", "Connection test error": "测试连接时发生错误", - "Vision model config saved": "视频分析模型配置已保存(OpenAI 兼容)", + "Vision model config saved": "视觉分析模型配置已保存(OpenAI 兼容)", "Text model config saved": "文案生成模型配置已保存(OpenAI 兼容)", "Failed to save config": "保存配置失败", "Custom Position (% from top)": "自定义位置(距顶部百分比)", @@ -306,9 +321,12 @@ "Ali Bailian API Key Help": "请输入你自己的阿里百炼 API Key;保存配置后会写入本地 config.toml", "Upload media to transcribe": "上传需要转录的音频/视频", "Using selected video for subtitle transcription": "将使用当前视频生成字幕: {file}", + "Using selected videos for subtitle transcription": "将使用当前 {count} 个视频生成字幕: {files}", "Please select or upload a video first": "请先在上方选择或上传视频文件", "Selected video file does not exist": "当前视频文件不存在,请重新选择或上传", - "Transcribe subtitles": "转写生成字幕", + "Selected video files do not exist": "以下视频文件不存在,请重新选择或上传: {files}", + "Transcribe subtitles": "转录字幕", + "Calibrate subtitles": "校准字幕", "Please enter Ali Bailian API Key": "请先输入阿里百炼 API Key", "Please enter local FunASR-Pack API URL": "请先输入本地 FunASR-Pack API 地址", "Please upload media to transcribe": "请先上传需要转录的音频或视频文件", @@ -316,6 +334,11 @@ "Transcribing with Fun-ASR...": "正在使用阿里百炼 Fun-ASR 转写字幕,请稍候...", "Fun-ASR failed without subtitle file": "Fun-ASR 转写失败:未生成字幕文件", "Subtitle transcription succeeded": "字幕转写成功: {file}", + "Subtitle transcription succeeded for multiple files": "字幕转写成功,共 {count} 个文件: {files}", + "Calibrating subtitles...": "正在使用大模型校准字幕,请稍候...", + "Subtitle calibration succeeded": "字幕校准成功: {file}", + "Subtitle calibration succeeded for multiple files": "字幕校准成功,共 {count} 个文件: {files}", + "Subtitle calibration failed": "字幕校准失败", "Transcribed subtitles storage hint": "之前转录生成的字幕保存在 {path},可从该目录拖入上传", "剧情理解": "剧情理解", "剧情理解结果": "剧情理解结果", From 5b2487e879bfc01277ccc2a5c189bd6b9e4737f8 Mon Sep 17 00:00:00 2001 From: viccy Date: Fri, 5 Jun 2026 23:52:31 +0800 Subject: [PATCH 08/24] =?UTF-8?q?feat(indextts2,=20webui):=20=E5=AE=8C?= =?UTF-8?q?=E5=96=84=20IndexTTS2=20=E6=94=AF=E6=8C=81=EF=BC=8C=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E5=8F=82=E8=80=83=E9=9F=B3=E9=A2=91=E9=80=89=E6=8B=A9?= =?UTF-8?q?=E4=B8=8E=E9=A2=84=E8=A7=88=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修改内容包括: - 修正缺失参考音频的错误提示文本 - 更新示例配置文件,新增参考音频来源配置项并将 IndexTTS2 设为默认TTS引擎 - 更新语音服务模块的注释与文档字符串 - 新增多语言适配文案,支持新UI的所有提示内容 - 重构 IndexTTS2 设置页面:支持从资源目录选择音频、上传本地音频、预览音频效果 - 调整TTS引擎选项的排序与默认选中项 --- app/services/jianying_task.py | 2 +- app/services/voice.py | 4 +- config.example.toml | 7 +- webui/components/audio_settings.py | 291 +++++++++++++++++++++++++---- webui/i18n/en.json | 16 +- webui/i18n/zh.json | 16 +- 6 files changed, 290 insertions(+), 46 deletions(-) diff --git a/app/services/jianying_task.py b/app/services/jianying_task.py index f06d2f0..282cf47 100644 --- a/app/services/jianying_task.py +++ b/app/services/jianying_task.py @@ -71,7 +71,7 @@ def _normalize_indextts2_reference_audio(params: VideoClipParams) -> None: logger.info(f"IndexTTS2 使用配置中的参考音频: {configured_ref}") return - raise ValueError("IndexTTS2 参考音频不存在,请在音频设置中上传或填写有效的参考音频路径") + raise ValueError("IndexTTS2 参考音频不存在,请在音频设置中上传或选择有效的参考音频") def start_export_jianying_draft(task_id: str, params: VideoClipParams): diff --git a/app/services/voice.py b/app/services/voice.py index e6c94f7..58cd1c9 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -2236,7 +2236,7 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1. Args: text: 要转换的文本 - voice_name: 参考音频路径(格式:indextts2:path/to/audio.wav) + voice_name: 参考音频文件(格式:indextts2:path/to/audio.wav) voice_file: 输出音频文件路径 speed: 语音速度(此引擎暂不支持速度调节) @@ -2253,7 +2253,7 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1. num_beams = config.indextts2.get("num_beams", 3) repetition_penalty = config.indextts2.get("repetition_penalty", 10.0) - # 解析参考音频路径 + # 解析参考音频文件 reference_audio_path = parse_indextts2_voice(voice_name) if not reference_audio_path or not os.path.exists(reference_audio_path): diff --git a/config.example.toml b/config.example.toml index 2df60dc..1b2730d 100644 --- a/config.example.toml +++ b/config.example.toml @@ -120,7 +120,8 @@ # 默认 API 地址(本地部署) api_url = "http://127.0.0.1:8081/tts" - # 默认参考音频路径(可选) + # 默认参考音频(可选) + reference_audio_source = "resource" # reference_audio = "/path/to/reference_audio.wav" # 推理模式:普通推理 / 快速推理 @@ -151,8 +152,8 @@ silence_duration = 0.125 [ui] - # TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen, doubaotts) - tts_engine = "edge_tts" + # TTS引擎选择 (indextts2, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech) + tts_engine = "indextts2" # Edge TTS 配置 edge_voice_name = "zh-CN-XiaoyiNeural-Female" diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index 862457d..121cadb 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -1,5 +1,6 @@ import streamlit as st import os +import shutil from uuid import uuid4 from app.config import config from app.services import voice @@ -8,6 +9,35 @@ from app.utils import utils from webui.utils.cache import get_songs_cache +INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/mp3" +INDEXTTS2_REFERENCE_AUDIO_COPY_SUBDIR = "indextts2_refs" +INDEXTTS2_REFERENCE_AUDIO_MAP = [ + ("yingshijieshuo-zh-male.mp3", "影视解说", "Film Narration"), + ("maikeashe-zh-male.mp3", "麦克阿瑟", "Macintosh"), + ("dong-yuhui-zh-male.mp3", "董宇辉", "Dong Yuhui"), + ("fangzhenren-ad-fake-news-zh-male.mp3", "仿真人", "Realistic Human"), + ("fengyin-jilupian-jieshuo-zh-male.mp3", "风吟纪录片解说", "Fengyin Documentary Narration"), + ("guwo-dianying-jieshuo-zh-male.mp3", "顾我电影解说", "Guwo Film Narration"), + ("jia-xiaojun-final-zh-male.mp3", "贾小军", "Jia Xiaojun"), + ("junshi-zh-male.mp3", "军事解说", "Military Narration"), + ("qi-tongwei-v2-zh-male.mp3", "祁同伟", "Qi Tongwei"), + ("saima-niang-mambo-oye-zh-female.mp3", "赛马娘曼波欧耶版", "Uma Musume Mambo Oye Version"), + ("shejian-shangde-zhongguo-zh-male.mp3", "舌尖上的中国", "A Bite of China"), + ("xiaoming-jianmo-zh-male.mp3", "小明剑魔", "Xiaoming Sword Demon"), + ("xin-youxi-jieshuo-zh-male.mp3", "新游戏解说", "New Game Narration"), + ("xinzhong-zhicheng-zh-male.mp3", "心中之城", "City in the Heart"), + ("alex-chikna-en-male.mp3", "亚历克斯", "Alex Chikna"), + ("alle-en-unknown.mp3", "艾莉", "ALLE"), + ("calm-normal-en-unknown.mp3", "沉稳男声", "Calm Normal"), + ("donald-j-trump-noise-reduction-en-male.mp3", "唐纳德·特朗普", "Donald J. Trump"), + ("elite-en-unknown.mp3", "精英男声", "ELITE"), + ("horror-en-unknown.mp3", "惊悚男声", "Horror"), + ("meiqu-kelong-en-unknown.mp3", "美式男声", "US Clone"), + ("sarah-en-female.mp3", "莎拉", "Sarah"), +] +INDEXTTS2_REFERENCE_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg") + + def get_soulvoice_voices(): """获取 SoulVoice 语音列表""" # 检查是否配置了 SoulVoice API key @@ -22,12 +52,12 @@ def get_soulvoice_voices(): def get_tts_engine_options(tr=lambda key: key): """获取TTS引擎选项""" return { + "indextts2": "IndexTTS2", "edge_tts": "Edge TTS", - "azure_speech": "Azure Speech Services", - "tencent_tts": tr("Tencent Cloud TTS"), "qwen3_tts": tr("Tongyi Qwen3 TTS"), - "indextts2": tr("IndexTTS2 Voice Clone"), - "doubaotts": tr("Doubao TTS") + "tencent_tts": tr("Tencent Cloud TTS"), + "doubaotts": tr("Doubao TTS"), + "azure_speech": "Azure Speech Services" } @@ -59,9 +89,9 @@ def get_tts_engine_descriptions(tr=lambda key: key): "registration": "https://dashscope.aliyuncs.com/" }, "indextts2": { - "title": tr("IndexTTS2 Voice Clone"), + "title": "IndexTTS2", "features": tr("IndexTTS2 features"), - "use_case": tr("IndexTTS2 download link"), + "use_case": tr("IndexTTS2 use case"), "registration": None }, "doubaotts": { @@ -73,6 +103,143 @@ def get_tts_engine_descriptions(tr=lambda key: key): } +def infer_indextts2_reference_audio_language(filename): + """根据文件名推断参考音频语言""" + lower_filename = filename.lower() + if "-zh-" in lower_filename: + return "zh" + if "-en-" in lower_filename: + return "en" + return "unknown" + + +def get_indextts2_reference_audio_options(): + """获取本地 IndexTTS2 参考音频选项""" + options = [] + mapped_files = set() + + for filename, zh_name, en_name in INDEXTTS2_REFERENCE_AUDIO_MAP: + audio_path = os.path.join(INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR, filename) + if os.path.isfile(audio_path): + options.append({ + "filename": filename, + "path": audio_path, + "zh": zh_name, + "en": en_name, + "language": infer_indextts2_reference_audio_language(filename), + }) + mapped_files.add(filename) + + if os.path.isdir(INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR): + for filename in sorted(os.listdir(INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR)): + if filename in mapped_files: + continue + if not filename.lower().endswith(INDEXTTS2_REFERENCE_AUDIO_EXTENSIONS): + continue + audio_path = os.path.join(INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR, filename) + if not os.path.isfile(audio_path): + continue + fallback_name = os.path.splitext(filename)[0] + options.append({ + "filename": filename, + "path": audio_path, + "zh": fallback_name, + "en": fallback_name, + "language": infer_indextts2_reference_audio_language(filename), + }) + + return options + + +def format_indextts2_reference_audio_option(option): + """格式化 IndexTTS2 参考音频下拉显示名""" + zh_name = option.get("zh", "") + en_name = option.get("en", "") + language = option.get("language", "unknown") + ui_language = str(st.session_state.get("ui_language", "zh-CN")).lower() + + if ui_language.startswith("en"): + display_name = en_name or zh_name or option.get("filename", "") + language_labels = { + "zh": "Chinese", + "en": "English", + } + else: + display_name = zh_name or en_name or option.get("filename", "") + language_labels = { + "zh": "中文", + "en": "英文", + } + + language_label = language_labels.get(language) + if not language_label: + return display_name + + return f"{display_name} ({language_label})" + + +def get_indextts2_reference_audio_index(options, saved_reference_audio): + """根据已保存的参考音频文件匹配下拉选项索引""" + if not options: + return 0 + + saved_filename = os.path.basename(saved_reference_audio or "") + for index, option in enumerate(options): + if option["filename"] == saved_filename: + return index + + return 0 + + +def copy_indextts2_reference_audio(source_path): + """复制一份参考音频到项目存储目录,并返回复制后的路径""" + if not source_path or not os.path.isfile(source_path): + return "" + + target_dir = utils.storage_dir(INDEXTTS2_REFERENCE_AUDIO_COPY_SUBDIR, create=True) + target_path = os.path.join(target_dir, os.path.basename(source_path)) + + if os.path.abspath(source_path) == os.path.abspath(target_path): + return target_path + + should_copy = True + if os.path.exists(target_path): + should_copy = os.path.getsize(source_path) != os.path.getsize(target_path) + + if should_copy: + shutil.copy2(source_path, target_path) + + return target_path + + +def get_audio_mime_type(audio_path): + """根据音频文件扩展名返回 MIME 类型""" + extension = os.path.splitext(audio_path or "")[1].lower() + if extension == ".wav": + return "audio/wav" + if extension == ".ogg": + return "audio/ogg" + if extension == ".m4a": + return "audio/mp4" + if extension == ".aac": + return "audio/aac" + return "audio/mp3" + + +def render_reference_audio_preview_button(reference_audio, key, tr): + """渲染参考音频试听按钮""" + can_preview = bool(reference_audio and os.path.isfile(reference_audio)) + if st.button( + " ", + key=key, + icon=":material/play_arrow:", + help=tr("Preview Reference Audio Help"), + disabled=not can_preview, + use_container_width=True, + ): + st.session_state["indextts2_reference_audio_preview_path"] = reference_audio + + def is_valid_azure_voice_name(voice_name: str) -> bool: """检查是否为有效的Azure音色名称格式""" if not voice_name or not isinstance(voice_name, str): @@ -109,11 +276,11 @@ def render_tts_settings(tr): engine_descriptions = get_tts_engine_descriptions(tr) # 获取保存的TTS引擎设置 - saved_tts_engine = config.ui.get("tts_engine", "edge_tts") + saved_tts_engine = config.ui.get("tts_engine", "indextts2") # 确保保存的引擎在可用选项中 if saved_tts_engine not in engine_options: - saved_tts_engine = "edge_tts" + saved_tts_engine = "indextts2" # TTS引擎选择下拉框 selected_engine = st.selectbox( @@ -566,8 +733,6 @@ def render_qwen3_tts_settings(tr): def render_indextts2_tts_settings(tr): """渲染 IndexTTS2 TTS 设置""" - import os - # API 地址配置 api_url = st.text_input( tr("API URL"), @@ -575,29 +740,90 @@ def render_indextts2_tts_settings(tr): help=tr("IndexTTS2 API URL Help") ) - # 参考音频文件路径 - reference_audio = st.text_input( - tr("Reference Audio Path"), - value=config.indextts2.get("reference_audio", ""), - help=tr("Reference Audio Path Help") + saved_reference_audio = config.indextts2.get("reference_audio", "") + reference_audio_source_options = { + tr("Select from Resource Directory"): "resource", + tr("Upload Reference Audio"): "upload", + } + reference_audio_source_labels = list(reference_audio_source_options.keys()) + saved_reference_audio_source = config.indextts2.get("reference_audio_source", "resource") + if saved_reference_audio_source not in reference_audio_source_options.values(): + saved_reference_audio_source = "resource" + default_reference_audio_source_label = next( + label + for label, source_value in reference_audio_source_options.items() + if source_value == saved_reference_audio_source ) - - # 文件上传功能 - uploaded_file = st.file_uploader( - tr("Upload Reference Audio File"), - type=["wav", "mp3"], - help=tr("Upload Reference Audio Help") + + st.markdown(f"**{tr('Reference Audio Path')}**") + reference_audio_source_label = st.pills( + tr("Reference Audio Source"), + options=reference_audio_source_labels, + selection_mode="single", + default=default_reference_audio_source_label, + key="indextts2_reference_audio_source_selection", + help=tr("Reference Audio Source Help"), + label_visibility="collapsed", + width="stretch", ) - - if uploaded_file is not None: - # 保存上传的文件 - import tempfile - temp_dir = tempfile.gettempdir() - audio_path = os.path.join(temp_dir, f"indextts2_ref_{uploaded_file.name}") - with open(audio_path, "wb") as f: - f.write(uploaded_file.getbuffer()) - reference_audio = audio_path - st.success(tr("Audio uploaded").format(path=audio_path)) + if not reference_audio_source_label: + reference_audio_source_label = default_reference_audio_source_label + reference_audio_source = reference_audio_source_options[reference_audio_source_label] + + reference_audio = saved_reference_audio + reference_audio_options = get_indextts2_reference_audio_options() + if reference_audio_source == "resource" and reference_audio_options: + selected_audio_index = get_indextts2_reference_audio_index(reference_audio_options, saved_reference_audio) + select_col, preview_col = st.columns([5, 1]) + with select_col: + selected_audio_option = reference_audio_options[st.selectbox( + tr("Reference Audio Path"), + options=range(len(reference_audio_options)), + index=selected_audio_index, + format_func=lambda x: format_indextts2_reference_audio_option(reference_audio_options[x]), + help=tr("Reference Audio Path Help"), + label_visibility="collapsed" + )] + reference_audio = copy_indextts2_reference_audio(selected_audio_option["path"]) + with preview_col: + render_reference_audio_preview_button( + reference_audio, + "indextts2_resource_reference_audio_preview", + tr, + ) + elif reference_audio_source == "resource": + st.warning(tr("No Reference Audio Resources Found")) + + if reference_audio_source == "upload": + if saved_reference_audio_source != "upload": + reference_audio = "" + upload_col, preview_col = st.columns([5, 1]) + with upload_col: + uploaded_file = st.file_uploader( + tr("Upload Reference Audio File"), + type=["wav", "mp3"], + help=tr("Upload Reference Audio Help"), + label_visibility="collapsed" + ) + + if uploaded_file is not None: + target_dir = utils.storage_dir(INDEXTTS2_REFERENCE_AUDIO_COPY_SUBDIR, create=True) + audio_path = os.path.join(target_dir, f"uploaded_{uploaded_file.name}") + with open(audio_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + reference_audio = audio_path + st.success(tr("Audio uploaded").format(path=audio_path)) + with preview_col: + render_reference_audio_preview_button( + reference_audio, + "indextts2_upload_reference_audio_preview", + tr, + ) + + preview_audio_path = st.session_state.get("indextts2_reference_audio_preview_path", "") + if preview_audio_path == reference_audio and os.path.isfile(preview_audio_path): + with open(preview_audio_path, "rb") as audio_file: + st.audio(audio_file.read(), format=get_audio_mime_type(preview_audio_path)) # 推理模式 infer_mode_options = [ @@ -676,6 +902,7 @@ def render_indextts2_tts_settings(tr): # 保存配置 config.indextts2["api_url"] = api_url + config.indextts2["reference_audio_source"] = reference_audio_source config.indextts2["reference_audio"] = reference_audio config.indextts2["infer_mode"] = infer_mode config.indextts2["temperature"] = temperature @@ -1175,5 +1402,5 @@ def get_audio_params(): 'bgm_type': st.session_state.get('bgm_type', 'random'), 'bgm_file': st.session_state.get('bgm_file', ''), 'bgm_volume': st.session_state.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME), - 'tts_engine': st.session_state.get('tts_engine', "edge_tts"), + 'tts_engine': st.session_state.get('tts_engine', "indextts2"), } diff --git a/webui/i18n/en.json b/webui/i18n/en.json index fe53cbc..f912355 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -235,7 +235,8 @@ "Tencent Cloud TTS use case": "Personal and enterprise users who need stable Chinese speech synthesis", "Tongyi Qwen3 TTS features": "Alibaba Cloud Tongyi Qwen speech synthesis with high-quality voices and multiple voice options.", "High-quality Chinese speech synthesis use case": "Users who need high-quality Chinese speech synthesis", - "IndexTTS2 features": "Zero-shot voice cloning. Upload a reference audio file to synthesize speech with a matching voice. Requires local or private deployment.", + "IndexTTS2 features": "A locally or privately deployed voice-cloning engine. Choose a resource audio file or upload a reference audio file, then synthesize narration in that voice.", + "IndexTTS2 use case": "Best for fixed narrator voices, character dubbing, or generating multiple videos with the same voice. Start the IndexTTS2 API service before use. Deployment package: https://pan.quark.cn/s/0767c9bcefd5", "IndexTTS2 download link": "Download link: https://pan.quark.cn/s/0767c9bcefd5", "Doubao TTS features": "Volcengine Doubao speech synthesis with multiple voices and emotions, plus fast access in mainland China.", "Select TTS Engine": "Select TTS Engine", @@ -387,9 +388,16 @@ "Select Qwen3 TTS Voice": "Select a Qwen3 TTS voice", "API URL": "API URL", "IndexTTS2 API URL Help": "IndexTTS2 API service URL", - "Reference Audio Path": "Reference Audio Path", - "Reference Audio Path Help": "Reference audio file path for voice cloning (WAV format, 3-10 seconds recommended)", - "Upload Reference Audio File": "Or Upload Reference Audio File", + "Reference Audio Source": "Reference Audio Source", + "Reference Audio Source Help": "Choose a reference audio from the resource directory or upload a new one.", + "Select from Resource Directory": "Select from Resource Directory", + "Upload Reference Audio": "Upload Reference Audio", + "Reference Audio Path": "Reference Audio", + "Reference Audio Path Help": "Choose the reference audio for voice cloning (WAV/MP3, 3-10 seconds recommended)", + "No Reference Audio Resources Found": "No reference audio resources found. Please upload a reference audio file.", + "Preview Reference Audio": "Preview", + "Preview Reference Audio Help": "Play the selected reference audio.", + "Upload Reference Audio File": "Upload Reference Audio File", "Upload Reference Audio Help": "Upload a clear audio clip for voice cloning", "Audio uploaded": "✅ Audio uploaded: {path}", "Inference Mode": "Inference Mode", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 27328ad..75c9721 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -216,7 +216,8 @@ "Tencent Cloud TTS use case": "个人和企业用户,需要稳定的中文语音合成", "Tongyi Qwen3 TTS features": "阿里云通义千问语音合成,音质优秀,支持多种音色", "High-quality Chinese speech synthesis use case": "需要高质量中文语音合成的用户", - "IndexTTS2 features": "零样本语音克隆,上传参考音频即可合成相同音色的语音,需要本地或私有部署", + "IndexTTS2 features": "本地/私有部署的语音克隆引擎。选择资源目录音频或上传参考音频后,可按该音色合成旁白。", + "IndexTTS2 use case": "适合需要固定旁白音色、角色配音或批量生成同一音色视频的场景。使用前请先启动 IndexTTS2 API 服务;部署包下载:https://pan.quark.cn/s/0767c9bcefd5", "IndexTTS2 download link": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", "Doubao TTS features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快", "Select TTS Engine": "选择 TTS 引擎", @@ -369,9 +370,16 @@ "Select Qwen3 TTS Voice": "选择 Qwen3 TTS 音色", "API URL": "API 地址", "IndexTTS2 API URL Help": "IndexTTS2 API 服务地址", - "Reference Audio Path": "参考音频路径", - "Reference Audio Path Help": "用于语音克隆的参考音频文件路径(WAV 格式,建议 3-10 秒)", - "Upload Reference Audio File": "或上传参考音频文件", + "Reference Audio Source": "参考音频来源", + "Reference Audio Source Help": "选择从资源目录选择参考音频,或上传新的参考音频", + "Select from Resource Directory": "从资源目录选择", + "Upload Reference Audio": "上传参考音频", + "Reference Audio Path": "参考音频", + "Reference Audio Path Help": "选择用于语音克隆的参考音频(WAV/MP3 格式,建议 3-10 秒)", + "No Reference Audio Resources Found": "未找到资源目录中的参考音频,请上传参考音频文件", + "Preview Reference Audio": "试听", + "Preview Reference Audio Help": "播放当前参考音频", + "Upload Reference Audio File": "上传参考音频文件", "Upload Reference Audio Help": "上传一段清晰的音频用于语音克隆", "Audio uploaded": "✅ 音频已上传: {path}", "Inference Mode": "推理模式", From 33c17c26365ae2b337cf7e76e042e548623367f7 Mon Sep 17 00:00:00 2001 From: viccy Date: Sat, 6 Jun 2026 01:08:35 +0800 Subject: [PATCH 09/24] =?UTF-8?q?feat(subtitle,=20asr,=20bgm):=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E5=AD=97=E5=B9=95=E9=81=AE=E7=BD=A9=E3=80=81=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E8=BD=AC=E5=BD=95=E5=8A=9F=E8=83=BD=EF=BC=8C=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E8=83=8C=E6=99=AF=E9=9F=B3=E4=B9=90=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增字幕遮罩功能,可在烧录新字幕前遮盖原视频自带的字幕区域,支持横屏/竖屏自定义配置与预览调试 - 新增自动字幕转录功能,支持本地FunASR和阿里百炼在线转写,在最终视频合并完成后自动生成并压入成片字幕 - 重构背景音乐设置面板,新增从资源目录选择BGM、上传本地BGM文件的功能,新增BGM试听预览,优化交互流程 - 更新配置示例文件、数据Schema与多语言翻译文件,完善前后端参数传递逻辑 --- app/models/schema.py | 19 + app/services/generate_video.py | 202 +++++++++- app/services/task.py | 187 ++++++++- config.example.toml | 16 + webui/components/audio_settings.py | 239 +++++++++++- webui/components/subtitle_settings.py | 542 ++++++++++++++++++++++++-- webui/i18n/en.json | 46 ++- webui/i18n/zh.json | 46 ++- 8 files changed, 1239 insertions(+), 58 deletions(-) diff --git a/app/models/schema.py b/app/models/schema.py index a41b1e1..45d3658 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -183,6 +183,25 @@ class VideoClipParams(BaseModel): bgm_file: Optional[str] = Field(default="", description="背景音乐文件") subtitle_enabled: bool = True + subtitle_mask_enabled: bool = False + subtitle_mask_landscape_x_percent: float = 10.0 + subtitle_mask_landscape_y_percent: float = 78.0 + subtitle_mask_landscape_width_percent: float = 80.0 + subtitle_mask_landscape_height_percent: float = 14.0 + subtitle_mask_landscape_blur_radius: int = 18 + subtitle_mask_landscape_opacity_percent: int = 82 + subtitle_mask_portrait_x_percent: float = 8.0 + subtitle_mask_portrait_y_percent: float = 79.0 + subtitle_mask_portrait_width_percent: float = 84.0 + subtitle_mask_portrait_height_percent: float = 16.0 + subtitle_mask_portrait_blur_radius: int = 26 + subtitle_mask_portrait_opacity_percent: int = 84 + subtitle_auto_transcribe_enabled: bool = False + subtitle_auto_transcribe_backend: str = "local" + subtitle_auto_transcribe_api_url: str = "" + subtitle_auto_transcribe_api_key: str = "" + subtitle_auto_transcribe_hotword: str = "" + subtitle_auto_transcribe_enable_spk: bool = False font_name: str = "SimHei" # 默认使用黑体 font_size: int = 36 text_fore_color: str = "white" # 文本前景色 diff --git a/app/services/generate_video.py b/app/services/generate_video.py index 2eb633f..f5b808a 100644 --- a/app/services/generate_video.py +++ b/app/services/generate_video.py @@ -13,6 +13,7 @@ import traceback import tempfile from typing import Optional, Dict, Any from loguru import logger +import numpy as np from moviepy import ( VideoFileClip, AudioFileClip, @@ -22,13 +23,207 @@ from moviepy import ( afx ) from moviepy.video.tools.subtitles import SubtitlesClip -from PIL import ImageFont +from PIL import ImageFont, Image, ImageDraw, ImageEnhance, ImageFilter from app.utils import utils from app.models.schema import AudioVolumeDefaults from app.services.audio_normalizer import AudioNormalizer, normalize_audio_for_mixing +SUBTITLE_MASK_DEFAULTS = { + "landscape": { + "x_percent": 10.0, + "y_percent": 78.0, + "width_percent": 80.0, + "height_percent": 14.0, + "blur_radius": 18, + "opacity_percent": 82, + }, + "portrait": { + "x_percent": 8.0, + "y_percent": 79.0, + "width_percent": 84.0, + "height_percent": 16.0, + "blur_radius": 26, + "opacity_percent": 84, + }, +} + + +def _clamp(value, minimum, maximum): + return min(max(value, minimum), maximum) + + +def _get_numeric_option(options, key, default, integer=False): + try: + value = float(options.get(key, default)) + except (TypeError, ValueError): + value = float(default) + return int(round(value)) if integer else value + + +def _get_subtitle_mask_region_options(options, orientation): + defaults = SUBTITLE_MASK_DEFAULTS[orientation] + prefix = f"subtitle_mask_{orientation}_" + + x_percent = _clamp(_get_numeric_option(options, f"{prefix}x_percent", defaults["x_percent"]), 0, 99) + y_percent = _clamp(_get_numeric_option(options, f"{prefix}y_percent", defaults["y_percent"]), 0, 99) + width_percent = _clamp( + _get_numeric_option(options, f"{prefix}width_percent", defaults["width_percent"]), + 2, + 100 - x_percent, + ) + height_percent = _clamp( + _get_numeric_option(options, f"{prefix}height_percent", defaults["height_percent"]), + 2, + 100 - y_percent, + ) + blur_radius = _clamp( + _get_numeric_option(options, f"{prefix}blur_radius", defaults["blur_radius"], integer=True), + 0, + 200, + ) + opacity_percent = _clamp( + _get_numeric_option(options, f"{prefix}opacity_percent", defaults["opacity_percent"], integer=True), + 0, + 100, + ) + + return { + "x_percent": x_percent, + "y_percent": y_percent, + "width_percent": width_percent, + "height_percent": height_percent, + "blur_radius": blur_radius, + "opacity_percent": opacity_percent, + } + + +def _resolve_subtitle_mask_region(video_width, video_height, options): + orientation = "portrait" if video_height > video_width else "landscape" + region = _get_subtitle_mask_region_options(options, orientation) + + x = _clamp(round(video_width * region["x_percent"] / 100), 0, max(0, video_width - 2)) + y = _clamp(round(video_height * region["y_percent"] / 100), 0, max(0, video_height - 2)) + width = _clamp(round(video_width * region["width_percent"] / 100), 2, max(2, video_width - x)) + height = _clamp(round(video_height * region["height_percent"] / 100), 2, max(2, video_height - y)) + + base_height = 1920 if orientation == "portrait" else 1080 + blur_radius = ( + 0 + if region["blur_radius"] == 0 + else max(1, round(region["blur_radius"] * (video_height / base_height))) + ) + corner_radius = max(8, round(min(height * 0.32, blur_radius * 1.4 or height * 0.24))) + feather = max(6, round(max(blur_radius * 0.85, 8))) + padding = blur_radius + padded_x = max(0, x - padding) + padded_y = max(0, y - padding) + padded_width = _clamp(width + padding * 2, 2, video_width - padded_x) + padded_height = _clamp(height + padding * 2, 2, video_height - padded_y) + + return { + "orientation": orientation, + "x": int(x), + "y": int(y), + "width": int(width), + "height": int(height), + "blur_radius": int(blur_radius), + "opacity": _clamp(region["opacity_percent"] / 100, 0, 1), + "corner_radius": int(corner_radius), + "feather": int(feather), + "padded_x": int(padded_x), + "padded_y": int(padded_y), + "padded_width": int(padded_width), + "padded_height": int(padded_height), + } + + +def _build_subtitle_mask_alpha(region): + alpha = Image.new("L", (region["padded_width"], region["padded_height"]), 0) + draw = ImageDraw.Draw(alpha) + left = region["x"] - region["padded_x"] + top = region["y"] - region["padded_y"] + right = left + region["width"] + bottom = top + region["height"] + draw.rounded_rectangle( + (left, top, right, bottom), + radius=region["corner_radius"], + fill=255, + ) + if region["feather"] > 0: + alpha = alpha.filter(ImageFilter.GaussianBlur(radius=max(1, region["feather"] / 2))) + return alpha + + +def apply_subtitle_mask(video_clip, options): + """Apply a Speclip-style blurred subtitle mask before subtitle burn-in.""" + if not options.get("subtitle_mask_enabled", False): + return video_clip + + video_width, video_height = video_clip.size + region = _resolve_subtitle_mask_region(video_width, video_height, options) + logger.info( + "字幕遮罩已启用: " + f"{region['orientation']} x={region['x']} y={region['y']} " + f"w={region['width']} h={region['height']} blur={region['blur_radius']}" + ) + + alpha = _build_subtitle_mask_alpha(region) + tint_alpha = _clamp(round((0.05 + region["opacity"] * 0.07) * 100) / 100, 0.05, 0.14) + blur_sigma = ( + max(4, round(region["blur_radius"] * (0.9 + region["opacity"] * 0.35))) + if region["blur_radius"] > 0 + else 0 + ) + brightness = 1.0 + 0.03 + region["opacity"] * 0.04 + contrast = 0.975 - region["opacity"] * 0.035 + saturation = 1.0 + region["opacity"] * 0.03 + obliterate_width = max(24, round(region["padded_width"] * 0.12)) + obliterate_height = max(12, round(region["padded_height"] * 0.18)) + + def mask_frame(get_frame, t): + frame = np.asarray(get_frame(t)) + if frame.dtype != np.uint8: + frame = np.clip(frame, 0, 255).astype(np.uint8) + image = Image.fromarray(frame).convert("RGB") + crop_box = ( + region["padded_x"], + region["padded_y"], + region["padded_x"] + region["padded_width"], + region["padded_y"] + region["padded_height"], + ) + mask_image = image.crop(crop_box) + mask_image = mask_image.resize( + (obliterate_width, obliterate_height), + Image.Resampling.BICUBIC, + ).resize( + (region["padded_width"], region["padded_height"]), + Image.Resampling.LANCZOS, + ) + + if blur_sigma > 0: + mask_image = mask_image.filter(ImageFilter.GaussianBlur(radius=blur_sigma)) + mask_image = mask_image.filter(ImageFilter.BoxBlur(4)) + mask_image = ImageEnhance.Brightness(mask_image).enhance(brightness) + mask_image = ImageEnhance.Contrast(mask_image).enhance(contrast) + mask_image = ImageEnhance.Color(mask_image).enhance(saturation) + + blurred = mask_image.convert("RGBA") + blurred.putalpha(alpha) + + tint = Image.new("RGBA", blurred.size, (255, 255, 255, 0)) + tint_alpha_mask = alpha.point(lambda value: int(value * tint_alpha)) + tint.putalpha(tint_alpha_mask) + masked_region = Image.alpha_composite(blurred, tint) + + output = image.convert("RGBA") + output.alpha_composite(masked_region, dest=(region["padded_x"], region["padded_y"])) + return np.asarray(output.convert("RGB")) + + return video_clip.transform(mask_frame) + + def is_valid_subtitle_file(subtitle_path: str) -> bool: """ 检查字幕文件是否有效 @@ -121,6 +316,7 @@ def merge_materials( threads = options.get('threads', 2) fps = options.get('fps', 30) subtitle_enabled = options.get('subtitle_enabled', True) + subtitle_mask_enabled = bool(options.get('subtitle_mask_enabled', False)) # 配置日志 - 便于调试问题 logger.info(f"音量配置详情:") @@ -130,6 +326,7 @@ def merge_materials( logger.info(f" - 是否保留原声: {keep_original_audio}") logger.info(f"字幕配置详情:") logger.info(f" - 是否启用字幕: {subtitle_enabled}") + logger.info(f" - 是否启用字幕遮罩: {subtitle_mask_enabled}") logger.info(f" - 字幕文件路径: {subtitle_path}") # 音量参数验证 @@ -279,6 +476,9 @@ def merge_materials( # 处理视频尺寸 video_width, video_height = video_clip.size + + if subtitle_enabled and subtitle_mask_enabled: + video_clip = apply_subtitle_mask(video_clip, options) # 字幕处理函数 def create_text_clip(subtitle_item): diff --git a/app/services/task.py b/app/services/task.py index d1da33e..0b6b138 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -15,6 +15,121 @@ from app.services import state as sm from app.utils import utils +def _is_auto_transcription_enabled(params: VideoClipParams) -> bool: + return bool( + getattr(params, "subtitle_enabled", True) + and getattr(params, "subtitle_auto_transcribe_enabled", False) + ) + + +def _get_auto_transcription_backend(params: VideoClipParams) -> str: + backend = str(getattr(params, "subtitle_auto_transcribe_backend", "") or "").strip().lower() + if backend not in {"local", "bailian"}: + backend = "local" + return backend + + +def _build_subtitle_mask_options(params: VideoClipParams, enabled=None) -> dict: + mask_configured = bool( + getattr(params, "subtitle_enabled", True) + and getattr(params, "subtitle_mask_enabled", False) + ) + mask_enabled = mask_configured if enabled is None else mask_configured and enabled + return { + 'subtitle_mask_enabled': mask_enabled, + 'subtitle_mask_landscape_x_percent': getattr(params, "subtitle_mask_landscape_x_percent", 10.0), + 'subtitle_mask_landscape_y_percent': getattr(params, "subtitle_mask_landscape_y_percent", 78.0), + 'subtitle_mask_landscape_width_percent': getattr(params, "subtitle_mask_landscape_width_percent", 80.0), + 'subtitle_mask_landscape_height_percent': getattr(params, "subtitle_mask_landscape_height_percent", 14.0), + 'subtitle_mask_landscape_blur_radius': getattr(params, "subtitle_mask_landscape_blur_radius", 18), + 'subtitle_mask_landscape_opacity_percent': getattr(params, "subtitle_mask_landscape_opacity_percent", 82), + 'subtitle_mask_portrait_x_percent': getattr(params, "subtitle_mask_portrait_x_percent", 8.0), + 'subtitle_mask_portrait_y_percent': getattr(params, "subtitle_mask_portrait_y_percent", 79.0), + 'subtitle_mask_portrait_width_percent': getattr(params, "subtitle_mask_portrait_width_percent", 84.0), + 'subtitle_mask_portrait_height_percent': getattr(params, "subtitle_mask_portrait_height_percent", 16.0), + 'subtitle_mask_portrait_blur_radius': getattr(params, "subtitle_mask_portrait_blur_radius", 26), + 'subtitle_mask_portrait_opacity_percent': getattr(params, "subtitle_mask_portrait_opacity_percent", 84), + } + + +def _transcribe_final_video(task_id: str, video_path: str, params: VideoClipParams) -> str: + """Transcribe the fully merged video into an SRT file.""" + from app.services import fun_asr_subtitle + + if not video_path or not path.exists(video_path): + raise FileNotFoundError(f"自动转录视频不存在: {video_path}") + + backend = _get_auto_transcription_backend(params) + subtitle_file = path.join(utils.task_dir(task_id), "auto_transcribed_final.srt") + logger.info(f"开始自动转录最终视频: {video_path}, backend={backend}") + + if backend == "local": + api_url = str( + getattr(params, "subtitle_auto_transcribe_api_url", "") + or config.fun_asr.get("api_url", fun_asr_subtitle.LOCAL_FUN_ASR_API_URL) + ).strip() + if not api_url: + raise ValueError("请先输入本地 FunASR-Pack API 地址") + + generated_path = fun_asr_subtitle.create_with_local_fun_asr( + local_file=video_path, + subtitle_file=subtitle_file, + api_url=api_url, + hotword=str(getattr(params, "subtitle_auto_transcribe_hotword", "") or "").strip(), + enable_spk=bool(getattr(params, "subtitle_auto_transcribe_enable_spk", False)), + ) + else: + api_key = str( + getattr(params, "subtitle_auto_transcribe_api_key", "") + or config.fun_asr.get("api_key", "") + ).strip() + if not api_key: + raise ValueError("请先输入阿里百炼 API Key") + + generated_path = fun_asr_subtitle.create_with_fun_asr( + local_file=video_path, + subtitle_file=subtitle_file, + api_key=api_key, + ) + + if not generated_path or not path.exists(generated_path): + raise RuntimeError("自动转录失败:未生成字幕文件") + + logger.info(f"自动转录字幕生成成功: {generated_path}") + return generated_path + + +def _merge_auto_transcribed_subtitles( + source_video_path: str, + output_video_path: str, + subtitle_path: str, + params: VideoClipParams, +) -> str: + subtitle_options = { + 'voice_volume': 1.0, + 'bgm_volume': 0.0, + 'original_audio_volume': 1.0, + 'keep_original_audio': True, + 'subtitle_enabled': True, + 'subtitle_font': params.font_name, + 'subtitle_font_size': params.font_size, + 'subtitle_color': params.text_fore_color, + 'subtitle_bg_color': None, + 'subtitle_position': params.subtitle_position, + 'custom_position': params.custom_position, + 'threads': params.n_threads, + **_build_subtitle_mask_options(params, enabled=True), + } + return generate_video.merge_materials( + video_path=source_video_path, + audio_path="", + subtitle_path=subtitle_path, + bgm_path="", + output_path=output_video_path, + options=subtitle_options + ) + + def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict = None): """ 后台任务(统一视频裁剪处理)- 优化版本 @@ -200,10 +315,19 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di 6. 合并字幕/BGM/配音/视频 """ output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4") - logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}") + auto_transcription_enabled = _is_auto_transcription_enabled(params) + merge_output_video_path = ( + path.join(utils.task_dir(task_id), "combined_without_auto_subtitles.mp4") + if auto_transcription_enabled + else output_video_path + ) + logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {merge_output_video_path}") # bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3' - bgm_path = utils.get_bgm_file() + bgm_path = utils.get_bgm_file( + bgm_type=getattr(params, "bgm_type", "random"), + bgm_file=getattr(params, "bgm_file", ""), + ) # 获取优化的音量配置 optimized_volumes = get_recommended_volumes_for_content('mixed') @@ -232,24 +356,39 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di 'bgm_volume': final_bgm_volume, # 背景音乐音量(优化后) 'original_audio_volume': final_original_volume, # 视频原声音量(优化后) 'keep_original_audio': True, # 是否保留原声 - 'subtitle_enabled': params.subtitle_enabled, # 是否启用字幕 - 修复字幕开关bug + 'subtitle_enabled': params.subtitle_enabled and not auto_transcription_enabled, 'subtitle_font': params.font_name, # 这里使用相对字体路径,会自动在 font_dir() 目录下查找 'subtitle_font_size': params.font_size, 'subtitle_color': params.text_fore_color, 'subtitle_bg_color': None, # 直接使用None表示透明背景 'subtitle_position': params.subtitle_position, 'custom_position': params.custom_position, - 'threads': params.n_threads + 'threads': params.n_threads, + **_build_subtitle_mask_options(params, enabled=not auto_transcription_enabled), } generate_video.merge_materials( video_path=combined_video_path, audio_path=merged_audio_path, subtitle_path=merged_subtitle_path, bgm_path=bgm_path, - output_path=output_video_path, + output_path=merge_output_video_path, options=options ) + auto_subtitle_path = "" + if auto_transcription_enabled: + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=90) + logger.info("\n\n## 7. 自动转录最终视频字幕") + auto_subtitle_path = _transcribe_final_video(task_id, merge_output_video_path, params) + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=95) + logger.info(f"\n\n## 8. 压入自动转录字幕 -> {output_video_path}") + _merge_auto_transcribed_subtitles( + source_video_path=merge_output_video_path, + output_video_path=output_video_path, + subtitle_path=auto_subtitle_path, + params=params, + ) + final_video_paths.append(output_video_path) combined_video_paths.append(combined_video_path) @@ -259,6 +398,8 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di "videos": final_video_paths, "combined_videos": combined_video_paths } + if auto_subtitle_path: + kwargs["subtitles"] = [auto_subtitle_path] sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs) return kwargs @@ -416,9 +557,18 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): 6. 合并字幕/BGM/配音/视频 """ output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4") - logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}") + auto_transcription_enabled = _is_auto_transcription_enabled(params) + merge_output_video_path = ( + path.join(utils.task_dir(task_id), "combined_without_auto_subtitles.mp4") + if auto_transcription_enabled + else output_video_path + ) + logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {merge_output_video_path}") - bgm_path = utils.get_bgm_file() + bgm_path = utils.get_bgm_file( + bgm_type=getattr(params, "bgm_type", "random"), + bgm_file=getattr(params, "bgm_file", ""), + ) # 获取优化的音量配置 optimized_volumes = get_recommended_volumes_for_content('mixed') @@ -446,24 +596,39 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): 'bgm_volume': final_bgm_volume, 'original_audio_volume': final_original_volume, 'keep_original_audio': True, - 'subtitle_enabled': params.subtitle_enabled, + 'subtitle_enabled': params.subtitle_enabled and not auto_transcription_enabled, 'subtitle_font': params.font_name, 'subtitle_font_size': params.font_size, 'subtitle_color': params.text_fore_color, 'subtitle_bg_color': None, 'subtitle_position': params.subtitle_position, 'custom_position': params.custom_position, - 'threads': params.n_threads + 'threads': params.n_threads, + **_build_subtitle_mask_options(params, enabled=not auto_transcription_enabled), } generate_video.merge_materials( video_path=combined_video_path, audio_path=merged_audio_path, subtitle_path=merged_subtitle_path, bgm_path=bgm_path, - output_path=output_video_path, + output_path=merge_output_video_path, options=options ) + auto_subtitle_path = "" + if auto_transcription_enabled: + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=90) + logger.info("\n\n## 7. 自动转录最终视频字幕") + auto_subtitle_path = _transcribe_final_video(task_id, merge_output_video_path, params) + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=95) + logger.info(f"\n\n## 8. 压入自动转录字幕 -> {output_video_path}") + _merge_auto_transcribed_subtitles( + source_video_path=merge_output_video_path, + output_video_path=output_video_path, + subtitle_path=auto_subtitle_path, + params=params, + ) + final_video_paths.append(output_video_path) combined_video_paths.append(combined_video_path) @@ -473,6 +638,8 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): "videos": final_video_paths, "combined_videos": combined_video_paths } + if auto_subtitle_path: + kwargs["subtitles"] = [auto_subtitle_path] sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs) return kwargs diff --git a/config.example.toml b/config.example.toml index 1b2730d..89217eb 100644 --- a/config.example.toml +++ b/config.example.toml @@ -105,6 +105,7 @@ [fun_asr] # Fun-ASR 字幕转录配置 # backend = "local" 使用本地 FunASR-Pack API;backend = "bailian" 使用阿里百炼在线 fun-asr + auto_transcribe_enabled = false backend = "local" api_url = "http://127.0.0.1:7860" hotword = "" @@ -171,6 +172,21 @@ doubaotts_voice_type = "BV700_V2_streaming" doubaotts_rate = 1.0 + # 字幕遮罩配置:用于在烧录新字幕前遮盖原视频自带字幕 + subtitle_mask_enabled = false + subtitle_mask_landscape_x_percent = 10 + subtitle_mask_landscape_y_percent = 78 + subtitle_mask_landscape_width_percent = 80 + subtitle_mask_landscape_height_percent = 14 + subtitle_mask_landscape_blur_radius = 18 + subtitle_mask_landscape_opacity_percent = 82 + subtitle_mask_portrait_x_percent = 8 + subtitle_mask_portrait_y_percent = 79 + subtitle_mask_portrait_width_percent = 84 + subtitle_mask_portrait_height_percent = 16 + subtitle_mask_portrait_blur_radius = 26 + subtitle_mask_portrait_opacity_percent = 84 + ########################################## # 代理和网络配置 ########################################## diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index 121cadb..43b48fd 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -1,12 +1,12 @@ import streamlit as st import os import shutil +import json from uuid import uuid4 from app.config import config from app.services import voice from app.models.schema import AudioVolumeDefaults from app.utils import utils -from webui.utils.cache import get_songs_cache INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/mp3" @@ -36,6 +36,10 @@ INDEXTTS2_REFERENCE_AUDIO_MAP = [ ("sarah-en-female.mp3", "莎拉", "Sarah"), ] INDEXTTS2_REFERENCE_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg") +BGM_RESOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/bgms-safe" +BGM_TRACKS_JSON = os.path.join(BGM_RESOURCE_DIR, "tracks.json") +BGM_UPLOAD_SUBDIR = "uploaded_bgms" +BGM_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg") def get_soulvoice_voices(): @@ -212,11 +216,106 @@ def copy_indextts2_reference_audio(source_path): return target_path +def load_bgm_tracks_metadata(): + """读取 BGM 资源描述信息。""" + if not os.path.isfile(BGM_TRACKS_JSON): + return {} + + try: + with open(BGM_TRACKS_JSON, "r", encoding="utf-8") as f: + tracks = json.load(f) + except (OSError, json.JSONDecodeError): + return {} + + if not isinstance(tracks, list): + return {} + + metadata = {} + for track in tracks: + if not isinstance(track, dict): + continue + filename = track.get("fileName") + if filename: + metadata[filename] = track + + return metadata + + +def get_bgm_resource_options(): + """获取 BGM 资源目录中的音频选项。""" + options = [] + metadata = load_bgm_tracks_metadata() + added_files = set() + + for filename, track in metadata.items(): + audio_path = os.path.join(BGM_RESOURCE_DIR, filename) + if not os.path.isfile(audio_path): + continue + + options.append({ + "filename": filename, + "path": audio_path, + "title": track.get("title") or os.path.splitext(filename)[0], + "style": track.get("style", ""), + "category": track.get("category", ""), + }) + added_files.add(filename) + + if os.path.isdir(BGM_RESOURCE_DIR): + for filename in sorted(os.listdir(BGM_RESOURCE_DIR)): + if filename in added_files: + continue + if not filename.lower().endswith(BGM_AUDIO_EXTENSIONS): + continue + + audio_path = os.path.join(BGM_RESOURCE_DIR, filename) + if not os.path.isfile(audio_path): + continue + + options.append({ + "filename": filename, + "path": audio_path, + "title": os.path.splitext(filename)[0], + "style": "", + "category": "", + }) + + return options + + +def format_bgm_resource_option(option): + """格式化 BGM 资源下拉显示名。""" + title = option.get("title") or os.path.splitext(option.get("filename", ""))[0] + style = option.get("style", "") + category = option.get("category", "") + + if style: + return f"{title} ({style})" + if category: + return f"{title} ({category})" + return title + + +def get_bgm_resource_index(options, saved_bgm_file): + """根据已保存的 BGM 文件匹配下拉选项索引。""" + if not options: + return 0 + + saved_filename = os.path.basename(saved_bgm_file or "") + for index, option in enumerate(options): + if option["filename"] == saved_filename: + return index + + return 0 + + def get_audio_mime_type(audio_path): """根据音频文件扩展名返回 MIME 类型""" extension = os.path.splitext(audio_path or "")[1].lower() if extension == ".wav": return "audio/wav" + if extension == ".flac": + return "audio/flac" if extension == ".ogg": return "audio/ogg" if extension == ".m4a": @@ -240,6 +339,20 @@ def render_reference_audio_preview_button(reference_audio, key, tr): st.session_state["indextts2_reference_audio_preview_path"] = reference_audio +def render_bgm_preview_button(bgm_file, key, tr): + """渲染 BGM 试听按钮。""" + can_preview = bool(bgm_file and os.path.isfile(bgm_file)) + if st.button( + " ", + key=key, + icon=":material/play_arrow:", + help=tr("Preview Background Music Help"), + disabled=not can_preview, + use_container_width=True, + ): + st.session_state["bgm_preview_path"] = bgm_file + + def is_valid_azure_voice_name(voice_name: str) -> bool: """检查是否为有效的Azure音色名称格式""" if not voice_name or not isinstance(voice_name, str): @@ -262,7 +375,13 @@ def render_audio_panel(tr): # 渲染TTS设置 render_tts_settings(tr) - # 渲染背景音乐设置 + # 背景音乐独立成框,放在音频设置下方 + render_bgm_panel(tr) + + +def render_bgm_panel(tr): + """渲染背景音乐设置面板""" + with st.container(border=True): render_bgm_settings(tr) @@ -1356,29 +1475,106 @@ def render_voice_preview(tr, voice_name): def render_bgm_settings(tr): """渲染背景音乐设置""" - # 背景音乐选项 - bgm_options = [ - (tr("No Background Music"), ""), - (tr("Random Background Music"), "random"), - (tr("Custom Background Music"), "custom"), - ] + saved_bgm_file = st.session_state.get('bgm_file', '') + saved_bgm_source = st.session_state.get('bgm_source', 'resource') + if st.session_state.get('bgm_type') == "": + saved_bgm_source = "none" - selected_index = st.selectbox( - tr("Background Music"), - index=1, - options=range(len(bgm_options)), - format_func=lambda x: bgm_options[x][0], + bgm_source_options = { + tr("Select from Resource Directory"): "resource", + tr("Upload Background Music"): "upload", + tr("No Background Music"): "none", + } + if saved_bgm_source not in bgm_source_options.values(): + saved_bgm_source = "resource" + + default_bgm_source_label = next( + label + for label, source_value in bgm_source_options.items() + if source_value == saved_bgm_source ) - # 获取选择的背景音乐类型 - bgm_type = bgm_options[selected_index][1] - st.session_state['bgm_type'] = bgm_type + st.markdown(f"**{tr('Background Music')}**") + bgm_source_label = st.pills( + tr("Background Music Source"), + options=list(bgm_source_options.keys()), + selection_mode="single", + default=default_bgm_source_label, + key="bgm_source_selection", + help=tr("Background Music Source Help"), + label_visibility="collapsed", + width="stretch", + ) + if not bgm_source_label: + bgm_source_label = default_bgm_source_label - # 自定义背景音乐处理 - if bgm_type == "custom": - custom_bgm_file = st.text_input(tr("Custom Background Music File")) - if custom_bgm_file and os.path.exists(custom_bgm_file): - st.session_state['bgm_file'] = custom_bgm_file + bgm_source = bgm_source_options[bgm_source_label] + bgm_file = "" + bgm_name = "" + + if bgm_source == "resource": + bgm_options = get_bgm_resource_options() + if bgm_options: + selected_bgm_index = get_bgm_resource_index(bgm_options, saved_bgm_file) + select_col, preview_col = st.columns([5, 1]) + with select_col: + selected_bgm_option = bgm_options[st.selectbox( + tr("Background Music"), + options=range(len(bgm_options)), + index=selected_bgm_index, + format_func=lambda x: format_bgm_resource_option(bgm_options[x]), + help=tr("Background Music Path Help"), + label_visibility="collapsed" + )] + bgm_file = selected_bgm_option["path"] + bgm_name = selected_bgm_option["title"] + with preview_col: + render_bgm_preview_button( + bgm_file, + "resource_bgm_preview", + tr, + ) + else: + st.warning(tr("No Background Music Resources Found")) + + if bgm_source == "upload": + if st.session_state.get('bgm_source') != "upload": + saved_bgm_file = "" + bgm_file = saved_bgm_file if saved_bgm_file and os.path.isfile(saved_bgm_file) else "" + bgm_name = os.path.splitext(os.path.basename(bgm_file))[0] if bgm_file else "" + upload_col, preview_col = st.columns([5, 1]) + with upload_col: + uploaded_file = st.file_uploader( + tr("Upload Background Music File"), + type=[extension.lstrip(".") for extension in BGM_AUDIO_EXTENSIONS], + help=tr("Upload Background Music Help"), + label_visibility="collapsed" + ) + + if uploaded_file is not None: + target_dir = utils.storage_dir(BGM_UPLOAD_SUBDIR, create=True) + bgm_file = os.path.join(target_dir, f"uploaded_{uploaded_file.name}") + with open(bgm_file, "wb") as f: + f.write(uploaded_file.getbuffer()) + bgm_name = os.path.splitext(uploaded_file.name)[0] + st.success(tr("Background Music uploaded").format(path=bgm_file)) + with preview_col: + render_bgm_preview_button( + bgm_file, + "upload_bgm_preview", + tr, + ) + + preview_bgm_path = st.session_state.get("bgm_preview_path", "") + if preview_bgm_path == bgm_file and os.path.isfile(preview_bgm_path): + with open(preview_bgm_path, "rb") as audio_file: + st.audio(audio_file.read(), format=get_audio_mime_type(preview_bgm_path)) + + bgm_type = "" if bgm_source == "none" or not bgm_file else "custom" + st.session_state['bgm_source'] = bgm_source + st.session_state['bgm_type'] = bgm_type + st.session_state['bgm_file'] = bgm_file if bgm_type else "" + st.session_state['bgm_name'] = bgm_name if bgm_type else "" # 背景音乐音量 - 使用统一的默认值 bgm_volume = st.slider( @@ -1399,6 +1595,7 @@ def get_audio_params(): 'voice_volume': st.session_state.get('voice_volume', AudioVolumeDefaults.VOICE_VOLUME), 'voice_rate': st.session_state.get('voice_rate', 1.0), 'voice_pitch': st.session_state.get('voice_pitch', 1.0), + 'bgm_name': st.session_state.get('bgm_name', ''), 'bgm_type': st.session_state.get('bgm_type', 'random'), 'bgm_file': st.session_state.get('bgm_file', ''), 'bgm_volume': st.session_state.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME), diff --git a/webui/components/subtitle_settings.py b/webui/components/subtitle_settings.py index 41fac95..ac3793b 100644 --- a/webui/components/subtitle_settings.py +++ b/webui/components/subtitle_settings.py @@ -1,47 +1,507 @@ - -from loguru import logger import streamlit as st from app.config import config +from app.utils import utils from webui.utils.cache import get_fonts_cache +import hashlib import os +SUBTITLE_MASK_DEFAULTS = { + "landscape": { + "x_percent": 10, + "y_percent": 78, + "width_percent": 80, + "height_percent": 14, + "blur_radius": 18, + "opacity_percent": 82, + }, + "portrait": { + "x_percent": 8, + "y_percent": 79, + "width_percent": 84, + "height_percent": 16, + "blur_radius": 26, + "opacity_percent": 84, + }, +} + + +VIDEO_PREVIEW_UPLOAD_TYPES = ["mp4", "mov", "avi", "flv", "mkv", "mpeg4"] + + def render_subtitle_panel(tr): """渲染字幕设置面板""" with st.container(border=True): st.write(tr("Subtitle Settings")) - st.info(tr("Subtitle TTS support notice")) - # 检查是否选择了 SoulVoice qwen3_tts引擎 - from app.services import voice - # current_voice = st.session_state.get('voice_name', '') tts_engine = config.ui.get('tts_engine', '') is_disabled_subtitle = is_disabled_subtitle_settings(tts_engine) if is_disabled_subtitle: - # SoulVoice 引擎时显示禁用提示 st.warning(tr("TTS engine does not support precise subtitles").format(engine=tts_engine)) - st.info(tr("Manual subtitle editing recommendation")) - # 强制禁用字幕 - st.session_state['subtitle_enabled'] = False + enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True) + st.session_state['subtitle_enabled'] = enable_subtitles - # 显示禁用状态的复选框 - st.checkbox( - tr("Enable Subtitles"), - value=False, - disabled=True, - help=tr("Disabled subtitles help") - ) + if enable_subtitles: + render_subtitle_mask_settings(tr) + render_auto_transcription_settings(tr) + render_font_settings(tr) + render_position_settings(tr) + render_style_settings(tr) else: - # 其他引擎正常显示字幕选项 - enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True) - st.session_state['subtitle_enabled'] = enable_subtitles + st.session_state['subtitle_mask_enabled'] = False + config.ui["subtitle_mask_enabled"] = False + st.session_state['subtitle_auto_transcribe_enabled'] = False + config.fun_asr["auto_transcribe_enabled"] = False - if enable_subtitles: - render_font_settings(tr) - render_position_settings(tr) - render_style_settings(tr) + +def _subtitle_mask_key(orientation, field): + return f"subtitle_mask_{orientation}_{field}" + + +def _get_subtitle_mask_value(orientation, field): + key = _subtitle_mask_key(orientation, field) + return config.ui.get(key, SUBTITLE_MASK_DEFAULTS[orientation][field]) + + +def _set_subtitle_mask_value(orientation, field, value): + key = _subtitle_mask_key(orientation, field) + config.ui[key] = value + st.session_state[key] = value + + +def _format_preview_time(seconds): + seconds = max(0.0, float(seconds or 0)) + minutes = int(seconds // 60) + remaining_seconds = seconds - minutes * 60 + return f"{minutes:02d}:{remaining_seconds:04.1f}" + + +def _get_current_preview_video_path(): + uploaded_path = st.session_state.get("subtitle_mask_preview_video_path") + if uploaded_path and os.path.exists(uploaded_path): + return uploaded_path + + video_path = st.session_state.get("video_origin_path", "") + if isinstance(video_path, str) and video_path and os.path.exists(video_path): + return video_path + + video_paths = st.session_state.get("video_origin_paths", []) + if isinstance(video_paths, list): + for path in video_paths: + if isinstance(path, str) and path and os.path.exists(path): + return path + + return "" + + +def _save_subtitle_mask_preview_video(uploaded_file): + if uploaded_file is None: + return "" + + signature = f"{uploaded_file.name}:{uploaded_file.size}" + existing_signature = st.session_state.get("subtitle_mask_preview_upload_signature") + existing_path = st.session_state.get("subtitle_mask_preview_video_path", "") + if signature == existing_signature and existing_path and os.path.exists(existing_path): + return existing_path + + target_dir = utils.temp_dir("subtitle_mask_preview") + safe_name = os.path.basename(uploaded_file.name).strip() or "preview.mp4" + digest = hashlib.md5(signature.encode("utf-8")).hexdigest()[:10] + preview_path = os.path.join(target_dir, f"{digest}_{safe_name}") + + with open(preview_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + + st.session_state["subtitle_mask_preview_upload_signature"] = signature + st.session_state["subtitle_mask_preview_video_path"] = preview_path + return preview_path + + +def _video_mtime(video_path): + try: + return os.path.getmtime(video_path) + except OSError: + return 0 + + +@st.cache_data(show_spinner=False) +def _probe_subtitle_mask_preview_video(video_path, mtime): + from moviepy import VideoFileClip + + clip = VideoFileClip(video_path) + try: + return { + "duration": float(clip.duration or 0), + "width": int(clip.w), + "height": int(clip.h), + } + finally: + clip.close() + + +@st.cache_data(show_spinner=False) +def _extract_subtitle_mask_preview_frame(video_path, timestamp, mtime): + import numpy as np + from moviepy import VideoFileClip + + clip = VideoFileClip(video_path) + try: + safe_time = min(max(float(timestamp or 0), 0.0), max(float(clip.duration or 0), 0.0)) + frame = np.asarray(clip.get_frame(safe_time)) + if frame.dtype != np.uint8: + frame = np.clip(frame, 0, 255).astype(np.uint8) + return frame + finally: + clip.close() + + +def _build_subtitle_mask_preview_options(): + options = {"subtitle_mask_enabled": True} + for orientation in ("landscape", "portrait"): + for field in ("x_percent", "y_percent", "width_percent", "height_percent", "blur_radius", "opacity_percent"): + options[_subtitle_mask_key(orientation, field)] = _get_subtitle_mask_value(orientation, field) + return options + + +def _draw_subtitle_mask_preview(frame): + from PIL import Image, ImageDraw + from app.services.generate_video import _resolve_subtitle_mask_region + + image = Image.fromarray(frame).convert("RGBA") + region = _resolve_subtitle_mask_region(image.width, image.height, _build_subtitle_mask_preview_options()) + + overlay = Image.new("RGBA", image.size, (0, 0, 0, 0)) + draw = ImageDraw.Draw(overlay) + rect = ( + region["x"], + region["y"], + region["x"] + region["width"], + region["y"] + region["height"], + ) + draw.rounded_rectangle( + rect, + radius=region["corner_radius"], + fill=(0, 0, 0, 96), + outline=(255, 75, 85, 235), + width=max(2, round(min(image.width, image.height) * 0.004)), + ) + image.alpha_composite(overlay) + return image.convert("RGB"), region + + +def _resize_subtitle_mask_preview_image(image, max_width=520, max_height=360): + image = image.copy() + image.thumbnail((max_width, max_height)) + return image + + +def _render_subtitle_mask_preview(tr): + st.subheader(tr("Subtitle Mask Preview")) + + uploaded_path = st.session_state.get("subtitle_mask_preview_video_path", "") + if uploaded_path and os.path.exists(uploaded_path): + preview_cols = st.columns([0.68, 0.32], vertical_alignment="center") + with preview_cols[0]: + st.caption( + tr("Using Subtitle Mask Preview Video").format( + file=os.path.basename(uploaded_path) + ) + ) + with preview_cols[1]: + if st.button( + tr("Change Subtitle Mask Preview Video"), + key="change_subtitle_mask_preview_video", + use_container_width=True, + ): + st.session_state.pop("subtitle_mask_preview_video_path", None) + st.session_state.pop("subtitle_mask_preview_upload_signature", None) + st.rerun(scope="fragment") + else: + uploaded_file = st.file_uploader( + tr("Upload Subtitle Mask Preview Video"), + type=VIDEO_PREVIEW_UPLOAD_TYPES, + key="subtitle_mask_preview_video_uploader", + help=tr("Upload Subtitle Mask Preview Video Help"), + ) + uploaded_path = _save_subtitle_mask_preview_video(uploaded_file) + if uploaded_path: + st.rerun(scope="fragment") + + preview_video_path = uploaded_path or _get_current_preview_video_path() + + if not preview_video_path: + st.info(tr("Subtitle Mask Preview Empty")) + return + + try: + mtime = _video_mtime(preview_video_path) + video_info = _probe_subtitle_mask_preview_video(preview_video_path, mtime) + duration = max(0.0, video_info["duration"]) + if duration <= 0: + st.warning(tr("Subtitle Mask Preview Failed")) + return + + selected_time = st.slider( + tr("Subtitle Mask Preview Timeline"), + min_value=0.0, + max_value=duration, + value=min(float(st.session_state.get("subtitle_mask_preview_time", 0.0)), duration), + step=0.1, + format="%.1f", + key="subtitle_mask_preview_time", + help=tr("Subtitle Mask Preview Timeline Help"), + ) + frame = _extract_subtitle_mask_preview_frame(preview_video_path, selected_time, mtime) + preview_image, region = _draw_subtitle_mask_preview(frame) + preview_image = _resize_subtitle_mask_preview_image(preview_image, max_width=420, max_height=280) + st.image( + preview_image, + caption=tr("Subtitle Mask Preview Frame Caption").format( + time=_format_preview_time(selected_time), + orientation=tr("Portrait") if region["orientation"] == "portrait" else tr("Landscape"), + ), + ) + except Exception: + st.warning(tr("Subtitle Mask Preview Failed")) + + +def _render_subtitle_mask_region_controls(tr, orientation): + x_percent = st.slider( + tr("Subtitle Mask Left"), + min_value=0, + max_value=99, + value=int(_get_subtitle_mask_value(orientation, "x_percent")), + help=tr("Subtitle Mask Left Help"), + key=f"{orientation}_subtitle_mask_x_percent", + ) + _set_subtitle_mask_value(orientation, "x_percent", x_percent) + + y_percent = st.slider( + tr("Subtitle Mask Top"), + min_value=0, + max_value=99, + value=int(_get_subtitle_mask_value(orientation, "y_percent")), + help=tr("Subtitle Mask Top Help"), + key=f"{orientation}_subtitle_mask_y_percent", + ) + _set_subtitle_mask_value(orientation, "y_percent", y_percent) + + max_width = max(2, 100 - x_percent) + width_widget_key = f"{orientation}_subtitle_mask_width_percent" + if st.session_state.get(width_widget_key, 2) < 2: + st.session_state[width_widget_key] = 2 + if st.session_state.get(width_widget_key, 0) > max_width: + st.session_state[width_widget_key] = max_width + width_percent = st.slider( + tr("Subtitle Mask Width"), + min_value=2, + max_value=max_width, + value=min(int(_get_subtitle_mask_value(orientation, "width_percent")), max_width), + help=tr("Subtitle Mask Width Help"), + key=width_widget_key, + ) + _set_subtitle_mask_value(orientation, "width_percent", width_percent) + + max_height = max(2, 100 - y_percent) + height_widget_key = f"{orientation}_subtitle_mask_height_percent" + if st.session_state.get(height_widget_key, 2) < 2: + st.session_state[height_widget_key] = 2 + if st.session_state.get(height_widget_key, 0) > max_height: + st.session_state[height_widget_key] = max_height + height_percent = st.slider( + tr("Subtitle Mask Height"), + min_value=2, + max_value=max_height, + value=min(int(_get_subtitle_mask_value(orientation, "height_percent")), max_height), + help=tr("Subtitle Mask Height Help"), + key=height_widget_key, + ) + _set_subtitle_mask_value(orientation, "height_percent", height_percent) + + blur_radius = st.slider( + tr("Subtitle Mask Blur Radius"), + min_value=0, + max_value=200, + value=int(_get_subtitle_mask_value(orientation, "blur_radius")), + help=tr("Subtitle Mask Blur Radius Help"), + key=f"{orientation}_subtitle_mask_blur_radius", + ) + _set_subtitle_mask_value(orientation, "blur_radius", blur_radius) + + opacity_percent = st.slider( + tr("Subtitle Mask Opacity"), + min_value=0, + max_value=100, + value=int(_get_subtitle_mask_value(orientation, "opacity_percent")), + help=tr("Subtitle Mask Opacity Help"), + key=f"{orientation}_subtitle_mask_opacity_percent", + ) + _set_subtitle_mask_value(orientation, "opacity_percent", opacity_percent) + + +def _render_subtitle_mask_dialog(tr): + @st.dialog(tr("Subtitle Mask Settings"), width="large") + def subtitle_mask_dialog(): + preview_col, settings_col = st.columns([1, 1], vertical_alignment="top") + + with settings_col: + st.caption(tr("Subtitle Mask Settings Caption")) + st.caption(tr("Subtitle Mask Preview Caption")) + landscape_tab, portrait_tab = st.tabs([ + tr("Landscape Subtitle Mask"), + tr("Portrait Subtitle Mask"), + ]) + with landscape_tab: + _render_subtitle_mask_region_controls(tr, "landscape") + with portrait_tab: + _render_subtitle_mask_region_controls(tr, "portrait") + + with preview_col: + _render_subtitle_mask_preview(tr) + + if st.button(tr("Save Subtitle Mask Settings"), type="primary", use_container_width=True): + config.save_config() + st.rerun() + + subtitle_mask_dialog() + + +def render_subtitle_mask_settings(tr): + """渲染原字幕遮罩设置。""" + mask_enabled = st.checkbox( + tr("Enable Subtitle Mask"), + value=bool(config.ui.get("subtitle_mask_enabled", False)), + help=tr("Enable Subtitle Mask Help"), + key="subtitle_mask_enabled_checkbox", + ) + st.session_state['subtitle_mask_enabled'] = mask_enabled + config.ui["subtitle_mask_enabled"] = mask_enabled + + if not mask_enabled: + return + + button_col, summary_col = st.columns([0.35, 0.65], vertical_alignment="center") + with button_col: + if st.button(tr("Set Subtitle Mask"), key="set_subtitle_mask", use_container_width=True): + _render_subtitle_mask_dialog(tr) + with summary_col: + st.caption( + tr("Subtitle Mask Summary").format( + landscape_x=_get_subtitle_mask_value("landscape", "x_percent"), + landscape_y=_get_subtitle_mask_value("landscape", "y_percent"), + landscape_width=_get_subtitle_mask_value("landscape", "width_percent"), + landscape_height=_get_subtitle_mask_value("landscape", "height_percent"), + portrait_x=_get_subtitle_mask_value("portrait", "x_percent"), + portrait_y=_get_subtitle_mask_value("portrait", "y_percent"), + portrait_width=_get_subtitle_mask_value("portrait", "width_percent"), + portrait_height=_get_subtitle_mask_value("portrait", "height_percent"), + ) + ) + + +def _get_saved_auto_transcribe_backend(): + saved_backend = str(config.fun_asr.get("backend", "")).strip().lower() + if saved_backend not in {"local", "bailian"}: + saved_backend = ( + "bailian" + if config.fun_asr.get("api_key") and not config.fun_asr.get("api_url") + else "local" + ) + return saved_backend + + +def render_auto_transcription_settings(tr): + """渲染最终视频自动转录设置。""" + from app.services import fun_asr_subtitle + + auto_transcribe_enabled = st.checkbox( + tr("Enable Auto Transcription"), + value=bool(config.fun_asr.get("auto_transcribe_enabled", False)), + help=tr("Enable Auto Transcription Help"), + key="subtitle_auto_transcribe_enabled_checkbox", + ) + st.session_state['subtitle_auto_transcribe_enabled'] = auto_transcribe_enabled + config.fun_asr["auto_transcribe_enabled"] = auto_transcribe_enabled + + backend = _get_saved_auto_transcribe_backend() + api_url = config.fun_asr.get("api_url", fun_asr_subtitle.LOCAL_FUN_ASR_API_URL) + hotword = config.fun_asr.get("hotword", "") + enable_spk = bool(config.fun_asr.get("enable_spk", False)) + api_key = config.fun_asr.get("api_key", "") + + if not auto_transcribe_enabled: + st.session_state['subtitle_auto_transcribe_backend'] = backend + st.session_state['subtitle_auto_transcribe_api_url'] = api_url + st.session_state['subtitle_auto_transcribe_hotword'] = hotword + st.session_state['subtitle_auto_transcribe_enable_spk'] = enable_spk + st.session_state['subtitle_auto_transcribe_api_key'] = api_key + return + + backend_options = { + tr("Local FunASR-Pack API"): "local", + tr("Ali Bailian Online Fun-ASR"): "bailian", + } + backend_values = list(backend_options.values()) + backend_labels = list(backend_options.keys()) + + backend_label = st.radio( + tr("Subtitle Processing Method"), + options=backend_labels, + index=backend_values.index(backend), + horizontal=True, + key="subtitle_auto_transcribe_backend_radio", + ) + backend = backend_options[backend_label] + + if backend == "local": + st.caption(tr("Auto Transcription Local Caption")) + api_url = st.text_input( + tr("Local FunASR-Pack API URL"), + value=api_url, + help=tr("Local FunASR-Pack API URL Help"), + key="subtitle_auto_transcribe_api_url_input", + ) + hotword = st.text_input( + tr("Fun-ASR Hotword"), + value=hotword, + help=tr("Fun-ASR Hotword Help"), + key="subtitle_auto_transcribe_hotword_input", + ) + enable_spk = st.checkbox( + tr("Enable speaker diarization"), + value=enable_spk, + help=tr("Enable speaker diarization Help"), + key="subtitle_auto_transcribe_enable_spk_checkbox", + ) + else: + st.caption(tr("Auto Transcription Online Caption")) + st.markdown( + f"{tr('API Key URL')}: " + "[https://bailian.console.aliyun.com/?tab=model#/api-key]" + "(https://bailian.console.aliyun.com/?tab=model#/api-key)" + ) + api_key = st.text_input( + tr("Ali Bailian API Key"), + value=api_key, + type="password", + help=tr("Ali Bailian API Key Help"), + key="subtitle_auto_transcribe_api_key_input", + ) + + config.fun_asr["backend"] = backend + config.fun_asr["api_url"] = str(api_url).strip() + config.fun_asr["api_key"] = str(api_key).strip() + config.fun_asr["hotword"] = str(hotword).strip() + config.fun_asr["enable_spk"] = bool(enable_spk) + config.fun_asr["model"] = "fun-asr" + + st.session_state['subtitle_auto_transcribe_backend'] = backend + st.session_state['subtitle_auto_transcribe_api_url'] = str(api_url).strip() + st.session_state['subtitle_auto_transcribe_api_key'] = str(api_key).strip() + st.session_state['subtitle_auto_transcribe_hotword'] = str(hotword).strip() + st.session_state['subtitle_auto_transcribe_enable_spk'] = bool(enable_spk) def render_font_settings(tr): @@ -154,6 +614,40 @@ def get_subtitle_params(): font_name = st.session_state.get('font_name') or "SimHei" return { 'subtitle_enabled': st.session_state.get('subtitle_enabled', True), + 'subtitle_mask_enabled': st.session_state.get('subtitle_mask_enabled', False), + 'subtitle_mask_landscape_x_percent': _get_subtitle_mask_value("landscape", "x_percent"), + 'subtitle_mask_landscape_y_percent': _get_subtitle_mask_value("landscape", "y_percent"), + 'subtitle_mask_landscape_width_percent': _get_subtitle_mask_value("landscape", "width_percent"), + 'subtitle_mask_landscape_height_percent': _get_subtitle_mask_value("landscape", "height_percent"), + 'subtitle_mask_landscape_blur_radius': _get_subtitle_mask_value("landscape", "blur_radius"), + 'subtitle_mask_landscape_opacity_percent': _get_subtitle_mask_value("landscape", "opacity_percent"), + 'subtitle_mask_portrait_x_percent': _get_subtitle_mask_value("portrait", "x_percent"), + 'subtitle_mask_portrait_y_percent': _get_subtitle_mask_value("portrait", "y_percent"), + 'subtitle_mask_portrait_width_percent': _get_subtitle_mask_value("portrait", "width_percent"), + 'subtitle_mask_portrait_height_percent': _get_subtitle_mask_value("portrait", "height_percent"), + 'subtitle_mask_portrait_blur_radius': _get_subtitle_mask_value("portrait", "blur_radius"), + 'subtitle_mask_portrait_opacity_percent': _get_subtitle_mask_value("portrait", "opacity_percent"), + 'subtitle_auto_transcribe_enabled': st.session_state.get('subtitle_auto_transcribe_enabled', False), + 'subtitle_auto_transcribe_backend': st.session_state.get( + 'subtitle_auto_transcribe_backend', + _get_saved_auto_transcribe_backend() + ), + 'subtitle_auto_transcribe_api_url': st.session_state.get( + 'subtitle_auto_transcribe_api_url', + config.fun_asr.get("api_url", "") + ), + 'subtitle_auto_transcribe_api_key': st.session_state.get( + 'subtitle_auto_transcribe_api_key', + config.fun_asr.get("api_key", "") + ), + 'subtitle_auto_transcribe_hotword': st.session_state.get( + 'subtitle_auto_transcribe_hotword', + config.fun_asr.get("hotword", "") + ), + 'subtitle_auto_transcribe_enable_spk': st.session_state.get( + 'subtitle_auto_transcribe_enable_spk', + bool(config.fun_asr.get("enable_spk", False)) + ), 'font_name': font_name, 'font_size': st.session_state.get('font_size', 60), 'text_fore_color': st.session_state.get('text_fore_color', '#FFFFFF'), diff --git a/webui/i18n/en.json b/webui/i18n/en.json index f912355..5c7b5b1 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -51,9 +51,52 @@ "Random Background Music": "Random Background Music", "Custom Background Music": "Custom Background Music", "Custom Background Music File": "Please enter the file path of the custom background music", + "Background Music Source": "Background Music Source", + "Background Music Source Help": "Choose background music from the resource directory, upload a new file, or disable background music.", + "Upload Background Music": "Upload Background Music", + "Background Music Path Help": "Choose the background music used for video synthesis.", + "No Background Music Resources Found": "No background music resources found. Please upload a background music file.", + "Preview Background Music Help": "Play the selected background music.", + "Upload Background Music File": "Upload Background Music File", + "Upload Background Music Help": "Upload an audio file to use as background music.", + "Background Music uploaded": "✅ Background music uploaded: {path}", "Background Music Volume": "Background Music Volume (0.2 represents 20%, background sound should not be too loud)", "Subtitle Settings": "**Subtitle Settings**", "Enable Subtitles": "Enable Subtitles (If unchecked, the following settings will not take effect)", + "Enable Subtitle Mask": "Enable Subtitle Mask", + "Enable Subtitle Mask Help": "Before burning in new subtitles, cover the original subtitle area with a soft blurred mask.", + "Set Subtitle Mask": "Set Subtitle Mask", + "Subtitle Mask Summary": "Landscape {landscape_x}%/{landscape_y}% · {landscape_width}%×{landscape_height}%; portrait {portrait_x}%/{portrait_y}% · {portrait_width}%×{portrait_height}%", + "Subtitle Mask Settings": "Subtitle Mask Settings", + "Subtitle Mask Settings Caption": "Save landscape and portrait mask regions as frame percentages. The mask is applied before new subtitles are burned in.", + "Landscape Subtitle Mask": "Landscape Mask", + "Portrait Subtitle Mask": "Portrait Mask", + "Save Subtitle Mask Settings": "Save Subtitle Mask Settings", + "Subtitle Mask Left": "Left Position", + "Subtitle Mask Left Help": "Mask distance from the left edge as a frame percentage.", + "Subtitle Mask Top": "Top Position", + "Subtitle Mask Top Help": "Mask distance from the top edge as a frame percentage.", + "Subtitle Mask Width": "Mask Width", + "Subtitle Mask Width Help": "Width of the covered mask region as a frame percentage.", + "Subtitle Mask Height": "Mask Height", + "Subtitle Mask Height Help": "Height of the covered mask region as a frame percentage.", + "Subtitle Mask Blur Radius": "Blur Radius", + "Subtitle Mask Blur Radius Help": "Blur strength for the mask background and edge.", + "Subtitle Mask Opacity": "Mask Strength", + "Subtitle Mask Opacity Help": "Mask blend strength. Higher values cover source subtitles more strongly.", + "Subtitle Mask Preview": "Source Subtitle Mask Preview", + "Subtitle Mask Preview Caption": "Upload a source video for preview, or use the currently selected source video. Uploaded files here are only used for mask preview.", + "Upload Subtitle Mask Preview Video": "Upload Preview Source Video", + "Upload Subtitle Mask Preview Video Help": "Only used for previewing the mask in this dialog. It will not replace the source video used for generation.", + "Using Subtitle Mask Preview Video": "Preview video: {file}", + "Change Subtitle Mask Preview Video": "Change Video", + "Subtitle Mask Preview Empty": "Upload a preview video, or select a source video above first.", + "Subtitle Mask Preview Timeline": "Preview Timeline (seconds)", + "Subtitle Mask Preview Timeline Help": "Drag to a frame where the source subtitles appear, then fine-tune the mask region.", + "Subtitle Mask Preview Frame Caption": "{time} · {orientation} · red outline shows the current mask region", + "Subtitle Mask Preview Failed": "Unable to read this video preview. Please try another video file.", + "Enable Auto Transcription": "Enable Auto Transcription", + "Enable Auto Transcription Help": "After the final video is merged, transcribe the whole video into subtitles and burn them into the output.", "Font": "Subtitle Font", "Position": "Subtitle Position", "Top": "Top", @@ -219,7 +262,6 @@ "短剧名称": "Short Drama Name", "生成短剧解说脚本": "Generate Short Drama Narration Script", "请输入视频脚本": "Please enter the video script", - "Subtitle TTS support notice": "💡 Note: currently only the **edge-tts** engine supports automatic subtitle generation. Other TTS engines are not supported yet.", "TTS engine does not support precise subtitles": "⚠️ {engine} does not support precise subtitle generation", "Manual subtitle editing recommendation": "💡 We recommend adding subtitles manually in a professional editor such as CapCut or Premiere Pro.", "Disabled subtitles help": "This TTS engine does not support subtitle generation. Please use another TTS engine.", @@ -329,6 +371,8 @@ "Ali Bailian Online Fun-ASR": "Online", "Local Fun-ASR upload caption": "The current video above will be converted to SRT subtitles through the locally running FunASR-Pack API.", "Fun-ASR upload caption": "The current video above will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.", + "Auto Transcription Local Caption": "After the final video is merged, it will be converted to SRT subtitles through the locally running FunASR-Pack API.", + "Auto Transcription Online Caption": "After the final video is merged, it will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.", "Local FunASR-Pack API URL": "Local FunASR-Pack API URL", "Local FunASR-Pack API URL Help": "For example, http://127.0.0.1:7860. A full /asr endpoint URL is also supported.", "Fun-ASR Hotword": "Hotword", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 75c9721..29945e0 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -41,9 +41,52 @@ "Random Background Music": "随机背景音乐", "Custom Background Music": "自定义背景音乐", "Custom Background Music File": "请输入自定义背景音乐的文件路径", + "Background Music Source": "背景音乐来源", + "Background Music Source Help": "选择资源目录中的背景音乐、上传新的背景音乐,或关闭背景音乐", + "Upload Background Music": "上传背景音乐", + "Background Music Path Help": "选择用于视频合成的背景音乐", + "No Background Music Resources Found": "未找到资源目录中的背景音乐,请上传背景音乐文件", + "Preview Background Music Help": "播放当前背景音乐", + "Upload Background Music File": "上传背景音乐文件", + "Upload Background Music Help": "上传一个音频文件作为背景音乐", + "Background Music uploaded": "✅ 背景音乐已上传: {path}", "Background Music Volume": "背景音乐音量(0.2表示20%,背景声音不宜过高)", "Subtitle Settings": "**字幕设置**", "Enable Subtitles": "启用字幕(若取消勾选,下面的设置都将不生效)", + "Enable Subtitle Mask": "启用字幕遮罩", + "Enable Subtitle Mask Help": "开启后会在烧录新字幕前,先用模糊遮罩覆盖原视频自带字幕区域", + "Set Subtitle Mask": "设置字幕遮罩", + "Subtitle Mask Summary": "横屏 {landscape_x}%/{landscape_y}% · {landscape_width}%×{landscape_height}%;竖屏 {portrait_x}%/{portrait_y}% · {portrait_width}%×{portrait_height}%", + "Subtitle Mask Settings": "字幕遮罩设置", + "Subtitle Mask Settings Caption": "按画面百分比保存横屏和竖屏遮罩区域;生成视频时会先叠加柔化遮罩,再烧录新字幕。", + "Landscape Subtitle Mask": "横屏遮罩", + "Portrait Subtitle Mask": "竖屏遮罩", + "Save Subtitle Mask Settings": "保存字幕遮罩设置", + "Subtitle Mask Left": "左侧位置", + "Subtitle Mask Left Help": "遮罩距离画面左侧的百分比", + "Subtitle Mask Top": "顶部位置", + "Subtitle Mask Top Help": "遮罩距离画面顶部的百分比", + "Subtitle Mask Width": "遮罩宽度", + "Subtitle Mask Width Help": "遮罩覆盖区域的宽度百分比", + "Subtitle Mask Height": "遮罩高度", + "Subtitle Mask Height Help": "遮罩覆盖区域的高度百分比", + "Subtitle Mask Blur Radius": "模糊半径", + "Subtitle Mask Blur Radius Help": "遮罩边缘和背景的模糊强度", + "Subtitle Mask Opacity": "遮罩强度", + "Subtitle Mask Opacity Help": "遮罩融合强度,数值越高越容易遮住原字幕", + "Subtitle Mask Preview": "原字幕遮罩预览", + "Subtitle Mask Preview Caption": "可上传一段原视频作为预览,也可直接使用当前已选择的原视频;上传内容仅用于预览遮罩位置。", + "Upload Subtitle Mask Preview Video": "上传预览原视频", + "Upload Subtitle Mask Preview Video Help": "仅用于在弹窗中预览遮罩,不会替换生成视频使用的原视频", + "Using Subtitle Mask Preview Video": "当前预览视频: {file}", + "Change Subtitle Mask Preview Video": "更换视频", + "Subtitle Mask Preview Empty": "请上传预览视频,或先在上方选择原视频", + "Subtitle Mask Preview Timeline": "预览时间轴(秒)", + "Subtitle Mask Preview Timeline Help": "拖动到原字幕出现的画面,方便微调遮罩区域", + "Subtitle Mask Preview Frame Caption": "{time} · {orientation} · 红框为当前遮罩覆盖区域", + "Subtitle Mask Preview Failed": "无法读取该视频预览,请尝试更换视频文件", + "Enable Auto Transcription": "启用自动转录", + "Enable Auto Transcription Help": "开启后会在最终视频合并完成后,对整条视频转录生成字幕并压入成片", "Font": "字幕字体", "Position": "字幕位置", "Top": "顶部", @@ -200,7 +243,6 @@ "QwenVL model returned invalid response": "QwenVL 模型返回了无效响应", "Testing connection...": "正在测试连接...", "Connection failed": "连接失败", - "Subtitle TTS support notice": "💡 提示:目前仅 **edge-tts** 引擎支持自动生成字幕,其他 TTS 引擎暂不支持。", "TTS engine does not support precise subtitles": "⚠️ {engine} 不支持精确字幕生成", "Manual subtitle editing recommendation": "💡 建议使用专业剪辑工具(如剪映、PR 等)手动添加字幕", "Disabled subtitles help": "当前 TTS 引擎不支持字幕生成,请使用其他 TTS 引擎", @@ -311,6 +353,8 @@ "Ali Bailian Online Fun-ASR": "在线转写", "Local Fun-ASR upload caption": "将使用上方当前视频,通过本机运行的 FunASR-Pack API 生成 SRT 字幕。", "Fun-ASR upload caption": "将使用上方当前视频,自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。", + "Auto Transcription Local Caption": "将在最终视频合并完成后,通过本机运行的 FunASR-Pack API 生成 SRT 字幕。", + "Auto Transcription Online Caption": "将在最终视频合并完成后,自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。", "Local FunASR-Pack API URL": "本地 FunASR-Pack API 地址", "Local FunASR-Pack API URL Help": "例如 http://127.0.0.1:7860;也可以直接填到 /asr 的完整地址。", "Fun-ASR Hotword": "热词", From 5a9775d62dfff6474d0e23321a4e7aed1551bce1 Mon Sep 17 00:00:00 2001 From: viccy Date: Sat, 6 Jun 2026 12:01:36 +0800 Subject: [PATCH 10/24] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=E6=A8=AA?= =?UTF-8?q?=E7=AB=96=E5=B1=8F=E8=87=AA=E5=AE=9A=E4=B9=89=E5=AD=97=E5=B9=95?= =?UTF-8?q?=E4=BD=8D=E7=BD=AE=EF=BC=8C=E9=87=8D=E6=9E=84=E5=89=AA=E6=98=A0?= =?UTF-8?q?=E5=AF=BC=E5=87=BA=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增横竖屏分别的字幕垂直位置配置,默认值分别为85%和82% - 更新WebUI字幕设置界面,新增独立的横屏/竖屏字幕位置标签页,在预览画面中添加蓝线标注当前字幕位置 - 重构剪映草稿导出逻辑,将相关代码抽离至独立模块,移除requirements.txt中的pyJianYingDraft直接依赖 - 优化媒体时长处理逻辑,新增时长缓存和自动裁剪处理,添加完整的单元测试覆盖 - 更新配置示例文件、数据Schema定义和中英多语言翻译文件 --- app/models/schema.py | 2 + app/services/generate_video.py | 18 +- app/services/jianying_draft_builder.py | 1452 +++++++++++++++++++ app/services/jianying_task.py | 158 +- app/services/task.py | 2 + app/services/test_jianying_task_unittest.py | 120 +- config.example.toml | 2 + requirements.txt | 3 - webui/components/subtitle_settings.py | 62 +- webui/i18n/en.json | 6 +- webui/i18n/zh.json | 6 +- 11 files changed, 1724 insertions(+), 107 deletions(-) create mode 100644 app/services/jianying_draft_builder.py diff --git a/app/models/schema.py b/app/models/schema.py index 45d3658..e0547e7 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -196,6 +196,8 @@ class VideoClipParams(BaseModel): subtitle_mask_portrait_height_percent: float = 16.0 subtitle_mask_portrait_blur_radius: int = 26 subtitle_mask_portrait_opacity_percent: int = 84 + subtitle_position_landscape_y_percent: float = 85.0 + subtitle_position_portrait_y_percent: float = 82.0 subtitle_auto_transcribe_enabled: bool = False subtitle_auto_transcribe_backend: str = "local" subtitle_auto_transcribe_api_url: str = "" diff --git a/app/services/generate_video.py b/app/services/generate_video.py index f5b808a..d66166b 100644 --- a/app/services/generate_video.py +++ b/app/services/generate_video.py @@ -224,6 +224,14 @@ def apply_subtitle_mask(video_clip, options): return video_clip.transform(mask_frame) +def _resolve_orientation_subtitle_y_percent(video_width, video_height, options): + orientation = "portrait" if video_height > video_width else "landscape" + key = f"subtitle_position_{orientation}_y_percent" + if key not in options: + return None + return _clamp(_get_numeric_option(options, key, 85 if orientation == "landscape" else 82), 0, 99) + + def is_valid_subtitle_file(subtitle_path: str) -> bool: """ 检查字幕文件是否有效 @@ -476,6 +484,7 @@ def merge_materials( # 处理视频尺寸 video_width, video_height = video_clip.size + orientation_subtitle_y_percent = _resolve_orientation_subtitle_y_percent(video_width, video_height, options) if subtitle_enabled and subtitle_mask_enabled: video_clip = apply_subtitle_mask(video_clip, options) @@ -525,7 +534,14 @@ def merge_materials( _clip = _clip.with_duration(duration) # 设置字幕位置 - if subtitle_position == "bottom": + if orientation_subtitle_y_percent is not None: + margin = 10 + max_y = video_height - _clip.h - margin + min_y = margin + custom_y = (video_height - _clip.h) * (orientation_subtitle_y_percent / 100) + custom_y = max(min_y, min(custom_y, max_y)) + _clip = _clip.with_position(("center", custom_y)) + elif subtitle_position == "bottom": _clip = _clip.with_position(("center", video_height * 0.95 - _clip.h)) elif subtitle_position == "top": _clip = _clip.with_position(("center", video_height * 0.05)) diff --git a/app/services/jianying_draft_builder.py b/app/services/jianying_draft_builder.py new file mode 100644 index 0000000..3f00422 --- /dev/null +++ b/app/services/jianying_draft_builder.py @@ -0,0 +1,1452 @@ +import json +import os +import re +import shutil +import subprocess +import time +import uuid +from typing import Any, Dict, List, Optional, Set, Tuple + +from loguru import logger + +from app.models.schema import VideoClipParams + + +MICROSECONDS = 1_000_000 +DRAFT_PATH_PLACEHOLDER = "##_draftpath_placeholder_0E685133-18CE-45ED-8CB8-2904A212EC80_##" +DRAFT_PATH_PLACEHOLDER_PATTERN = re.compile(r"^##_draftpath_placeholder_[^#]+_##/") + +MATERIAL_COLLECTION_KEYS = [ + "ai_translates", + "audio_balances", + "audio_effects", + "audio_fades", + "audio_pannings", + "audio_pitch_shifts", + "audio_track_indexes", + "audios", + "beats", + "canvases", + "chromas", + "color_curves", + "common_mask", + "digital_human_model_dressing", + "digital_humans", + "drafts", + "effects", + "flowers", + "green_screens", + "handwrites", + "hsl", + "hsl_curves", + "images", + "log_color_wheels", + "loudnesses", + "manual_beautys", + "manual_deformations", + "material_animations", + "material_colors", + "multi_language_refs", + "placeholder_infos", + "placeholders", + "plugin_effects", + "primary_color_wheels", + "realtime_denoises", + "shapes", + "smart_crops", + "smart_relights", + "sound_channel_mappings", + "speeds", + "stickers", + "tail_leaders", + "text_templates", + "texts", + "time_marks", + "transitions", + "video_effects", + "video_radius", + "video_shadows", + "video_strokes", + "video_trackings", + "videos", + "vocal_beautifys", + "vocal_separations", +] + +DRAFT_PACKAGE_DIRECTORIES = [ + "qr_upload", + "matting", + "common_attachment", + "Resources/audioAlg", + "Resources/digitalHuman", + "Resources/restore_lut", + "Resources/videoAlg", + "subdraft", + "adjust_mask", + "assets/audio", + "assets/video", + "smart_crop", +] + +DEFAULT_DRAFT_COVER_BYTES = bytes([ + 0xFF, 0xD8, 0xFF, 0xDB, 0x00, 0x43, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC0, 0x00, 0x0B, 0x08, 0x00, 0x01, 0x00, + 0x01, 0x01, 0x01, 0x11, 0x00, 0xFF, 0xC4, 0x00, 0x14, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0xFF, 0xC4, 0x00, 0x14, 0x10, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0xFF, 0xDA, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0x7F, + 0xFF, 0xD9, +]) + + +def _write_json_file(file_path: str, data: Dict[str, Any]) -> None: + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, separators=(",", ":")) + + +def _floor_duration_to_milliseconds(duration: float) -> float: + return int(max(duration, 0.0) * 1000) / 1000.0 + + +def _seconds_to_microseconds(seconds: float) -> int: + return int(round(max(seconds, 0.0) * MICROSECONDS)) + + +def _get_media_duration_ffprobe(media_file: str) -> float: + cmd = [ + "ffprobe", + "-v", "error", + "-show_entries", "format=duration", + "-of", "csv=p=0", + media_file, + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return float(result.stdout.strip()) + + +def _get_cached_media_duration(media_file: str, duration_cache: Dict[str, float]) -> float: + if media_file not in duration_cache: + duration_cache[media_file] = _floor_duration_to_milliseconds( + _get_media_duration_ffprobe(media_file) + ) + return duration_cache[media_file] + + +def _clamp_duration_to_media( + requested_duration: float, + media_file: str, + duration_cache: Dict[str, float], + media_label: str, + source_start_time: float = 0.0, +) -> float: + requested_duration = _floor_duration_to_milliseconds(requested_duration) + actual_duration = _get_cached_media_duration(media_file, duration_cache) + available_duration = _floor_duration_to_milliseconds( + max(actual_duration - max(source_start_time, 0.0), 0.0) + ) + safe_duration = min(requested_duration, available_duration) + + logger.info( + f"{media_label}实际时长: {actual_duration:.6f}秒, " + f"可用时长: {available_duration:.6f}秒, 请求时长: {requested_duration:.3f}秒" + ) + if safe_duration < requested_duration: + logger.warning( + f"{media_label}短于脚本时长,已将剪映片段时长从 " + f"{requested_duration:.3f}秒 调整为 {safe_duration:.3f}秒" + ) + return safe_duration + + +def _get_video_metadata_ffprobe( + media_file: str, + metadata_cache: Dict[str, Tuple[int, int, int]], +) -> Tuple[int, int, int]: + if media_file in metadata_cache: + return metadata_cache[media_file] + + try: + cmd = [ + "ffprobe", + "-v", "error", + "-show_entries", "stream=width,height:format=duration", + "-of", "json", + media_file, + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + info = json.loads(result.stdout or "{}") + stream = next( + ( + item for item in info.get("streams", []) + if item.get("width") and item.get("height") + ), + {}, + ) + duration = _floor_duration_to_milliseconds( + float(info.get("format", {}).get("duration") or 0.0) + ) + width = int(stream.get("width") or 1920) + height = int(stream.get("height") or 1080) + metadata_cache[media_file] = (_seconds_to_microseconds(duration), width, height) + except Exception as e: + logger.warning(f"读取视频元信息失败,将使用默认分辨率: {media_file}, {e}") + duration = _floor_duration_to_milliseconds(_get_media_duration_ffprobe(media_file)) + metadata_cache[media_file] = (_seconds_to_microseconds(duration), 1920, 1080) + + return metadata_cache[media_file] + + +def _format_draft_uuid(draft_id: str) -> str: + compact = draft_id.replace("-", "") + if not re.fullmatch(r"[a-fA-F0-9]{32}", compact): + return draft_id + return "-".join([ + compact[0:8], + compact[8:12], + compact[12:16], + compact[16:20], + compact[20:32], + ]).upper() + + +def _detect_platform(draft_root_path: str) -> str: + return "windows" if re.match(r"^(?:[a-zA-Z]:[\\/]|\\\\)", draft_root_path) else "mac" + + +def _create_platform_info(draft_root_path: str) -> Dict[str, Any]: + return { + "app_id": 3704, + "app_source": "lv", + "app_version": "10.6.0", + "device_id": "", + "hard_disk_id": "", + "mac_address": "", + "os": _detect_platform(draft_root_path), + "os_version": "", + } + + +def _default_function_assistant_info() -> Dict[str, Any]: + return { + "audio_noise_segid_list": [], + "auto_adjust": False, + "auto_adjust_fixed": False, + "auto_adjust_fixed_value": 50.0, + "auto_adjust_segid_list": [], + "auto_caption": False, + "auto_caption_segid_list": [], + "auto_caption_template_id": "", + "caption_opt": False, + "caption_opt_segid_list": [], + "color_correction": False, + "color_correction_fixed": False, + "color_correction_fixed_value": 50.0, + "color_correction_segid_list": [], + "deflicker_segid_list": [], + "enhance_quality": False, + "enhance_quality_fixed": False, + "enhance_quality_segid_list": [], + "enhance_voice_segid_list": [], + "enhande_voice": False, + "enhande_voice_fixed": False, + "eye_correction": False, + "eye_correction_segid_list": [], + "fixed_rec_applied": False, + "fps": {"den": 1, "num": 0}, + "normalize_loudness": False, + "normalize_loudness_audio_denoise_segid_list": [], + "normalize_loudness_fixed": False, + "normalize_loudness_segid_list": [], + "retouch": False, + "retouch_fixed": False, + "retouch_segid_list": [], + "smart_rec_applied": False, + "smart_segid_list": [], + "smooth_slow_motion": False, + "smooth_slow_motion_fixed": False, + "video_noise_segid_list": [], + } + + +def _safe_file_name(file_path: str, fallback: str) -> str: + name = os.path.basename(file_path) or fallback + name = re.sub(r'[<>:"|?*\x00-\x1f/\\]+', "_", name).strip(" ._") + return name or fallback + + +def _normalize_asset_path(file_path: Optional[str], fallback: str) -> str: + normalized = (file_path or "").replace("\\", "/").lstrip("./") + without_draft_placeholder = DRAFT_PATH_PLACEHOLDER_PATTERN.sub("", normalized) + if without_draft_placeholder.startswith("assets/"): + return without_draft_placeholder + assets_index = without_draft_placeholder.rfind("/assets/") + if assets_index >= 0: + return without_draft_placeholder[assets_index + 1:] + return fallback + + +def _to_draft_material_path(relative_path: str) -> str: + return f"{DRAFT_PATH_PLACEHOLDER}/{relative_path}" + + +def _unique_relative_asset_path( + directory: str, + file_name: str, + used_paths: Set[str], +) -> str: + base_name, ext = os.path.splitext(file_name) + candidate_name = file_name + counter = 2 + while True: + relative_path = f"{directory}/{candidate_name}" + if relative_path not in used_paths: + used_paths.add(relative_path) + return relative_path + candidate_name = f"{base_name}_{counter}{ext}" + counter += 1 + + +def _copy_asset_into_draft(source_file: str, draft_path: str, relative_path: str) -> None: + destination = os.path.join(draft_path, *relative_path.split("/")) + os.makedirs(os.path.dirname(destination), exist_ok=True) + if os.path.abspath(source_file) != os.path.abspath(destination): + shutil.copy2(source_file, destination) + + +def _register_asset( + source_file: str, + draft_path: str, + asset_dir: str, + fallback_name: str, + used_paths: Set[str], + asset_path_cache: Dict[str, str], +) -> str: + source_key = os.path.abspath(source_file) + if source_key in asset_path_cache: + return asset_path_cache[source_key] + + file_name = _safe_file_name(source_file, fallback_name) + relative_path = _unique_relative_asset_path(asset_dir, file_name, used_paths) + _copy_asset_into_draft(source_file, draft_path, relative_path) + asset_path_cache[source_key] = relative_path + return relative_path + + +def _create_unique_draft_path(drafts_root: str, draft_name: str) -> Tuple[str, str]: + folder_base = _safe_file_name(draft_name, f"NarratoAI_{int(time.time())}") + folder_name = folder_base + counter = 2 + while os.path.exists(os.path.join(drafts_root, folder_name)): + folder_name = f"{folder_base}_{counter}" + counter += 1 + return folder_name, os.path.join(drafts_root, folder_name) + + +def _create_material_collections() -> Dict[str, List[Any]]: + return {key: [] for key in MATERIAL_COLLECTION_KEYS} + + +def _create_draft_template( + draft_id: str, + draft_name: str, + draft_root_path: str, + width: int = 1920, + height: int = 1080, +) -> Dict[str, Any]: + now_us = int(time.time() * MICROSECONDS) + platform_info = _create_platform_info(draft_root_path) + return { + "canvas_config": {"height": height, "ratio": "original", "width": width}, + "color_space": 0, + "config": { + "adjust_max_index": 1, + "attachment_info": [], + "combination_max_index": 1, + "export_range": None, + "extract_audio_last_index": 1, + "lyrics_recognition_id": "", + "lyrics_sync": True, + "lyrics_taskinfo": [], + "maintrack_adsorb": True, + "material_save_mode": 0, + "multi_language_current": "none", + "multi_language_list": [], + "multi_language_main": "none", + "multi_language_mode": "none", + "original_sound_last_index": 1, + "record_audio_last_index": 1, + "sticker_max_index": 1, + "subtitle_keywords_config": None, + "subtitle_recognition_id": "", + "subtitle_sync": True, + "subtitle_taskinfo": [], + "system_font_list": [], + "video_mute": False, + "zoom_info_params": None, + }, + "cover": None, + "create_time": now_us, + "duration": 0, + "extra_info": None, + "fps": 30.0, + "free_render_index_mode_on": False, + "group_container": None, + "id": draft_id, + "keyframe_graph_list": [], + "keyframes": { + "adjusts": [], + "audios": [], + "effects": [], + "filters": [], + "handwrites": [], + "stickers": [], + "texts": [], + "videos": [], + }, + "last_modified_platform": platform_info, + "materials": _create_material_collections(), + "mutable_config": None, + "name": draft_name, + "new_version": "169.0.0", + "relationships": [], + "render_index_track_mode_on": True, + "retouch_cover": None, + "source": "default", + "static_cover_image_path": "", + "time_marks": None, + "tracks": [], + "update_time": now_us, + "version": 360000, + } + + +def _create_track(track_type: str, name: str) -> Dict[str, Any]: + return { + "attribute": 0, + "flag": 0, + "id": uuid.uuid4().hex, + "is_default_name": True, + "name": name, + "segments": [], + "type": track_type, + } + + +def _create_video_material( + relative_path: str, + duration_us: int, + width: int, + height: int, +) -> Dict[str, Any]: + return { + "id": uuid.uuid4().hex, + "path": relative_path, + "type": "video", + "duration": duration_us, + "width": width, + "height": height, + "material_name": os.path.basename(relative_path), + "create_time": int(time.time() * MICROSECONDS), + "crop": { + "lower_left_x": 0.0, + "lower_left_y": 1.0, + "lower_right_x": 1.0, + "lower_right_y": 1.0, + "upper_left_x": 0.0, + "upper_left_y": 0.0, + "upper_right_x": 1.0, + "upper_right_y": 0.0, + }, + "extra_type_option": 0, + "source_platform": 0, + } + + +def _create_audio_material(relative_path: str, duration_us: int) -> Dict[str, Any]: + material_id = uuid.uuid4().hex + return { + "app_id": 0, + "category_id": "", + "category_name": "local", + "check_flag": 1, + "copyright_limit_type": "none", + "duration": duration_us, + "effect_id": "", + "formula_id": "", + "id": material_id, + "intensifies_path": "", + "is_ai_clone_tone": False, + "is_text_edit_overdub": False, + "is_ugc": False, + "local_material_id": material_id, + "music_id": material_id, + "name": os.path.basename(relative_path), + "path": relative_path, + "remote_url": "", + "query": "", + "request_id": "", + "resource_id": "", + "search_id": "", + "source_from": "", + "source_platform": 0, + "team_id": "", + "text_id": "", + "tone_category_id": "", + "tone_category_name": "", + "tone_effect_id": "", + "tone_effect_name": "", + "tone_platform": "", + "tone_second_category_id": "", + "tone_second_category_name": "", + "tone_speaker": "", + "tone_type": "", + "type": "extract_music", + "video_id": "", + "wave_points": [], + } + + +def _create_video_segment( + material_id: str, + source_start_us: int, + duration_us: int, + target_start_us: int, + volume: float, +) -> Dict[str, Any]: + return { + "id": uuid.uuid4().hex, + "material_id": material_id, + "target_timerange": {"start": target_start_us, "duration": duration_us}, + "source_timerange": {"start": source_start_us, "duration": duration_us}, + "speed": 1.0, + "volume": volume, + "enable_adjust": True, + "enable_color_curves": True, + "enable_color_match_adjust": False, + "enable_color_wheels": True, + "enable_lut": True, + "enable_smart_color_adjust": False, + "extra_material_refs": [], + "hdr_settings": {"intensity": 1.0, "mode": 1, "nits": 1000}, + "uniform_scale": {"on": True, "value": 1.0}, + "clip": { + "alpha": 1.0, + "flip": {"horizontal": False, "vertical": False}, + "rotation": 0.0, + "scale": {"x": 1.0, "y": 1.0}, + "transform": {"x": 0.0, "y": 0.0}, + }, + "common_keyframes": [], + } + + +def _create_audio_segment( + material_id: str, + duration_us: int, + target_start_us: int, + volume: float, +) -> Dict[str, Any]: + return { + "id": uuid.uuid4().hex, + "material_id": material_id, + "target_timerange": {"start": target_start_us, "duration": duration_us}, + "source_timerange": {"start": 0, "duration": duration_us}, + "speed": 1.0, + "volume": volume, + "extra_material_refs": [], + "clip": None, + "hdr_settings": None, + "uniform_scale": None, + "common_keyframes": [], + } + + +def _normalize_video_material(material: Dict[str, Any]) -> Dict[str, Any]: + fallback_path = f"assets/video/{material.get('material_name') or 'source.mp4'}" + result = { + "aigc_history_id": "", + "aigc_item_id": "", + "aigc_type": "none", + "audio_fade": None, + "beauty_body_auto_preset": None, + "beauty_body_preset_id": "", + "beauty_face_auto_preset": None, + "beauty_face_auto_preset_infos": [], + "beauty_face_preset_infos": [], + "cartoon_path": "", + "category_id": "", + "category_name": "local", + "check_flag": 65535, + "content_feature_info": None, + "corner_pin": None, + "crop_ratio": "free", + "crop_scale": 1.0, + "formula_id": "", + "freeze": None, + "has_audio": True, + "has_sound_separated": False, + "intensifies_audio_path": "", + "intensifies_path": "", + "is_ai_generate_content": False, + "is_copyright": False, + "is_set_beauty_mode": False, + "is_text_edit_overdub": False, + "is_unified_beauty_mode": False, + "live_photo_cover_path": "", + "live_photo_timestamp": 0, + "local_id": "", + "local_material_from": 0, + "local_material_id": "", + "material_id": "", + "material_url": "", + "matting": None, + "media_path": "", + "multi_camera_info": None, + "object_locked": None, + "origin_material_id": "", + "picture_from": "none", + "picture_set_category_id": "", + "picture_set_category_name": "", + "request_id": "", + "reverse_intensifies_path": "", + "reverse_path": "", + "smart_match_info": None, + "smart_motion": None, + "source": 0, + "stable": None, + "surface_trackings": None, + "team_id": "", + "unique_id": "", + "video_algorithm": None, + "video_mask_shadow": None, + "video_mask_stroke": None, + } + result.update(material) + result["path"] = _to_draft_material_path( + _normalize_asset_path(material.get("path"), fallback_path) + ) + result["type"] = "video" + return result + + +def _normalize_audio_material(material: Dict[str, Any]) -> Dict[str, Any]: + fallback_path = f"assets/audio/{material.get('name') or 'audio.mp3'}" + result = { + "ai_music_enter_from": "", + "ai_music_generate_scene": "", + "ai_music_type": 0, + "aigc_history_id": "", + "aigc_item_id": "", + "app_id": 0, + "category_id": "", + "category_name": "local", + "check_flag": 1, + "cloned_model_type": "", + "copyright_limit_type": "none", + "effect_id": "", + "formula_id": "", + "intensifies_path": "", + "is_ai_clone_tone": False, + "is_ai_clone_tone_post": False, + "is_text_edit_overdub": False, + "is_ugc": False, + "lyric_type": 0, + "mock_tone_speaker": "", + "moyin_emotion": "", + "music_source": "", + "pgc_id": "", + "pgc_name": "", + "query": "", + "request_id": "", + "resource_id": "", + "search_id": "", + "similiar_music_info": None, + "sound_separate_type": 0, + "source_from": "", + "source_platform": 0, + "team_id": "", + "text_id": "", + "third_resource_id": "", + "tone_category_id": "", + "tone_category_name": "", + "tone_effect_id": "", + "tone_effect_name": "", + "tone_emotion_name_key": "", + "tone_emotion_role": "", + "tone_emotion_scale": 0, + "tone_emotion_selection": "", + "tone_emotion_style": "", + "tone_platform": "", + "tone_second_category_id": "", + "tone_second_category_name": "", + "tone_speaker": "", + "tone_type": "", + "tts_benefit_info": None, + "tts_generate_scene": 0, + "tts_task_id": "", + "unique_id": "", + "video_id": "", + "wave_points": [], + } + result.update(material) + result["path"] = _to_draft_material_path( + _normalize_asset_path(material.get("path"), fallback_path) + ) + result["type"] = "extract_music" + return result + + +def _normalize_materials(draft: Dict[str, Any]) -> Dict[str, List[Any]]: + source = draft.get("materials", {}) + materials = { + key: source.get(key, []) if isinstance(source.get(key, []), list) else [] + for key in MATERIAL_COLLECTION_KEYS + } + materials["videos"] = [_normalize_video_material(item) for item in source.get("videos", [])] + materials["audios"] = [_normalize_audio_material(item) for item in source.get("audios", [])] + return materials + + +def _create_responsive_layout() -> Dict[str, Any]: + return { + "enable": False, + "horizontal_pos_layout": 0, + "size_layout": 0, + "target_follow": "", + "vertical_pos_layout": 0, + } + + +def _normalize_segment( + segment: Dict[str, Any], + track_type: str, + track_index: int, + track_attribute: int, +) -> Dict[str, Any]: + is_video = track_type == "video" + result = { + "caption_info": None, + "cartoon": False, + "color_correct_alg_result": "", + "common_keyframes": [], + "desc": "", + "digital_human_template_group_id": "", + "enable_adjust": is_video, + "enable_adjust_mask": is_video, + "enable_color_adjust_pro": False, + "enable_color_correct_adjust": False, + "enable_color_curves": True, + "enable_color_match_adjust": False, + "enable_color_wheels": True, + "enable_hsl": is_video, + "enable_hsl_curves": True, + "enable_lut": is_video, + "enable_mask_shadow": False, + "enable_mask_stroke": False, + "enable_smart_color_adjust": False, + "enable_video_mask": True, + "extra_material_refs": [], + "group_id": "", + "hdr_settings": segment.get("hdr_settings"), + "intensifies_audio": False, + "is_loop": False, + "is_placeholder": False, + "is_tone_modify": False, + "keyframe_refs": [], + "last_nonzero_volume": 1.0, + "lyric_keyframes": None, + "raw_segment_id": "", + "render_index": 0, + "render_timerange": {"duration": 0, "start": 0}, + "responsive_layout": _create_responsive_layout(), + "reverse": False, + "source": "segmentsourcenormal", + "state": 0, + "template_id": "", + "template_scene": "default", + "uniform_scale": {"on": True, "value": 1.0}, + "visible": True, + } + result.update(segment) + result["track_attribute"] = track_attribute + result["track_render_index"] = track_index + return result + + +def _normalize_tracks(draft: Dict[str, Any]) -> List[Dict[str, Any]]: + tracks = [] + for index, track in enumerate(draft.get("tracks", [])): + track_copy = dict(track) + track_attribute = int(track_copy.get("attribute", 0) or 0) + track_copy["segments"] = [ + _normalize_segment(segment, track_copy.get("type", ""), index, track_attribute) + for segment in track.get("segments", []) + ] + tracks.append(track_copy) + return tracks + + +def _create_draft_info( + draft: Dict[str, Any], + draft_name: str, + draft_root_path: str, + new_version: str = "169.0.0", +) -> Dict[str, Any]: + info = json.loads(json.dumps(draft, ensure_ascii=False)) + canvas_config = info.get("canvas_config", {}) + platform_info = _create_platform_info(draft_root_path) + info.update({ + "canvas_config": { + "background": canvas_config.get("background"), + "height": canvas_config.get("height", 1080), + "ratio": canvas_config.get("ratio", "original"), + "width": canvas_config.get("width", 1920), + }, + "draft_type": "video", + "function_assistant_info": _default_function_assistant_info(), + "is_drop_frame_timecode": False, + "last_modified_platform": platform_info, + "lyrics_effects": [], + "materials": _normalize_materials(info), + "name": draft_name, + "new_version": new_version, + "path": "", + "platform": platform_info, + "render_index_track_mode_on": True, + "smart_ads_info": {"draft_url": "", "page_from": "", "routine": ""}, + "tracks": _normalize_tracks(info), + "uneven_animation_template_info": { + "composition": "", + "content": "", + "order": "", + "sub_template_info_list": [], + }, + }) + return info + + +def _create_empty_template(draft: Dict[str, Any], draft_root_path: str) -> Dict[str, Any]: + empty_draft = json.loads(json.dumps(draft, ensure_ascii=False)) + empty_draft["canvas_config"] = { + "background": None, + "height": 0, + "ratio": "original", + "width": 0, + } + empty_draft["color_space"] = -1 + empty_draft["duration"] = 0 + empty_draft["keyframes"] = { + "adjusts": [], + "audios": [], + "effects": [], + "filters": [], + "handwrites": [], + "stickers": [], + "texts": [], + "videos": [], + } + empty_draft["materials"] = _create_material_collections() + empty_draft["tracks"] = [] + return _create_draft_info(empty_draft, "", draft_root_path, "75.0.0") + + +def _create_draft_material_index_item( + material: Dict[str, Any], + file_name: str, + metetype: str, + width: int, + height: int, +) -> Dict[str, Any]: + duration = int(material.get("duration", 0) or 0) + return { + "ai_group_type": "", + "create_time": -1, + "duration": duration, + "enter_from": 0, + "extra_info": file_name, + "file_Path": material.get("path", ""), + "height": height, + "id": material.get("id", ""), + "import_time": -1, + "import_time_ms": -1, + "item_source": 1, + "md5": "", + "metetype": metetype, + "roughcut_time_range": {"duration": duration, "start": 0}, + "sub_time_range": {"duration": -1, "start": -1}, + "type": 0, + "width": width, + } + + +def _create_draft_material_index(draft: Dict[str, Any]) -> List[Dict[str, Any]]: + items: List[Dict[str, Any]] = [] + for video in draft.get("materials", {}).get("videos", []): + relative_path = _normalize_asset_path( + video.get("path"), + f"assets/video/{video.get('material_name') or 'source.mp4'}", + ) + items.append(_create_draft_material_index_item( + {**video, "path": _to_draft_material_path(relative_path)}, + os.path.basename(relative_path), + "video", + int(video.get("width", 0) or 0), + int(video.get("height", 0) or 0), + )) + for audio in draft.get("materials", {}).get("audios", []): + relative_path = _normalize_asset_path( + audio.get("path"), + f"assets/audio/{audio.get('name') or 'audio.mp3'}", + ) + items.append(_create_draft_material_index_item( + {**audio, "path": _to_draft_material_path(relative_path)}, + os.path.basename(relative_path), + "music", + 0, + 0, + )) + return items + + +def _create_meta_info( + draft: Dict[str, Any], + draft_name: str, + draft_id: str, + draft_root_path: str, + draft_path: str, + asset_size: int, +) -> Dict[str, Any]: + return { + "cloud_draft_cover": False, + "cloud_draft_sync": False, + "cloud_package_completed_time": "", + "draft_cloud_capcut_purchase_info": "", + "draft_cloud_last_action_download": False, + "draft_cloud_package_type": "", + "draft_cloud_purchase_info": "", + "draft_cloud_template_id": "", + "draft_cloud_tutorial_info": "", + "draft_cloud_videocut_purchase_info": "", + "draft_cover": "draft_cover.jpg", + "draft_deeplink_url": "", + "draft_enterprise_info": { + "draft_enterprise_extra": "", + "draft_enterprise_id": "", + "draft_enterprise_name": "", + "enterprise_material": [], + }, + "draft_fold_path": draft_path, + "draft_id": _format_draft_uuid(draft_id), + "draft_is_ae_produce": False, + "draft_is_ai_packaging_used": False, + "draft_is_ai_shorts": False, + "draft_is_ai_translate": False, + "draft_is_article_video_draft": False, + "draft_is_cloud_temp_draft": False, + "draft_is_from_deeplink": "false", + "draft_is_invisible": False, + "draft_is_pippit_draft": False, + "draft_is_web_article_video": False, + "draft_materials": [ + {"type": 0, "value": _create_draft_material_index(draft)}, + {"type": 1, "value": []}, + {"type": 2, "value": []}, + {"type": 3, "value": []}, + {"type": 6, "value": []}, + {"type": 7, "value": []}, + {"type": 8, "value": []}, + ], + "draft_materials_copied_info": [], + "draft_name": draft_name, + "draft_need_rename_folder": False, + "draft_new_version": "", + "draft_removable_storage_device": "", + "draft_root_path": draft_root_path, + "draft_segment_extra_info": [], + "draft_timeline_materials_size_": asset_size, + "draft_type": "", + "draft_web_article_video_enter_from": "", + "tm_draft_cloud_completed": "", + "tm_draft_cloud_entry_id": -1, + "tm_draft_cloud_modified": 0, + "tm_draft_cloud_parent_entry_id": -1, + "tm_draft_cloud_space_id": -1, + "tm_draft_cloud_user_id": -1, + "tm_draft_create": draft.get("create_time", 0), + "tm_draft_modified": draft.get("update_time", 0), + "tm_draft_removed": 0, + "tm_duration": draft.get("duration", 0), + } + + +def _create_root_meta_entry( + draft: Dict[str, Any], + draft_name: str, + draft_id: str, + draft_root_path: str, + draft_path: str, + asset_size: int, +) -> Dict[str, Any]: + return { + "cloud_draft_cover": False, + "cloud_draft_sync": False, + "draft_cloud_last_action_download": False, + "draft_cloud_purchase_info": "", + "draft_cloud_template_id": "", + "draft_cloud_tutorial_info": "", + "draft_cloud_videocut_purchase_info": "", + "draft_cover": os.path.join(draft_path, "draft_cover.jpg"), + "draft_fold_path": draft_path, + "draft_id": _format_draft_uuid(draft_id), + "draft_is_ai_shorts": False, + "draft_is_cloud_temp_draft": False, + "draft_is_invisible": False, + "draft_is_web_article_video": False, + "draft_json_file": os.path.join(draft_path, "draft_info.json"), + "draft_name": draft_name, + "draft_new_version": "", + "draft_root_path": draft_root_path, + "draft_timeline_materials_size": asset_size, + "draft_type": "", + "draft_web_article_video_enter_from": "", + "streaming_edit_draft_ready": True, + "tm_draft_cloud_completed": "", + "tm_draft_cloud_entry_id": -1, + "tm_draft_cloud_modified": 0, + "tm_draft_cloud_parent_entry_id": -1, + "tm_draft_cloud_space_id": -1, + "tm_draft_cloud_user_id": -1, + "tm_draft_create": draft.get("create_time", 0), + "tm_draft_modified": draft.get("update_time", 0), + "tm_draft_removed": 0, + "tm_duration": draft.get("duration", 0), + } + + +def _merge_root_meta_info( + existing_value: Any, + next_entry: Dict[str, Any], + root_path: str, +) -> Dict[str, Any]: + existing = existing_value if isinstance(existing_value, dict) else {} + existing_store = existing.get("all_draft_store") + if not isinstance(existing_store, list): + existing_store = [] + + all_draft_store = [ + next_entry, + *[ + entry for entry in existing_store + if ( + isinstance(entry, dict) + and entry.get("draft_id") != next_entry.get("draft_id") + and entry.get("draft_fold_path") != next_entry.get("draft_fold_path") + and entry.get("draft_name") != next_entry.get("draft_name") + ) + ], + ] + return { + "all_draft_store": all_draft_store, + "draft_ids": existing.get("draft_ids") if isinstance(existing.get("draft_ids"), int) else 1, + "root_path": existing.get("root_path") or root_path, + } + + +def _create_draft_settings(draft: Dict[str, Any], draft_root_path: str) -> str: + created_at = round(int(draft.get("create_time", 0) or 0) / MICROSECONDS) + updated_at = round(int(draft.get("update_time", 0) or 0) / MICROSECONDS) + return "\n".join([ + "[General]", + f"cloud_last_modify_platform={_detect_platform(draft_root_path)}", + f"draft_create_time={created_at}", + f"draft_last_edit_time={updated_at}", + "real_edit_keys=1", + "real_edit_seconds=0", + "", + ]) + + +def _create_reference_line_attachment() -> Dict[str, Any]: + return { + "reference_lines_config": { + "horizontal_lines": [], + "is_lock": False, + "is_visible": False, + "vertical_lines": [], + }, + "safe_area_type": 0, + } + + +def _create_editing_attachment() -> Dict[str, Any]: + return { + "editing_draft": { + "ai_remove_filter_words": { + "enter_source": "", + "right_id": "", + }, + "ai_shorts_info": { + "report_params": "", + "type": 0, + }, + "cover_extra_info": { + "draft_id": "", + "position": 0, + "select_segment_id": "", + "select_segment_source_start": 0, + "select_segment_target_start": 0, + "type": 1, + }, + "crop_info_extra": { + "crop_mirror_type": 0, + "crop_rotate": 0, + "crop_rotate_total": 0, + }, + "digital_human_template_to_video_info": { + "has_upload_material": False, + "template_type": 0, + }, + "draft_used_recommend_function": "", + "edit_type": 0, + "eye_correct_enabled_multi_face_time": 0, + "has_adjusted_render_layer": False, + "image_ai_chat_info": { + "before_chat_edit": False, + "draft_modify_time": 0, + "generate_type": "", + "keyword_content": "", + "keyword_type": "", + "message_id": "", + "model_name": "", + "need_restore": False, + "picture_id": "", + "prompt_content": "", + "prompt_from": "", + "sugs_info": [], + }, + "is_open_expand_player": False, + "is_template_text_ai_generate": False, + "is_use_adjust": False, + "is_use_ai_expand": False, + "is_use_ai_remove": False, + "is_use_ai_video": False, + "is_use_audio_separation": False, + "is_use_chroma_key": False, + "is_use_curve_speed": False, + "is_use_digital_human": False, + "is_use_edit_multi_camera": False, + "is_use_lip_sync": False, + "is_use_lock_object": False, + "is_use_loudness_unify": False, + "is_use_noise_reduction": False, + "is_use_one_click_beauty": False, + "is_use_one_click_ultra_hd": False, + "is_use_retouch_face": False, + "is_use_smart_adjust_color": False, + "is_use_smart_body_beautify": False, + "is_use_smart_motion": False, + "is_use_subtitle_recognition": False, + "is_use_text_to_audio": False, + "material_edit_session": { + "material_edit_info": [], + "session_id": "", + "session_time": 0, + }, + "paste_segment_list": [], + "profile_entrance_type": "", + "publish_enter_from": "", + "publish_type": "", + "single_function_type": 0, + "text_convert_case_types": [], + "version": "1.0.0", + "video_recording_create_draft": "", + } + } + + +def _create_draft_virtual_store(draft: Dict[str, Any]) -> Dict[str, Any]: + materials = [ + *draft.get("materials", {}).get("videos", []), + *draft.get("materials", {}).get("audios", []), + ] + return { + "draft_materials": [], + "draft_virtual_store": [ + { + "type": 0, + "value": [ + { + "creation_time": 0, + "display_name": "", + "filter_type": 0, + "id": "", + "import_time": 0, + "import_time_us": 0, + "sort_sub_type": 0, + "sort_type": 0, + "subdraft_filter_type": 0, + } + ], + }, + { + "type": 1, + "value": [ + {"child_id": material.get("id", ""), "parent_id": ""} + for material in materials + ], + }, + {"type": 2, "value": []}, + ], + } + + +def _write_root_meta_info(draft_root_path: str, root_meta_entry: Dict[str, Any]) -> None: + root_meta_path = os.path.join(draft_root_path, "root_meta_info.json") + existing_value: Any = {} + if os.path.exists(root_meta_path): + try: + with open(root_meta_path, "r", encoding="utf-8") as f: + existing_value = json.load(f) + except Exception as e: + logger.warning(f"读取 root_meta_info.json 失败,将重建索引: {e}") + + _write_json_file( + root_meta_path, + _merge_root_meta_info(existing_value, root_meta_entry, draft_root_path), + ) + + +def _write_plaintext_draft_files( + draft_root_path: str, + draft_path: str, + draft_name: str, + draft_id: str, + draft: Dict[str, Any], + asset_size: int, +) -> None: + draft_info = _create_draft_info(draft, draft_name, draft_root_path) + empty_template = _create_empty_template(draft, draft_root_path) + + _write_json_file( + os.path.join(draft_path, "draft_meta_info.json"), + _create_meta_info(draft, draft_name, draft_id, draft_root_path, draft_path, asset_size), + ) + with open(os.path.join(draft_path, "draft_settings"), "w", encoding="utf-8") as f: + f.write(_create_draft_settings(draft, draft_root_path)) + with open(os.path.join(draft_path, "draft_cover.jpg"), "wb") as f: + f.write(DEFAULT_DRAFT_COVER_BYTES) + _write_json_file(os.path.join(draft_path, "draft_info.json"), draft_info) + _write_json_file(os.path.join(draft_path, "template-2.tmp"), draft_info) + _write_json_file(os.path.join(draft_path, "template.tmp"), empty_template) + _write_json_file( + os.path.join(draft_path, "common_attachment", "attachment_pc_timeline.json"), + _create_reference_line_attachment(), + ) + _write_json_file(os.path.join(draft_path, "common_attachment", "attachment_action_scene.json"), {}) + _write_json_file(os.path.join(draft_path, "common_attachment", "attachment_script_video.json"), {}) + _write_json_file(os.path.join(draft_path, "common_attachment", "attachment_gen_ai_info.json"), {}) + _write_json_file(os.path.join(draft_path, "attachment_editing.json"), _create_editing_attachment()) + _write_json_file( + os.path.join(draft_path, "draft_agency_config.json"), + { + "is_auto_agency_enabled": False, + "is_auto_agency_popup": False, + "is_single_agency_mode": False, + "marterials": None, + "use_converter": False, + "video_resolution": draft.get("canvas_config", {}).get("height", 1080), + }, + ) + with open(os.path.join(draft_path, "draft_biz_config.json"), "w", encoding="utf-8"): + pass + _write_json_file( + os.path.join(draft_path, "draft_virtual_store.json"), + _create_draft_virtual_store(draft), + ) + _write_json_file( + os.path.join(draft_path, "performance_opt_info.json"), + {"manual_cancle_precombine_segs": None, "need_auto_precombine_segs": None}, + ) + _write_json_file( + os.path.join(draft_path, "timeline_layout.json"), + { + "activeTimeline": draft_id, + "dockItems": [ + { + "dockIndex": 0, + "ratio": 1, + "timelineIds": [draft_id], + "timelineNames": ["时间线01"], + } + ], + "layoutOrientation": 1, + }, + ) + _write_root_meta_info( + draft_root_path, + _create_root_meta_entry(draft, draft_name, draft_id, draft_root_path, draft_path, asset_size), + ) + + +def _resolve_item_audio_file(item: Dict[str, Any], output_dir: str) -> str: + audio_file = "" + timestamp = item.get("timestamp", "") + if timestamp: + audio_file = os.path.join(output_dir, f"audio_{timestamp.replace(':', '_')}.mp3") + + item_audio_file = item.get("audio", "") + if item_audio_file and os.path.exists(item_audio_file): + audio_file = item_audio_file + + return audio_file + + +def write_plaintext_jianying_draft( + jianying_draft_path: str, + draft_name: str, + new_script_list: List[Dict[str, Any]], + params: VideoClipParams, + output_dir: str, +) -> Tuple[str, str]: + os.makedirs(jianying_draft_path, exist_ok=True) + + display_name = draft_name or f"NarratoAI_{int(time.time())}" + folder_name, draft_path = _create_unique_draft_path(jianying_draft_path, display_name) + os.makedirs(draft_path, exist_ok=False) + for rel_dir in DRAFT_PACKAGE_DIRECTORIES: + os.makedirs(os.path.join(draft_path, *rel_dir.split("/")), exist_ok=True) + + draft_id = uuid.uuid4().hex + draft = _create_draft_template(draft_id, display_name, jianying_draft_path) + video_track = _create_track("video", "视频轨道") + audio_track = _create_track("audio", "音频轨道") + draft["tracks"] = [video_track, audio_track] + + duration_cache: Dict[str, float] = {} + metadata_cache: Dict[str, Tuple[int, int, int]] = {} + used_asset_paths: Set[str] = set() + asset_path_cache: Dict[str, str] = {} + current_time_us = 0 + + for item in new_script_list: + start_time = float(item.get("start_time", 0.0) or 0.0) + requested_duration = float(item.get("duration", 0.0) or 0.0) + timestamp = item.get("timestamp", "") + ost = int(item.get("OST", 0) or 0) + + logger.info( + f"处理片段: OST={ost}, start_time={start_time}, " + f"duration={requested_duration}, timestamp={timestamp}" + ) + + video_file = item.get("video", "") + use_clipped_video = bool(video_file and os.path.exists(video_file)) + if not use_clipped_video: + video_file = params.video_origin_path + + if not video_file or not os.path.exists(video_file): + logger.warning(f"视频素材不存在,跳过片段: {video_file or timestamp}") + continue + + source_start_time = 0.0 if use_clipped_video else start_time + video_duration = _clamp_duration_to_media( + requested_duration, + video_file, + duration_cache, + "视频素材" if use_clipped_video else "原始视频素材", + source_start_time=source_start_time, + ) + + audio_file = _resolve_item_audio_file(item, output_dir) + audio_duration = None + if ost in [0, 2] and audio_file and os.path.exists(audio_file): + audio_duration = _get_cached_media_duration(audio_file, duration_cache) + logger.info( + f"音频文件实际时长: {audio_duration:.6f}秒, 视频片段时长: {video_duration:.3f}秒" + ) + + segment_duration = min( + video_duration, + audio_duration if audio_duration is not None else video_duration, + ) + segment_duration = _floor_duration_to_milliseconds(segment_duration) + if segment_duration <= 0: + logger.warning(f"片段时长无效,跳过: {timestamp}") + continue + + segment_duration_us = _seconds_to_microseconds(segment_duration) + video_material_duration_us, width, height = _get_video_metadata_ffprobe(video_file, metadata_cache) + video_relative_path = _register_asset( + video_file, + draft_path, + "assets/video", + f"video_{len(video_track['segments']) + 1}.mp4", + used_asset_paths, + asset_path_cache, + ) + video_material = _create_video_material(video_relative_path, video_material_duration_us, width, height) + draft["materials"]["videos"].append(video_material) + video_track["segments"].append(_create_video_segment( + video_material["id"], + _seconds_to_microseconds(_floor_duration_to_milliseconds(source_start_time)), + segment_duration_us, + current_time_us, + float(getattr(params, "original_volume", 1.0) or 1.0), + )) + + if ost in [0, 2] and audio_file and os.path.exists(audio_file): + audio_material_duration_us = _seconds_to_microseconds( + _get_cached_media_duration(audio_file, duration_cache) + ) + audio_relative_path = _register_asset( + audio_file, + draft_path, + "assets/audio", + f"audio_{len(audio_track['segments']) + 1}.mp3", + used_asset_paths, + asset_path_cache, + ) + audio_material = _create_audio_material(audio_relative_path, audio_material_duration_us) + draft["materials"]["audios"].append(audio_material) + audio_track["segments"].append(_create_audio_segment( + audio_material["id"], + segment_duration_us, + current_time_us, + float(getattr(params, "tts_volume", 1.0) or 1.0), + )) + elif ost in [0, 2]: + logger.warning(f"音频文件不存在: {audio_file}") + + current_time_us += segment_duration_us + + if not video_track["segments"]: + raise ValueError("没有可写入剪映草稿的视频片段") + + first_video = draft["materials"]["videos"][0] + draft["canvas_config"]["width"] = int(first_video.get("width", 1920) or 1920) + draft["canvas_config"]["height"] = int(first_video.get("height", 1080) or 1080) + draft["duration"] = current_time_us + draft["update_time"] = int(time.time() * MICROSECONDS) + + asset_size = sum( + os.path.getsize(source_file) + for source_file in asset_path_cache.keys() + if os.path.exists(source_file) + ) + _write_plaintext_draft_files( + jianying_draft_path, + draft_path, + display_name, + draft_id, + draft, + asset_size, + ) + + logger.info(f"剪映明文草稿包已写入: {draft_path} (folder={folder_name})") + return draft_path, display_name diff --git a/app/services/jianying_task.py b/app/services/jianying_task.py index 282cf47..e988435 100644 --- a/app/services/jianying_task.py +++ b/app/services/jianying_task.py @@ -3,25 +3,27 @@ import os import subprocess import time from os import path +from typing import Dict from loguru import logger from app.config import config from app.models import const from app.models.schema import VideoClipParams from app.services import voice, clip_video, update_script +from app.services.jianying_draft_builder import write_plaintext_jianying_draft from app.services import state as sm from app.utils import utils -def get_audio_duration_ffprobe(audio_file: str) -> float: +def get_media_duration_ffprobe(media_file: str) -> float: """ - 使用ffprobe获取音频文件的精确时长(秒) + 使用ffprobe获取媒体文件的精确时长(秒) Args: - audio_file: 音频文件路径 + media_file: 媒体文件路径 Returns: - float: 音频时长(秒),精确到微秒 + float: 媒体时长(秒),精确到微秒 """ try: cmd = [ @@ -29,20 +31,24 @@ def get_audio_duration_ffprobe(audio_file: str) -> float: '-v', 'error', '-show_entries', 'format=duration', '-of', 'csv=p=0', - audio_file + media_file ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) duration = float(result.stdout.strip()) - logger.debug(f"使用ffprobe获取音频时长: {duration:.6f}秒") + logger.debug(f"使用ffprobe获取媒体时长: {duration:.6f}秒, 文件: {media_file}") return duration except subprocess.CalledProcessError as e: logger.error(f"ffprobe执行失败: {e.stderr}") raise except Exception as e: - logger.error(f"获取音频时长失败: {str(e)}") + logger.error(f"获取媒体时长失败: {str(e)}") raise +def get_audio_duration_ffprobe(audio_file: str) -> float: + return get_media_duration_ffprobe(audio_file) + + def _strip_indextts2_prefix(voice_name: str) -> str: prefix = "indextts2:" if voice_name.startswith(prefix): @@ -54,6 +60,45 @@ def _floor_duration_to_milliseconds(duration: float) -> float: return int(duration * 1000) / 1000.0 +def _format_seconds_for_trange(seconds: float) -> str: + return f"{seconds:.3f}s" + + +def _get_cached_media_duration(media_file: str, duration_cache: Dict[str, float]) -> float: + if media_file not in duration_cache: + duration_cache[media_file] = _floor_duration_to_milliseconds( + get_media_duration_ffprobe(media_file) + ) + return duration_cache[media_file] + + +def _clamp_duration_to_media( + requested_duration: float, + media_file: str, + duration_cache: Dict[str, float], + media_label: str, + source_start_time: float = 0.0, +) -> float: + requested_duration = _floor_duration_to_milliseconds(max(requested_duration, 0.0)) + actual_duration = _get_cached_media_duration(media_file, duration_cache) + available_duration = _floor_duration_to_milliseconds( + max(actual_duration - max(source_start_time, 0.0), 0.0) + ) + safe_duration = min(requested_duration, available_duration) + + logger.info( + f"{media_label}实际时长: {actual_duration:.6f}秒, " + f"可用时长: {available_duration:.6f}秒, 请求时长: {requested_duration:.3f}秒" + ) + if safe_duration < requested_duration: + logger.warning( + f"{media_label}短于脚本时长,已将剪映片段时长从 " + f"{requested_duration:.3f}秒 调整为 {safe_duration:.3f}秒" + ) + + return safe_duration + + def _normalize_indextts2_reference_audio(params: VideoClipParams) -> None: """Ensure IndexTTS2 uses the configured reference audio instead of a stale UI voice.""" if params.tts_engine != "indextts2": @@ -158,103 +203,26 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams): logger.info("\n\n## 4. 导出到剪映草稿") try: - import pyJianYingDraft - from pyJianYingDraft import DraftFolder, VideoSegment, AudioSegment, trange, TrackType jianying_draft_path = config.ui.get("jianying_draft_path", "") if not jianying_draft_path: raise ValueError("剪映草稿路径未配置") - # 创建DraftFolder实例 - draft_folder = DraftFolder(jianying_draft_path) - # 使用从参数中获取的草稿名称,如果为空则使用默认名称 draft_name = getattr(params, 'draft_name', "") logger.debug(f"从params获取的草稿名称: '{draft_name}' (类型: {type(draft_name)})") if not draft_name: draft_name = f"NarratoAI_{int(time.time())}" logger.debug(f"使用默认草稿名称: '{draft_name}'") - - # 创建新草稿 - script = draft_folder.create_draft(draft_name, 1920, 1080) - - # 添加视频轨道和音频轨道 - script.add_track(TrackType.video, '视频轨道') - script.add_track(TrackType.audio, '音频轨道') - - # 处理脚本数据 - current_time = 0 + output_dir = utils.task_dir(task_id) - - for item in new_script_list: - # 获取时间信息 - start_time = float(item.get('start_time', 0.0)) - duration = float(item.get('duration', 0.0)) - timestamp = item.get('timestamp', '') - - logger.info(f"处理片段: OST={item['OST']}, start_time={start_time}, duration={duration}, timestamp={timestamp}") - - # 生成音频文件路径 - audio_file = "" - if timestamp: - timestamp_formatted = timestamp.replace(':', '_') - audio_file = os.path.join( - output_dir, - f"audio_{timestamp_formatted}.mp3" - ) - - # 检查是否有裁剪后的视频文件 - video_file = item.get('video', '') - if video_file and not os.path.exists(video_file): - video_file = "" - - # 添加视频片段 - if video_file: - # 使用裁剪后的视频文件 - # 对于裁剪后的视频,target_timerange的第二个参数是持续时间 - video_segment = VideoSegment( - video_file, - trange(f"{current_time}s", f"{duration}s") - ) - else: - # 使用原始视频文件 - # source_timerange是从原始视频中截取的部分 - # target_timerange是片段在时间轴上的位置 - video_segment = VideoSegment( - params.video_origin_path, - trange(f"{current_time}s", f"{duration}s"), - source_timerange=trange(f"{start_time}s", f"{duration}s") - ) - script.add_segment(video_segment, '视频轨道') - - # 处理音频 - if item['OST'] in [0, 2]: # 需要TTS的片段 - if os.path.exists(audio_file): - # 使用ffprobe获取精确的音频时长,避免因TTS引擎差异导致时长不匹配 - actual_audio_duration = get_audio_duration_ffprobe(audio_file) - actual_audio_duration = _floor_duration_to_milliseconds(actual_audio_duration) - logger.info(f"音频文件实际时长: {actual_audio_duration:.6f}秒, 脚本时长(视频): {duration:.3f}秒") - - # 使用音频实际时长和视频时长中的较小值,确保不超过素材时长 - # 当TTS语速调整时,音频可能比视频长或短,取较小值可以避免超出素材 - safe_duration = min(actual_audio_duration, duration) - logger.info(f"使用时长: {safe_duration:.6f}秒 (取音频和视频时长的较小值)") - - audio_segment = AudioSegment( - audio_file, - trange(f"{current_time}s", f"{safe_duration}s") - ) - script.add_segment(audio_segment, '音频轨道') - else: - logger.warning(f"音频文件不存在: {audio_file}") - # OST=1的片段保留原声,不需要添加额外音频 - - # 更新当前时间 - current_time += duration - - # 保存草稿 - script.save() - - draft_path = os.path.join(jianying_draft_path, draft_name) + + draft_path, draft_name = write_plaintext_jianying_draft( + jianying_draft_path=jianying_draft_path, + draft_name=draft_name, + new_script_list=new_script_list, + params=params, + output_dir=output_dir, + ) logger.success(f"成功导出到剪映草稿: {draft_name}") logger.info(f"草稿已保存到: {draft_path}") @@ -263,10 +231,6 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams): sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, draft_path=draft_path, draft_name=draft_name) return {"draft_path": draft_path, "draft_name": draft_name} - - except ImportError as e: - logger.error(f"导入pyJianYingDraft失败: {e}") - raise ImportError(f"pyJianYingDraft库导入失败: {e}\n请确保已正确安装该库") except Exception as e: logger.error(f"导出到剪映草稿失败: {e}") import traceback diff --git a/app/services/task.py b/app/services/task.py index 0b6b138..bf8c45b 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -49,6 +49,8 @@ def _build_subtitle_mask_options(params: VideoClipParams, enabled=None) -> dict: 'subtitle_mask_portrait_height_percent': getattr(params, "subtitle_mask_portrait_height_percent", 16.0), 'subtitle_mask_portrait_blur_radius': getattr(params, "subtitle_mask_portrait_blur_radius", 26), 'subtitle_mask_portrait_opacity_percent': getattr(params, "subtitle_mask_portrait_opacity_percent", 84), + 'subtitle_position_landscape_y_percent': getattr(params, "subtitle_position_landscape_y_percent", 85.0), + 'subtitle_position_portrait_y_percent': getattr(params, "subtitle_position_portrait_y_percent", 82.0), } diff --git a/app/services/test_jianying_task_unittest.py b/app/services/test_jianying_task_unittest.py index 24a87fe..5e25478 100644 --- a/app/services/test_jianying_task_unittest.py +++ b/app/services/test_jianying_task_unittest.py @@ -1,10 +1,14 @@ +import json import tempfile import unittest from pathlib import Path from unittest.mock import patch from app.models.schema import VideoClipParams -from app.services import jianying_task +from app.services import jianying_draft_builder, jianying_task + + +DraftPathPlaceholder = "##_draftpath_placeholder_0E685133-18CE-45ED-8CB8-2904A212EC80_##" class JianyingTaskTests(unittest.TestCase): @@ -38,6 +42,120 @@ class JianyingTaskTests(unittest.TestCase): self.assertAlmostEqual(6.997, jianying_task._floor_duration_to_milliseconds(6.997333)) self.assertAlmostEqual(7.0, jianying_task._floor_duration_to_milliseconds(7.000999)) + def test_clamp_duration_to_media_uses_actual_media_duration(self): + duration_cache = {} + + with patch.object(jianying_task, "get_media_duration_ffprobe", return_value=4.2809): + duration = jianying_task._clamp_duration_to_media( + requested_duration=4.31, + media_file="/tmp/clip.mp4", + duration_cache=duration_cache, + media_label="视频素材", + ) + + self.assertAlmostEqual(4.28, duration) + + def test_clamp_duration_to_media_respects_source_start_time(self): + duration_cache = {} + + with patch.object(jianying_task, "get_media_duration_ffprobe", return_value=10.0): + duration = jianying_task._clamp_duration_to_media( + requested_duration=4.0, + media_file="/tmp/original.mp4", + duration_cache=duration_cache, + media_label="原始视频素材", + source_start_time=8.5, + ) + + self.assertAlmostEqual(1.5, duration) + + def test_format_seconds_for_trange_uses_millisecond_precision(self): + self.assertEqual("4.280s", jianying_task._format_seconds_for_trange(4.28)) + + def test_write_plaintext_jianying_draft_creates_root_package(self): + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) / "drafts" + output_dir = Path(temp_dir) / "task" + root_path.mkdir() + output_dir.mkdir() + video_path = output_dir / "clip:01.mp4" + audio_path = output_dir / "audio_00_00_00,000-00_00_04,310.mp3" + video_path.write_bytes(b"fake video") + audio_path.write_bytes(b"fake audio") + + params = VideoClipParams( + video_origin_path=str(video_path), + original_volume=0.4, + tts_volume=0.9, + ) + script = [ + { + "OST": 0, + "start_time": 0.0, + "duration": 4.31, + "timestamp": "00:00:00,000-00:00:04,310", + "video": str(video_path), + "audio": str(audio_path), + } + ] + + def fake_duration(file_path): + return 4.2809 if file_path == str(video_path) else 5.0 + + with ( + patch.object(jianying_draft_builder, "_get_media_duration_ffprobe", side_effect=fake_duration), + patch.object( + jianying_draft_builder, + "_get_video_metadata_ffprobe", + return_value=(4_280_000, 720, 1280), + ), + ): + draft_path, draft_name = jianying_draft_builder.write_plaintext_jianying_draft( + str(root_path), + "NarratoAI_test", + script, + params, + str(output_dir), + ) + + draft_dir = Path(draft_path) + self.assertEqual("NarratoAI_test", draft_name) + self.assertTrue((draft_dir / "draft_info.json").exists()) + self.assertTrue((draft_dir / "template-2.tmp").exists()) + self.assertTrue((draft_dir / "template.tmp").exists()) + self.assertTrue((draft_dir / "draft_cover.jpg").exists()) + self.assertFalse((draft_dir / "draft_content_legacy.json").exists()) + self.assertFalse((draft_dir / "Timelines" / "project.json").exists()) + self.assertTrue((draft_dir / "assets" / "video" / "clip_01.mp4").exists()) + self.assertTrue((draft_dir / "assets" / "audio" / audio_path.name).exists()) + + draft_info = json.loads((draft_dir / "draft_info.json").read_text(encoding="utf-8")) + self.assertEqual("169.0.0", draft_info["new_version"]) + self.assertEqual("NarratoAI_test", draft_info["name"]) + self.assertEqual(54, len(draft_info["materials"])) + self.assertEqual( + f"{DraftPathPlaceholder}/assets/video/clip_01.mp4", + draft_info["materials"]["videos"][0]["path"], + ) + self.assertEqual( + f"{DraftPathPlaceholder}/assets/audio/{audio_path.name}", + draft_info["materials"]["audios"][0]["path"], + ) + self.assertEqual(4_280_000, draft_info["tracks"][0]["segments"][0]["source_timerange"]["duration"]) + self.assertEqual(4_280_000, draft_info["tracks"][1]["segments"][0]["source_timerange"]["duration"]) + + attachment_editing = json.loads((draft_dir / "attachment_editing.json").read_text(encoding="utf-8")) + self.assertEqual("1.0.0", attachment_editing["editing_draft"]["version"]) + self.assertFalse(attachment_editing["editing_draft"]["is_use_audio_separation"]) + + empty_template = json.loads((draft_dir / "template.tmp").read_text(encoding="utf-8")) + self.assertEqual("75.0.0", empty_template["new_version"]) + self.assertEqual([], empty_template["tracks"]) + + root_meta = json.loads((root_path / "root_meta_info.json").read_text(encoding="utf-8")) + self.assertEqual("NarratoAI_test", root_meta["all_draft_store"][0]["draft_name"]) + self.assertEqual(str(draft_dir / "draft_info.json"), root_meta["all_draft_store"][0]["draft_json_file"]) + if __name__ == "__main__": unittest.main() diff --git a/config.example.toml b/config.example.toml index 89217eb..3e81c08 100644 --- a/config.example.toml +++ b/config.example.toml @@ -186,6 +186,8 @@ subtitle_mask_portrait_height_percent = 16 subtitle_mask_portrait_blur_radius = 26 subtitle_mask_portrait_opacity_percent = 84 + subtitle_position_landscape_y_percent = 85 + subtitle_position_portrait_y_percent = 82 ########################################## # 代理和网络配置 diff --git a/requirements.txt b/requirements.txt index 799282c..12def4d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,6 +35,3 @@ tenacity>=9.0.0 # torch>=2.0.0 # torchvision>=0.15.0 # torchaudio>=2.0.0 - -# 剪映草稿导出依赖 -pyJianYingDraft>=0.1.0 diff --git a/webui/components/subtitle_settings.py b/webui/components/subtitle_settings.py index ac3793b..f719d5e 100644 --- a/webui/components/subtitle_settings.py +++ b/webui/components/subtitle_settings.py @@ -25,6 +25,15 @@ SUBTITLE_MASK_DEFAULTS = { }, } +SUBTITLE_POSITION_DEFAULTS = { + "landscape": { + "y_percent": 85, + }, + "portrait": { + "y_percent": 82, + }, +} + VIDEO_PREVIEW_UPLOAD_TYPES = ["mp4", "mov", "avi", "flv", "mkv", "mpeg4"] @@ -71,6 +80,21 @@ def _set_subtitle_mask_value(orientation, field, value): st.session_state[key] = value +def _subtitle_position_key(orientation, field): + return f"subtitle_position_{orientation}_{field}" + + +def _get_orientation_subtitle_position_value(orientation, field): + key = _subtitle_position_key(orientation, field) + return config.ui.get(key, SUBTITLE_POSITION_DEFAULTS[orientation][field]) + + +def _set_orientation_subtitle_position_value(orientation, field, value): + key = _subtitle_position_key(orientation, field) + config.ui[key] = value + st.session_state[key] = value + + def _format_preview_time(seconds): seconds = max(0.0, float(seconds or 0)) minutes = int(seconds // 60) @@ -162,6 +186,10 @@ def _build_subtitle_mask_preview_options(): for orientation in ("landscape", "portrait"): for field in ("x_percent", "y_percent", "width_percent", "height_percent", "blur_radius", "opacity_percent"): options[_subtitle_mask_key(orientation, field)] = _get_subtitle_mask_value(orientation, field) + options[_subtitle_position_key(orientation, "y_percent")] = _get_orientation_subtitle_position_value( + orientation, + "y_percent", + ) return options @@ -187,6 +215,14 @@ def _draw_subtitle_mask_preview(frame): outline=(255, 75, 85, 235), width=max(2, round(min(image.width, image.height) * 0.004)), ) + subtitle_y_percent = _get_orientation_subtitle_position_value(region["orientation"], "y_percent") + subtitle_y = round((image.height - 1) * subtitle_y_percent / 100) + line_width = max(2, round(min(image.width, image.height) * 0.004)) + draw.line( + (0, subtitle_y, image.width, subtitle_y), + fill=(59, 130, 246, 220), + width=line_width, + ) image.alpha_composite(overlay) return image.convert("RGB"), region @@ -341,6 +377,18 @@ def _render_subtitle_mask_region_controls(tr, orientation): _set_subtitle_mask_value(orientation, "opacity_percent", opacity_percent) +def _render_subtitle_position_controls(tr, orientation): + y_percent = st.slider( + tr("Subtitle Burn Position"), + min_value=0, + max_value=99, + value=int(_get_orientation_subtitle_position_value(orientation, "y_percent")), + help=tr("Subtitle Burn Position Help"), + key=f"{orientation}_subtitle_burn_y_percent", + ) + _set_orientation_subtitle_position_value(orientation, "y_percent", y_percent) + + def _render_subtitle_mask_dialog(tr): @st.dialog(tr("Subtitle Mask Settings"), width="large") def subtitle_mask_dialog(): @@ -349,14 +397,20 @@ def _render_subtitle_mask_dialog(tr): with settings_col: st.caption(tr("Subtitle Mask Settings Caption")) st.caption(tr("Subtitle Mask Preview Caption")) - landscape_tab, portrait_tab = st.tabs([ + landscape_mask_tab, portrait_mask_tab, landscape_position_tab, portrait_position_tab = st.tabs([ tr("Landscape Subtitle Mask"), tr("Portrait Subtitle Mask"), + tr("Landscape Subtitle Position"), + tr("Portrait Subtitle Position"), ]) - with landscape_tab: + with landscape_mask_tab: _render_subtitle_mask_region_controls(tr, "landscape") - with portrait_tab: + with portrait_mask_tab: _render_subtitle_mask_region_controls(tr, "portrait") + with landscape_position_tab: + _render_subtitle_position_controls(tr, "landscape") + with portrait_position_tab: + _render_subtitle_position_controls(tr, "portrait") with preview_col: _render_subtitle_mask_preview(tr) @@ -627,6 +681,8 @@ def get_subtitle_params(): 'subtitle_mask_portrait_height_percent': _get_subtitle_mask_value("portrait", "height_percent"), 'subtitle_mask_portrait_blur_radius': _get_subtitle_mask_value("portrait", "blur_radius"), 'subtitle_mask_portrait_opacity_percent': _get_subtitle_mask_value("portrait", "opacity_percent"), + 'subtitle_position_landscape_y_percent': _get_orientation_subtitle_position_value("landscape", "y_percent"), + 'subtitle_position_portrait_y_percent': _get_orientation_subtitle_position_value("portrait", "y_percent"), 'subtitle_auto_transcribe_enabled': st.session_state.get('subtitle_auto_transcribe_enabled', False), 'subtitle_auto_transcribe_backend': st.session_state.get( 'subtitle_auto_transcribe_backend', diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 5c7b5b1..654185c 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -71,6 +71,8 @@ "Subtitle Mask Settings Caption": "Save landscape and portrait mask regions as frame percentages. The mask is applied before new subtitles are burned in.", "Landscape Subtitle Mask": "Landscape Mask", "Portrait Subtitle Mask": "Portrait Mask", + "Landscape Subtitle Position": "Landscape Subtitle Position", + "Portrait Subtitle Position": "Portrait Subtitle Position", "Save Subtitle Mask Settings": "Save Subtitle Mask Settings", "Subtitle Mask Left": "Left Position", "Subtitle Mask Left Help": "Mask distance from the left edge as a frame percentage.", @@ -84,6 +86,8 @@ "Subtitle Mask Blur Radius Help": "Blur strength for the mask background and edge.", "Subtitle Mask Opacity": "Mask Strength", "Subtitle Mask Opacity Help": "Mask blend strength. Higher values cover source subtitles more strongly.", + "Subtitle Burn Position": "Subtitle Position", + "Subtitle Burn Position Help": "New subtitle distance from the top edge as a frame percentage. The blue line in preview shows this position.", "Subtitle Mask Preview": "Source Subtitle Mask Preview", "Subtitle Mask Preview Caption": "Upload a source video for preview, or use the currently selected source video. Uploaded files here are only used for mask preview.", "Upload Subtitle Mask Preview Video": "Upload Preview Source Video", @@ -93,7 +97,7 @@ "Subtitle Mask Preview Empty": "Upload a preview video, or select a source video above first.", "Subtitle Mask Preview Timeline": "Preview Timeline (seconds)", "Subtitle Mask Preview Timeline Help": "Drag to a frame where the source subtitles appear, then fine-tune the mask region.", - "Subtitle Mask Preview Frame Caption": "{time} · {orientation} · red outline shows the current mask region", + "Subtitle Mask Preview Frame Caption": "{time} · {orientation} · red outline is the mask, blue line is the subtitle position", "Subtitle Mask Preview Failed": "Unable to read this video preview. Please try another video file.", "Enable Auto Transcription": "Enable Auto Transcription", "Enable Auto Transcription Help": "After the final video is merged, transcribe the whole video into subtitles and burn them into the output.", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 29945e0..1fc60f0 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -61,6 +61,8 @@ "Subtitle Mask Settings Caption": "按画面百分比保存横屏和竖屏遮罩区域;生成视频时会先叠加柔化遮罩,再烧录新字幕。", "Landscape Subtitle Mask": "横屏遮罩", "Portrait Subtitle Mask": "竖屏遮罩", + "Landscape Subtitle Position": "横屏字幕位置", + "Portrait Subtitle Position": "竖屏字幕位置", "Save Subtitle Mask Settings": "保存字幕遮罩设置", "Subtitle Mask Left": "左侧位置", "Subtitle Mask Left Help": "遮罩距离画面左侧的百分比", @@ -74,6 +76,8 @@ "Subtitle Mask Blur Radius Help": "遮罩边缘和背景的模糊强度", "Subtitle Mask Opacity": "遮罩强度", "Subtitle Mask Opacity Help": "遮罩融合强度,数值越高越容易遮住原字幕", + "Subtitle Burn Position": "字幕位置", + "Subtitle Burn Position Help": "新字幕距离画面顶部的百分比;预览中的蓝线表示当前字幕位置", "Subtitle Mask Preview": "原字幕遮罩预览", "Subtitle Mask Preview Caption": "可上传一段原视频作为预览,也可直接使用当前已选择的原视频;上传内容仅用于预览遮罩位置。", "Upload Subtitle Mask Preview Video": "上传预览原视频", @@ -83,7 +87,7 @@ "Subtitle Mask Preview Empty": "请上传预览视频,或先在上方选择原视频", "Subtitle Mask Preview Timeline": "预览时间轴(秒)", "Subtitle Mask Preview Timeline Help": "拖动到原字幕出现的画面,方便微调遮罩区域", - "Subtitle Mask Preview Frame Caption": "{time} · {orientation} · 红框为当前遮罩覆盖区域", + "Subtitle Mask Preview Frame Caption": "{time} · {orientation} · 红框为遮罩区域,蓝线为字幕位置", "Subtitle Mask Preview Failed": "无法读取该视频预览,请尝试更换视频文件", "Enable Auto Transcription": "启用自动转录", "Enable Auto Transcription Help": "开启后会在最终视频合并完成后,对整条视频转录生成字幕并压入成片", From a2645aebd381c5e75d8f57ec83abb34fc9a260b7 Mon Sep 17 00:00:00 2001 From: viccy Date: Sat, 6 Jun 2026 12:43:57 +0800 Subject: [PATCH 11/24] =?UTF-8?q?feat(webui):=20=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=89=AA=E6=98=A0=E8=8D=89=E7=A8=BF=E5=AF=BC=E5=87=BA=E7=9A=84?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E4=BD=93=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 更新streamlit依赖至1.57.0以支持原生弹窗组件 - 重构剪映导出逻辑,使用原生弹窗替代旧的内联表单 - 新增带样式的导出确认面板并补充多语言翻译 - 简化导出状态渲染与会话状态管理逻辑 --- README.md | 2 +- app/config/config.py | 41 +++- app/config/test_config_bootstrap_unittest.py | 15 ++ app/services/jianying_task.py | 28 ++- app/services/test_jianying_task_unittest.py | 37 +-- app/services/voice.py | 70 +++--- config.example.toml | 8 +- requirements.txt | 2 +- webui.py | 241 +++++++++++++------ webui/components/audio_settings.py | 144 +++++------ webui/i18n/en.json | 18 +- webui/i18n/zh.json | 18 +- 12 files changed, 398 insertions(+), 226 deletions(-) diff --git a/README.md b/README.md index 60afdda..7edab9b 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ NarratoAI 是一款自动化影视解说工具,基于 LLM 实现文案撰写 - 2026.04.27 发布新版本 0.7.9,新增 **Fun-ASR一键转录字幕** - 2026.04.03 发布新版本 0.7.8,重构纪录片逐帧分析链路,统一共享服务并优化抽帧、缓存、视觉并发与文案生成流程 - 2026.03.27 发布新版本 0.7.7,出于安全考虑,已移除 LiteLLM 依赖,统一使用 OpenAI 兼容请求链路 -- 2025.11.20 发布新版本 0.7.5,新增 [IndexTTS2](https://github.com/index-tts/index-tts) 语音克隆支持 +- 2025.11.20 发布新版本 0.7.5,新增 [IndexTTS-1.5](https://github.com/index-tts/index-tts) 语音克隆支持 - 2025.10.15 发布新版本 0.7.3,升级大模型供应商管理能力 - 2025.09.10 发布新版本 0.7.2,新增腾讯云tts - 2025.08.18 发布新版本 0.7.1,支持 **语音克隆** 和 最新大模型 diff --git a/app/config/config.py b/app/config/config.py index e157f94..ac44df1 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -9,6 +9,36 @@ from app.config.defaults import build_default_app_config, merge_missing_app_defa root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) config_file = f"{root_dir}/config.toml" version_file = f"{root_dir}/project_version" +INDEXTTS_ENGINE = "indextts" +INDEXTTS_LEGACY_ENGINE = "indextts2" +INDEXTTS_DISPLAY_NAME = "IndexTTS-1.5" +INDEXTTS_VOICE_PREFIX = f"{INDEXTTS_ENGINE}:" +INDEXTTS_LEGACY_VOICE_PREFIX = f"{INDEXTTS_LEGACY_ENGINE}:" + + +def normalize_tts_engine_name(tts_engine: str) -> str: + if tts_engine == INDEXTTS_LEGACY_ENGINE: + return INDEXTTS_ENGINE + return tts_engine + + +def normalize_indextts_voice_prefix(voice_name: str) -> str: + if isinstance(voice_name, str) and voice_name.startswith(INDEXTTS_LEGACY_VOICE_PREFIX): + return f"{INDEXTTS_VOICE_PREFIX}{voice_name[len(INDEXTTS_LEGACY_VOICE_PREFIX):]}" + return voice_name + + +def migrate_indextts_config(config_data): + if "indextts" not in config_data and INDEXTTS_LEGACY_ENGINE in config_data: + config_data["indextts"] = config_data[INDEXTTS_LEGACY_ENGINE] + + ui_config = config_data.get("ui") + if isinstance(ui_config, dict): + if "tts_engine" in ui_config: + ui_config["tts_engine"] = normalize_tts_engine_name(ui_config.get("tts_engine", "")) + if "voice_name" in ui_config: + ui_config["voice_name"] = normalize_indextts_voice_prefix(ui_config.get("voice_name", "")) + return config_data def get_version_from_file(): @@ -32,13 +62,13 @@ def load_config(): _config_ = build_default_config() write_config_file(_config_) logger.info("create config.toml with shared defaults") - return _config_ + return migrate_indextts_config(_config_) logger.info(f"load config from file: {config_file}") _config_ = load_toml_file(config_file) _config_["app"] = merge_missing_app_defaults(_config_.get("app", {})) - return _config_ + return migrate_indextts_config(_config_) def load_toml_file(file_path): @@ -60,7 +90,7 @@ def build_default_config(): config_data = load_toml_file(example_file) config_data["app"] = build_default_app_config(config_data.get("app", {})) - return config_data + return migrate_indextts_config(config_data) def write_config_file(config_data): @@ -82,7 +112,8 @@ def save_config(): _cfg["ui"] = ui _cfg["tts_qwen"] = tts_qwen _cfg["fun_asr"] = fun_asr - _cfg["indextts2"] = indextts2 + _cfg["indextts"] = indextts + _cfg.pop(INDEXTTS_LEGACY_ENGINE, None) _cfg["doubaotts"] = doubaotts f.write(toml.dumps(_cfg)) @@ -98,7 +129,7 @@ ui = _cfg.get("ui", {}) frames = _cfg.get("frames", {}) tts_qwen = _cfg.get("tts_qwen", {}) fun_asr = _cfg.get("fun_asr", {}) -indextts2 = _cfg.get("indextts2", {}) +indextts = _cfg.get("indextts", _cfg.get(INDEXTTS_LEGACY_ENGINE, {})) doubaotts = _cfg.get("doubaotts", {}) hostname = socket.gethostname() diff --git a/app/config/test_config_bootstrap_unittest.py b/app/config/test_config_bootstrap_unittest.py index 8398fea..691b6da 100644 --- a/app/config/test_config_bootstrap_unittest.py +++ b/app/config/test_config_bootstrap_unittest.py @@ -64,6 +64,21 @@ hide_config = true self.assertEqual("Pro/zai-org/GLM-5", saved_config["app"]["text_openai_model_name"]) self.assertTrue(saved_config["app"]["hide_config"]) + def test_indextts_legacy_config_is_migrated(self): + migrated = cfg.migrate_indextts_config( + { + "indextts2": {"api_url": "http://127.0.0.1:8081/tts"}, + "ui": { + "tts_engine": "indextts2", + "voice_name": "indextts2:/tmp/reference.wav", + }, + } + ) + + self.assertEqual("http://127.0.0.1:8081/tts", migrated["indextts"]["api_url"]) + self.assertEqual("indextts", migrated["ui"]["tts_engine"]) + self.assertEqual("indextts:/tmp/reference.wav", migrated["ui"]["voice_name"]) + class OpenAICompatibleModelDefaultsTests(unittest.TestCase): def test_ui_keeps_full_model_name_and_openai_provider(self): diff --git a/app/services/jianying_task.py b/app/services/jianying_task.py index e988435..dd9db81 100644 --- a/app/services/jianying_task.py +++ b/app/services/jianying_task.py @@ -49,8 +49,9 @@ def get_audio_duration_ffprobe(audio_file: str) -> float: return get_media_duration_ffprobe(audio_file) -def _strip_indextts2_prefix(voice_name: str) -> str: - prefix = "indextts2:" +def _strip_indextts_prefix(voice_name: str) -> str: + voice_name = config.normalize_indextts_voice_prefix(voice_name or "") + prefix = config.INDEXTTS_VOICE_PREFIX if voice_name.startswith(prefix): return voice_name[len(prefix):] return voice_name @@ -99,24 +100,25 @@ def _clamp_duration_to_media( return safe_duration -def _normalize_indextts2_reference_audio(params: VideoClipParams) -> None: - """Ensure IndexTTS2 uses the configured reference audio instead of a stale UI voice.""" - if params.tts_engine != "indextts2": +def _normalize_indextts_reference_audio(params: VideoClipParams) -> None: + """Ensure IndexTTS-1.5 uses the configured reference audio instead of a stale UI voice.""" + params.tts_engine = config.normalize_tts_engine_name(params.tts_engine) + if params.tts_engine != config.INDEXTTS_ENGINE: return - candidate = _strip_indextts2_prefix(getattr(params, "voice_name", "") or "") + candidate = _strip_indextts_prefix(getattr(params, "voice_name", "") or "") if candidate and os.path.isfile(candidate): - params.voice_name = f"indextts2:{candidate}" - logger.info(f"IndexTTS2 使用参考音频: {candidate}") + params.voice_name = f"{config.INDEXTTS_VOICE_PREFIX}{candidate}" + logger.info(f"IndexTTS-1.5 使用参考音频: {candidate}") return - configured_ref = _strip_indextts2_prefix(config.indextts2.get("reference_audio", "") or "") + configured_ref = _strip_indextts_prefix(config.indextts.get("reference_audio", "") or "") if configured_ref and os.path.isfile(configured_ref): - params.voice_name = f"indextts2:{configured_ref}" - logger.info(f"IndexTTS2 使用配置中的参考音频: {configured_ref}") + params.voice_name = f"{config.INDEXTTS_VOICE_PREFIX}{configured_ref}" + logger.info(f"IndexTTS-1.5 使用配置中的参考音频: {configured_ref}") return - raise ValueError("IndexTTS2 参考音频不存在,请在音频设置中上传或选择有效的参考音频") + raise ValueError("IndexTTS-1.5 参考音频不存在,请在音频设置中上传或选择有效的参考音频") def start_export_jianying_draft(task_id: str, params: VideoClipParams): @@ -159,7 +161,7 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams): 2. 使用 TTS 生成音频素材 """ logger.info("\n\n## 2. 根据OST设置生成音频列表") - _normalize_indextts2_reference_audio(params) + _normalize_indextts_reference_audio(params) tts_segments = [ segment for segment in list_script if segment['OST'] in [0, 2] diff --git a/app/services/test_jianying_task_unittest.py b/app/services/test_jianying_task_unittest.py index 5e25478..c073d3f 100644 --- a/app/services/test_jianying_task_unittest.py +++ b/app/services/test_jianying_task_unittest.py @@ -12,31 +12,40 @@ DraftPathPlaceholder = "##_draftpath_placeholder_0E685133-18CE-45ED-8CB8-2904A21 class JianyingTaskTests(unittest.TestCase): - def test_normalize_indextts2_uses_valid_param_reference(self): + def test_normalize_indextts_uses_valid_param_reference(self): with tempfile.NamedTemporaryFile(suffix=".wav") as ref: - params = VideoClipParams(tts_engine="indextts2", voice_name=ref.name) + params = VideoClipParams(tts_engine="indextts", voice_name=ref.name) - jianying_task._normalize_indextts2_reference_audio(params) + jianying_task._normalize_indextts_reference_audio(params) - self.assertEqual(f"indextts2:{ref.name}", params.voice_name) + self.assertEqual(f"indextts:{ref.name}", params.voice_name) - def test_normalize_indextts2_uses_config_reference_when_param_is_stale(self): + def test_normalize_indextts_uses_config_reference_when_param_is_stale(self): with tempfile.TemporaryDirectory() as temp_dir: ref_path = Path(temp_dir) / "reference.wav" ref_path.write_bytes(b"fake wav") - params = VideoClipParams(tts_engine="indextts2", voice_name="zh-CN-YunjianNeural") + params = VideoClipParams(tts_engine="indextts", voice_name="zh-CN-YunjianNeural") - with patch.dict(jianying_task.config.indextts2, {"reference_audio": str(ref_path)}, clear=False): - jianying_task._normalize_indextts2_reference_audio(params) + with patch.dict(jianying_task.config.indextts, {"reference_audio": str(ref_path)}, clear=False): + jianying_task._normalize_indextts_reference_audio(params) - self.assertEqual(f"indextts2:{ref_path}", params.voice_name) + self.assertEqual(f"indextts:{ref_path}", params.voice_name) - def test_normalize_indextts2_requires_existing_reference_audio(self): - params = VideoClipParams(tts_engine="indextts2", voice_name="zh-CN-YunjianNeural") + def test_normalize_indextts_accepts_legacy_engine_and_prefix(self): + with tempfile.NamedTemporaryFile(suffix=".wav") as ref: + params = VideoClipParams(tts_engine="indextts2", voice_name=f"indextts2:{ref.name}") - with patch.dict(jianying_task.config.indextts2, {"reference_audio": ""}, clear=False): - with self.assertRaisesRegex(ValueError, "IndexTTS2 参考音频不存在"): - jianying_task._normalize_indextts2_reference_audio(params) + jianying_task._normalize_indextts_reference_audio(params) + + self.assertEqual("indextts", params.tts_engine) + self.assertEqual(f"indextts:{ref.name}", params.voice_name) + + def test_normalize_indextts_requires_existing_reference_audio(self): + params = VideoClipParams(tts_engine="indextts", voice_name="zh-CN-YunjianNeural") + + with patch.dict(jianying_task.config.indextts, {"reference_audio": ""}, clear=False): + with self.assertRaisesRegex(ValueError, "IndexTTS-1.5 参考音频不存在"): + jianying_task._normalize_indextts_reference_audio(params) def test_floor_duration_to_milliseconds(self): self.assertAlmostEqual(6.997, jianying_task._floor_duration_to_milliseconds(6.997333)) diff --git a/app/services/voice.py b/app/services/voice.py index 58cd1c9..38d70ee 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1263,6 +1263,8 @@ def doubaotts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1. def tts( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str, tts_engine: str ) -> Union[SubMaker, None]: + tts_engine = config.normalize_tts_engine_name(tts_engine) + voice_name = config.normalize_indextts_voice_prefix(voice_name) logger.info(f"使用 TTS 引擎: '{tts_engine}', 语音: '{voice_name}'") if tts_engine == "tencent_tts": @@ -1288,9 +1290,9 @@ def tts( logger.info("分发到 Edge TTS") return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file) - if tts_engine == "indextts2": - logger.info("分发到 IndexTTS2") - return indextts2_tts(text, voice_name, voice_file, speed=voice_rate) + if tts_engine == config.INDEXTTS_ENGINE: + logger.info("分发到 IndexTTS-1.5") + return indextts_tts(text, voice_name, voice_file, speed=voice_rate) if tts_engine == "doubaotts": logger.info("分发到豆包语音 TTS") @@ -1772,7 +1774,8 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f :param tts_engine: TTS 引擎 :return: 生成的音频文件列表 """ - voice_name = parse_voice_name(voice_name) + tts_engine = config.normalize_tts_engine_name(tts_engine) + voice_name = config.normalize_indextts_voice_prefix(parse_voice_name(voice_name)) output_dir = utils.task_dir(task_id) tts_results = [] @@ -1800,8 +1803,8 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f f"或者使用其他 tts 引擎") continue else: - # SoulVoice、Qwen3、IndexTTS2、豆包语音 引擎不生成字幕文件 - if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2" or tts_engine == "doubaotts": + # SoulVoice、Qwen3、IndexTTS-1.5、豆包语音 引擎不生成字幕文件 + if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == config.INDEXTTS_ENGINE or tts_engine == "doubaotts": # 获取实际音频文件的时长 duration = get_audio_duration_from_file(audio_file) if duration <= 0: @@ -2219,24 +2222,25 @@ def parse_soulvoice_voice(voice_name: str) -> str: return voice_name -def parse_indextts2_voice(voice_name: str) -> str: +def parse_indextts_voice(voice_name: str) -> str: """ - 解析 IndexTTS2 语音名称 - 支持格式:indextts2:reference_audio_path + 解析 IndexTTS-1.5 语音名称 + 支持格式:indextts:reference_audio_path 返回参考音频文件路径 """ - if voice_name.startswith("indextts2:"): - return voice_name[10:] # 移除 "indextts2:" 前缀 + voice_name = config.normalize_indextts_voice_prefix(voice_name) + if voice_name.startswith(config.INDEXTTS_VOICE_PREFIX): + return voice_name[len(config.INDEXTTS_VOICE_PREFIX):] return voice_name -def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: +def indextts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: """ - 使用 IndexTTS2 API 进行零样本语音克隆 + 使用 IndexTTS-1.5 API 进行零样本语音克隆 Args: text: 要转换的文本 - voice_name: 参考音频文件(格式:indextts2:path/to/audio.wav) + voice_name: 参考音频文件(格式:indextts:path/to/audio.wav) voice_file: 输出音频文件路径 speed: 语音速度(此引擎暂不支持速度调节) @@ -2244,20 +2248,20 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1. SubMaker: 包含时间戳信息的字幕制作器,失败时返回 None """ # 获取配置 - api_url = config.indextts2.get("api_url", "http://192.168.3.6:8081/tts") - infer_mode = config.indextts2.get("infer_mode", "普通推理") - temperature = config.indextts2.get("temperature", 1.0) - top_p = config.indextts2.get("top_p", 0.8) - top_k = config.indextts2.get("top_k", 30) - do_sample = config.indextts2.get("do_sample", True) - num_beams = config.indextts2.get("num_beams", 3) - repetition_penalty = config.indextts2.get("repetition_penalty", 10.0) + api_url = config.indextts.get("api_url", "http://192.168.3.6:8081/tts") + infer_mode = config.indextts.get("infer_mode", "普通推理") + temperature = config.indextts.get("temperature", 1.0) + top_p = config.indextts.get("top_p", 0.8) + top_k = config.indextts.get("top_k", 30) + do_sample = config.indextts.get("do_sample", True) + num_beams = config.indextts.get("num_beams", 3) + repetition_penalty = config.indextts.get("repetition_penalty", 10.0) # 解析参考音频文件 - reference_audio_path = parse_indextts2_voice(voice_name) + reference_audio_path = parse_indextts_voice(voice_name) if not reference_audio_path or not os.path.exists(reference_audio_path): - logger.error(f"IndexTTS2 参考音频文件不存在: {reference_audio_path}") + logger.error(f"IndexTTS-1.5 参考音频文件不存在: {reference_audio_path}") return None # 准备请求数据 @@ -2279,7 +2283,7 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1. # 重试机制 for attempt in range(3): try: - logger.info(f"第 {attempt + 1} 次调用 IndexTTS2 API") + logger.info(f"第 {attempt + 1} 次调用 IndexTTS-1.5 API") # 设置代理 proxies = {} @@ -2295,7 +2299,7 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1. files=files, data=data, proxies=proxies, - timeout=120 # IndexTTS2 推理可能需要较长时间 + timeout=120 # IndexTTS-1.5 推理可能需要较长时间 ) if response.status_code == 200: @@ -2303,9 +2307,9 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1. with open(voice_file, 'wb') as f: f.write(response.content) - logger.info(f"IndexTTS2 成功生成音频: {voice_file}, 大小: {len(response.content)} 字节") + logger.info(f"IndexTTS-1.5 成功生成音频: {voice_file}, 大小: {len(response.content)} 字节") - # IndexTTS2 不支持精确字幕生成,返回简单的 SubMaker 对象 + # IndexTTS-1.5 不支持精确字幕生成,返回简单的 SubMaker 对象 sub_maker = new_sub_maker() # 估算音频时长(基于文本长度) estimated_duration_ms = max(1000, int(len(text) * 200)) @@ -2314,14 +2318,14 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1. return sub_maker else: - logger.error(f"IndexTTS2 API 调用失败: {response.status_code} - {response.text}") + logger.error(f"IndexTTS-1.5 API 调用失败: {response.status_code} - {response.text}") except requests.exceptions.Timeout: - logger.error(f"IndexTTS2 API 调用超时 (尝试 {attempt + 1}/3)") + logger.error(f"IndexTTS-1.5 API 调用超时 (尝试 {attempt + 1}/3)") except requests.exceptions.RequestException as e: - logger.error(f"IndexTTS2 API 网络错误: {str(e)} (尝试 {attempt + 1}/3)") + logger.error(f"IndexTTS-1.5 API 网络错误: {str(e)} (尝试 {attempt + 1}/3)") except Exception as e: - logger.error(f"IndexTTS2 TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)") + logger.error(f"IndexTTS-1.5 TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)") finally: # 确保关闭文件 try: @@ -2338,5 +2342,5 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1. except: pass - logger.error("IndexTTS2 TTS 生成失败,已达到最大重试次数") + logger.error("IndexTTS-1.5 TTS 生成失败,已达到最大重试次数") return None diff --git a/config.example.toml b/config.example.toml index 3e81c08..652ffb0 100644 --- a/config.example.toml +++ b/config.example.toml @@ -114,8 +114,8 @@ api_key = "" model = "fun-asr" -[indextts2] - # IndexTTS2 语音克隆配置 +[indextts] + # IndexTTS-1.5 语音克隆配置 # 这是一个开源的零样本语音克隆项目,需要自行部署 # 项目地址:https://github.com/index-tts/index-tts # 默认 API 地址(本地部署) @@ -153,8 +153,8 @@ silence_duration = 0.125 [ui] - # TTS引擎选择 (indextts2, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech) - tts_engine = "indextts2" + # TTS引擎选择 (indextts, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech) + tts_engine = "indextts" # Edge TTS 配置 edge_voice_name = "zh-CN-XiaoyiNeural-Female" diff --git a/requirements.txt b/requirements.txt index 12def4d..c6011de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ requests>=2.32.0 moviepy==2.1.1 edge-tts==7.2.7 -streamlit>=1.45.0 +streamlit>=1.57.0 watchdog==6.0.0 loguru>=0.7.3 tomli>=2.2.1 diff --git a/webui.py b/webui.py index 7c70ad7..f57d240 100644 --- a/webui.py +++ b/webui.py @@ -2,6 +2,7 @@ import streamlit as st import os import sys import time +from html import escape from loguru import logger from app.config import config from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \ @@ -232,10 +233,10 @@ def get_voice_name_for_tts_engine(tts_engine: str) -> str: return f"tencent:{config.ui.get('tencent_voice_type', '101001')}" if tts_engine == 'qwen3_tts': return f"qwen3:{config.ui.get('qwen_voice_type', 'Cherry')}" - if tts_engine == 'indextts2': - reference_audio = config.indextts2.get('reference_audio', '') + if config.normalize_tts_engine_name(tts_engine) == config.INDEXTTS_ENGINE: + reference_audio = config.indextts.get('reference_audio', '') if reference_audio: - return f"indextts2:{reference_audio}" + return f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}" return config.ui.get('voice_name', '') if tts_engine == 'doubaotts': return config.ui.get('doubaotts_voice_type', 'BV700_streaming') @@ -247,7 +248,7 @@ def get_voice_name_for_tts_engine(tts_engine: str) -> str: return config.ui.get('voice_name', config.ui.get('edge_voice_name', 'zh-CN-XiaoxiaoNeural-Female')) -def get_jianying_export_params() -> VideoClipParams: +def get_jianying_export_params(draft_name=None) -> VideoClipParams: """获取导出到剪映草稿的参数""" tts_engine = st.session_state.get('tts_engine', config.ui.get('tts_engine', 'edge_tts')) voice_name = get_voice_name_for_tts_engine(tts_engine) @@ -272,20 +273,178 @@ def get_jianying_export_params() -> VideoClipParams: tts_volume=st.session_state.get('tts_volume', 1.0), original_volume=st.session_state.get('original_volume', 0.7), bgm_volume=st.session_state.get('bgm_volume', 0.3), - draft_name=st.session_state.get('draft_name_input', f"NarratoAI_{int(time.time())}") + draft_name=( + draft_name + if draft_name is not None + else st.session_state.get('draft_name_input', f"NarratoAI_{int(time.time())}") + ) ) +def _render_jianying_export_status(): + """渲染剪映导出的结果提示。""" + result = st.session_state.get('jianying_export_result') + error = st.session_state.get('jianying_export_error') + + if result: + st.success(tr("Jianying draft exported successfully").format(name=result['draft_name'])) + st.info(tr("Draft saved to").format(path=result['draft_path'])) + elif error: + st.error(f"{tr('Failed to export Jianying draft')}: {error}") + + +def _render_jianying_export_dialog(): + """使用弹窗确认剪映草稿名称。""" + import uuid + from loguru import logger + + @st.dialog(tr("Export to Jianying Draft"), width="small") + def jianying_export_dialog(): + jianying_draft_path = config.ui.get("jianying_draft_path", "") + dialog_title = escape(tr("Jianying export dialog title")) + dialog_description = escape(tr("Jianying export dialog description")) + destination_label = escape(tr("Jianying export destination")) + destination_path = escape(jianying_draft_path or "-") + + st.markdown( + f""" + +
+
📤
+
+
{dialog_title}
+
{dialog_description}
+
+
+
+ {destination_label} + {destination_path} +
+ """, + unsafe_allow_html=True, + ) + + draft_name = st.text_input( + tr("Jianying draft name"), + key="draft_name_input", + placeholder="NarratoAI_", + ) + + error = st.session_state.get('jianying_export_error') + if error: + st.error(f"{tr('Failed to export Jianying draft')}: {error}") + + cancel_col, confirm_col = st.columns(2) + with cancel_col: + if st.button(tr("Cancel"), key="cancel_export", use_container_width=True): + st.session_state['jianying_export_error'] = None + st.rerun() + + with confirm_col: + if st.button(tr("Confirm Export"), key="confirm_export", type="primary", use_container_width=True): + draft_name = (draft_name or "").strip() + if not draft_name: + st.error(tr("Please enter draft name")) + return + + # 创建任务ID + task_id = str(uuid.uuid4()) + st.session_state['task_id'] = task_id + + # 构建参数 + try: + params = get_jianying_export_params(draft_name) + except Exception as e: + logger.error(f"构建参数失败: {e}") + st.session_state['jianying_export_error'] = f"{tr('Failed to build parameters')}: {e}" + st.error(st.session_state['jianying_export_error']) + return + + with st.spinner(tr("Exporting to Jianying draft...")): + try: + from app.services import jianying_task + + # 调用导出到剪映草稿的任务 + result = jianying_task.start_export_jianying_draft(task_id, params) + + # 记录日志 + logger.info(f"成功导出到剪映草稿: {result['draft_name']}") + logger.info(f"草稿已保存到: {result['draft_path']}") + + # 保存结果到session state + st.session_state['jianying_export_result'] = result + st.session_state['jianying_export_error'] = None + st.rerun() + except Exception as e: + logger.error(f"导出到剪映草稿失败: {e}") + import traceback + logger.error(f"错误详情: {traceback.format_exc()}") + st.session_state['jianying_export_error'] = str(e) + st.session_state['jianying_export_result'] = None + st.error(f"{tr('Failed to export Jianying draft')}: {e}") + + jianying_export_dialog() + + def render_export_jianying_button(): """渲染导出到剪映草稿按钮和处理逻辑""" import os import time - import uuid - from loguru import logger # 初始化session state - if 'show_jianying_export_form' not in st.session_state: - st.session_state['show_jianying_export_form'] = False if 'jianying_export_result' not in st.session_state: st.session_state['jianying_export_result'] = None if 'jianying_export_error' not in st.session_state: @@ -310,70 +469,12 @@ def render_export_jianying_button(): st.error(tr("Jianying draft folder does not exist").format(path=jianying_draft_path)) return - # 显示导出表单 - st.session_state['show_jianying_export_form'] = True st.session_state['jianying_export_result'] = None st.session_state['jianying_export_error'] = None + st.session_state['draft_name_input'] = f"NarratoAI_{int(time.time())}" + _render_jianying_export_dialog() - # 显示导出表单 - if st.session_state['show_jianying_export_form']: - st.markdown("---") - st.subheader(tr("Export to Jianying Draft")) - - draft_name = st.text_input( - tr("Please enter Jianying draft name"), - value=f"NarratoAI_{int(time.time())}", - key="draft_name_input" - ) - - if st.button(tr("Confirm Export"), key="confirm_export"): - if not draft_name: - st.error(tr("Please enter draft name")) - return - - # 创建任务ID - task_id = str(uuid.uuid4()) - st.session_state['task_id'] = task_id - - # 构建参数 - try: - params = get_jianying_export_params() - except Exception as e: - logger.error(f"构建参数失败: {e}") - st.error(f"{tr('Failed to build parameters')}: {e}") - return - - with st.spinner(tr("Exporting to Jianying draft...")): - try: - from app.services import jianying_task - - # 调用导出到剪映草稿的任务 - result = jianying_task.start_export_jianying_draft(task_id, params) - - # 记录日志 - logger.info(f"成功导出到剪映草稿: {result['draft_name']}") - logger.info(f"草稿已保存到: {result['draft_path']}") - - # 保存结果到session state - st.session_state['jianying_export_result'] = result - st.session_state['jianying_export_error'] = None - st.session_state['show_jianying_export_form'] = False - - st.success(tr("Jianying draft exported successfully").format(name=result['draft_name'])) - st.info(tr("Draft saved to").format(path=result['draft_path'])) - except Exception as e: - logger.error(f"导出到剪映草稿失败: {e}") - import traceback - logger.error(f"错误详情: {traceback.format_exc()}") - st.session_state['jianying_export_error'] = str(e) - st.session_state['jianying_export_result'] = None - st.error(f"{tr('Failed to export Jianying draft')}: {e}") - - if st.button(tr("Cancel"), key="cancel_export"): - st.session_state['show_jianying_export_form'] = False - st.session_state['jianying_export_result'] = None - st.session_state['jianying_export_error'] = None - st.rerun() + _render_jianying_export_status() diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index 43b48fd..5b12cdd 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -9,9 +9,9 @@ from app.models.schema import AudioVolumeDefaults from app.utils import utils -INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/mp3" -INDEXTTS2_REFERENCE_AUDIO_COPY_SUBDIR = "indextts2_refs" -INDEXTTS2_REFERENCE_AUDIO_MAP = [ +INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/mp3" +INDEXTTS_REFERENCE_AUDIO_COPY_SUBDIR = "indextts_refs" +INDEXTTS_REFERENCE_AUDIO_MAP = [ ("yingshijieshuo-zh-male.mp3", "影视解说", "Film Narration"), ("maikeashe-zh-male.mp3", "麦克阿瑟", "Macintosh"), ("dong-yuhui-zh-male.mp3", "董宇辉", "Dong Yuhui"), @@ -35,7 +35,7 @@ INDEXTTS2_REFERENCE_AUDIO_MAP = [ ("meiqu-kelong-en-unknown.mp3", "美式男声", "US Clone"), ("sarah-en-female.mp3", "莎拉", "Sarah"), ] -INDEXTTS2_REFERENCE_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg") +INDEXTTS_REFERENCE_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg") BGM_RESOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/bgms-safe" BGM_TRACKS_JSON = os.path.join(BGM_RESOURCE_DIR, "tracks.json") BGM_UPLOAD_SUBDIR = "uploaded_bgms" @@ -56,7 +56,7 @@ def get_soulvoice_voices(): def get_tts_engine_options(tr=lambda key: key): """获取TTS引擎选项""" return { - "indextts2": "IndexTTS2", + config.INDEXTTS_ENGINE: config.INDEXTTS_DISPLAY_NAME, "edge_tts": "Edge TTS", "qwen3_tts": tr("Tongyi Qwen3 TTS"), "tencent_tts": tr("Tencent Cloud TTS"), @@ -92,10 +92,10 @@ def get_tts_engine_descriptions(tr=lambda key: key): "use_case": tr("High-quality Chinese speech synthesis use case"), "registration": "https://dashscope.aliyuncs.com/" }, - "indextts2": { - "title": "IndexTTS2", - "features": tr("IndexTTS2 features"), - "use_case": tr("IndexTTS2 use case"), + config.INDEXTTS_ENGINE: { + "title": config.INDEXTTS_DISPLAY_NAME, + "features": tr("IndexTTS features"), + "use_case": tr("IndexTTS use case"), "registration": None }, "doubaotts": { @@ -107,7 +107,7 @@ def get_tts_engine_descriptions(tr=lambda key: key): } -def infer_indextts2_reference_audio_language(filename): +def infer_indextts_reference_audio_language(filename): """根据文件名推断参考音频语言""" lower_filename = filename.lower() if "-zh-" in lower_filename: @@ -117,30 +117,30 @@ def infer_indextts2_reference_audio_language(filename): return "unknown" -def get_indextts2_reference_audio_options(): - """获取本地 IndexTTS2 参考音频选项""" +def get_indextts_reference_audio_options(): + """获取本地 IndexTTS-1.5 参考音频选项""" options = [] mapped_files = set() - for filename, zh_name, en_name in INDEXTTS2_REFERENCE_AUDIO_MAP: - audio_path = os.path.join(INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR, filename) + for filename, zh_name, en_name in INDEXTTS_REFERENCE_AUDIO_MAP: + audio_path = os.path.join(INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR, filename) if os.path.isfile(audio_path): options.append({ "filename": filename, "path": audio_path, "zh": zh_name, "en": en_name, - "language": infer_indextts2_reference_audio_language(filename), + "language": infer_indextts_reference_audio_language(filename), }) mapped_files.add(filename) - if os.path.isdir(INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR): - for filename in sorted(os.listdir(INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR)): + if os.path.isdir(INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR): + for filename in sorted(os.listdir(INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR)): if filename in mapped_files: continue - if not filename.lower().endswith(INDEXTTS2_REFERENCE_AUDIO_EXTENSIONS): + if not filename.lower().endswith(INDEXTTS_REFERENCE_AUDIO_EXTENSIONS): continue - audio_path = os.path.join(INDEXTTS2_REFERENCE_AUDIO_SOURCE_DIR, filename) + audio_path = os.path.join(INDEXTTS_REFERENCE_AUDIO_SOURCE_DIR, filename) if not os.path.isfile(audio_path): continue fallback_name = os.path.splitext(filename)[0] @@ -149,14 +149,14 @@ def get_indextts2_reference_audio_options(): "path": audio_path, "zh": fallback_name, "en": fallback_name, - "language": infer_indextts2_reference_audio_language(filename), + "language": infer_indextts_reference_audio_language(filename), }) return options -def format_indextts2_reference_audio_option(option): - """格式化 IndexTTS2 参考音频下拉显示名""" +def format_indextts_reference_audio_option(option): + """格式化 IndexTTS-1.5 参考音频下拉显示名""" zh_name = option.get("zh", "") en_name = option.get("en", "") language = option.get("language", "unknown") @@ -182,7 +182,7 @@ def format_indextts2_reference_audio_option(option): return f"{display_name} ({language_label})" -def get_indextts2_reference_audio_index(options, saved_reference_audio): +def get_indextts_reference_audio_index(options, saved_reference_audio): """根据已保存的参考音频文件匹配下拉选项索引""" if not options: return 0 @@ -195,12 +195,12 @@ def get_indextts2_reference_audio_index(options, saved_reference_audio): return 0 -def copy_indextts2_reference_audio(source_path): +def copy_indextts_reference_audio(source_path): """复制一份参考音频到项目存储目录,并返回复制后的路径""" if not source_path or not os.path.isfile(source_path): return "" - target_dir = utils.storage_dir(INDEXTTS2_REFERENCE_AUDIO_COPY_SUBDIR, create=True) + target_dir = utils.storage_dir(INDEXTTS_REFERENCE_AUDIO_COPY_SUBDIR, create=True) target_path = os.path.join(target_dir, os.path.basename(source_path)) if os.path.abspath(source_path) == os.path.abspath(target_path): @@ -336,7 +336,7 @@ def render_reference_audio_preview_button(reference_audio, key, tr): disabled=not can_preview, use_container_width=True, ): - st.session_state["indextts2_reference_audio_preview_path"] = reference_audio + st.session_state["indextts_reference_audio_preview_path"] = reference_audio def render_bgm_preview_button(bgm_file, key, tr): @@ -395,11 +395,13 @@ def render_tts_settings(tr): engine_descriptions = get_tts_engine_descriptions(tr) # 获取保存的TTS引擎设置 - saved_tts_engine = config.ui.get("tts_engine", "indextts2") + saved_tts_engine = config.normalize_tts_engine_name( + config.ui.get("tts_engine", config.INDEXTTS_ENGINE) + ) # 确保保存的引擎在可用选项中 if saved_tts_engine not in engine_options: - saved_tts_engine = "indextts2" + saved_tts_engine = config.INDEXTTS_ENGINE # TTS引擎选择下拉框 selected_engine = st.selectbox( @@ -438,8 +440,8 @@ def render_tts_settings(tr): render_tencent_tts_settings(tr) elif selected_engine == "qwen3_tts": render_qwen3_tts_settings(tr) - elif selected_engine == "indextts2": - render_indextts2_tts_settings(tr) + elif selected_engine == config.INDEXTTS_ENGINE: + render_indextts_tts_settings(tr) elif selected_engine == "doubaotts": render_doubaotts_settings(tr) @@ -850,22 +852,22 @@ def render_qwen3_tts_settings(tr): config.ui["voice_name"] = voice_type #兼容性 -def render_indextts2_tts_settings(tr): - """渲染 IndexTTS2 TTS 设置""" +def render_indextts_tts_settings(tr): + """渲染 IndexTTS-1.5 TTS 设置""" # API 地址配置 api_url = st.text_input( tr("API URL"), - value=config.indextts2.get("api_url", "http://127.0.0.1:8081/tts"), - help=tr("IndexTTS2 API URL Help") + value=config.indextts.get("api_url", "http://127.0.0.1:8081/tts"), + help=tr("IndexTTS API URL Help") ) - saved_reference_audio = config.indextts2.get("reference_audio", "") + saved_reference_audio = config.indextts.get("reference_audio", "") reference_audio_source_options = { tr("Select from Resource Directory"): "resource", tr("Upload Reference Audio"): "upload", } reference_audio_source_labels = list(reference_audio_source_options.keys()) - saved_reference_audio_source = config.indextts2.get("reference_audio_source", "resource") + saved_reference_audio_source = config.indextts.get("reference_audio_source", "resource") if saved_reference_audio_source not in reference_audio_source_options.values(): saved_reference_audio_source = "resource" default_reference_audio_source_label = next( @@ -880,7 +882,7 @@ def render_indextts2_tts_settings(tr): options=reference_audio_source_labels, selection_mode="single", default=default_reference_audio_source_label, - key="indextts2_reference_audio_source_selection", + key="indextts_reference_audio_source_selection", help=tr("Reference Audio Source Help"), label_visibility="collapsed", width="stretch", @@ -890,24 +892,24 @@ def render_indextts2_tts_settings(tr): reference_audio_source = reference_audio_source_options[reference_audio_source_label] reference_audio = saved_reference_audio - reference_audio_options = get_indextts2_reference_audio_options() + reference_audio_options = get_indextts_reference_audio_options() if reference_audio_source == "resource" and reference_audio_options: - selected_audio_index = get_indextts2_reference_audio_index(reference_audio_options, saved_reference_audio) + selected_audio_index = get_indextts_reference_audio_index(reference_audio_options, saved_reference_audio) select_col, preview_col = st.columns([5, 1]) with select_col: selected_audio_option = reference_audio_options[st.selectbox( tr("Reference Audio Path"), options=range(len(reference_audio_options)), index=selected_audio_index, - format_func=lambda x: format_indextts2_reference_audio_option(reference_audio_options[x]), + format_func=lambda x: format_indextts_reference_audio_option(reference_audio_options[x]), help=tr("Reference Audio Path Help"), label_visibility="collapsed" )] - reference_audio = copy_indextts2_reference_audio(selected_audio_option["path"]) + reference_audio = copy_indextts_reference_audio(selected_audio_option["path"]) with preview_col: render_reference_audio_preview_button( reference_audio, - "indextts2_resource_reference_audio_preview", + "indextts_resource_reference_audio_preview", tr, ) elif reference_audio_source == "resource": @@ -926,7 +928,7 @@ def render_indextts2_tts_settings(tr): ) if uploaded_file is not None: - target_dir = utils.storage_dir(INDEXTTS2_REFERENCE_AUDIO_COPY_SUBDIR, create=True) + target_dir = utils.storage_dir(INDEXTTS_REFERENCE_AUDIO_COPY_SUBDIR, create=True) audio_path = os.path.join(target_dir, f"uploaded_{uploaded_file.name}") with open(audio_path, "wb") as f: f.write(uploaded_file.getbuffer()) @@ -935,11 +937,11 @@ def render_indextts2_tts_settings(tr): with preview_col: render_reference_audio_preview_button( reference_audio, - "indextts2_upload_reference_audio_preview", + "indextts_upload_reference_audio_preview", tr, ) - preview_audio_path = st.session_state.get("indextts2_reference_audio_preview_path", "") + preview_audio_path = st.session_state.get("indextts_reference_audio_preview_path", "") if preview_audio_path == reference_audio and os.path.isfile(preview_audio_path): with open(preview_audio_path, "rb") as audio_file: st.audio(audio_file.read(), format=get_audio_mime_type(preview_audio_path)) @@ -949,7 +951,7 @@ def render_indextts2_tts_settings(tr): ("普通推理", tr("Standard Inference")), ("快速推理", tr("Fast Inference")), ] - infer_mode_index = 0 if config.indextts2.get("infer_mode", "普通推理") == "普通推理" else 1 + infer_mode_index = 0 if config.indextts.get("infer_mode", "普通推理") == "普通推理" else 1 infer_mode = infer_mode_options[st.selectbox( tr("Inference Mode"), options=range(len(infer_mode_options)), @@ -967,7 +969,7 @@ def render_indextts2_tts_settings(tr): tr("Sampling Temperature"), min_value=0.1, max_value=2.0, - value=float(config.indextts2.get("temperature", 1.0)), + value=float(config.indextts.get("temperature", 1.0)), step=0.1, help=tr("Sampling Temperature Help") ) @@ -976,7 +978,7 @@ def render_indextts2_tts_settings(tr): "Top P", min_value=0.0, max_value=1.0, - value=float(config.indextts2.get("top_p", 0.8)), + value=float(config.indextts.get("top_p", 0.8)), step=0.05, help=tr("Top P Help") ) @@ -985,7 +987,7 @@ def render_indextts2_tts_settings(tr): "Top K", min_value=0, max_value=100, - value=int(config.indextts2.get("top_k", 30)), + value=int(config.indextts.get("top_k", 30)), step=5, help=tr("Top K Help") ) @@ -995,7 +997,7 @@ def render_indextts2_tts_settings(tr): tr("Num Beams"), min_value=1, max_value=10, - value=int(config.indextts2.get("num_beams", 3)), + value=int(config.indextts.get("num_beams", 3)), step=1, help=tr("Num Beams Help") ) @@ -1004,36 +1006,36 @@ def render_indextts2_tts_settings(tr): tr("Repetition Penalty"), min_value=1.0, max_value=20.0, - value=float(config.indextts2.get("repetition_penalty", 10.0)), + value=float(config.indextts.get("repetition_penalty", 10.0)), step=0.5, help=tr("Repetition Penalty Help") ) do_sample = st.checkbox( tr("Enable Sampling"), - value=config.indextts2.get("do_sample", True), + value=config.indextts.get("do_sample", True), help=tr("Enable Sampling Help") ) # 显示使用说明 - with st.expander(tr("IndexTTS2 Usage Instructions Title"), expanded=False): - st.markdown(tr("IndexTTS2 Usage Instructions")) + with st.expander(tr("IndexTTS Usage Instructions Title"), expanded=False): + st.markdown(tr("IndexTTS Usage Instructions")) # 保存配置 - config.indextts2["api_url"] = api_url - config.indextts2["reference_audio_source"] = reference_audio_source - config.indextts2["reference_audio"] = reference_audio - config.indextts2["infer_mode"] = infer_mode - config.indextts2["temperature"] = temperature - config.indextts2["top_p"] = top_p - config.indextts2["top_k"] = top_k - config.indextts2["num_beams"] = num_beams - config.indextts2["repetition_penalty"] = repetition_penalty - config.indextts2["do_sample"] = do_sample + config.indextts["api_url"] = api_url + config.indextts["reference_audio_source"] = reference_audio_source + config.indextts["reference_audio"] = reference_audio + config.indextts["infer_mode"] = infer_mode + config.indextts["temperature"] = temperature + config.indextts["top_p"] = top_p + config.indextts["top_k"] = top_k + config.indextts["num_beams"] = num_beams + config.indextts["repetition_penalty"] = repetition_penalty + config.indextts["do_sample"] = do_sample # 保存 voice_name 用于兼容性 if reference_audio: - config.ui["voice_name"] = f"indextts2:{reference_audio}" + config.ui["voice_name"] = f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}" def render_doubaotts_settings(tr): @@ -1317,12 +1319,12 @@ def render_voice_preview_new(tr, selected_engine): voice_name = f"qwen3:{vt}" voice_rate = config.ui.get("qwen3_rate", 1.0) voice_pitch = 1.0 # Qwen3 TTS 不支持音调调节 - elif selected_engine == "indextts2": - reference_audio = config.indextts2.get("reference_audio", "") + elif selected_engine == config.INDEXTTS_ENGINE: + reference_audio = config.indextts.get("reference_audio", "") if reference_audio: - voice_name = f"indextts2:{reference_audio}" - voice_rate = 1.0 # IndexTTS2 不支持速度调节 - voice_pitch = 1.0 # IndexTTS2 不支持音调调节 + voice_name = f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}" + voice_rate = 1.0 # IndexTTS-1.5 不支持速度调节 + voice_pitch = 1.0 # IndexTTS-1.5 不支持音调调节 elif selected_engine == "doubaotts": voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming") voice_name = voice_type @@ -1599,5 +1601,5 @@ def get_audio_params(): 'bgm_type': st.session_state.get('bgm_type', 'random'), 'bgm_file': st.session_state.get('bgm_file', ''), 'bgm_volume': st.session_state.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME), - 'tts_engine': st.session_state.get('tts_engine', "indextts2"), + 'tts_engine': st.session_state.get('tts_engine', config.INDEXTTS_ENGINE), } diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 654185c..bf8ffe3 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -271,7 +271,7 @@ "Disabled subtitles help": "This TTS engine does not support subtitle generation. Please use another TTS engine.", "Tencent Cloud TTS": "Tencent Cloud TTS", "Tongyi Qwen3 TTS": "Tongyi Qwen3 TTS", - "IndexTTS2 Voice Clone": "IndexTTS2 Voice Clone", + "IndexTTS Voice Clone": "IndexTTS-1.5 Voice Clone", "Doubao TTS": "Doubao TTS", "Edge TTS features": "Completely free, but service stability can vary and voice cloning is not supported.", "Edge TTS use case": "Testing and lightweight use", @@ -281,9 +281,9 @@ "Tencent Cloud TTS use case": "Personal and enterprise users who need stable Chinese speech synthesis", "Tongyi Qwen3 TTS features": "Alibaba Cloud Tongyi Qwen speech synthesis with high-quality voices and multiple voice options.", "High-quality Chinese speech synthesis use case": "Users who need high-quality Chinese speech synthesis", - "IndexTTS2 features": "A locally or privately deployed voice-cloning engine. Choose a resource audio file or upload a reference audio file, then synthesize narration in that voice.", - "IndexTTS2 use case": "Best for fixed narrator voices, character dubbing, or generating multiple videos with the same voice. Start the IndexTTS2 API service before use. Deployment package: https://pan.quark.cn/s/0767c9bcefd5", - "IndexTTS2 download link": "Download link: https://pan.quark.cn/s/0767c9bcefd5", + "IndexTTS features": "A locally or privately deployed IndexTTS-1.5 voice-cloning engine. Choose a resource audio file or upload a reference audio file, then synthesize narration in that voice.", + "IndexTTS use case": "Best for fixed narrator voices, character dubbing, or generating multiple videos with the same voice. Start the IndexTTS-1.5 API service before use. Deployment package: https://pan.quark.cn/s/0767c9bcefd5", + "IndexTTS download link": "Download link: https://pan.quark.cn/s/0767c9bcefd5", "Doubao TTS features": "Volcengine Doubao speech synthesis with multiple voices and emotions, plus fast access in mainland China.", "Select TTS Engine": "Select TTS Engine", "Select TTS Engine Help": "Choose the text-to-speech engine you want to use.", @@ -330,6 +330,10 @@ "Export to Jianying Draft": "📤 Export to Jianying Draft", "Please configure Jianying draft folder in basic settings": "Please configure the Jianying draft folder in Basic Settings", "Jianying draft folder does not exist": "Jianying draft folder does not exist: {path}", + "Jianying export dialog title": "Confirm draft name", + "Jianying export dialog description": "Confirm the Jianying draft name before exporting. Once complete, you can open it from the Jianying draft folder.", + "Jianying export destination": "Save location", + "Jianying draft name": "Draft name", "Please enter Jianying draft name": "Please enter the Jianying draft name", "Confirm Export": "Confirm Export", "Please enter draft name": "Please enter a draft name", @@ -435,7 +439,7 @@ "Qwen TTS Model Help": "Qwen TTS model name, for example qwen3-tts-flash", "Select Qwen3 TTS Voice": "Select a Qwen3 TTS voice", "API URL": "API URL", - "IndexTTS2 API URL Help": "IndexTTS2 API service URL", + "IndexTTS API URL Help": "IndexTTS-1.5 API service URL", "Reference Audio Source": "Reference Audio Source", "Reference Audio Source Help": "Choose a reference audio from the resource directory or upload a new one.", "Select from Resource Directory": "Select from Resource Directory", @@ -463,8 +467,8 @@ "Repetition Penalty Help": "Higher values reduce repetition, but overly high values may sound unnatural.", "Enable Sampling": "Enable Sampling", "Enable Sampling Help": "Enable sampling for more natural speech.", - "IndexTTS2 Usage Instructions Title": "💡 IndexTTS2 Usage Instructions", - "IndexTTS2 Usage Instructions": "**Zero-shot voice cloning**\n\n1. **Prepare reference audio**: upload or specify a clear audio file (3-10 seconds recommended)\n2. **Set API URL**: make sure the IndexTTS2 service is running\n3. **Start synthesis**: the system will use the reference voice to synthesize new speech\n\n**Notes**:\n- Reference audio quality directly affects synthesis quality\n- Use clean audio without background noise when possible\n- Keep text length within a reasonable range\n- The first synthesis may take longer", + "IndexTTS Usage Instructions Title": "💡 IndexTTS-1.5 Usage Instructions", + "IndexTTS Usage Instructions": "**Zero-shot voice cloning**\n\n1. **Prepare reference audio**: upload or specify a clear audio file (3-10 seconds recommended)\n2. **Set API URL**: make sure the IndexTTS-1.5 service is running\n3. **Start synthesis**: the system will use the reference voice to synthesize new speech\n\n**Notes**:\n- Reference audio quality directly affects synthesis quality\n- Use clean audio without background noise when possible\n- Keep text length within a reasonable range\n- The first synthesis may take longer", "Volcengine Access Key Help": "Volcengine Access Key", "Volcengine Secret Key Help": "Volcengine Secret Key", "Doubao AppID Help": "Doubao TTS application AppID", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 1fc60f0..84ea88e 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -252,7 +252,7 @@ "Disabled subtitles help": "当前 TTS 引擎不支持字幕生成,请使用其他 TTS 引擎", "Tencent Cloud TTS": "腾讯云 TTS", "Tongyi Qwen3 TTS": "通义千问 Qwen3 TTS", - "IndexTTS2 Voice Clone": "IndexTTS2 语音克隆", + "IndexTTS Voice Clone": "IndexTTS-1.5 语音克隆", "Doubao TTS": "豆包语音 TTS", "Edge TTS features": "完全免费,但服务稳定性一般,不支持语音克隆功能", "Edge TTS use case": "测试和轻量级使用", @@ -262,9 +262,9 @@ "Tencent Cloud TTS use case": "个人和企业用户,需要稳定的中文语音合成", "Tongyi Qwen3 TTS features": "阿里云通义千问语音合成,音质优秀,支持多种音色", "High-quality Chinese speech synthesis use case": "需要高质量中文语音合成的用户", - "IndexTTS2 features": "本地/私有部署的语音克隆引擎。选择资源目录音频或上传参考音频后,可按该音色合成旁白。", - "IndexTTS2 use case": "适合需要固定旁白音色、角色配音或批量生成同一音色视频的场景。使用前请先启动 IndexTTS2 API 服务;部署包下载:https://pan.quark.cn/s/0767c9bcefd5", - "IndexTTS2 download link": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", + "IndexTTS features": "本地/私有部署的 IndexTTS-1.5 语音克隆引擎。选择资源目录音频或上传参考音频后,可按该音色合成旁白。", + "IndexTTS use case": "适合需要固定旁白音色、角色配音或批量生成同一音色视频的场景。使用前请先启动 IndexTTS-1.5 API 服务;部署包下载:https://pan.quark.cn/s/0767c9bcefd5", + "IndexTTS download link": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", "Doubao TTS features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快", "Select TTS Engine": "选择 TTS 引擎", "Select TTS Engine Help": "选择您要使用的文本转语音引擎", @@ -312,6 +312,10 @@ "Export to Jianying Draft": "📤 导出到剪映草稿", "Please configure Jianying draft folder in basic settings": "请在基础设置中配置剪映草稿地址", "Jianying draft folder does not exist": "剪映草稿文件夹不存在: {path}", + "Jianying export dialog title": "确认草稿名称", + "Jianying export dialog description": "导出前请确认剪映草稿名称,完成后可在剪映草稿目录中打开。", + "Jianying export destination": "保存目录", + "Jianying draft name": "草稿名称", "Please enter Jianying draft name": "请输入剪映草稿名称", "Confirm Export": "确认导出", "Please enter draft name": "请输入草稿名称", @@ -417,7 +421,7 @@ "Qwen TTS Model Help": "Qwen TTS 模型名,例如 qwen3-tts-flash", "Select Qwen3 TTS Voice": "选择 Qwen3 TTS 音色", "API URL": "API 地址", - "IndexTTS2 API URL Help": "IndexTTS2 API 服务地址", + "IndexTTS API URL Help": "IndexTTS-1.5 API 服务地址", "Reference Audio Source": "参考音频来源", "Reference Audio Source Help": "选择从资源目录选择参考音频,或上传新的参考音频", "Select from Resource Directory": "从资源目录选择", @@ -445,8 +449,8 @@ "Repetition Penalty Help": "值越大越能避免重复,但过大可能导致不自然", "Enable Sampling": "启用采样", "Enable Sampling Help": "启用采样可以获得更自然的语音", - "IndexTTS2 Usage Instructions Title": "💡 IndexTTS2 使用说明", - "IndexTTS2 Usage Instructions": "**零样本语音克隆**\n\n1. **准备参考音频**:上传或指定一段清晰的音频文件(建议 3-10 秒)\n2. **设置 API 地址**:确保 IndexTTS2 服务正常运行\n3. **开始合成**:系统会自动使用参考音频的音色合成新语音\n\n**注意事项**:\n- 参考音频质量直接影响合成效果\n- 建议使用无背景噪音的清晰音频\n- 文本长度建议控制在合理范围内\n- 首次合成可能需要较长时间", + "IndexTTS Usage Instructions Title": "💡 IndexTTS-1.5 使用说明", + "IndexTTS Usage Instructions": "**零样本语音克隆**\n\n1. **准备参考音频**:上传或指定一段清晰的音频文件(建议 3-10 秒)\n2. **设置 API 地址**:确保 IndexTTS-1.5 服务正常运行\n3. **开始合成**:系统会自动使用参考音频的音色合成新语音\n\n**注意事项**:\n- 参考音频质量直接影响合成效果\n- 建议使用无背景噪音的清晰音频\n- 文本长度建议控制在合理范围内\n- 首次合成可能需要较长时间", "Volcengine Access Key Help": "火山引擎 Access Key", "Volcengine Secret Key Help": "火山引擎 Secret Key", "Doubao AppID Help": "豆包语音应用 AppID", From d147fe66e4d6591d2977ff1a497d1856d22366b9 Mon Sep 17 00:00:00 2001 From: viccy Date: Sat, 6 Jun 2026 14:31:09 +0800 Subject: [PATCH 12/24] =?UTF-8?q?feat(tts):=20=E6=96=B0=E5=A2=9EIndexTTS-2?= =?UTF-8?q?=E8=AF=AD=E9=9F=B3=E5=90=88=E6=88=90=E5=BC=95=E6=93=8E=E6=94=AF?= =?UTF-8?q?=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 实现兼容IndexTTS2-Pack API的完整TTS调用流程,包含音频下载、错误重试等处理 重构原有IndexTTS-1.5代码,抽象通用逻辑以同时兼容indextts和indextts2两个引擎 新增IndexTTS-2的WebUI配置界面,支持情感控制与高级生成参数调整 更新配置示例文件与中英多语言文案,完善配置迁移逻辑兼容旧版配置 新增对应单元测试覆盖参数处理与配置迁移流程 --- app/config/config.py | 46 +- app/config/test_config_bootstrap_unittest.py | 23 +- app/services/jianying_task.py | 38 +- app/services/test_jianying_task_unittest.py | 17 +- app/services/voice.py | 155 ++++++- config.example.toml | 46 +- webui.py | 5 + webui/components/audio_settings.py | 418 +++++++++++++++---- webui/i18n/en.json | 33 ++ webui/i18n/zh.json | 33 ++ 10 files changed, 689 insertions(+), 125 deletions(-) diff --git a/app/config/config.py b/app/config/config.py index ac44df1..ae19945 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -10,34 +10,51 @@ root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__fi config_file = f"{root_dir}/config.toml" version_file = f"{root_dir}/project_version" INDEXTTS_ENGINE = "indextts" -INDEXTTS_LEGACY_ENGINE = "indextts2" INDEXTTS_DISPLAY_NAME = "IndexTTS-1.5" +INDEXTTS2_ENGINE = "indextts2" +INDEXTTS2_DISPLAY_NAME = "IndexTTS-2" INDEXTTS_VOICE_PREFIX = f"{INDEXTTS_ENGINE}:" -INDEXTTS_LEGACY_VOICE_PREFIX = f"{INDEXTTS_LEGACY_ENGINE}:" +INDEXTTS2_VOICE_PREFIX = f"{INDEXTTS2_ENGINE}:" def normalize_tts_engine_name(tts_engine: str) -> str: - if tts_engine == INDEXTTS_LEGACY_ENGINE: - return INDEXTTS_ENGINE return tts_engine def normalize_indextts_voice_prefix(voice_name: str) -> str: - if isinstance(voice_name, str) and voice_name.startswith(INDEXTTS_LEGACY_VOICE_PREFIX): - return f"{INDEXTTS_VOICE_PREFIX}{voice_name[len(INDEXTTS_LEGACY_VOICE_PREFIX):]}" return voice_name +def _is_legacy_indextts2_config(indextts2_config) -> bool: + if not isinstance(indextts2_config, dict): + return False + api_url = str(indextts2_config.get("api_url", "")) + has_indextts2_fields = any( + key in indextts2_config + for key in ( + "emotion_mode", + "emotion_alpha", + "max_text_tokens_per_segment", + "max_mel_tokens", + "vec_calm", + ) + ) + return "8081" in api_url and not has_indextts2_fields + + def migrate_indextts_config(config_data): - if "indextts" not in config_data and INDEXTTS_LEGACY_ENGINE in config_data: - config_data["indextts"] = config_data[INDEXTTS_LEGACY_ENGINE] + migrated_legacy_indextts2 = _is_legacy_indextts2_config(config_data.get(INDEXTTS2_ENGINE)) + if migrated_legacy_indextts2: + if "indextts" not in config_data: + config_data["indextts"] = config_data[INDEXTTS2_ENGINE] + config_data.pop(INDEXTTS2_ENGINE, None) ui_config = config_data.get("ui") if isinstance(ui_config, dict): - if "tts_engine" in ui_config: - ui_config["tts_engine"] = normalize_tts_engine_name(ui_config.get("tts_engine", "")) - if "voice_name" in ui_config: - ui_config["voice_name"] = normalize_indextts_voice_prefix(ui_config.get("voice_name", "")) + if migrated_legacy_indextts2 and ui_config.get("tts_engine") == INDEXTTS2_ENGINE: + ui_config["tts_engine"] = INDEXTTS_ENGINE + if ui_config.get("voice_name", "").startswith(INDEXTTS2_VOICE_PREFIX) and ui_config.get("tts_engine") == INDEXTTS_ENGINE: + ui_config["voice_name"] = f"{INDEXTTS_VOICE_PREFIX}{ui_config['voice_name'][len(INDEXTTS2_VOICE_PREFIX):]}" return config_data @@ -113,7 +130,7 @@ def save_config(): _cfg["tts_qwen"] = tts_qwen _cfg["fun_asr"] = fun_asr _cfg["indextts"] = indextts - _cfg.pop(INDEXTTS_LEGACY_ENGINE, None) + _cfg["indextts2"] = indextts2 _cfg["doubaotts"] = doubaotts f.write(toml.dumps(_cfg)) @@ -129,7 +146,8 @@ ui = _cfg.get("ui", {}) frames = _cfg.get("frames", {}) tts_qwen = _cfg.get("tts_qwen", {}) fun_asr = _cfg.get("fun_asr", {}) -indextts = _cfg.get("indextts", _cfg.get(INDEXTTS_LEGACY_ENGINE, {})) +indextts = _cfg.get("indextts", {}) +indextts2 = _cfg.get("indextts2", {}) doubaotts = _cfg.get("doubaotts", {}) hostname = socket.gethostname() diff --git a/app/config/test_config_bootstrap_unittest.py b/app/config/test_config_bootstrap_unittest.py index 691b6da..8034d02 100644 --- a/app/config/test_config_bootstrap_unittest.py +++ b/app/config/test_config_bootstrap_unittest.py @@ -64,7 +64,7 @@ hide_config = true self.assertEqual("Pro/zai-org/GLM-5", saved_config["app"]["text_openai_model_name"]) self.assertTrue(saved_config["app"]["hide_config"]) - def test_indextts_legacy_config_is_migrated(self): + def test_legacy_indextts2_config_is_migrated_to_indextts_15(self): migrated = cfg.migrate_indextts_config( { "indextts2": {"api_url": "http://127.0.0.1:8081/tts"}, @@ -76,9 +76,30 @@ hide_config = true ) self.assertEqual("http://127.0.0.1:8081/tts", migrated["indextts"]["api_url"]) + self.assertNotIn("indextts2", migrated) self.assertEqual("indextts", migrated["ui"]["tts_engine"]) self.assertEqual("indextts:/tmp/reference.wav", migrated["ui"]["voice_name"]) + def test_indextts2_config_is_kept_as_separate_engine(self): + migrated = cfg.migrate_indextts_config( + { + "indextts": {"api_url": "http://127.0.0.1:8081/tts"}, + "indextts2": { + "api_url": "http://192.168.3.6:7863/tts", + "emotion_mode": "speaker", + }, + "ui": { + "tts_engine": "indextts2", + "voice_name": "indextts2:/tmp/reference.wav", + }, + } + ) + + self.assertEqual("http://127.0.0.1:8081/tts", migrated["indextts"]["api_url"]) + self.assertEqual("http://192.168.3.6:7863/tts", migrated["indextts2"]["api_url"]) + self.assertEqual("indextts2", migrated["ui"]["tts_engine"]) + self.assertEqual("indextts2:/tmp/reference.wav", migrated["ui"]["voice_name"]) + class OpenAICompatibleModelDefaultsTests(unittest.TestCase): def test_ui_keeps_full_model_name_and_openai_provider(self): diff --git a/app/services/jianying_task.py b/app/services/jianying_task.py index dd9db81..345f6b7 100644 --- a/app/services/jianying_task.py +++ b/app/services/jianying_task.py @@ -49,14 +49,20 @@ def get_audio_duration_ffprobe(audio_file: str) -> float: return get_media_duration_ffprobe(audio_file) -def _strip_indextts_prefix(voice_name: str) -> str: - voice_name = config.normalize_indextts_voice_prefix(voice_name or "") - prefix = config.INDEXTTS_VOICE_PREFIX +def _strip_tts_voice_prefix(voice_name: str, prefix: str) -> str: + voice_name = voice_name or "" if voice_name.startswith(prefix): return voice_name[len(prefix):] return voice_name +def _strip_indextts_prefix(voice_name: str) -> str: + return _strip_tts_voice_prefix( + config.normalize_indextts_voice_prefix(voice_name or ""), + config.INDEXTTS_VOICE_PREFIX, + ) + + def _floor_duration_to_milliseconds(duration: float) -> float: return int(duration * 1000) / 1000.0 @@ -101,24 +107,32 @@ def _clamp_duration_to_media( def _normalize_indextts_reference_audio(params: VideoClipParams) -> None: - """Ensure IndexTTS-1.5 uses the configured reference audio instead of a stale UI voice.""" + """Ensure IndexTTS engines use the configured reference audio instead of a stale UI voice.""" params.tts_engine = config.normalize_tts_engine_name(params.tts_engine) - if params.tts_engine != config.INDEXTTS_ENGINE: + if params.tts_engine == config.INDEXTTS_ENGINE: + tts_config = config.indextts + voice_prefix = config.INDEXTTS_VOICE_PREFIX + display_name = "IndexTTS-1.5" + elif params.tts_engine == config.INDEXTTS2_ENGINE: + tts_config = config.indextts2 + voice_prefix = config.INDEXTTS2_VOICE_PREFIX + display_name = "IndexTTS-2" + else: return - candidate = _strip_indextts_prefix(getattr(params, "voice_name", "") or "") + candidate = _strip_tts_voice_prefix(getattr(params, "voice_name", "") or "", voice_prefix) if candidate and os.path.isfile(candidate): - params.voice_name = f"{config.INDEXTTS_VOICE_PREFIX}{candidate}" - logger.info(f"IndexTTS-1.5 使用参考音频: {candidate}") + params.voice_name = f"{voice_prefix}{candidate}" + logger.info(f"{display_name} 使用参考音频: {candidate}") return - configured_ref = _strip_indextts_prefix(config.indextts.get("reference_audio", "") or "") + configured_ref = _strip_tts_voice_prefix(tts_config.get("reference_audio", "") or "", voice_prefix) if configured_ref and os.path.isfile(configured_ref): - params.voice_name = f"{config.INDEXTTS_VOICE_PREFIX}{configured_ref}" - logger.info(f"IndexTTS-1.5 使用配置中的参考音频: {configured_ref}") + params.voice_name = f"{voice_prefix}{configured_ref}" + logger.info(f"{display_name} 使用配置中的参考音频: {configured_ref}") return - raise ValueError("IndexTTS-1.5 参考音频不存在,请在音频设置中上传或选择有效的参考音频") + raise ValueError(f"{display_name} 参考音频不存在,请在音频设置中上传或选择有效的参考音频") def start_export_jianying_draft(task_id: str, params: VideoClipParams): diff --git a/app/services/test_jianying_task_unittest.py b/app/services/test_jianying_task_unittest.py index c073d3f..18897a4 100644 --- a/app/services/test_jianying_task_unittest.py +++ b/app/services/test_jianying_task_unittest.py @@ -31,14 +31,25 @@ class JianyingTaskTests(unittest.TestCase): self.assertEqual(f"indextts:{ref_path}", params.voice_name) - def test_normalize_indextts_accepts_legacy_engine_and_prefix(self): + def test_normalize_indextts2_uses_valid_param_reference(self): with tempfile.NamedTemporaryFile(suffix=".wav") as ref: params = VideoClipParams(tts_engine="indextts2", voice_name=f"indextts2:{ref.name}") jianying_task._normalize_indextts_reference_audio(params) - self.assertEqual("indextts", params.tts_engine) - self.assertEqual(f"indextts:{ref.name}", params.voice_name) + self.assertEqual("indextts2", params.tts_engine) + self.assertEqual(f"indextts2:{ref.name}", params.voice_name) + + def test_normalize_indextts2_uses_config_reference_when_param_is_stale(self): + with tempfile.TemporaryDirectory() as temp_dir: + ref_path = Path(temp_dir) / "reference.wav" + ref_path.write_bytes(b"fake wav") + params = VideoClipParams(tts_engine="indextts2", voice_name="zh-CN-YunjianNeural") + + with patch.dict(jianying_task.config.indextts2, {"reference_audio": str(ref_path)}, clear=False): + jianying_task._normalize_indextts_reference_audio(params) + + self.assertEqual(f"indextts2:{ref_path}", params.voice_name) def test_normalize_indextts_requires_existing_reference_audio(self): params = VideoClipParams(tts_engine="indextts", voice_name="zh-CN-YunjianNeural") diff --git a/app/services/voice.py b/app/services/voice.py index 38d70ee..2be5c87 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -21,6 +21,7 @@ except ImportError: MOVIEPY_AVAILABLE = False logger.warning("moviepy 未安装,将使用估算方法计算音频时长") import time +from urllib.parse import urljoin from app.config import config from app.utils import utils @@ -1293,6 +1294,10 @@ def tts( if tts_engine == config.INDEXTTS_ENGINE: logger.info("分发到 IndexTTS-1.5") return indextts_tts(text, voice_name, voice_file, speed=voice_rate) + + if tts_engine == config.INDEXTTS2_ENGINE: + logger.info("分发到 IndexTTS-2") + return indextts2_tts(text, voice_name, voice_file) if tts_engine == "doubaotts": logger.info("分发到豆包语音 TTS") @@ -1778,12 +1783,13 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f voice_name = config.normalize_indextts_voice_prefix(parse_voice_name(voice_name)) output_dir = utils.task_dir(task_id) tts_results = [] + audio_extension = ".wav" if tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) else ".mp3" for item in list_script: if item['OST'] != 1: # 将时间戳中的冒号替换为下划线 timestamp = item['timestamp'].replace(':', '_') - audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3") + audio_file = os.path.join(output_dir, f"audio_{timestamp}{audio_extension}") subtitle_file = os.path.join(output_dir, f"subtitle_{timestamp}.srt") text = item['narration'] @@ -1803,8 +1809,13 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f f"或者使用其他 tts 引擎") continue else: - # SoulVoice、Qwen3、IndexTTS-1.5、豆包语音 引擎不生成字幕文件 - if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == config.INDEXTTS_ENGINE or tts_engine == "doubaotts": + # SoulVoice、Qwen3、IndexTTS、豆包语音 引擎不生成精确字幕文件 + if ( + is_soulvoice_voice(voice_name) + or is_qwen_engine(tts_engine) + or tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) + or tts_engine == "doubaotts" + ): # 获取实际音频文件的时长 duration = get_audio_duration_from_file(audio_file) if duration <= 0: @@ -2234,6 +2245,17 @@ def parse_indextts_voice(voice_name: str) -> str: return voice_name +def parse_indextts2_voice(voice_name: str) -> str: + """ + 解析 IndexTTS-2 语音名称 + 支持格式:indextts2:reference_audio_path + 返回参考音频文件路径 + """ + if isinstance(voice_name, str) and voice_name.startswith(config.INDEXTTS2_VOICE_PREFIX): + return voice_name[len(config.INDEXTTS2_VOICE_PREFIX):] + return voice_name + + def indextts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: """ 使用 IndexTTS-1.5 API 进行零样本语音克隆 @@ -2344,3 +2366,130 @@ def indextts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0 logger.error("IndexTTS-1.5 TTS 生成失败,已达到最大重试次数") return None + + +def _normalize_indextts2_api_url(api_url: str) -> str: + api_url = (api_url or "http://192.168.3.6:7863/tts").strip() + if api_url.endswith("/tts"): + return api_url + return f"{api_url.rstrip('/')}/tts" + + +def _get_configured_proxies() -> dict: + if not config.proxy.get("http"): + return {} + return { + "http": config.proxy.get("http"), + "https": config.proxy.get("https", config.proxy.get("http")), + } + + +def _download_indextts2_audio(response: requests.Response, api_url: str, voice_file: str, proxies: dict) -> bool: + content_type = response.headers.get("content-type", "").lower() + if "application/json" not in content_type: + with open(voice_file, "wb") as f: + f.write(response.content) + return os.path.getsize(voice_file) > 0 + + result = response.json() + downloads = result.get("downloads") if isinstance(result, dict) else {} + download_url = downloads.get("wav") if isinstance(downloads, dict) else "" + if not download_url: + logger.error(f"IndexTTS-2 API 响应中没有音频下载地址: {result}") + return False + + audio_url = urljoin(api_url, download_url) + audio_response = requests.get(audio_url, proxies=proxies, timeout=120) + if audio_response.status_code != 200: + logger.error(f"IndexTTS-2 音频下载失败: {audio_response.status_code} - {audio_response.text}") + return False + + with open(voice_file, "wb") as f: + f.write(audio_response.content) + return os.path.getsize(voice_file) > 0 + + +def indextts2_tts(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]: + """ + 使用 IndexTTS-2 API 进行零样本语音克隆。 + 接口兼容 IndexTTS2-Pack 的 POST /tts multipart form。 + """ + api_url = _normalize_indextts2_api_url(config.indextts2.get("api_url", "http://192.168.3.6:7863/tts")) + reference_audio_path = parse_indextts2_voice(voice_name) + + if not reference_audio_path or not os.path.exists(reference_audio_path): + logger.error(f"IndexTTS-2 参考音频文件不存在: {reference_audio_path}") + return None + + emotion_mode = config.indextts2.get("emotion_mode", "speaker") + emotion_audio_path = config.indextts2.get("emotion_audio", "") + data = { + "text": text.strip(), + "emotion_mode": emotion_mode, + "emotion_alpha": config.indextts2.get("emotion_alpha", 0.65), + "emotion_text": config.indextts2.get("emotion_text", ""), + "use_random": str(bool(config.indextts2.get("use_random", False))).lower(), + "max_text_tokens_per_segment": config.indextts2.get("max_text_tokens_per_segment", 120), + "vec_happy": config.indextts2.get("vec_happy", 0.0), + "vec_angry": config.indextts2.get("vec_angry", 0.0), + "vec_sad": config.indextts2.get("vec_sad", 0.0), + "vec_afraid": config.indextts2.get("vec_afraid", 0.0), + "vec_disgusted": config.indextts2.get("vec_disgusted", 0.0), + "vec_melancholic": config.indextts2.get("vec_melancholic", 0.0), + "vec_surprised": config.indextts2.get("vec_surprised", 0.0), + "vec_calm": config.indextts2.get("vec_calm", 0.8), + "temperature": config.indextts2.get("temperature", 0.8), + "top_p": config.indextts2.get("top_p", 0.8), + "top_k": config.indextts2.get("top_k", 30), + "num_beams": config.indextts2.get("num_beams", 3), + "repetition_penalty": config.indextts2.get("repetition_penalty", 10.0), + "max_mel_tokens": config.indextts2.get("max_mel_tokens", 1500), + } + + proxies = _get_configured_proxies() + for attempt in range(3): + files = {} + try: + files["speaker_audio"] = open(reference_audio_path, "rb") + if emotion_mode == "audio": + if not emotion_audio_path or not os.path.exists(emotion_audio_path): + logger.error(f"IndexTTS-2 情感参考音频文件不存在: {emotion_audio_path}") + return None + files["emotion_audio"] = open(emotion_audio_path, "rb") + + logger.info(f"第 {attempt + 1} 次调用 IndexTTS-2 API: {api_url}") + response = requests.post( + api_url, + files=files, + data=data, + proxies=proxies, + timeout=180, + ) + + if response.status_code == 200 and _download_indextts2_audio(response, api_url, voice_file, proxies): + logger.info(f"IndexTTS-2 成功生成音频: {voice_file}, 大小: {os.path.getsize(voice_file)} 字节") + sub_maker = new_sub_maker() + duration = get_audio_duration_from_file(voice_file) + duration_ms = int(duration * 1000) if duration > 0 else max(1000, int(len(text) * 200)) + add_subtitle_event(sub_maker, 0, duration_ms * 10000, text) + return sub_maker + + logger.error(f"IndexTTS-2 API 调用失败: {response.status_code} - {response.text}") + except requests.exceptions.Timeout: + logger.error(f"IndexTTS-2 API 调用超时 (尝试 {attempt + 1}/3)") + except requests.exceptions.RequestException as e: + logger.error(f"IndexTTS-2 API 网络错误: {str(e)} (尝试 {attempt + 1}/3)") + except Exception as e: + logger.error(f"IndexTTS-2 TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)") + finally: + for file_obj in files.values(): + try: + file_obj.close() + except Exception: + pass + + if attempt < 2: + time.sleep(2) + + logger.error("IndexTTS-2 TTS 生成失败,已达到最大重试次数") + return None diff --git a/config.example.toml b/config.example.toml index 652ffb0..0b807e3 100644 --- a/config.example.toml +++ b/config.example.toml @@ -113,21 +113,21 @@ # 使用阿里百炼在线 fun-asr 时,访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取 API Key api_key = "" model = "fun-asr" - + [indextts] # IndexTTS-1.5 语音克隆配置 # 这是一个开源的零样本语音克隆项目,需要自行部署 # 项目地址:https://github.com/index-tts/index-tts # 默认 API 地址(本地部署) api_url = "http://127.0.0.1:8081/tts" - + # 默认参考音频(可选) reference_audio_source = "resource" # reference_audio = "/path/to/reference_audio.wav" - + # 推理模式:普通推理 / 快速推理 infer_mode = "普通推理" - + # 高级参数 temperature = 1.0 top_p = 0.8 @@ -135,6 +135,42 @@ do_sample = true num_beams = 3 repetition_penalty = 10.0 + +[indextts2] + # IndexTTS-2 语音克隆配置 + # 支持 IndexTTS2-Pack FastAPI 接口:POST /tts + api_url = "http://192.168.3.6:7863/tts" + + # 默认参考音频(可选),音色列表复用 IndexTTS-1.5 的资源目录 + reference_audio_source = "resource" + # reference_audio = "/path/to/reference_audio.wav" + + # 情感控制:speaker / audio / vector / text + emotion_mode = "speaker" + emotion_audio = "" + emotion_alpha = 0.65 + emotion_text = "" + use_random = false + max_text_tokens_per_segment = 120 + + # 8 维情感向量,顺序:happy, angry, sad, afraid, disgusted, melancholic, surprised, calm + vec_happy = 0.0 + vec_angry = 0.0 + vec_sad = 0.0 + vec_afraid = 0.0 + vec_disgusted = 0.0 + vec_melancholic = 0.0 + vec_surprised = 0.0 + vec_calm = 0.8 + + # 高级生成参数 + temperature = 0.8 + top_p = 0.8 + top_k = 30 + num_beams = 3 + repetition_penalty = 10.0 + max_mel_tokens = 1500 + [doubaotts] # 豆包语音 TTS 配置 # 申请流程: @@ -153,7 +189,7 @@ silence_duration = 0.125 [ui] - # TTS引擎选择 (indextts, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech) + # TTS引擎选择 (indextts, indextts2, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech) tts_engine = "indextts" # Edge TTS 配置 diff --git a/webui.py b/webui.py index f57d240..68c24a7 100644 --- a/webui.py +++ b/webui.py @@ -233,6 +233,11 @@ def get_voice_name_for_tts_engine(tts_engine: str) -> str: return f"tencent:{config.ui.get('tencent_voice_type', '101001')}" if tts_engine == 'qwen3_tts': return f"qwen3:{config.ui.get('qwen_voice_type', 'Cherry')}" + if tts_engine == config.INDEXTTS2_ENGINE: + reference_audio = config.indextts2.get('reference_audio', '') + if reference_audio: + return f"{config.INDEXTTS2_VOICE_PREFIX}{reference_audio}" + return config.ui.get('voice_name', '') if config.normalize_tts_engine_name(tts_engine) == config.INDEXTTS_ENGINE: reference_audio = config.indextts.get('reference_audio', '') if reference_audio: diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index 5b12cdd..c5ec08c 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -57,6 +57,7 @@ def get_tts_engine_options(tr=lambda key: key): """获取TTS引擎选项""" return { config.INDEXTTS_ENGINE: config.INDEXTTS_DISPLAY_NAME, + config.INDEXTTS2_ENGINE: config.INDEXTTS2_DISPLAY_NAME, "edge_tts": "Edge TTS", "qwen3_tts": tr("Tongyi Qwen3 TTS"), "tencent_tts": tr("Tencent Cloud TTS"), @@ -98,6 +99,12 @@ def get_tts_engine_descriptions(tr=lambda key: key): "use_case": tr("IndexTTS use case"), "registration": None }, + config.INDEXTTS2_ENGINE: { + "title": config.INDEXTTS2_DISPLAY_NAME, + "features": tr("IndexTTS2 features"), + "use_case": tr("IndexTTS2 use case"), + "registration": None + }, "doubaotts": { "title": tr("Doubao TTS"), "features": tr("Doubao TTS features"), @@ -325,7 +332,7 @@ def get_audio_mime_type(audio_path): return "audio/mp3" -def render_reference_audio_preview_button(reference_audio, key, tr): +def render_reference_audio_preview_button(reference_audio, key, tr, preview_state_key="indextts_reference_audio_preview_path"): """渲染参考音频试听按钮""" can_preview = bool(reference_audio and os.path.isfile(reference_audio)) if st.button( @@ -336,7 +343,102 @@ def render_reference_audio_preview_button(reference_audio, key, tr): disabled=not can_preview, use_container_width=True, ): - st.session_state["indextts_reference_audio_preview_path"] = reference_audio + st.session_state[preview_state_key] = reference_audio + + +def render_indextts_reference_audio_selector(tr, tts_config, key_prefix): + """渲染 IndexTTS 系列共用的参考音频选择器。""" + saved_reference_audio = tts_config.get("reference_audio", "") + reference_audio_source_options = { + tr("Select from Resource Directory"): "resource", + tr("Upload Reference Audio"): "upload", + } + reference_audio_source_labels = list(reference_audio_source_options.keys()) + saved_reference_audio_source = tts_config.get("reference_audio_source", "resource") + if saved_reference_audio_source not in reference_audio_source_options.values(): + saved_reference_audio_source = "resource" + default_reference_audio_source_label = next( + label + for label, source_value in reference_audio_source_options.items() + if source_value == saved_reference_audio_source + ) + + st.markdown(f"**{tr('Reference Audio Path')}**") + reference_audio_source_label = st.pills( + tr("Reference Audio Source"), + options=reference_audio_source_labels, + selection_mode="single", + default=default_reference_audio_source_label, + key=f"{key_prefix}_reference_audio_source_selection", + help=tr("Reference Audio Source Help"), + label_visibility="collapsed", + width="stretch", + ) + if not reference_audio_source_label: + reference_audio_source_label = default_reference_audio_source_label + reference_audio_source = reference_audio_source_options[reference_audio_source_label] + + reference_audio = saved_reference_audio + preview_state_key = f"{key_prefix}_reference_audio_preview_path" + reference_audio_options = get_indextts_reference_audio_options() + if reference_audio_source == "resource" and reference_audio_options: + selected_audio_index = get_indextts_reference_audio_index(reference_audio_options, saved_reference_audio) + select_col, preview_col = st.columns([5, 1]) + with select_col: + selected_audio_option = reference_audio_options[st.selectbox( + tr("Reference Audio Path"), + options=range(len(reference_audio_options)), + index=selected_audio_index, + format_func=lambda x: format_indextts_reference_audio_option(reference_audio_options[x]), + help=tr("Reference Audio Path Help"), + label_visibility="collapsed", + key=f"{key_prefix}_reference_audio_select", + )] + reference_audio = copy_indextts_reference_audio(selected_audio_option["path"]) + with preview_col: + render_reference_audio_preview_button( + reference_audio, + f"{key_prefix}_resource_reference_audio_preview", + tr, + preview_state_key=preview_state_key, + ) + elif reference_audio_source == "resource": + st.warning(tr("No Reference Audio Resources Found")) + + if reference_audio_source == "upload": + if saved_reference_audio_source != "upload": + reference_audio = "" + upload_col, preview_col = st.columns([5, 1]) + with upload_col: + uploaded_file = st.file_uploader( + tr("Upload Reference Audio File"), + type=["wav", "mp3"], + help=tr("Upload Reference Audio Help"), + label_visibility="collapsed", + key=f"{key_prefix}_reference_audio_upload", + ) + + if uploaded_file is not None: + target_dir = utils.storage_dir(INDEXTTS_REFERENCE_AUDIO_COPY_SUBDIR, create=True) + audio_path = os.path.join(target_dir, f"uploaded_{uploaded_file.name}") + with open(audio_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + reference_audio = audio_path + st.success(tr("Audio uploaded").format(path=audio_path)) + with preview_col: + render_reference_audio_preview_button( + reference_audio, + f"{key_prefix}_upload_reference_audio_preview", + tr, + preview_state_key=preview_state_key, + ) + + preview_audio_path = st.session_state.get(preview_state_key, "") + if preview_audio_path == reference_audio and os.path.isfile(preview_audio_path): + with open(preview_audio_path, "rb") as audio_file: + st.audio(audio_file.read(), format=get_audio_mime_type(preview_audio_path)) + + return reference_audio_source, reference_audio def render_bgm_preview_button(bgm_file, key, tr): @@ -442,6 +544,8 @@ def render_tts_settings(tr): render_qwen3_tts_settings(tr) elif selected_engine == config.INDEXTTS_ENGINE: render_indextts_tts_settings(tr) + elif selected_engine == config.INDEXTTS2_ENGINE: + render_indextts2_tts_settings(tr) elif selected_engine == "doubaotts": render_doubaotts_settings(tr) @@ -861,90 +965,11 @@ def render_indextts_tts_settings(tr): help=tr("IndexTTS API URL Help") ) - saved_reference_audio = config.indextts.get("reference_audio", "") - reference_audio_source_options = { - tr("Select from Resource Directory"): "resource", - tr("Upload Reference Audio"): "upload", - } - reference_audio_source_labels = list(reference_audio_source_options.keys()) - saved_reference_audio_source = config.indextts.get("reference_audio_source", "resource") - if saved_reference_audio_source not in reference_audio_source_options.values(): - saved_reference_audio_source = "resource" - default_reference_audio_source_label = next( - label - for label, source_value in reference_audio_source_options.items() - if source_value == saved_reference_audio_source + reference_audio_source, reference_audio = render_indextts_reference_audio_selector( + tr, + config.indextts, + "indextts", ) - - st.markdown(f"**{tr('Reference Audio Path')}**") - reference_audio_source_label = st.pills( - tr("Reference Audio Source"), - options=reference_audio_source_labels, - selection_mode="single", - default=default_reference_audio_source_label, - key="indextts_reference_audio_source_selection", - help=tr("Reference Audio Source Help"), - label_visibility="collapsed", - width="stretch", - ) - if not reference_audio_source_label: - reference_audio_source_label = default_reference_audio_source_label - reference_audio_source = reference_audio_source_options[reference_audio_source_label] - - reference_audio = saved_reference_audio - reference_audio_options = get_indextts_reference_audio_options() - if reference_audio_source == "resource" and reference_audio_options: - selected_audio_index = get_indextts_reference_audio_index(reference_audio_options, saved_reference_audio) - select_col, preview_col = st.columns([5, 1]) - with select_col: - selected_audio_option = reference_audio_options[st.selectbox( - tr("Reference Audio Path"), - options=range(len(reference_audio_options)), - index=selected_audio_index, - format_func=lambda x: format_indextts_reference_audio_option(reference_audio_options[x]), - help=tr("Reference Audio Path Help"), - label_visibility="collapsed" - )] - reference_audio = copy_indextts_reference_audio(selected_audio_option["path"]) - with preview_col: - render_reference_audio_preview_button( - reference_audio, - "indextts_resource_reference_audio_preview", - tr, - ) - elif reference_audio_source == "resource": - st.warning(tr("No Reference Audio Resources Found")) - - if reference_audio_source == "upload": - if saved_reference_audio_source != "upload": - reference_audio = "" - upload_col, preview_col = st.columns([5, 1]) - with upload_col: - uploaded_file = st.file_uploader( - tr("Upload Reference Audio File"), - type=["wav", "mp3"], - help=tr("Upload Reference Audio Help"), - label_visibility="collapsed" - ) - - if uploaded_file is not None: - target_dir = utils.storage_dir(INDEXTTS_REFERENCE_AUDIO_COPY_SUBDIR, create=True) - audio_path = os.path.join(target_dir, f"uploaded_{uploaded_file.name}") - with open(audio_path, "wb") as f: - f.write(uploaded_file.getbuffer()) - reference_audio = audio_path - st.success(tr("Audio uploaded").format(path=audio_path)) - with preview_col: - render_reference_audio_preview_button( - reference_audio, - "indextts_upload_reference_audio_preview", - tr, - ) - - preview_audio_path = st.session_state.get("indextts_reference_audio_preview_path", "") - if preview_audio_path == reference_audio and os.path.isfile(preview_audio_path): - with open(preview_audio_path, "rb") as audio_file: - st.audio(audio_file.read(), format=get_audio_mime_type(preview_audio_path)) # 推理模式 infer_mode_options = [ @@ -1038,6 +1063,217 @@ def render_indextts_tts_settings(tr): config.ui["voice_name"] = f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}" +def render_indextts2_tts_settings(tr): + """渲染 IndexTTS-2 TTS 设置""" + api_url = st.text_input( + tr("API URL"), + value=config.indextts2.get("api_url", "http://192.168.3.6:7863/tts"), + help=tr("IndexTTS2 API URL Help") + ) + + reference_audio_source, reference_audio = render_indextts_reference_audio_selector( + tr, + config.indextts2, + "indextts2", + ) + + emotion_mode_options = [ + ("speaker", tr("Emotion Mode Speaker")), + ("audio", tr("Emotion Mode Audio")), + ("vector", tr("Emotion Mode Vector")), + ("text", tr("Emotion Mode Text")), + ] + saved_emotion_mode = config.indextts2.get("emotion_mode", "speaker") + emotion_mode_values = [item[0] for item in emotion_mode_options] + if saved_emotion_mode not in emotion_mode_values: + saved_emotion_mode = "speaker" + + with st.expander(tr("IndexTTS2 Emotion Parameters"), expanded=False): + emotion_mode = emotion_mode_options[st.selectbox( + tr("Emotion Mode"), + options=range(len(emotion_mode_options)), + index=emotion_mode_values.index(saved_emotion_mode), + format_func=lambda x: emotion_mode_options[x][1], + help=tr("Emotion Mode Help"), + )][0] + + emotion_alpha = st.slider( + tr("Emotion Alpha"), + min_value=0.0, + max_value=1.0, + value=float(config.indextts2.get("emotion_alpha", 0.65)), + step=0.05, + help=tr("Emotion Alpha Help"), + ) + + emotion_audio = config.indextts2.get("emotion_audio", "") + emotion_text = config.indextts2.get("emotion_text", "") + if emotion_mode == "audio": + emotion_audio_col, emotion_preview_col = st.columns([5, 1]) + with emotion_audio_col: + emotion_audio = st.text_input( + tr("Emotion Reference Audio Path"), + value=emotion_audio, + help=tr("Emotion Reference Audio Path Help"), + ) + with emotion_preview_col: + render_reference_audio_preview_button( + emotion_audio, + "indextts2_emotion_audio_preview", + tr, + preview_state_key="indextts2_emotion_audio_preview_path", + ) + preview_audio_path = st.session_state.get("indextts2_emotion_audio_preview_path", "") + if preview_audio_path == emotion_audio and os.path.isfile(preview_audio_path): + with open(preview_audio_path, "rb") as audio_file: + st.audio(audio_file.read(), format=get_audio_mime_type(preview_audio_path)) + elif emotion_mode == "text": + emotion_text = st.text_input( + tr("Emotion Text"), + value=emotion_text, + help=tr("Emotion Text Help"), + placeholder=tr("Emotion Text Placeholder"), + ) + + use_random = st.checkbox( + tr("Use Random Emotion"), + value=bool(config.indextts2.get("use_random", False)), + help=tr("Use Random Emotion Help"), + ) + + emotion_vector_defaults = { + "vec_happy": 0.0, + "vec_angry": 0.0, + "vec_sad": 0.0, + "vec_afraid": 0.0, + "vec_disgusted": 0.0, + "vec_melancholic": 0.0, + "vec_surprised": 0.0, + "vec_calm": 0.8, + } + emotion_vector_labels = { + "vec_happy": tr("Emotion Happy"), + "vec_angry": tr("Emotion Angry"), + "vec_sad": tr("Emotion Sad"), + "vec_afraid": tr("Emotion Afraid"), + "vec_disgusted": tr("Emotion Disgusted"), + "vec_melancholic": tr("Emotion Melancholic"), + "vec_surprised": tr("Emotion Surprised"), + "vec_calm": tr("Emotion Calm"), + } + emotion_vector_values = {} + if emotion_mode == "vector": + vec_cols = st.columns(2) + for index, (field, default_value) in enumerate(emotion_vector_defaults.items()): + with vec_cols[index % 2]: + emotion_vector_values[field] = st.slider( + emotion_vector_labels[field], + min_value=0.0, + max_value=1.0, + value=float(config.indextts2.get(field, default_value)), + step=0.05, + ) + else: + emotion_vector_values = { + field: float(config.indextts2.get(field, default_value)) + for field, default_value in emotion_vector_defaults.items() + } + + with st.expander(tr("Advanced Parameters"), expanded=False): + col1, col2 = st.columns(2) + + with col1: + temperature = st.slider( + tr("Sampling Temperature"), + min_value=0.1, + max_value=2.0, + value=float(config.indextts2.get("temperature", 0.8)), + step=0.1, + help=tr("Sampling Temperature Help") + ) + + top_p = st.slider( + "Top P", + min_value=0.0, + max_value=1.0, + value=float(config.indextts2.get("top_p", 0.8)), + step=0.05, + help=tr("Top P Help") + ) + + top_k = st.slider( + "Top K", + min_value=0, + max_value=100, + value=int(config.indextts2.get("top_k", 30)), + step=5, + help=tr("Top K Help") + ) + + max_text_tokens_per_segment = st.slider( + tr("Max Text Tokens Per Segment"), + min_value=20, + max_value=600, + value=int(config.indextts2.get("max_text_tokens_per_segment", 120)), + step=10, + help=tr("Max Text Tokens Per Segment Help") + ) + + with col2: + num_beams = st.slider( + tr("Num Beams"), + min_value=1, + max_value=10, + value=int(config.indextts2.get("num_beams", 3)), + step=1, + help=tr("Num Beams Help") + ) + + repetition_penalty = st.slider( + tr("Repetition Penalty"), + min_value=0.1, + max_value=20.0, + value=float(config.indextts2.get("repetition_penalty", 10.0)), + step=0.1, + help=tr("Repetition Penalty Help") + ) + + max_mel_tokens = st.slider( + tr("Max Mel Tokens"), + min_value=50, + max_value=1815, + value=int(config.indextts2.get("max_mel_tokens", 1500)), + step=10, + help=tr("Max Mel Tokens Help") + ) + + with st.expander(tr("IndexTTS2 Usage Instructions Title"), expanded=False): + st.markdown(tr("IndexTTS2 Usage Instructions")) + + config.indextts2["api_url"] = api_url + config.indextts2["reference_audio_source"] = reference_audio_source + config.indextts2["reference_audio"] = reference_audio + config.indextts2["emotion_mode"] = emotion_mode + config.indextts2["emotion_audio"] = emotion_audio + config.indextts2["emotion_alpha"] = emotion_alpha + config.indextts2["emotion_text"] = emotion_text + config.indextts2["use_random"] = use_random + config.indextts2["max_text_tokens_per_segment"] = max_text_tokens_per_segment + for field, value in emotion_vector_values.items(): + config.indextts2[field] = value + config.indextts2["temperature"] = temperature + config.indextts2["top_p"] = top_p + config.indextts2["top_k"] = top_k + config.indextts2["num_beams"] = num_beams + config.indextts2["repetition_penalty"] = repetition_penalty + config.indextts2["max_mel_tokens"] = max_mel_tokens + + if reference_audio: + config.ui["voice_name"] = f"{config.INDEXTTS2_VOICE_PREFIX}{reference_audio}" + st.session_state['voice_rate'] = 1.0 + st.session_state['voice_pitch'] = 1.0 + + def render_doubaotts_settings(tr): """渲染豆包语音 TTS 设置""" # AK 输入 @@ -1325,6 +1561,12 @@ def render_voice_preview_new(tr, selected_engine): voice_name = f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}" voice_rate = 1.0 # IndexTTS-1.5 不支持速度调节 voice_pitch = 1.0 # IndexTTS-1.5 不支持音调调节 + elif selected_engine == config.INDEXTTS2_ENGINE: + reference_audio = config.indextts2.get("reference_audio", "") + if reference_audio: + voice_name = f"{config.INDEXTTS2_VOICE_PREFIX}{reference_audio}" + voice_rate = 1.0 # IndexTTS-2 使用自身生成参数 + voice_pitch = 1.0 elif selected_engine == "doubaotts": voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming") voice_name = voice_type @@ -1337,7 +1579,9 @@ def render_voice_preview_new(tr, selected_engine): with st.spinner(tr("Synthesizing Voice")): temp_dir = utils.storage_dir("temp", create=True) - audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3") + audio_format = "audio/wav" if selected_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) else "audio/mp3" + audio_extension = ".wav" if audio_format == "audio/wav" else ".mp3" + audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}{audio_extension}") sub_maker = voice.tts( text=play_content, @@ -1354,7 +1598,7 @@ def render_voice_preview_new(tr, selected_engine): # 播放音频 with open(audio_file, 'rb') as audio_file_obj: audio_bytes = audio_file_obj.read() - st.audio(audio_bytes, format='audio/mp3') + st.audio(audio_bytes, format=audio_format) # 清理临时文件 try: diff --git a/webui/i18n/en.json b/webui/i18n/en.json index bf8ffe3..8e3356c 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -284,6 +284,8 @@ "IndexTTS features": "A locally or privately deployed IndexTTS-1.5 voice-cloning engine. Choose a resource audio file or upload a reference audio file, then synthesize narration in that voice.", "IndexTTS use case": "Best for fixed narrator voices, character dubbing, or generating multiple videos with the same voice. Start the IndexTTS-1.5 API service before use. Deployment package: https://pan.quark.cn/s/0767c9bcefd5", "IndexTTS download link": "Download link: https://pan.quark.cn/s/0767c9bcefd5", + "IndexTTS2 features": "A locally or privately deployed IndexTTS-2 voice-cloning engine with emotion control and fuller generation parameters.", + "IndexTTS2 use case": "Best for fixed voices, emotional narration, and local speech synthesis workflows that need finer sampling controls. Start the IndexTTS-2 API service before use.", "Doubao TTS features": "Volcengine Doubao speech synthesis with multiple voices and emotions, plus fast access in mainland China.", "Select TTS Engine": "Select TTS Engine", "Select TTS Engine Help": "Choose the text-to-speech engine you want to use.", @@ -440,6 +442,7 @@ "Select Qwen3 TTS Voice": "Select a Qwen3 TTS voice", "API URL": "API URL", "IndexTTS API URL Help": "IndexTTS-1.5 API service URL", + "IndexTTS2 API URL Help": "IndexTTS-2 API service URL. You can enter the service root or the full /tts endpoint.", "Reference Audio Source": "Reference Audio Source", "Reference Audio Source Help": "Choose a reference audio from the resource directory or upload a new one.", "Select from Resource Directory": "Select from Resource Directory", @@ -469,6 +472,36 @@ "Enable Sampling Help": "Enable sampling for more natural speech.", "IndexTTS Usage Instructions Title": "💡 IndexTTS-1.5 Usage Instructions", "IndexTTS Usage Instructions": "**Zero-shot voice cloning**\n\n1. **Prepare reference audio**: upload or specify a clear audio file (3-10 seconds recommended)\n2. **Set API URL**: make sure the IndexTTS-1.5 service is running\n3. **Start synthesis**: the system will use the reference voice to synthesize new speech\n\n**Notes**:\n- Reference audio quality directly affects synthesis quality\n- Use clean audio without background noise when possible\n- Keep text length within a reasonable range\n- The first synthesis may take longer", + "IndexTTS2 Emotion Parameters": "🎭 Emotion Parameters", + "Emotion Mode": "Emotion Mode", + "Emotion Mode Help": "Choose the emotion control source for IndexTTS-2.", + "Emotion Mode Speaker": "Same as speaker reference", + "Emotion Mode Audio": "Use emotion reference audio", + "Emotion Mode Vector": "Use emotion vector", + "Emotion Mode Text": "Use emotion text", + "Emotion Alpha": "Emotion Alpha", + "Emotion Alpha Help": "Controls how strongly the emotion condition affects generation. 0 is weak, 1 is strong.", + "Emotion Reference Audio Path": "Emotion Reference Audio Path", + "Emotion Reference Audio Path Help": "Local emotion reference audio path used when emotion_mode=audio.", + "Emotion Text": "Emotion Text", + "Emotion Text Help": "Emotion description used when emotion_mode=text, such as happy, nervous, or aggrieved.", + "Emotion Text Placeholder": "e.g. calm, nervous, happy", + "Use Random Emotion": "Use Random Emotion", + "Use Random Emotion Help": "Let IndexTTS-2 use random emotion sampling during generation.", + "Emotion Happy": "Happy", + "Emotion Angry": "Angry", + "Emotion Sad": "Sad", + "Emotion Afraid": "Afraid", + "Emotion Disgusted": "Disgusted", + "Emotion Melancholic": "Melancholic", + "Emotion Surprised": "Surprised", + "Emotion Calm": "Calm", + "Max Text Tokens Per Segment": "Max Text Tokens Per Segment", + "Max Text Tokens Per Segment Help": "Maximum text tokens per segment for IndexTTS-2 inference.", + "Max Mel Tokens": "Max Mel Tokens", + "Max Mel Tokens Help": "Controls the maximum mel tokens generated in one request. Higher values can produce longer audio.", + "IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 Usage Instructions", + "IndexTTS2 Usage Instructions": "**IndexTTS-2 voice cloning**\n\n1. **Choose a voice**: reuse IndexTTS-1.5 resource audio or upload a reference audio file\n2. **Set API URL**: for example http://192.168.3.6:7863/tts, or enter the service root\n3. **Tune emotion**: speaker is the default; switch to audio, vector, or text when needed\n4. **Tune generation**: temperature, top_p, top_k, num_beams, repetition_penalty, and max_mel_tokens are sent directly to the IndexTTS-2 API\n\n**Notes**:\n- Reference audio quality directly affects cloning quality\n- The first request may load the model and take longer\n- CPU deployments are much slower than GPU deployments", "Volcengine Access Key Help": "Volcengine Access Key", "Volcengine Secret Key Help": "Volcengine Secret Key", "Doubao AppID Help": "Doubao TTS application AppID", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 84ea88e..4b16d7e 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -265,6 +265,8 @@ "IndexTTS features": "本地/私有部署的 IndexTTS-1.5 语音克隆引擎。选择资源目录音频或上传参考音频后,可按该音色合成旁白。", "IndexTTS use case": "适合需要固定旁白音色、角色配音或批量生成同一音色视频的场景。使用前请先启动 IndexTTS-1.5 API 服务;部署包下载:https://pan.quark.cn/s/0767c9bcefd5", "IndexTTS download link": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", + "IndexTTS2 features": "本地/私有部署的 IndexTTS-2 语音克隆引擎,支持情感控制和更完整的生成参数。", + "IndexTTS2 use case": "适合需要固定音色、情绪化旁白或更细致采样控制的本地语音合成场景。使用前请先启动 IndexTTS-2 API 服务。", "Doubao TTS features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快", "Select TTS Engine": "选择 TTS 引擎", "Select TTS Engine Help": "选择您要使用的文本转语音引擎", @@ -422,6 +424,7 @@ "Select Qwen3 TTS Voice": "选择 Qwen3 TTS 音色", "API URL": "API 地址", "IndexTTS API URL Help": "IndexTTS-1.5 API 服务地址", + "IndexTTS2 API URL Help": "IndexTTS-2 API 服务地址,可填写服务根地址或完整 /tts 地址", "Reference Audio Source": "参考音频来源", "Reference Audio Source Help": "选择从资源目录选择参考音频,或上传新的参考音频", "Select from Resource Directory": "从资源目录选择", @@ -451,6 +454,36 @@ "Enable Sampling Help": "启用采样可以获得更自然的语音", "IndexTTS Usage Instructions Title": "💡 IndexTTS-1.5 使用说明", "IndexTTS Usage Instructions": "**零样本语音克隆**\n\n1. **准备参考音频**:上传或指定一段清晰的音频文件(建议 3-10 秒)\n2. **设置 API 地址**:确保 IndexTTS-1.5 服务正常运行\n3. **开始合成**:系统会自动使用参考音频的音色合成新语音\n\n**注意事项**:\n- 参考音频质量直接影响合成效果\n- 建议使用无背景噪音的清晰音频\n- 文本长度建议控制在合理范围内\n- 首次合成可能需要较长时间", + "IndexTTS2 Emotion Parameters": "🎭 情感参数", + "Emotion Mode": "情感控制方式", + "Emotion Mode Help": "选择 IndexTTS-2 的情感控制来源", + "Emotion Mode Speaker": "与音色参考相同", + "Emotion Mode Audio": "使用情感参考音频", + "Emotion Mode Vector": "使用情感向量", + "Emotion Mode Text": "使用情感描述文本", + "Emotion Alpha": "情感权重", + "Emotion Alpha Help": "控制情感条件的影响强度,0 表示弱,1 表示强", + "Emotion Reference Audio Path": "情感参考音频路径", + "Emotion Reference Audio Path Help": "emotion_mode=audio 时使用的本地情感参考音频路径", + "Emotion Text": "情感描述文本", + "Emotion Text Help": "emotion_mode=text 时使用的情感描述,例如开心、紧张、委屈", + "Emotion Text Placeholder": "例如:沉稳、紧张、开心", + "Use Random Emotion": "启用随机情感", + "Use Random Emotion Help": "让 IndexTTS-2 在生成时使用随机情感采样", + "Emotion Happy": "开心", + "Emotion Angry": "愤怒", + "Emotion Sad": "悲伤", + "Emotion Afraid": "害怕", + "Emotion Disgusted": "厌恶", + "Emotion Melancholic": "忧郁", + "Emotion Surprised": "惊讶", + "Emotion Calm": "平静", + "Max Text Tokens Per Segment": "单段最大文本 Token", + "Max Text Tokens Per Segment Help": "IndexTTS-2 分段推理的最大文本 token 数", + "Max Mel Tokens": "最大 Mel Tokens", + "Max Mel Tokens Help": "控制单次生成的最大 mel token 数,值越大可生成更长音频", + "IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 使用说明", + "IndexTTS2 Usage Instructions": "**IndexTTS-2 语音克隆**\n\n1. **选择音色**:复用 IndexTTS-1.5 的资源音频或上传参考音频\n2. **设置 API 地址**:例如 http://192.168.3.6:7863/tts,也可以填写服务根地址\n3. **调整情感参数**:默认使用 speaker,可按需切换到 audio、vector 或 text\n4. **调整生成参数**:temperature、top_p、top_k、num_beams、repetition_penalty 和 max_mel_tokens 会直接传给 IndexTTS-2 接口\n\n**注意事项**:\n- 参考音频质量会直接影响克隆效果\n- 首次请求可能需要加载模型,耗时更长\n- CPU 部署生成速度会明显慢于 GPU", "Volcengine Access Key Help": "火山引擎 Access Key", "Volcengine Secret Key Help": "火山引擎 Secret Key", "Doubao AppID Help": "豆包语音应用 AppID", From 342fc15f3bf3beb515655de540d9a259bc9cf337 Mon Sep 17 00:00:00 2001 From: viccy Date: Sun, 7 Jun 2026 01:24:32 +0800 Subject: [PATCH 13/24] =?UTF-8?q?feat(tts,search,video):=20=E6=96=B0?= =?UTF-8?q?=E5=A2=9EOmniVoice=20TTS=E3=80=81=E8=81=94=E7=BD=91=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E4=B8=8E=E5=A4=9A=E8=A7=86=E9=A2=91=E5=89=AA=E8=BE=91?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增OmniVoice语音合成引擎全流程支持,包含配置项、WebUI界面与服务实现 集成Tavily联网搜索能力,支持短剧剧情分析前自动检索剧情背景信息 新增多视频源剪辑支持,完善脚本校验规则并重构剪辑逻辑适配多视频路径 重构LLM剧情分析Prompt,优化输出格式适配多场景与联网检索结果 调整streamlit版本至1.56.0修复兼容性问题 新增相关单元测试与多语言翻译,更新配置示例文件 --- app/config/config.py | 5 + app/config/defaults.py | 3 + app/services/clip_video.py | 114 +++++++-- app/services/jianying_task.py | 9 +- app/services/llm/unified_service.py | 15 +- app/services/llm/validators.py | 12 + .../short_drama_narration/plot_analysis.py | 105 +++++---- .../script_generation.py | 25 +- app/services/task.py | 2 + app/services/tavily_search.py | 116 +++++++++ app/services/test_jianying_task_unittest.py | 17 ++ ...est_multi_video_script_sources_unittest.py | 84 +++++++ app/services/voice.py | 165 ++++++++++++- app/utils/check_script.py | 17 ++ config.example.toml | 32 ++- requirements.txt | 2 +- webui.py | 7 + webui/components/audio_settings.py | 192 ++++++++++++++- webui/components/basic_settings.py | 27 +++ webui/components/script_settings.py | 170 +++++++++++-- webui/components/subtitle_settings.py | 2 +- webui/i18n/en.json | 42 ++++ webui/i18n/zh.json | 42 ++++ webui/tools/generate_short_summary.py | 223 +++++++++++++++++- 24 files changed, 1320 insertions(+), 108 deletions(-) create mode 100644 app/services/tavily_search.py create mode 100644 app/services/test_multi_video_script_sources_unittest.py diff --git a/app/config/config.py b/app/config/config.py index ae19945..de17645 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -13,8 +13,11 @@ INDEXTTS_ENGINE = "indextts" INDEXTTS_DISPLAY_NAME = "IndexTTS-1.5" INDEXTTS2_ENGINE = "indextts2" INDEXTTS2_DISPLAY_NAME = "IndexTTS-2" +OMNIVOICE_ENGINE = "omnivoice" +OMNIVOICE_DISPLAY_NAME = "OmniVoice" INDEXTTS_VOICE_PREFIX = f"{INDEXTTS_ENGINE}:" INDEXTTS2_VOICE_PREFIX = f"{INDEXTTS2_ENGINE}:" +OMNIVOICE_VOICE_PREFIX = f"{OMNIVOICE_ENGINE}:" def normalize_tts_engine_name(tts_engine: str) -> str: @@ -131,6 +134,7 @@ def save_config(): _cfg["fun_asr"] = fun_asr _cfg["indextts"] = indextts _cfg["indextts2"] = indextts2 + _cfg["omnivoice"] = omnivoice _cfg["doubaotts"] = doubaotts f.write(toml.dumps(_cfg)) @@ -148,6 +152,7 @@ tts_qwen = _cfg.get("tts_qwen", {}) fun_asr = _cfg.get("fun_asr", {}) indextts = _cfg.get("indextts", {}) indextts2 = _cfg.get("indextts2", {}) +omnivoice = _cfg.get("omnivoice", {}) doubaotts = _cfg.get("doubaotts", {}) hostname = socket.gethostname() diff --git a/app/config/defaults.py b/app/config/defaults.py index a001978..9f648fa 100644 --- a/app/config/defaults.py +++ b/app/config/defaults.py @@ -35,6 +35,9 @@ DEFAULT_LLM_APP_CONFIG = { "text_openai_model_name": DEFAULT_TEXT_OPENAI_MODEL_NAME, "text_openai_api_key": "", "text_openai_base_url": DEFAULT_OPENAI_COMPATIBLE_BASE_URL, + "tavily_api_key": "", + "tavily_search_depth": "basic", + "tavily_max_results": 5, } DEFAULT_LLM_APP_CONFIG.update(DEFAULT_LLM_GENERATION_APP_CONFIG) diff --git a/app/services/clip_video.py b/app/services/clip_video.py index 8455703..93f9ddd 100644 --- a/app/services/clip_video.py +++ b/app/services/clip_video.py @@ -32,6 +32,82 @@ def parse_timestamp(timestamp: str) -> tuple: return start_time, end_time +def _normalize_video_origin_paths( + video_origin_path: str, + video_origin_paths: Optional[List[str]] = None, +) -> List[str]: + paths = [] + if video_origin_paths: + paths.extend(video_origin_paths) + if video_origin_path: + paths.insert(0, video_origin_path) + + normalized_paths = [] + seen = set() + for item in paths: + if not isinstance(item, str): + continue + item = item.strip() + if not item or item in seen: + continue + normalized_paths.append(item) + seen.add(item) + return normalized_paths + + +def _coerce_video_id(value) -> Optional[int]: + try: + video_id = int(value) + except (TypeError, ValueError): + return None + return video_id if video_id > 0 else None + + +def _match_video_id_by_name(video_name: str, video_origin_paths: List[str]) -> Optional[int]: + video_name = str(video_name or "").strip() + if not video_name: + return None + + expected_name = os.path.basename(video_name) + for index, video_path in enumerate(video_origin_paths, start=1): + if os.path.basename(video_path) == expected_name: + return index + return None + + +def _resolve_script_video_path(script_item: Dict, video_origin_paths: List[str]) -> str: + explicit_path = ( + script_item.get("source_video_path") + or script_item.get("video_origin_path") + or script_item.get("origin_video_path") + ) + if explicit_path and os.path.exists(explicit_path): + return explicit_path + + video_id = _coerce_video_id(script_item.get("video_id") or script_item.get("video_index")) + matched_video_id = _match_video_id_by_name( + script_item.get("video_name") or script_item.get("source_video"), + video_origin_paths, + ) + if matched_video_id: + video_id = matched_video_id + + if video_id is not None: + if video_id <= len(video_origin_paths): + return video_origin_paths[video_id - 1] + logger.warning( + f"片段 {script_item.get('_id')} 的 video_id={video_id} 超出视频数量 " + f"{len(video_origin_paths)},默认使用第一个视频" + ) + + return video_origin_paths[0] + + +def _safe_output_id(value) -> str: + safe_value = str(value if value is not None else "unknown") + return "".join(char if char.isalnum() or char in ("-", "_") else "_" for char in safe_value) + + def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str: """ 根据开始时间和持续时间计算结束时间 @@ -579,7 +655,7 @@ def _process_narration_only_segment( # 生成输出文件名 safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-') - output_filename = f"ost0_vid_{safe_start_time}@{safe_end_time}.mp4" + output_filename = f"ost0_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 - 移除音频 @@ -622,7 +698,7 @@ def _process_original_audio_segment( # 生成输出文件名 safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = end_time.replace(':', '-').replace(',', '-') - output_filename = f"ost1_vid_{safe_start_time}@{safe_end_time}.mp4" + output_filename = f"ost1_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 - 保持原声 @@ -674,7 +750,7 @@ def _process_mixed_segment( # 生成输出文件名 safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-') - output_filename = f"ost2_vid_{safe_start_time}@{safe_end_time}.mp4" + output_filename = f"ost2_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 - 保持原声 @@ -782,28 +858,34 @@ def clip_video_unified( script_list: List[Dict], tts_results: List[Dict], output_dir: Optional[str] = None, - task_id: Optional[str] = None + task_id: Optional[str] = None, + video_origin_paths: Optional[List[str]] = None ) -> Dict[str, str]: """ 基于OST类型的统一视频裁剪策略 - 消除双重裁剪问题 Args: - video_origin_path: 原始视频的路径 + video_origin_path: 原始视频的路径;旧脚本或无 video_id 片段默认使用该视频 script_list: 完整的脚本列表,包含所有片段信息 tts_results: TTS结果列表,仅包含OST=0和OST=2的片段 output_dir: 输出目录路径,默认为None时会自动生成 task_id: 任务ID,用于生成唯一的输出目录,默认为None时会自动生成 + video_origin_paths: 多个原始视频路径,脚本片段可用 video_id/video_name 指定来源 Returns: Dict[str, str]: 片段ID到裁剪后视频路径的映射 """ - # 检查视频文件是否存在 - if not os.path.exists(video_origin_path): - raise FileNotFoundError(f"视频文件不存在: {video_origin_path}") + video_source_paths = _normalize_video_origin_paths(video_origin_path, video_origin_paths) + if not video_source_paths: + raise FileNotFoundError("视频文件不存在: 未提供原始视频路径") + + missing_video_paths = [item for item in video_source_paths if not os.path.exists(item)] + if missing_video_paths: + raise FileNotFoundError(f"视频文件不存在: {', '.join(missing_video_paths)}") # 如果未提供task_id,则根据输入生成一个唯一ID if task_id is None: - content_for_hash = f"{video_origin_path}_{json.dumps(script_list)}" + content_for_hash = f"{json.dumps(video_source_paths, ensure_ascii=False)}_{json.dumps(script_list, ensure_ascii=False)}" task_id = hashlib.md5(content_for_hash.encode()).hexdigest() # 设置输出目录 @@ -840,29 +922,33 @@ def clip_video_unified( failed_clips = [] success_count = 0 - logger.info(f"📹 开始统一视频裁剪,总共{total_clips}个片段") + logger.info(f"📹 开始统一视频裁剪,总共{total_clips}个片段,源视频{len(video_source_paths)}个") for i, script_item in enumerate(script_list, 1): _id = script_item.get("_id") ost = script_item.get("OST", 0) timestamp = script_item["timestamp"] + source_video_path = _resolve_script_video_path(script_item, video_source_paths) - logger.info(f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, 时间戳:{timestamp}") + logger.info( + f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, " + f"视频:{os.path.basename(source_video_path)}, 时间戳:{timestamp}" + ) try: if ost == 0: # 纯解说片段 output_path = _process_narration_only_segment( - video_origin_path, script_item, tts_map, output_dir, + source_video_path, script_item, tts_map, output_dir, encoder_config, hwaccel_args ) elif ost == 1: # 纯原声片段 output_path = _process_original_audio_segment( - video_origin_path, script_item, output_dir, + source_video_path, script_item, output_dir, encoder_config, hwaccel_args ) elif ost == 2: # 解说+原声混合片段 output_path = _process_mixed_segment( - video_origin_path, script_item, tts_map, output_dir, + source_video_path, script_item, tts_map, output_dir, encoder_config, hwaccel_args ) else: diff --git a/app/services/jianying_task.py b/app/services/jianying_task.py index 345f6b7..a24304c 100644 --- a/app/services/jianying_task.py +++ b/app/services/jianying_task.py @@ -107,7 +107,7 @@ def _clamp_duration_to_media( def _normalize_indextts_reference_audio(params: VideoClipParams) -> None: - """Ensure IndexTTS engines use the configured reference audio instead of a stale UI voice.""" + """Ensure local clone TTS engines use configured reference audio instead of a stale UI voice.""" params.tts_engine = config.normalize_tts_engine_name(params.tts_engine) if params.tts_engine == config.INDEXTTS_ENGINE: tts_config = config.indextts @@ -117,6 +117,12 @@ def _normalize_indextts_reference_audio(params: VideoClipParams) -> None: tts_config = config.indextts2 voice_prefix = config.INDEXTTS2_VOICE_PREFIX display_name = "IndexTTS-2" + elif params.tts_engine == config.OMNIVOICE_ENGINE: + tts_config = config.omnivoice + if tts_config.get("mode", "auto") != "voice_clone": + return + voice_prefix = config.OMNIVOICE_VOICE_PREFIX + display_name = "OmniVoice" else: return @@ -199,6 +205,7 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams): logger.info("\n\n## 3. 统一视频裁剪(基于OST类型)") video_clip_result = clip_video.clip_video_unified( video_origin_path=params.video_origin_path, + video_origin_paths=getattr(params, "video_origin_paths", []), script_list=list_script, tts_results=tts_results ) diff --git a/app/services/llm/unified_service.py b/app/services/llm/unified_service.py index 0c31b5a..63cc48f 100644 --- a/app/services/llm/unified_service.py +++ b/app/services/llm/unified_service.py @@ -12,6 +12,7 @@ from loguru import logger from .manager import LLMServiceManager from .validators import OutputValidator from .exceptions import LLMServiceError +from app.services.prompts import PromptManager # 提供商注册由 webui.py:main() 显式调用(见 LLM 提供商注册机制重构) # 这样更可靠,错误也更容易调试 @@ -181,12 +182,20 @@ class UnifiedLLMService: LLMServiceError: 服务调用失败时抛出 """ try: - # 构建分析提示词 - system_prompt = "你是一位专业的剧本分析师和剧情概括助手。请仔细分析字幕内容,提取关键剧情信息。" + prompt = PromptManager.get_prompt( + category="short_drama_narration", + name="plot_analysis", + parameters={"subtitle_content": subtitle_content}, + ) + prompt_object = PromptManager.get_prompt_object( + category="short_drama_narration", + name="plot_analysis", + ) + system_prompt = prompt_object.get_system_prompt() # 生成分析结果 result = await UnifiedLLMService.generate_text( - prompt=subtitle_content, + prompt=prompt, system_prompt=system_prompt, provider=provider, temperature=temperature, diff --git a/app/services/llm/validators.py b/app/services/llm/validators.py index 1614e14..1ef30e2 100644 --- a/app/services/llm/validators.py +++ b/app/services/llm/validators.py @@ -113,6 +113,8 @@ class OutputValidator: "required": ["_id", "timestamp", "picture", "narration"], "properties": { "_id": {"type": "number"}, + "video_id": {"type": "number"}, + "video_name": {"type": "string"}, "timestamp": {"type": "string"}, "picture": {"type": "string"}, "narration": {"type": "string"}, @@ -161,6 +163,16 @@ class OutputValidator: item_id = item.get("_id") if not isinstance(item_id, (int, float)) or item_id <= 0: raise ValidationError(f"第{index+1}项ID必须为正整数: {item_id}", "invalid_id") + + video_id = item.get("video_id") + if video_id not in (None, "") and ( + not isinstance(video_id, (int, float)) or video_id <= 0 + ): + raise ValidationError(f"第{index+1}项video_id必须为正整数: {video_id}", "invalid_video_id") + + video_name = item.get("video_name") + if video_name not in (None, "") and not isinstance(video_name, str): + raise ValidationError(f"第{index+1}项video_name必须为字符串: {video_name}", "invalid_video_name") @staticmethod def validate_subtitle_analysis(output: str) -> str: diff --git a/app/services/prompts/short_drama_narration/plot_analysis.py b/app/services/prompts/short_drama_narration/plot_analysis.py index 0f8ffb1..a50dbe7 100644 --- a/app/services/prompts/short_drama_narration/plot_analysis.py +++ b/app/services/prompts/short_drama_narration/plot_analysis.py @@ -19,72 +19,79 @@ class PlotAnalysisPrompt(TextPrompt): metadata = PromptMetadata( name="plot_analysis", category="short_drama_narration", - version="v1.0", - description="分析短剧字幕内容,提供详细的剧情分析和分段解析", + version="v1.1", + description="结合字幕和可选联网检索上下文,输出适合短剧解说脚本生成的结构化剧情理解", model_type=ModelType.TEXT, output_format=OutputFormat.TEXT, - tags=["短剧", "剧情分析", "字幕解析", "分段分析"], + tags=["短剧", "剧情分析", "字幕解析", "分段分析", "联网检索", "解说脚本素材"], parameters=["subtitle_content"] ) super().__init__(metadata) - self._system_prompt = "你是一位专业的剧本分析师和剧情概括助手。" + self._system_prompt = "你是一位专业的短剧解说策划和剧本分析师。请输出克制、结构化、可直接供下游解说脚本生成使用的剧情理解材料。" def get_template(self) -> str: return """# 角色 -你是一位专业的剧本分析师和剧情概括助手。 +你是一位专业的短剧解说策划和剧本分析师。你的输出不是给观众看的成片文案,而是给下游“短剧解说脚本生成器”使用的结构化剧情理解材料。 -# 任务 -我将为你提供一部短剧的完整字幕文本。请你基于这些字幕,完成以下任务: -1. **整体剧情分析**:简要概括整个短剧的核心剧情脉络、主要冲突和结局(如果有的话)。 -2. **分段剧情解析与时间戳定位**: - * 将整个短剧划分为若干个关键的剧情段落(例如:开端、发展、转折、高潮、结局,或根据具体情节自然划分)。 - * 段落数应该与字幕长度成正比。 - * 对于每一个剧情段落: - * **概括该段落的主要内容**:用简洁的语言描述这段剧情发生了什么。 - * **标注对应的时间戳范围**:明确指出该剧情段落对应的开始字幕时间戳和结束字幕时间戳。请直接从字幕中提取时间信息。 +# 输入说明 +下面的输入可能只包含一个视频的原始字幕,也可能包含多个视频文件的字幕;也可能同时包含 Tavily 联网检索结果和原始字幕。 +- 联网检索结果只能用于辅助识别短剧名称、人物关系、时代背景、公开剧情梗概。 +- 原始字幕是唯一可信的当前片段事实来源。 +- 如果联网检索结果与字幕冲突,必须以字幕为准。 +- 如果联网检索结果包含当前字幕尚未出现的后续剧情,只能放在“字幕未覆盖/需谨慎信息”中,不能写进当前剧情事实。 +- 多个视频字幕会以“视频 1: 文件名”“视频 2: 文件名”等标题分隔。时间戳均为对应视频内部时间,不是拼接后的累计时间。 -# 输入格式 -字幕内容通常包含时间戳和对话,例如: -``` -00:00:05,000 --> 00:00:10,000 -[角色A]: 你好吗? -00:00:10,500 --> 00:00:15,000 -[角色B]: 我很好,谢谢。发生了一些有趣的事情。 -... (更多字幕内容) ... -``` -我将把实际字幕粘贴在下方。 +# 核心任务 +请基于输入完成剧情理解,目标是帮助后续生成高质量短剧解说脚本: +1. 识别短剧名称、当前字幕范围、视频来源、联网检索辅助信息和字幕事实边界。 +2. 统一人物称呼,避免同一人物出现多个名字写法。 +3. 用 100-180 字概括当前字幕覆盖的剧情,不提前剧透字幕未出现的内容。 +4. 按视频来源和字幕时间顺序拆分关键剧情段落,并为每段标注准确 video_id / video_name / 时间戳。 +5. 提炼解说创作可用的钩子、冲突、爽点/泪点/悬念点和建议保留原声片段。 -# 输出格式要求 -请按照以下格式清晰地呈现分析结果: +# 强制输出规则 +1. 禁止输出寒暄、解释身份或“好的,我将……”等聊天式开场。 +2. 禁止编造字幕中没有的具体事件、对白、关系进展或结局。 +3. 时间戳必须直接来自对应视频字幕;无法确定时写“字幕未明确”,不要猜测。 +4. 多视频场景下必须明确每段来自哪个视频文件,禁止把不同视频的同名时间戳混在一起。 +5. 人名必须统一:优先采用联网检索中的正式名称;如果字幕写法不同,在人物表中保留“字幕称呼”。 +6. 内容要简洁、客观、可复用,避免散文化长段落。 +7. 必须严格按照下面的 Markdown 格式输出,不要添加额外章节。 -**一、整体剧情概括:** -[此处填写对整个短剧剧情的概括] +# 输出格式 +## 一、基础识别 +- 短剧名称:[如输入可判断则填写,否则写“未知”] +- 当前字幕范围:[开始时间戳] --> [结束时间戳];无法确定则写“字幕未明确” +- 视频来源:[列出视频编号、文件名和各自字幕时间范围;单视频也要写] +- 联网检索确认:[仅写可辅助理解的公开信息;没有联网结果则写“未启用/未提供”] +- 字幕内实际出现:[列出当前字幕真实出现的关键事实,2-4 条] +- 字幕未覆盖/需谨慎信息:[列出联网结果提到但当前字幕未发生的内容;没有则写“无”] -**二、分段剧情解析:** +## 二、人物与关系 +| 统一称呼 | 字幕称呼 | 身份/关系 | 当前剧情作用 | 确定性 | +|---|---|---|---|---| +| [人物名] | [字幕原文称呼] | [身份或关系] | [在当前片段中的作用] | 字幕明确/联网辅助/合理推断 | -**剧情段落 1:[段落主题/概括,例如:主角登场与背景介绍]** -* **时间戳:** [开始时间戳] --> [结束时间戳] -* **内容概要:** [对这段剧情的详细描述] +## 三、整体剧情概括 +[100-180 字,只概括当前字幕覆盖的剧情。必须包含核心冲突、人物动机和当前悬念。] -**剧情段落 2:[段落主题/概括,例如:第一个冲突出现]** -* **时间戳:** [开始时间戳] --> [结束时间戳] -* **内容概要:** [对这段剧情的详细描述] +## 四、分段剧情解析 +| 视频 | 时间戳 | 段落主题 | 剧情事件 | 情绪/冲突功能 | +|---|---|---|---|---| +| [video_id + video_name] | [开始] --> [结束] | [简短主题] | [当前段落发生了什么] | [铺垫/冲突升级/人物塑造/反转/悬念/情绪爆发等] | -... (根据实际剧情段落数量继续) ... +## 五、解说创作重点 +- 开场钩子:[用一句话指出最适合开场抓人的冲突或疑问] +- 核心冲突:[当前片段最主要的矛盾] +- 爽点/泪点/情绪点:[列 1-3 条,没有则写“无明显”] +- 悬念点:[当前片段留下的疑问或后续期待] +- 建议保留原声片段: + 1. [video_id + video_name + 时间戳]:[保留理由;如果没有合适原声,写“无明显”] -**剧情段落 N:[段落主题/概括,例如:结局与反思]** -* **时间戳:** [开始时间戳] --> [结束时间戳] -* **内容概要:** [对这段剧情的详细描述] +## 六、联网信息校验 +- 可用于辅助理解的信息:[联网结果中可帮助理解当前字幕的信息;没有则写“无”] +- 与字幕不一致或字幕未覆盖的信息:[必须列出,不要混入当前剧情事实;没有则写“无”] -# 注意事项 -* 请确保时间戳的准确性,直接引用字幕中的时间。 -* 剧情段落的划分应合乎逻辑,能够反映剧情的起承转合。 -* 语言表达应简洁、准确、客观。 - -# 限制 -1. 严禁输出与分析结果无关的内容 -2. 时间戳必须严格按照字幕中的实际时间 - -# 请处理以下字幕: +# 输入内容 ${subtitle_content}""" diff --git a/app/services/prompts/short_drama_narration/script_generation.py b/app/services/prompts/short_drama_narration/script_generation.py index 0184cb1..234fc98 100644 --- a/app/services/prompts/short_drama_narration/script_generation.py +++ b/app/services/prompts/short_drama_narration/script_generation.py @@ -43,11 +43,14 @@ class ScriptGenerationPrompt(ParameterizedPrompt): ${plot_analysis} -### 原始字幕(含精确时间戳) +### 原始字幕(含视频编号和精确时间戳) ${subtitle_content} +字幕可能来自多个视频文件。每个字幕分段标题会以“视频 1: 文件名”“视频 2: 文件名”等形式标识来源。 +生成脚本时必须把每个片段绑定到对应视频来源,时间戳表示该视频文件内部的局部时间,不是把多个视频拼接后的全局时间。 + ## 短剧解说创作核心要素 ### 1. 黄金开场(3秒法则) @@ -137,11 +140,18 @@ ${subtitle_content} ### 时间戳管理(绝对不能违反) - **时间戳绝对不能重叠**,确保剪辑后无重复画面 -- **时间段必须连续且不交叉**,严格按时间顺序排列 -- **每个时间戳都必须在原始字幕中找到对应范围** +- **同一个 video_id 内的时间段必须连续且不交叉**,严格按该视频内时间顺序排列 +- **跨视频可以切换 video_id**,但每个时间戳都必须来自对应视频字幕分段 +- **每个时间戳都必须在对应视频的原始字幕中找到对应范围** - 可以拆分原时间片段,但必须保持时间连续性 - 时间戳的格式必须与原始字幕中的格式完全一致 +### 多视频来源规范(多集/多文件必须遵守) +- **video_id**:必须填写,取字幕分段标题里的视频编号,例如“视频 3”就填 3 +- **video_name**:必须填写对应的视频文件名,例如“3_20260607002212.mp4” +- **timestamp**:只填写对应 video_id 内部的时间范围,不要换算成多个视频拼接后的累计时间 +- 如果剧情跨多个视频推进,脚本可以按故事顺序在不同 video_id 之间切换,但不得把视频 2 的时间戳写到 video_id=1 + ### 时长控制(1/3原则) - **解说视频总长度 = 原视频长度的 1/3** - 精确控制节奏和密度,既不能过短也不能过长 @@ -159,6 +169,8 @@ ${subtitle_content} ```json { "_id": 序号, + "video_id": 视频编号, + "video_name": "视频文件名", "timestamp": "开始时间-结束时间", "picture": "画面内容描述", "narration": "播放原片+序号", @@ -242,6 +254,8 @@ ${subtitle_content} "items": [ { "_id": 1, + "video_id": 1, + "video_name": "1.mp4", "timestamp": "00:00:01,000-00:00:05,500", "picture": "女主角林小雨慌张地道歉,男主角沈墨轩冷漠地看着她", "narration": "一个普通女孩的命运即将因为一杯咖啡彻底改变!她撞到的这个男人,竟然是...", @@ -249,6 +263,8 @@ ${subtitle_content} }, { "_id": 2, + "video_id": 1, + "video_name": "1.mp4", "timestamp": "00:00:05,500-00:00:08,000", "picture": "沈墨轩质问林小雨,语气冷厉威严", "narration": "播放原片2", @@ -256,6 +272,8 @@ ${subtitle_content} }, { "_id": 3, + "video_id": 2, + "video_name": "2.mp4", "timestamp": "00:00:08,000-00:00:12,000", "picture": "林小雨惊慌失措,沈墨轩眼中闪过一丝兴趣", "narration": "霸道总裁的经典开场!一杯咖啡引发的爱情故事就这样开始了...", @@ -281,6 +299,7 @@ ${subtitle_content} - **原声片段标识**:OST=1表示原声,OST=0表示解说 - **原声格式规范**:narration字段必须使用"播放原片+序号"格式 - **关键情绪点**:必须保留原片原声,增强观众代入感 +- **视频来源**:每个片段必须包含 video_id 和 video_name,用于定位多个上传视频中的源文件 - **时间戳精度**:精确到毫秒级别,确保与字幕完美匹配 - **逻辑连贯性**:严格遵循剧情发展顺序 diff --git a/app/services/task.py b/app/services/task.py index bf8c45b..28b05ea 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -225,6 +225,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di # 使用新的统一裁剪策略 video_clip_result = clip_video.clip_video_unified( video_origin_path=params.video_origin_path, + video_origin_paths=getattr(params, "video_origin_paths", []), script_list=list_script, tts_results=tts_results ) @@ -477,6 +478,7 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): # 使用新的统一裁剪策略 video_clip_result = clip_video.clip_video_unified( video_origin_path=params.video_origin_path, + video_origin_paths=getattr(params, "video_origin_paths", []), script_list=list_script, tts_results=tts_results ) diff --git a/app/services/tavily_search.py b/app/services/tavily_search.py new file mode 100644 index 0000000..586a7ee --- /dev/null +++ b/app/services/tavily_search.py @@ -0,0 +1,116 @@ +"""Tavily-powered web search helpers for plot analysis.""" + +from __future__ import annotations + +import os +from typing import Any + +import requests +from loguru import logger + + +TAVILY_API_BASE_URL = "https://api.tavily.com" +DEFAULT_SEARCH_DEPTH = "basic" +DEFAULT_MAX_RESULTS = 5 +DEFAULT_TIMEOUT = 20 + + +class TavilySearchError(RuntimeError): + """Raised when Tavily search cannot be completed.""" + + +def _trim_text(value: Any, max_chars: int) -> str: + text = str(value or "").strip() + if len(text) <= max_chars: + return text + return f"{text[:max_chars].rstrip()}..." + + +def search_short_drama( + short_name: str, + api_key: str | None = None, + *, + search_depth: str = DEFAULT_SEARCH_DEPTH, + max_results: int = DEFAULT_MAX_RESULTS, + timeout: int = DEFAULT_TIMEOUT, +) -> dict[str, Any]: + """Search web context for a short drama name with Tavily.""" + short_name = str(short_name or "").strip() + if not short_name: + raise TavilySearchError("短剧名称不能为空") + + api_key = (api_key or os.getenv("TAVILY_API_KEY") or "").strip() + if not api_key: + raise TavilySearchError("Tavily API Key 未配置") + + query = f"{short_name} 短剧 剧情 介绍 人物 结局" + payload = { + "query": query, + "search_depth": search_depth or DEFAULT_SEARCH_DEPTH, + "topic": "general", + "max_results": max(1, min(int(max_results or DEFAULT_MAX_RESULTS), 10)), + "include_answer": True, + "include_raw_content": False, + "include_images": False, + } + + try: + response = requests.post( + f"{TAVILY_API_BASE_URL}/search", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json=payload, + timeout=timeout, + ) + except requests.RequestException as exc: + raise TavilySearchError(f"Tavily 请求失败: {exc}") from exc + + if response.status_code >= 400: + message = _trim_text(response.text, 500) + raise TavilySearchError(f"Tavily 请求失败: HTTP {response.status_code} {message}") + + try: + data = response.json() + except ValueError as exc: + raise TavilySearchError("Tavily 返回内容不是有效 JSON") from exc + + logger.info( + "Tavily 短剧检索完成: query={}, results={}", + query, + len(data.get("results") or []), + ) + return data + + +def format_search_context(search_data: dict[str, Any], *, max_chars: int = 6000) -> str: + """Format Tavily response into compact LLM context.""" + if not search_data: + return "" + + lines = [ + "# Tavily 联网检索结果", + f"检索 query: {search_data.get('query', '')}", + ] + + answer = _trim_text(search_data.get("answer"), 1200) + if answer: + lines.extend(["", "## 综合回答", answer]) + + results = search_data.get("results") or [] + if results: + lines.extend(["", "## 搜索来源"]) + for index, result in enumerate(results, start=1): + title = _trim_text(result.get("title"), 120) + url = _trim_text(result.get("url"), 240) + content = _trim_text(result.get("content") or result.get("raw_content"), 700) + lines.extend( + [ + f"{index}. 标题: {title}", + f" 来源: {url}", + f" 摘要: {content}", + ] + ) + + return _trim_text("\n".join(lines).strip(), max_chars) diff --git a/app/services/test_jianying_task_unittest.py b/app/services/test_jianying_task_unittest.py index 18897a4..0a1660f 100644 --- a/app/services/test_jianying_task_unittest.py +++ b/app/services/test_jianying_task_unittest.py @@ -51,6 +51,23 @@ class JianyingTaskTests(unittest.TestCase): self.assertEqual(f"indextts2:{ref_path}", params.voice_name) + def test_normalize_omnivoice_clone_uses_valid_param_reference(self): + with tempfile.NamedTemporaryFile(suffix=".wav") as ref: + params = VideoClipParams(tts_engine="omnivoice", voice_name=f"omnivoice:{ref.name}") + + with patch.dict(jianying_task.config.omnivoice, {"mode": "voice_clone"}, clear=False): + jianying_task._normalize_indextts_reference_audio(params) + + self.assertEqual(f"omnivoice:{ref.name}", params.voice_name) + + def test_normalize_omnivoice_auto_does_not_require_reference(self): + params = VideoClipParams(tts_engine="omnivoice", voice_name="omnivoice:auto") + + with patch.dict(jianying_task.config.omnivoice, {"mode": "auto", "reference_audio": ""}, clear=False): + jianying_task._normalize_indextts_reference_audio(params) + + self.assertEqual("omnivoice:auto", params.voice_name) + def test_normalize_indextts_requires_existing_reference_audio(self): params = VideoClipParams(tts_engine="indextts", voice_name="zh-CN-YunjianNeural") diff --git a/app/services/test_multi_video_script_sources_unittest.py b/app/services/test_multi_video_script_sources_unittest.py new file mode 100644 index 0000000..dd6fce8 --- /dev/null +++ b/app/services/test_multi_video_script_sources_unittest.py @@ -0,0 +1,84 @@ +import json +import os +import tempfile +import unittest +from unittest import mock + +from app.services import clip_video +from app.utils import check_script + + +class TestMultiVideoScriptSources(unittest.TestCase): + def test_check_format_accepts_optional_video_source_fields(self): + script = [ + { + "_id": 1, + "video_id": 2, + "video_name": "2.mp4", + "timestamp": "00:00:00,000-00:00:03,000", + "picture": "画面", + "narration": "解说", + "OST": 0, + } + ] + + result = check_script.check_format(json.dumps(script, ensure_ascii=False)) + + self.assertTrue(result["success"]) + + def test_clip_video_unified_resolves_source_by_video_id_and_name(self): + with tempfile.TemporaryDirectory() as temp_dir: + video_1 = os.path.join(temp_dir, "1.mp4") + video_2 = os.path.join(temp_dir, "2.mp4") + for video_path in [video_1, video_2]: + with open(video_path, "wb") as file: + file.write(b"video") + + output_dir = os.path.join(temp_dir, "clips") + used_sources = [] + + def fake_process(source_video_path, script_item, output_dir_arg, *_args): + used_sources.append(source_video_path) + output_path = os.path.join(output_dir_arg, f"{script_item['_id']}.mp4") + with open(output_path, "wb") as file: + file.write(b"clip") + return output_path + + script_list = [ + { + "_id": 1, + "video_id": 2, + "timestamp": "00:00:00,000-00:00:03,000", + "picture": "视频2画面", + "narration": "播放原片1", + "OST": 1, + }, + { + "_id": 2, + "video_name": "1.mp4", + "timestamp": "00:00:03,000-00:00:06,000", + "picture": "视频1画面", + "narration": "播放原片2", + "OST": 1, + }, + ] + + with ( + mock.patch.object(clip_video, "check_hardware_acceleration", return_value=None), + mock.patch.object(clip_video, "_process_original_audio_segment", side_effect=fake_process), + ): + result = clip_video.clip_video_unified( + video_origin_path=video_1, + video_origin_paths=[video_1, video_2], + script_list=script_list, + tts_results=[], + output_dir=output_dir, + task_id="multi-video-test", + ) + + self.assertEqual([video_2, video_1], used_sources) + self.assertEqual({1, 2}, set(result.keys())) + + +if __name__ == "__main__": + unittest.main() diff --git a/app/services/voice.py b/app/services/voice.py index 2be5c87..476c2fe 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os import re import json @@ -1298,6 +1300,10 @@ def tts( if tts_engine == config.INDEXTTS2_ENGINE: logger.info("分发到 IndexTTS-2") return indextts2_tts(text, voice_name, voice_file) + + if tts_engine == config.OMNIVOICE_ENGINE: + logger.info("分发到 OmniVoice") + return omnivoice_tts(text, voice_name, voice_file, speed=voice_rate) if tts_engine == "doubaotts": logger.info("分发到豆包语音 TTS") @@ -1783,7 +1789,11 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f voice_name = config.normalize_indextts_voice_prefix(parse_voice_name(voice_name)) output_dir = utils.task_dir(task_id) tts_results = [] - audio_extension = ".wav" if tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) else ".mp3" + audio_extension = ".wav" if tts_engine in ( + config.INDEXTTS_ENGINE, + config.INDEXTTS2_ENGINE, + config.OMNIVOICE_ENGINE, + ) else ".mp3" for item in list_script: if item['OST'] != 1: @@ -1809,11 +1819,11 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f f"或者使用其他 tts 引擎") continue else: - # SoulVoice、Qwen3、IndexTTS、豆包语音 引擎不生成精确字幕文件 + # SoulVoice、Qwen3、IndexTTS、OmniVoice、豆包语音 引擎不生成精确字幕文件 if ( is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) - or tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) + or tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE, config.OMNIVOICE_ENGINE) or tts_engine == "doubaotts" ): # 获取实际音频文件的时长 @@ -2256,6 +2266,17 @@ def parse_indextts2_voice(voice_name: str) -> str: return voice_name +def parse_omnivoice_voice(voice_name: str) -> str: + """ + 解析 OmniVoice 语音名称 + 支持格式:omnivoice:reference_audio_path + 返回参考音频文件路径或模式名 + """ + if isinstance(voice_name, str) and voice_name.startswith(config.OMNIVOICE_VOICE_PREFIX): + return voice_name[len(config.OMNIVOICE_VOICE_PREFIX):] + return voice_name + + def indextts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: """ 使用 IndexTTS-1.5 API 进行零样本语音克隆 @@ -2493,3 +2514,141 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str) -> Union[SubMaker logger.error("IndexTTS-2 TTS 生成失败,已达到最大重试次数") return None + + +def _normalize_omnivoice_api_url(api_url: str) -> str: + api_url = (api_url or "http://127.0.0.1:7866/tts").strip() + if api_url.endswith("/tts"): + return api_url + if api_url.endswith("/tts/json"): + return f"{api_url[:-len('/tts/json')]}/tts" + return f"{api_url.rstrip('/')}/tts" + + +def _download_omnivoice_audio(response: requests.Response, api_url: str, voice_file: str, proxies: dict) -> bool: + content_type = response.headers.get("content-type", "").lower() + if "application/json" not in content_type: + with open(voice_file, "wb") as f: + f.write(response.content) + return os.path.getsize(voice_file) > 0 + + result = response.json() + audio_url = result.get("audio_url") if isinstance(result, dict) else "" + if not audio_url: + logger.error(f"OmniVoice API 响应中没有音频下载地址: {result}") + return False + + audio_response = requests.get(urljoin(api_url, audio_url), proxies=proxies, timeout=180) + if audio_response.status_code != 200: + logger.error(f"OmniVoice 音频下载失败: {audio_response.status_code} - {audio_response.text}") + return False + + with open(voice_file, "wb") as f: + f.write(audio_response.content) + return os.path.getsize(voice_file) > 0 + + +def _optional_omnivoice_generation_data(voice_speed: float) -> dict: + omnivoice_config = getattr(config, "omnivoice", {}) or {} + data = { + "speed": voice_speed or omnivoice_config.get("speed", 1.0), + } + + optional_fields = { + "num_step": omnivoice_config.get("num_step"), + "guidance_scale": omnivoice_config.get("guidance_scale"), + "duration": omnivoice_config.get("duration"), + } + for key, value in optional_fields.items(): + if value not in (None, ""): + data[key] = value + + for key in ("denoise", "postprocess_output", "preprocess_prompt"): + if key in omnivoice_config: + data[key] = str(bool(omnivoice_config.get(key))).lower() + + return data + + +def omnivoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: + """ + 使用 OmniVoice-Pack FastAPI 服务进行语音合成。 + 支持自动音色、指令音色和参考音频克隆三种模式。 + """ + omnivoice_config = getattr(config, "omnivoice", {}) or {} + api_url = _normalize_omnivoice_api_url(omnivoice_config.get("api_url", "http://127.0.0.1:7866/tts")) + mode = omnivoice_config.get("mode", "auto") + language = (omnivoice_config.get("language", "zh") or "").strip() + instruct = (omnivoice_config.get("instruct", "") or "").strip() + ref_text = (omnivoice_config.get("ref_text", "") or "").strip() + parsed_voice = parse_omnivoice_voice(voice_name) + if mode != "voice_clone" and parsed_voice and os.path.isfile(parsed_voice): + mode = "voice_clone" + + reference_audio_path = "" + if mode == "voice_clone": + candidate = parsed_voice + if candidate and os.path.isfile(candidate): + reference_audio_path = candidate + else: + reference_audio_path = parse_omnivoice_voice(omnivoice_config.get("reference_audio", "") or "") + + if not reference_audio_path or not os.path.exists(reference_audio_path): + logger.error(f"OmniVoice 参考音频文件不存在: {reference_audio_path}") + return None + elif mode != "voice_design": + instruct = "" + + data = { + "text": text.strip(), + "language": language, + **_optional_omnivoice_generation_data(speed), + } + if mode == "voice_design" and instruct: + data["instruct"] = instruct + if mode == "voice_clone" and ref_text: + data["ref_text"] = ref_text + + proxies = _get_configured_proxies() + for attempt in range(3): + files = {} + try: + if reference_audio_path: + files["ref_audio"] = open(reference_audio_path, "rb") + + logger.info(f"第 {attempt + 1} 次调用 OmniVoice API: {api_url}, mode={mode}") + response = requests.post( + api_url, + files=files or None, + data=data, + proxies=proxies, + timeout=240, + ) + + if response.status_code == 200 and _download_omnivoice_audio(response, api_url, voice_file, proxies): + logger.info(f"OmniVoice 成功生成音频: {voice_file}, 大小: {os.path.getsize(voice_file)} 字节") + sub_maker = new_sub_maker() + duration = get_audio_duration_from_file(voice_file) + duration_ms = int(duration * 1000) if duration > 0 else max(1000, int(len(text) * 200)) + add_subtitle_event(sub_maker, 0, duration_ms * 10000, text) + return sub_maker + + logger.error(f"OmniVoice API 调用失败: {response.status_code} - {response.text}") + except requests.exceptions.Timeout: + logger.error(f"OmniVoice API 调用超时 (尝试 {attempt + 1}/3)") + except requests.exceptions.RequestException as e: + logger.error(f"OmniVoice API 网络错误: {str(e)} (尝试 {attempt + 1}/3)") + except Exception as e: + logger.error(f"OmniVoice TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)") + finally: + for file_obj in files.values(): + try: + file_obj.close() + except Exception: + pass + + if attempt < 2: + time.sleep(2) + + logger.error("OmniVoice TTS 生成失败,已达到最大重试次数") + return None diff --git a/app/utils/check_script.py b/app/utils/check_script.py index 9c745e6..0e6f692 100644 --- a/app/utils/check_script.py +++ b/app/utils/check_script.py @@ -57,6 +57,23 @@ def check_format(script_content: str) -> Dict[str, Any]: 'details': f'当前值: {clip["_id"]} (类型: {type(clip["_id"]).__name__})' } + # 验证可选视频来源字段。旧脚本可以不包含,新脚本用于多视频定位。 + if 'video_id' in clip and clip['video_id'] not in ("", None): + if not isinstance(clip['video_id'], int) or clip['video_id'] <= 0: + return { + 'success': False, + 'message': f'第{i+1}个片段的video_id必须是正整数', + 'details': f'当前值: {clip["video_id"]} (类型: {type(clip["video_id"]).__name__})' + } + + if 'video_name' in clip and clip['video_name'] not in ("", None): + if not isinstance(clip['video_name'], str): + return { + 'success': False, + 'message': f'第{i+1}个片段的video_name必须是字符串', + 'details': f'当前值: {clip["video_name"]} (类型: {type(clip["video_name"]).__name__})' + } + # 验证 timestamp 字段格式 timestamp_pattern = r'^\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}$' if not isinstance(clip['timestamp'], str) or not re.match(timestamp_pattern, clip['timestamp']): diff --git a/config.example.toml b/config.example.toml index 0b807e3..547724e 100644 --- a/config.example.toml +++ b/config.example.toml @@ -49,6 +49,12 @@ text_openai_max_tokens = 65536 text_openai_thinking_level = "auto" # auto/off/low/medium/high + # ===== Tavily 联网搜索配置 ===== + # 用于短剧剧情理解前,按短剧名称检索公开剧情/人物/分集信息 + tavily_api_key = "" # 获取地址:https://app.tavily.com + tavily_search_depth = "basic" # basic / advanced / fast / ultra-fast + tavily_max_results = 5 + # ===== API Keys 参考 ===== # 主流 LLM Providers API Key 获取地址: # @@ -171,6 +177,30 @@ repetition_penalty = 10.0 max_mel_tokens = 1500 +[omnivoice] + # OmniVoice-Pack 语音合成配置 + # 支持 OmniVoice-Pack FastAPI 接口:POST /tts + api_url = "http://127.0.0.1:7866/tts" + language = "zh" + + # 生成模式:auto / voice_design / voice_clone + mode = "auto" + instruct = "" + + # voice_clone 模式下使用,音色列表复用 IndexTTS-1.5 的资源目录 + reference_audio_source = "resource" + reference_audio = "" + ref_text = "" + + # 高级生成参数 + num_step = 32 + guidance_scale = 2.0 + speed = 1.0 + duration = "" + denoise = true + postprocess_output = true + preprocess_prompt = true + [doubaotts] # 豆包语音 TTS 配置 # 申请流程: @@ -189,7 +219,7 @@ silence_duration = 0.125 [ui] - # TTS引擎选择 (indextts, indextts2, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech) + # TTS引擎选择 (indextts, indextts2, omnivoice, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech) tts_engine = "indextts" # Edge TTS 配置 diff --git a/requirements.txt b/requirements.txt index c6011de..be125ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ requests>=2.32.0 moviepy==2.1.1 edge-tts==7.2.7 -streamlit>=1.57.0 +streamlit==1.56.0 watchdog==6.0.0 loguru>=0.7.3 tomli>=2.2.1 diff --git a/webui.py b/webui.py index 68c24a7..5ba26a3 100644 --- a/webui.py +++ b/webui.py @@ -243,6 +243,12 @@ def get_voice_name_for_tts_engine(tts_engine: str) -> str: if reference_audio: return f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}" return config.ui.get('voice_name', '') + if tts_engine == config.OMNIVOICE_ENGINE: + mode = config.omnivoice.get('mode', 'auto') + reference_audio = config.omnivoice.get('reference_audio', '') + if mode == 'voice_clone' and reference_audio: + return f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}" + return f"{config.OMNIVOICE_VOICE_PREFIX}{mode}" if tts_engine == 'doubaotts': return config.ui.get('doubaotts_voice_type', 'BV700_streaming') if tts_engine == 'soulvoice': @@ -263,6 +269,7 @@ def get_jianying_export_params(draft_name=None) -> VideoClipParams: return VideoClipParams( video_clip_json_path=st.session_state['video_clip_json_path'], video_origin_path=st.session_state['video_origin_path'], + video_origin_paths=st.session_state.get('video_origin_paths', []), tts_engine=tts_engine, voice_name=voice_name, voice_rate=voice_rate, diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index c5ec08c..cab5413 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -40,6 +40,11 @@ BGM_RESOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/bgms-safe" BGM_TRACKS_JSON = os.path.join(BGM_RESOURCE_DIR, "tracks.json") BGM_UPLOAD_SUBDIR = "uploaded_bgms" BGM_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg") +LOCAL_TTS_ENGINES = { + config.INDEXTTS_ENGINE, + config.INDEXTTS2_ENGINE, + config.OMNIVOICE_ENGINE, +} def get_soulvoice_voices(): @@ -55,9 +60,10 @@ def get_soulvoice_voices(): def get_tts_engine_options(tr=lambda key: key): """获取TTS引擎选项""" - return { + engine_options = { config.INDEXTTS_ENGINE: config.INDEXTTS_DISPLAY_NAME, config.INDEXTTS2_ENGINE: config.INDEXTTS2_DISPLAY_NAME, + config.OMNIVOICE_ENGINE: config.OMNIVOICE_DISPLAY_NAME, "edge_tts": "Edge TTS", "qwen3_tts": tr("Tongyi Qwen3 TTS"), "tencent_tts": tr("Tencent Cloud TTS"), @@ -65,6 +71,25 @@ def get_tts_engine_options(tr=lambda key: key): "azure_speech": "Azure Speech Services" } + return { + engine: format_tts_engine_option(engine, display_name, tr) + for engine, display_name in engine_options.items() + } + + +def get_tts_engine_deployment_label(tts_engine, tr=lambda key: key): + """获取TTS引擎部署类型标签""" + if tts_engine in LOCAL_TTS_ENGINES: + return tr("Local Deployment") + + return tr("Cloud Service") + + +def format_tts_engine_option(tts_engine, display_name, tr=lambda key: key): + """格式化TTS引擎下拉显示名""" + deployment_label = get_tts_engine_deployment_label(tts_engine, tr) + return f"{display_name} [{deployment_label}]" + def get_tts_engine_descriptions(tr=lambda key: key): """获取TTS引擎详细描述""" @@ -105,6 +130,12 @@ def get_tts_engine_descriptions(tr=lambda key: key): "use_case": tr("IndexTTS2 use case"), "registration": None }, + config.OMNIVOICE_ENGINE: { + "title": config.OMNIVOICE_DISPLAY_NAME, + "features": tr("OmniVoice features"), + "use_case": tr("OmniVoice use case"), + "registration": None + }, "doubaotts": { "title": tr("Doubao TTS"), "features": tr("Doubao TTS features"), @@ -546,6 +577,8 @@ def render_tts_settings(tr): render_indextts_tts_settings(tr) elif selected_engine == config.INDEXTTS2_ENGINE: render_indextts2_tts_settings(tr) + elif selected_engine == config.OMNIVOICE_ENGINE: + render_omnivoice_tts_settings(tr) elif selected_engine == "doubaotts": render_doubaotts_settings(tr) @@ -1274,6 +1307,148 @@ def render_indextts2_tts_settings(tr): st.session_state['voice_pitch'] = 1.0 +def render_omnivoice_tts_settings(tr): + """渲染 OmniVoice TTS 设置""" + omnivoice_config = config.omnivoice + + api_url = st.text_input( + tr("API URL"), + value=omnivoice_config.get("api_url", "http://127.0.0.1:7866/tts"), + help=tr("OmniVoice API URL Help"), + ) + + language = st.text_input( + tr("OmniVoice Language Code"), + value=omnivoice_config.get("language", "zh"), + help=tr("OmniVoice Language Code Help"), + placeholder="zh", + ) + + mode_options = [ + ("auto", tr("OmniVoice Mode Auto")), + ("voice_design", tr("OmniVoice Mode Voice Design")), + ("voice_clone", tr("OmniVoice Mode Voice Clone")), + ] + mode_values = [item[0] for item in mode_options] + saved_mode = omnivoice_config.get("mode", "auto") + if saved_mode not in mode_values: + saved_mode = "auto" + + mode = mode_options[st.selectbox( + tr("OmniVoice Generation Mode"), + options=range(len(mode_options)), + index=mode_values.index(saved_mode), + format_func=lambda x: mode_options[x][1], + help=tr("OmniVoice Generation Mode Help"), + )][0] + + instruct = omnivoice_config.get("instruct", "") + reference_audio_source = omnivoice_config.get("reference_audio_source", "resource") + reference_audio = omnivoice_config.get("reference_audio", "") + ref_text = omnivoice_config.get("ref_text", "") + + if mode == "voice_design": + instruct = st.text_area( + tr("OmniVoice Instruct"), + value=instruct, + help=tr("OmniVoice Instruct Help"), + placeholder=tr("OmniVoice Instruct Placeholder"), + height=80, + ) + elif mode == "voice_clone": + reference_audio_source, reference_audio = render_indextts_reference_audio_selector( + tr, + omnivoice_config, + "omnivoice", + ) + ref_text = st.text_area( + tr("OmniVoice Reference Text"), + value=ref_text, + help=tr("OmniVoice Reference Text Help"), + placeholder=tr("OmniVoice Reference Text Placeholder"), + height=90, + ) + + with st.expander(tr("Advanced Parameters"), expanded=False): + col1, col2 = st.columns(2) + with col1: + num_step = st.slider( + "Num Step", + min_value=4, + max_value=64, + value=int(omnivoice_config.get("num_step", 32)), + step=1, + help=tr("OmniVoice Num Step Help"), + ) + guidance_scale = st.slider( + "Guidance Scale", + min_value=0.1, + max_value=10.0, + value=float(omnivoice_config.get("guidance_scale", 2.0)), + step=0.1, + help=tr("OmniVoice Guidance Scale Help"), + ) + voice_rate = st.slider( + tr("Voice Rate"), + min_value=0.5, + max_value=2.0, + value=float(omnivoice_config.get("speed", 1.0)), + step=0.1, + help=tr("Voice Rate Help 0.5-2.0"), + ) + with col2: + saved_duration = omnivoice_config.get("duration", "") + duration_value = float(saved_duration) if saved_duration not in (None, "") else 0.0 + duration = st.number_input( + tr("OmniVoice Duration"), + min_value=0.0, + max_value=120.0, + value=duration_value, + step=0.5, + help=tr("OmniVoice Duration Help"), + ) + denoise = st.checkbox( + tr("OmniVoice Denoise"), + value=bool(omnivoice_config.get("denoise", True)), + help=tr("OmniVoice Denoise Help"), + ) + postprocess_output = st.checkbox( + tr("OmniVoice Postprocess Output"), + value=bool(omnivoice_config.get("postprocess_output", True)), + help=tr("OmniVoice Postprocess Output Help"), + ) + preprocess_prompt = st.checkbox( + tr("OmniVoice Preprocess Prompt"), + value=bool(omnivoice_config.get("preprocess_prompt", True)), + help=tr("OmniVoice Preprocess Prompt Help"), + ) + + with st.expander(tr("OmniVoice Usage Instructions Title"), expanded=False): + st.markdown(tr("OmniVoice Usage Instructions")) + + config.omnivoice["api_url"] = api_url + config.omnivoice["language"] = language + config.omnivoice["mode"] = mode + config.omnivoice["instruct"] = instruct + config.omnivoice["reference_audio_source"] = reference_audio_source + config.omnivoice["reference_audio"] = reference_audio + config.omnivoice["ref_text"] = ref_text + config.omnivoice["num_step"] = num_step + config.omnivoice["guidance_scale"] = guidance_scale + config.omnivoice["speed"] = voice_rate + config.omnivoice["duration"] = duration if duration > 0 else "" + config.omnivoice["denoise"] = denoise + config.omnivoice["postprocess_output"] = postprocess_output + config.omnivoice["preprocess_prompt"] = preprocess_prompt + + if mode == "voice_clone" and reference_audio: + config.ui["voice_name"] = f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}" + else: + config.ui["voice_name"] = f"{config.OMNIVOICE_VOICE_PREFIX}{mode}" + st.session_state["voice_rate"] = voice_rate + st.session_state["voice_pitch"] = 1.0 + + def render_doubaotts_settings(tr): """渲染豆包语音 TTS 设置""" # AK 输入 @@ -1567,6 +1742,15 @@ def render_voice_preview_new(tr, selected_engine): voice_name = f"{config.INDEXTTS2_VOICE_PREFIX}{reference_audio}" voice_rate = 1.0 # IndexTTS-2 使用自身生成参数 voice_pitch = 1.0 + elif selected_engine == config.OMNIVOICE_ENGINE: + mode = config.omnivoice.get("mode", "auto") + reference_audio = config.omnivoice.get("reference_audio", "") + if mode == "voice_clone" and reference_audio: + voice_name = f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}" + else: + voice_name = f"{config.OMNIVOICE_VOICE_PREFIX}{mode}" + voice_rate = config.omnivoice.get("speed", 1.0) + voice_pitch = 1.0 elif selected_engine == "doubaotts": voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming") voice_name = voice_type @@ -1579,7 +1763,11 @@ def render_voice_preview_new(tr, selected_engine): with st.spinner(tr("Synthesizing Voice")): temp_dir = utils.storage_dir("temp", create=True) - audio_format = "audio/wav" if selected_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) else "audio/mp3" + audio_format = "audio/wav" if selected_engine in ( + config.INDEXTTS_ENGINE, + config.INDEXTTS2_ENGINE, + config.OMNIVOICE_ENGINE, + ) else "audio/mp3" audio_extension = ".wav" if audio_format == "audio/wav" else ".mp3" audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}{audio_extension}") diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py index a8185bc..1ea746c 100644 --- a/webui/components/basic_settings.py +++ b/webui/components/basic_settings.py @@ -260,6 +260,7 @@ def render_basic_settings(tr): with left_config_panel: render_language_settings(tr) render_proxy_settings(tr) + render_tavily_search_settings(tr) with middle_config_panel: render_vision_llm_settings(tr) # 视觉分析模型设置 @@ -345,6 +346,32 @@ def render_proxy_settings(tr): config.ui["jianying_draft_path"] = jianying_draft_path +def render_tavily_search_settings(tr): + """Render Tavily API key settings used by short drama web search.""" + st.subheader(tr("Tavily Search Settings")) + st.markdown( + f"{tr('API Key URL')}: " + "[https://app.tavily.com](https://app.tavily.com)" + ) + + tavily_api_key = st.text_input( + tr("Tavily API Key"), + value=config.app.get("tavily_api_key", ""), + type="password", + help=tr("Tavily API Key Help"), + key="tavily_api_key_input", + ) + + if update_app_config_if_changed("tavily_api_key", str(tavily_api_key or "").strip()): + try: + config.save_config() + st.session_state["tavily_api_key"] = str(tavily_api_key or "").strip() + st.success(tr("Tavily config saved")) + except Exception as e: + st.error(f"{tr('Failed to save config')}: {str(e)}") + logger.error(f"保存 Tavily 配置失败: {str(e)}") + + def test_vision_model_connection(api_key, base_url, model_name, provider, tr): """测试视觉模型连接 diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index 9b03457..d8b296e 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -17,7 +17,7 @@ from webui.tools.generate_script_short import generate_script_short from webui.tools.generate_short_summary import analyze_short_drama_plot, generate_script_short_sunmmary -SCRIPT_TABLE_BASE_COLUMNS = ["_id", "timestamp", "picture", "narration", "OST"] +SCRIPT_TABLE_BASE_COLUMNS = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"] VIDEO_UPLOAD_TYPES = ["mp4", "mov", "avi", "flv", "mkv", "mpeg4"] VIDEO_GLOB_PATTERNS = [f"*.{suffix}" for suffix in VIDEO_UPLOAD_TYPES] @@ -99,15 +99,24 @@ def _read_subtitle_file(path): return f.read() -def _build_combined_subtitle_content(subtitle_paths): +def _build_combined_subtitle_content(subtitle_paths, video_paths=None): sections = [] subtitle_contents = {} - for subtitle_path in subtitle_paths: + video_paths = _normalize_video_paths(video_paths) + for index, subtitle_path in enumerate(subtitle_paths, start=1): if not subtitle_path or not os.path.exists(subtitle_path): continue content = _read_subtitle_file(subtitle_path) subtitle_contents[subtitle_path] = content - sections.append(f"# {os.path.basename(subtitle_path)}\n{content}".strip()) + video_path = video_paths[index - 1] if index <= len(video_paths) else "" + if video_path: + header = ( + f"# 视频 {index}: {os.path.basename(video_path)}\n" + f"字幕文件: {os.path.basename(subtitle_path)}" + ) + else: + header = f"# 视频 {index}\n字幕文件: {os.path.basename(subtitle_path)}" + sections.append(f"{header}\n{content}".strip()) return "\n\n".join(sections), subtitle_contents @@ -120,7 +129,10 @@ def _selected_subtitle_paths(): def _set_subtitle_state(subtitle_paths): subtitle_paths = _normalize_video_paths(subtitle_paths) - subtitle_content, subtitle_contents = _build_combined_subtitle_content(subtitle_paths) + subtitle_content, subtitle_contents = _build_combined_subtitle_content( + subtitle_paths, + _selected_video_paths(), + ) st.session_state['subtitle_path'] = subtitle_paths[0] if subtitle_paths else None st.session_state['subtitle_paths'] = subtitle_paths st.session_state['subtitle_content'] = subtitle_content if subtitle_content else None @@ -128,6 +140,20 @@ def _set_subtitle_state(subtitle_paths): st.session_state['subtitle_file_processed'] = bool(subtitle_paths) +def _short_drama_plot_analysis_signature(subtitle_paths, video_theme, web_search_enabled, video_paths=None): + theme = str(video_theme or "").strip() if web_search_enabled else "" + return json.dumps( + { + "subtitle_paths": _normalize_video_paths(subtitle_paths), + "video_paths": _normalize_video_paths(video_paths), + "video_theme": theme, + "web_search_enabled": bool(web_search_enabled), + }, + ensure_ascii=False, + sort_keys=True, + ) + + def render_script_panel(tr): """渲染脚本配置面板""" with st.container(border=True): @@ -525,16 +551,71 @@ def short_drama_summary(tr): render_fun_asr_transcription(tr) render_subtitle_preview(tr) - current_subtitle_path = st.session_state.get('subtitle_path', '') - plot_analysis_source = st.session_state.get('short_drama_plot_analysis_subtitle_path') - if plot_analysis_source and plot_analysis_source != current_subtitle_path: - st.session_state['short_drama_plot_analysis'] = "" - st.session_state['short_drama_plot_analysis_subtitle_path'] = "" + current_subtitle_paths = _selected_subtitle_paths() + current_subtitle_path = current_subtitle_paths[0] if current_subtitle_paths else '' - name_cols = st.columns([4, 1.2], vertical_alignment="bottom") + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) + + name_cols = st.columns([3.4, 1.1, 2], vertical_alignment="bottom") with name_cols[0]: video_theme = st.text_input(tr("短剧名称")) with name_cols[1]: + web_search_enabled = st.toggle( + tr("联网搜索"), + key="short_drama_web_search_enabled", + help=tr("Enable Web Search Help"), + disabled=not current_subtitle_path, + ) + with name_cols[2]: analyze_plot_clicked = st.button( tr("剧情理解"), key="short_drama_plot_analysis_button", @@ -543,17 +624,37 @@ def short_drama_summary(tr): ) st.session_state['video_theme'] = video_theme + current_signature = _short_drama_plot_analysis_signature( + current_subtitle_paths, + video_theme, + web_search_enabled, + _selected_video_paths(), + ) + saved_signature = st.session_state.get('short_drama_plot_analysis_signature') + legacy_source = st.session_state.get('short_drama_plot_analysis_subtitle_path') + if ( + (saved_signature and saved_signature != current_signature) + or (legacy_source and legacy_source != current_subtitle_path) + ): + st.session_state['short_drama_plot_analysis'] = "" + st.session_state['short_drama_plot_analysis_subtitle_path'] = "" + st.session_state['short_drama_plot_analysis_signature'] = "" + if analyze_plot_clicked: with st.spinner(tr("Analyzing plot...")): plot_analysis = analyze_short_drama_plot( - current_subtitle_path, + current_subtitle_paths, st.session_state.get('temperature', 0.7), tr, subtitle_content=st.session_state.get('subtitle_content', ''), + short_name=video_theme, + enable_web_search=web_search_enabled, + video_paths=_selected_video_paths(), ) if plot_analysis: st.session_state['short_drama_plot_analysis'] = plot_analysis st.session_state['short_drama_plot_analysis_subtitle_path'] = current_subtitle_path + st.session_state['short_drama_plot_analysis_signature'] = current_signature st.success(tr("Plot analysis completed")) if st.session_state.get('short_drama_plot_analysis'): @@ -575,7 +676,10 @@ def render_subtitle_preview(tr): subtitle_contents = {} if subtitle_paths and (not subtitle_content or not subtitle_contents): - subtitle_content, subtitle_contents = _build_combined_subtitle_content(subtitle_paths) + subtitle_content, subtitle_contents = _build_combined_subtitle_content( + subtitle_paths, + _selected_video_paths(), + ) st.session_state['subtitle_content'] = subtitle_content st.session_state['subtitle_contents'] = subtitle_contents @@ -724,7 +828,7 @@ def _normalize_script_table_value(column, value): if _is_blank_table_value(value): return "" - if column in {"_id", "OST"}: + if column in {"_id", "video_id", "OST"}: try: return int(value) except (TypeError, ValueError): @@ -783,6 +887,14 @@ def render_video_script_editor(tr): column_order=column_order, column_config={ "_id": st.column_config.NumberColumn(tr("Script Column ID"), step=1, format="%d", width=52), + "video_id": st.column_config.NumberColumn( + tr("Script Column Video ID"), + min_value=1, + step=1, + format="%d", + width=80, + ), + "video_name": st.column_config.TextColumn(tr("Script Column Video Name"), width=180), "timestamp": st.column_config.TextColumn(tr("Script Column Timestamp"), width=200), "picture": st.column_config.TextColumn(tr("Script Column Picture"), width=320), "narration": st.column_config.TextColumn(tr("Script Column Narration"), width=480), @@ -1057,7 +1169,10 @@ def render_fun_asr_transcription(tr): st.error(tr("Fun-ASR failed without subtitle file")) return - subtitle_content, subtitle_contents = _build_combined_subtitle_content(generated_paths) + subtitle_content, subtitle_contents = _build_combined_subtitle_content( + generated_paths, + media_paths, + ) if not subtitle_content.strip(): clear_fun_asr_subtitle_state() st.error(tr("Fun-ASR failed without subtitle file")) @@ -1112,20 +1227,35 @@ def render_script_buttons(tr, params): generate_script_short(tr, params, custom_clips) elif script_path == "summary": # 执行 短剧解说 脚本生成 - subtitle_path = st.session_state.get('subtitle_path') + subtitle_paths = _selected_subtitle_paths() + subtitle_path = subtitle_paths[0] if subtitle_paths else None video_theme = st.session_state.get('video_theme') temperature = st.session_state.get('temperature') + web_search_enabled = bool(st.session_state.get('short_drama_web_search_enabled', False)) + current_signature = _short_drama_plot_analysis_signature( + subtitle_paths, + video_theme, + web_search_enabled, + _selected_video_paths(), + ) plot_analysis = "" - if st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path: + if st.session_state.get('short_drama_plot_analysis_signature') == current_signature: + plot_analysis = st.session_state.get('short_drama_plot_analysis', '') + elif ( + not web_search_enabled + and st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path + ): plot_analysis = st.session_state.get('short_drama_plot_analysis', '') generate_script_short_sunmmary( params, - subtitle_path, + subtitle_paths, video_theme, temperature, tr, plot_analysis=plot_analysis, subtitle_content=st.session_state.get('subtitle_content', ''), + enable_web_search=web_search_enabled, + video_paths=_selected_video_paths(), ) else: load_script(tr, script_path) @@ -1172,6 +1302,8 @@ def save_script_with_validation(tr, video_clip_json_details): example_script = [ { "_id": 1, + "video_id": 1, + "video_name": "1.mp4", "timestamp": "00:00:00,600-00:00:07,559", "picture": "工地上,蔡晓艳奋力救人,场面混乱", "narration": "灾后重建,工地上险象环生!泼辣女工蔡晓艳挺身而出,救人第一!", @@ -1179,6 +1311,8 @@ def save_script_with_validation(tr, video_clip_json_details): }, { "_id": 2, + "video_id": 2, + "video_name": "2.mp4", "timestamp": "00:00:08,240-00:00:12,359", "picture": "领导视察,蔡晓艳不屑一顾", "narration": "播放原片4", diff --git a/webui/components/subtitle_settings.py b/webui/components/subtitle_settings.py index f719d5e..96a7a7a 100644 --- a/webui/components/subtitle_settings.py +++ b/webui/components/subtitle_settings.py @@ -604,7 +604,7 @@ def render_font_settings(tr): def is_disabled_subtitle_settings(tts_engine:str)->bool: """是否禁用字幕设置""" - return tts_engine=="soulvoice" or tts_engine=="qwen3_tts" + return tts_engine=="soulvoice" or tts_engine=="qwen3_tts" or tts_engine==config.OMNIVOICE_ENGINE def render_position_settings(tr): """渲染位置设置""" diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 8e3356c..284d9a6 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -15,6 +15,8 @@ "Video script table help": "Edit the full script JSON as a table. You can add or delete rows; saving will validate and write the script file again.", "Raw JSON Preview": "Raw JSON Preview", "Script Column ID": "ID", + "Script Column Video ID": "Video", + "Script Column Video Name": "Video Name", "Script Column Timestamp": "Timestamp", "Script Column Picture": "Picture", "Script Column Narration": "Narration", @@ -286,7 +288,11 @@ "IndexTTS download link": "Download link: https://pan.quark.cn/s/0767c9bcefd5", "IndexTTS2 features": "A locally or privately deployed IndexTTS-2 voice-cloning engine with emotion control and fuller generation parameters.", "IndexTTS2 use case": "Best for fixed voices, emotional narration, and local speech synthesis workflows that need finer sampling controls. Start the IndexTTS-2 API service before use.", + "OmniVoice features": "A locally or privately deployed OmniVoice-Pack multilingual TTS engine with automatic voice generation, voice design, and reference-audio cloning.", + "OmniVoice use case": "Best for local controllable multilingual narration, voice design, or reference-audio cloning. Start the OmniVoice-Pack API service before use.", "Doubao TTS features": "Volcengine Doubao speech synthesis with multiple voices and emotions, plus fast access in mainland China.", + "Local Deployment": "Local Deployment", + "Cloud Service": "Cloud Service", "Select TTS Engine": "Select TTS Engine", "Select TTS Engine Help": "Choose the text-to-speech engine you want to use.", "TTS Engine Details": "📋 {engine} Details", @@ -413,6 +419,16 @@ "Subtitle calibration succeeded for multiple files": "Subtitle calibration succeeded for {count} files: {files}", "Subtitle calibration failed": "Subtitle calibration failed", "Transcribed subtitles storage hint": "Previously transcribed subtitles are saved in {path}; drag a file from that folder to upload", + "Tavily Search Settings": "Tavily Web Search", + "Tavily API Key": "Tavily API Key", + "Tavily API Key Help": "Used for web search before short drama plot analysis. When Web Search is enabled, the app searches plot, character, and episode context by drama name, then combines it with subtitles.", + "Tavily config saved": "Tavily configuration saved", + "联网搜索": "Web Search", + "Enable Web Search Help": "When enabled, plot analysis searches the web with Tavily by short drama name before combining those results with subtitles.", + "Please configure Tavily API Key in Basic Settings": "Please configure the Tavily API Key in Basic Settings first", + "Please enter short drama name before web search": "Please enter the short drama name before enabling web search", + "Searching short drama with Tavily...": "Searching short drama context with Tavily...", + "Tavily search failed": "Tavily search failed", "剧情理解": "Plot Analysis", "剧情理解结果": "Plot Analysis Result", "Analyzing plot...": "Analyzing plot...", @@ -443,6 +459,30 @@ "API URL": "API URL", "IndexTTS API URL Help": "IndexTTS-1.5 API service URL", "IndexTTS2 API URL Help": "IndexTTS-2 API service URL. You can enter the service root or the full /tts endpoint.", + "OmniVoice API URL Help": "OmniVoice-Pack API service URL. You can enter the service root or the full /tts endpoint.", + "OmniVoice Language Code": "Synthesis Language", + "OmniVoice Language Code Help": "The language parameter sent to OmniVoice-Pack, such as zh or en.", + "OmniVoice Generation Mode": "Generation Mode", + "OmniVoice Generation Mode Help": "Automatic voice needs no extra fields; voice design uses an instruction; reference-audio cloning needs reference audio and matching text.", + "OmniVoice Mode Auto": "Automatic Voice", + "OmniVoice Mode Voice Design": "Voice Design", + "OmniVoice Mode Voice Clone": "Reference Audio Clone", + "OmniVoice Instruct": "Voice Instruction", + "OmniVoice Instruct Help": "Describe the desired voice, such as gender, pitch, accent, or style.", + "OmniVoice Instruct Placeholder": "e.g. female, low pitch, british accent", + "OmniVoice Reference Text": "Reference Audio Text", + "OmniVoice Reference Text Help": "The exact transcript of the reference audio. Required when the deployed service has ASR disabled.", + "OmniVoice Reference Text Placeholder": "Enter the text spoken in the reference audio", + "OmniVoice Num Step Help": "Diffusion generation steps. Higher values usually improve quality but slow generation.", + "OmniVoice Guidance Scale Help": "Controls how strongly text conditions guide generation.", + "OmniVoice Duration": "Target Duration (seconds)", + "OmniVoice Duration Help": "0 lets the model decide the duration automatically.", + "OmniVoice Denoise": "Enable Denoise", + "OmniVoice Denoise Help": "Ask OmniVoice-Pack to denoise the generated output.", + "OmniVoice Postprocess Output": "Postprocess Output", + "OmniVoice Postprocess Output Help": "Enable OmniVoice-Pack output post-processing.", + "OmniVoice Preprocess Prompt": "Preprocess Text", + "OmniVoice Preprocess Prompt Help": "Enable OmniVoice-Pack text preprocessing.", "Reference Audio Source": "Reference Audio Source", "Reference Audio Source Help": "Choose a reference audio from the resource directory or upload a new one.", "Select from Resource Directory": "Select from Resource Directory", @@ -502,6 +542,8 @@ "Max Mel Tokens Help": "Controls the maximum mel tokens generated in one request. Higher values can produce longer audio.", "IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 Usage Instructions", "IndexTTS2 Usage Instructions": "**IndexTTS-2 voice cloning**\n\n1. **Choose a voice**: reuse IndexTTS-1.5 resource audio or upload a reference audio file\n2. **Set API URL**: for example http://192.168.3.6:7863/tts, or enter the service root\n3. **Tune emotion**: speaker is the default; switch to audio, vector, or text when needed\n4. **Tune generation**: temperature, top_p, top_k, num_beams, repetition_penalty, and max_mel_tokens are sent directly to the IndexTTS-2 API\n\n**Notes**:\n- Reference audio quality directly affects cloning quality\n- The first request may load the model and take longer\n- CPU deployments are much slower than GPU deployments", + "OmniVoice Usage Instructions Title": "OmniVoice Usage Instructions", + "OmniVoice Usage Instructions": "**OmniVoice-Pack speech synthesis**\n\n1. **Automatic voice**: set the API URL and language, then synthesize directly.\n2. **Voice design**: fill instruct with the desired gender, pitch, accent, or style.\n3. **Reference-audio clone**: upload or choose reference audio and fill its matching transcript.\n\n**Notes**:\n- The default service URL is http://127.0.0.1:7866/tts\n- Reference-audio cloning requires reference text when the service has no ASR model loaded\n- OmniVoice returns WAV audio, and NarratoAI estimates subtitle segment timing from the audio duration", "Volcengine Access Key Help": "Volcengine Access Key", "Volcengine Secret Key Help": "Volcengine Secret Key", "Doubao AppID Help": "Doubao TTS application AppID", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 4b16d7e..76872eb 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -159,6 +159,8 @@ "Video script table help": "在表格中编辑完整脚本 JSON。可新增、删除行;保存时会重新校验并写入脚本文件。", "Raw JSON Preview": "原始 JSON 预览", "Script Column ID": "序号", + "Script Column Video ID": "视频", + "Script Column Video Name": "视频文件", "Script Column Timestamp": "时间戳", "Script Column Picture": "画面描述", "Script Column Narration": "解说台词", @@ -267,7 +269,11 @@ "IndexTTS download link": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", "IndexTTS2 features": "本地/私有部署的 IndexTTS-2 语音克隆引擎,支持情感控制和更完整的生成参数。", "IndexTTS2 use case": "适合需要固定音色、情绪化旁白或更细致采样控制的本地语音合成场景。使用前请先启动 IndexTTS-2 API 服务。", + "OmniVoice features": "本地/私有部署的 OmniVoice-Pack 多语种语音合成引擎,支持自动音色、指令音色和参考音频克隆。", + "OmniVoice use case": "适合需要本地可控、多语言旁白、音色设计或参考音频克隆的场景。使用前请先启动 OmniVoice-Pack API 服务。", "Doubao TTS features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快", + "Local Deployment": "本地部署", + "Cloud Service": "云端服务", "Select TTS Engine": "选择 TTS 引擎", "Select TTS Engine Help": "选择您要使用的文本转语音引擎", "TTS Engine Details": "📋 {engine} 详细说明", @@ -395,6 +401,16 @@ "Subtitle calibration succeeded for multiple files": "字幕校准成功,共 {count} 个文件: {files}", "Subtitle calibration failed": "字幕校准失败", "Transcribed subtitles storage hint": "之前转录生成的字幕保存在 {path},可从该目录拖入上传", + "Tavily Search Settings": "Tavily 联网搜索", + "Tavily API Key": "Tavily API Key", + "Tavily API Key Help": "用于短剧剧情理解前的联网检索。开启“联网搜索”后,会先按短剧名称检索剧情、人物和分集信息,再结合字幕分析。", + "Tavily config saved": "Tavily 配置已保存", + "联网搜索": "联网搜索", + "Enable Web Search Help": "开启后,剧情理解会先使用 Tavily 按短剧名称联网检索,再结合检索结果和字幕分析剧情。", + "Please configure Tavily API Key in Basic Settings": "请先在基础设置中配置 Tavily API Key", + "Please enter short drama name before web search": "开启联网搜索前,请先填写短剧名称", + "Searching short drama with Tavily...": "正在使用 Tavily 检索短剧信息...", + "Tavily search failed": "Tavily 检索失败", "剧情理解": "剧情理解", "剧情理解结果": "剧情理解结果", "Analyzing plot...": "正在理解剧情...", @@ -425,6 +441,30 @@ "API URL": "API 地址", "IndexTTS API URL Help": "IndexTTS-1.5 API 服务地址", "IndexTTS2 API URL Help": "IndexTTS-2 API 服务地址,可填写服务根地址或完整 /tts 地址", + "OmniVoice API URL Help": "OmniVoice-Pack API 服务地址,可填写服务根地址或完整 /tts 地址", + "OmniVoice Language Code": "合成语言", + "OmniVoice Language Code Help": "传给 OmniVoice-Pack 的 language 参数,例如 zh、en。", + "OmniVoice Generation Mode": "生成模式", + "OmniVoice Generation Mode Help": "自动音色无需额外参数;指令音色使用描述词;参考音频克隆需要参考音频和对应文本。", + "OmniVoice Mode Auto": "自动音色", + "OmniVoice Mode Voice Design": "指令音色", + "OmniVoice Mode Voice Clone": "参考音频克隆", + "OmniVoice Instruct": "音色指令", + "OmniVoice Instruct Help": "描述希望生成的音色,例如性别、音高、口音或风格。", + "OmniVoice Instruct Placeholder": "例如:female, low pitch, british accent", + "OmniVoice Reference Text": "参考音频文本", + "OmniVoice Reference Text Help": "参考音频对应的逐字文本;当前部署未启用 ASR 时必须填写。", + "OmniVoice Reference Text Placeholder": "请输入参考音频中实际朗读的内容", + "OmniVoice Num Step Help": "扩散生成步数,值越大通常质量更高但速度更慢。", + "OmniVoice Guidance Scale Help": "控制文本条件的引导强度。", + "OmniVoice Duration": "目标时长(秒)", + "OmniVoice Duration Help": "0 表示由模型自动决定时长。", + "OmniVoice Denoise": "启用降噪", + "OmniVoice Denoise Help": "让 OmniVoice-Pack 对生成结果执行降噪处理。", + "OmniVoice Postprocess Output": "后处理输出", + "OmniVoice Postprocess Output Help": "启用 OmniVoice-Pack 的输出后处理。", + "OmniVoice Preprocess Prompt": "预处理文本", + "OmniVoice Preprocess Prompt Help": "启用 OmniVoice-Pack 的文本预处理。", "Reference Audio Source": "参考音频来源", "Reference Audio Source Help": "选择从资源目录选择参考音频,或上传新的参考音频", "Select from Resource Directory": "从资源目录选择", @@ -484,6 +524,8 @@ "Max Mel Tokens Help": "控制单次生成的最大 mel token 数,值越大可生成更长音频", "IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 使用说明", "IndexTTS2 Usage Instructions": "**IndexTTS-2 语音克隆**\n\n1. **选择音色**:复用 IndexTTS-1.5 的资源音频或上传参考音频\n2. **设置 API 地址**:例如 http://192.168.3.6:7863/tts,也可以填写服务根地址\n3. **调整情感参数**:默认使用 speaker,可按需切换到 audio、vector 或 text\n4. **调整生成参数**:temperature、top_p、top_k、num_beams、repetition_penalty 和 max_mel_tokens 会直接传给 IndexTTS-2 接口\n\n**注意事项**:\n- 参考音频质量会直接影响克隆效果\n- 首次请求可能需要加载模型,耗时更长\n- CPU 部署生成速度会明显慢于 GPU", + "OmniVoice Usage Instructions Title": "OmniVoice 使用说明", + "OmniVoice Usage Instructions": "**OmniVoice-Pack 语音合成**\n\n1. **自动音色**:只需要设置 API 地址和语言,可直接合成。\n2. **指令音色**:填写 instruct 描述想要的性别、音高、口音或风格。\n3. **参考音频克隆**:上传或选择参考音频,并填写该音频对应文本。\n\n**注意事项**:\n- 当前默认服务地址为 http://127.0.0.1:7866/tts\n- 参考音频克隆在服务未加载 ASR 模型时必须填写参考文本\n- OmniVoice 返回 WAV 音频,系统会按音频时长估算字幕段落", "Volcengine Access Key Help": "火山引擎 Access Key", "Volcengine Secret Key Help": "火山引擎 Secret Key", "Doubao AppID Help": "豆包语音应用 AppID", diff --git a/webui/tools/generate_short_summary.py b/webui/tools/generate_short_summary.py index eb42361..d06431c 100644 --- a/webui/tools/generate_short_summary.py +++ b/webui/tools/generate_short_summary.py @@ -17,12 +17,101 @@ from loguru import logger from app.config import config from app.services.SDE.short_drama_explanation import analyze_subtitle, generate_narration_script from app.services.subtitle_text import read_subtitle_text +from app.services.tavily_search import TavilySearchError, format_search_context, search_short_drama # 导入新的LLM服务模块 - 确保提供商被注册 import app.services.llm # 这会触发提供商注册 from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter import re +def _normalize_paths(paths): + if isinstance(paths, str): + paths = [paths] + if not paths: + return [] + + normalized_paths = [] + seen = set() + for path in paths: + if not isinstance(path, str): + continue + path = path.strip() + if not path or path in seen: + continue + normalized_paths.append(path) + seen.add(path) + return normalized_paths + + +def _build_combined_subtitle_content(subtitle_paths, video_paths=None): + sections = [] + video_paths = _normalize_paths(video_paths) + for index, subtitle_path in enumerate(_normalize_paths(subtitle_paths), start=1): + if not os.path.exists(subtitle_path): + continue + + video_path = video_paths[index - 1] if index <= len(video_paths) else "" + if video_path: + header = ( + f"# 视频 {index}: {os.path.basename(video_path)}\n" + f"字幕文件: {os.path.basename(subtitle_path)}" + ) + else: + header = f"# 视频 {index}\n字幕文件: {os.path.basename(subtitle_path)}" + sections.append(f"{header}\n{read_subtitle_text(subtitle_path).text}".strip()) + + return "\n\n".join(sections) + + +def _coerce_video_id(value): + try: + video_id = int(value) + except (TypeError, ValueError): + return None + return video_id if video_id > 0 else None + + +def _match_video_id_by_name(video_name, video_paths): + video_name = str(video_name or "").strip() + if not video_name: + return None + + for index, video_path in enumerate(video_paths, start=1): + if os.path.basename(video_path) == os.path.basename(video_name): + return index + return None + + +def _normalize_narration_items_video_sources(items, video_paths): + video_paths = _normalize_paths(video_paths) + if not video_paths: + return items + + normalized_items = [] + for item in items: + if not isinstance(item, dict): + normalized_items.append(item) + continue + + item_copy = item.copy() + video_id = _coerce_video_id(item_copy.get("video_id") or item_copy.get("video_index")) + matched_video_id = _match_video_id_by_name( + item_copy.get("video_name") or item_copy.get("source_video"), + video_paths, + ) + if matched_video_id: + video_id = matched_video_id + if video_id is None or video_id > len(video_paths): + logger.warning(f"片段 {item_copy.get('_id')} 未提供有效 video_id,默认使用视频 1") + video_id = 1 + + item_copy["video_id"] = video_id + item_copy["video_name"] = os.path.basename(video_paths[video_id - 1]) + normalized_items.append(item_copy) + + return normalized_items + + def parse_and_fix_json(json_string): """ 解析并修复JSON字符串 @@ -135,12 +224,83 @@ def parse_and_fix_json(json_string): return None -def analyze_short_drama_plot(subtitle_path, temperature, tr=lambda key: key, subtitle_content=None): +def _get_tavily_api_key() -> str: + return ( + st.session_state.get("tavily_api_key") + or config.app.get("tavily_api_key") + or "" + ).strip() + + +def _build_tavily_context(short_name: str, tr=lambda key: key) -> str | None: + short_name = str(short_name or "").strip() + if not short_name: + st.error(tr("Please enter short drama name before web search")) + return None + + api_key = _get_tavily_api_key() + if not api_key: + st.error(tr("Please configure Tavily API Key in Basic Settings")) + return None + + try: + search_data = search_short_drama( + short_name, + api_key, + search_depth=config.app.get("tavily_search_depth", "basic"), + max_results=config.app.get("tavily_max_results", 5), + ) + return format_search_context(search_data) + except TavilySearchError as e: + logger.error(f"Tavily 短剧检索失败: {str(e)}") + st.error(f"{tr('Tavily search failed')}: {str(e)}") + return None + except Exception as e: + logger.error(f"Tavily 短剧检索异常: {traceback.format_exc()}") + st.error(f"{tr('Tavily search failed')}: {str(e)}") + return None + + +def _build_plot_analysis_input( + subtitle_content: str, + short_name: str = "", + enable_web_search: bool = False, + tr=lambda key: key, +) -> str | None: + subtitle_content = str(subtitle_content or "").strip() + if not enable_web_search: + return subtitle_content + + tavily_context = _build_tavily_context(short_name, tr) + if tavily_context is None: + return None + + return f"""# 分析补充说明 +请先参考 Tavily 联网检索结果理解短剧名称、人物关系、剧情背景和公开剧情梗概,再结合原始字幕完成剧情理解。 +如果联网检索结果与字幕内容冲突,请以字幕内容为准;时间戳必须只从字幕内容中提取。 + +{tavily_context} + +# 原始字幕 +{subtitle_content}""" + + +def analyze_short_drama_plot( + subtitle_path, + temperature, + tr=lambda key: key, + subtitle_content=None, + short_name: str = "", + enable_web_search: bool = False, + video_paths=None, +): """仅执行短剧字幕剧情理解,返回可编辑的剧情分析文本。""" - if not subtitle_path: + subtitle_paths = _normalize_paths(subtitle_path) + if not subtitle_paths: st.error(tr("Please generate or upload subtitles first")) return None - if not os.path.exists(subtitle_path): + missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)] + if missing_subtitle_paths: st.error(tr("Subtitle file does not exist")) return None @@ -149,19 +309,31 @@ def analyze_short_drama_plot(subtitle_path, temperature, tr=lambda key: key, sub text_model = config.app.get(f'text_{text_provider}_model_name') text_base_url = config.app.get(f'text_{text_provider}_base_url') - subtitle_content = str(subtitle_content or "").strip() or read_subtitle_text(subtitle_path).text + subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content( + subtitle_paths, + video_paths, + ) if not subtitle_content: st.error(tr("Subtitle file is empty or unreadable")) return None + plot_analysis_input = _build_plot_analysis_input( + subtitle_content, + short_name=short_name, + enable_web_search=enable_web_search, + tr=tr, + ) + if plot_analysis_input is None: + return None + try: logger.info("使用新的LLM服务架构进行字幕分析") analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) - analysis_result = analyzer.analyze_subtitle(subtitle_content) + analysis_result = analyzer.analyze_subtitle(plot_analysis_input) except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") analysis_result = analyze_subtitle( - subtitle_content=subtitle_content, + subtitle_content=plot_analysis_input, api_key=text_api_key, model=text_model, base_url=text_base_url, @@ -186,6 +358,8 @@ def generate_script_short_sunmmary( tr=lambda key: key, plot_analysis=None, subtitle_content=None, + enable_web_search: bool = False, + video_paths=None, ): """ 生成 短剧解说 视频脚本 @@ -204,7 +378,12 @@ def generate_script_short_sunmmary( try: with st.spinner(tr("Generating script...")): - if not params.video_origin_path: + selected_video_paths = _normalize_paths( + video_paths + or getattr(params, "video_origin_paths", []) + or getattr(params, "video_origin_path", "") + ) + if not selected_video_paths: st.error(tr("Please select video file first")) return """ @@ -212,7 +391,9 @@ def generate_script_short_sunmmary( """ update_progress(30, tr("Parsing subtitles...")) # 判断字幕文件是否存在 - if not os.path.exists(subtitle_path): + subtitle_paths = _normalize_paths(subtitle_path) + missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)] + if not subtitle_paths or missing_subtitle_paths: st.error(tr("Subtitle file does not exist")) return @@ -225,7 +406,10 @@ def generate_script_short_sunmmary( text_base_url = config.app.get(f'text_{text_provider}_base_url') # 读取字幕文件内容(无论使用哪种实现都需要) - subtitle_content = str(subtitle_content or "").strip() or read_subtitle_text(subtitle_path).text + subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content( + subtitle_paths, + selected_video_paths, + ) if not subtitle_content: st.error(tr("Subtitle file is empty or unreadable")) return @@ -238,16 +422,27 @@ def generate_script_short_sunmmary( "analysis": str(plot_analysis).strip(), } else: + plot_analysis_input = subtitle_content + if enable_web_search: + update_progress(40, tr("Searching short drama with Tavily...")) + plot_analysis_input = _build_plot_analysis_input( + subtitle_content, + short_name=video_theme, + enable_web_search=True, + tr=tr, + ) + if plot_analysis_input is None: + return try: # 优先使用新的LLM服务架构 logger.info("使用新的LLM服务架构进行字幕分析") - analysis_result = analyzer.analyze_subtitle(subtitle_content) + analysis_result = analyzer.analyze_subtitle(plot_analysis_input) except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") # 回退到旧的实现 analysis_result = analyze_subtitle( - subtitle_content=subtitle_content, + subtitle_content=plot_analysis_input, api_key=text_api_key, model=text_model, base_url=text_base_url, @@ -320,7 +515,11 @@ def generate_script_short_sunmmary( logger.error(f"JSON结构错误,缺少items字段: {narration_dict}") st.stop() - script = json.dumps(narration_dict['items'], ensure_ascii=False, indent=2) + narration_items = _normalize_narration_items_video_sources( + narration_dict['items'], + selected_video_paths, + ) + script = json.dumps(narration_items, ensure_ascii=False, indent=2) if script is None: st.error(tr("Script generation failed check logs")) From e6e39d2dcdcabab97bdb771a0a74ba04c9f08c13 Mon Sep 17 00:00:00 2001 From: viccy Date: Sun, 7 Jun 2026 17:10:48 +0800 Subject: [PATCH 14/24] =?UTF-8?q?feat(short-drama):=20=E5=AE=8C=E6=95=B4?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E7=9F=AD=E5=89=A7=E8=A7=A3=E8=AF=B4=E5=89=AA?= =?UTF-8?q?=E8=BE=91=E5=85=A8=E6=B5=81=E7=A8=8B=E5=B9=B6=E6=96=B0=E5=A2=9E?= =?UTF-8?q?LLM=E6=B5=81=E5=BC=8F=E7=94=9F=E6=88=90=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增短剧解说全流程四类提示词模板:解说文案生成、片段规划、文案画面匹配、脚本修复 - 重构原有脚本生成提示词至v2.1,改为基于上游规划片段生成合规解说脚本 - 为LLM基础服务层新增流式文本生成接口,完善OpenAI兼容提供商的流式实现,支持流式回调与推理内容提取 - 重构OpenAI兼容文本提供商的生成逻辑,提取公共参数构建方法 - 新增多语言国际化文案,覆盖解说语言、短剧类型、原片占比等配置项与交互提示 - 新增多套单元测试,覆盖脚本校验、适配器流程、工具函数等模块 - 封装SubtitleAnalyzerAdapter,统一短剧解说脚本生成的整套业务接口 - 新增前端交互所需的解说文案审核相关提示文案 --- app/services/SDE/short_drama_explanation.py | 365 ++++++++++++++- app/services/llm/base.py | 21 + app/services/llm/migration_adapter.py | 273 ++++++++++- .../llm/openai_compatible_provider.py | 157 ++++++- ...test_subtitle_adapter_pipeline_unittest.py | 177 +++++++ app/services/llm/unified_service.py | 31 ++ .../prompts/short_drama_narration/__init__.py | 24 + .../short_drama_narration/narration_copy.py | 88 ++++ .../script_generation.py | 315 +++---------- .../short_drama_narration/script_matching.py | 131 ++++++ .../short_drama_narration/script_repair.py | 96 ++++ .../short_drama_narration/segment_planning.py | 104 +++++ .../short_drama_narration_validation.py | 435 ++++++++++++++++++ ...ort_drama_narration_validation_unittest.py | 290 ++++++++++++ webui/components/script_settings.py | 239 ++++++++-- webui/i18n/en.json | 42 ++ webui/i18n/zh.json | 42 ++ webui/tools/generate_short_summary.py | 298 ++++++++---- .../test_generate_short_summary_unittest.py | 27 ++ 19 files changed, 2737 insertions(+), 418 deletions(-) create mode 100644 app/services/llm/test_subtitle_adapter_pipeline_unittest.py create mode 100644 app/services/prompts/short_drama_narration/narration_copy.py create mode 100644 app/services/prompts/short_drama_narration/script_matching.py create mode 100644 app/services/prompts/short_drama_narration/script_repair.py create mode 100644 app/services/prompts/short_drama_narration/segment_planning.py create mode 100644 app/services/short_drama_narration_validation.py create mode 100644 app/services/test_short_drama_narration_validation_unittest.py create mode 100644 webui/tools/test_generate_short_summary_unittest.py diff --git a/app/services/SDE/short_drama_explanation.py b/app/services/SDE/short_drama_explanation.py index 4fc2478..6910324 100644 --- a/app/services/SDE/short_drama_explanation.py +++ b/app/services/SDE/short_drama_explanation.py @@ -11,7 +11,7 @@ import os import json import requests -from typing import Dict, Any, Optional +from typing import Dict, Any, Optional, Tuple from loguru import logger from app.config import config from app.utils.utils import get_uuid, storage_dir @@ -363,7 +363,179 @@ class SubtitleAnalyzer: logger.error(f"保存分析结果时发生错误: {str(e)}") return "" - def generate_narration_script(self, short_name: str, plot_analysis: str, subtitle_content: str = "", temperature: float = 0.7) -> Dict[str, Any]: + def _render_prompt(self, name: str, parameters: Dict[str, Any]) -> Tuple[str, Optional[str]]: + prompt = PromptManager.get_prompt( + category="short_drama_narration", + name=name, + parameters=parameters, + ) + prompt_object = PromptManager.get_prompt_object( + category="short_drama_narration", + name=name, + ) + return prompt, prompt_object.get_system_prompt() + + def _generate_json_text( + self, + prompt: str, + system_prompt: Optional[str], + temperature: float, + ) -> Dict[str, Any]: + if self.is_native_gemini: + return self._generate_narration_with_native_gemini(prompt, temperature, system_prompt, json_output=True) + return self._generate_narration_with_openai_compatible(prompt, temperature, system_prompt, json_output=True) + + def _generate_plain_text( + self, + prompt: str, + system_prompt: Optional[str], + temperature: float, + ) -> Dict[str, Any]: + if self.is_native_gemini: + result = self._generate_narration_with_native_gemini(prompt, temperature, system_prompt, json_output=False) + else: + result = self._generate_narration_with_openai_compatible(prompt, temperature, system_prompt, json_output=False) + if result.get("status") == "success": + result["narration_copy"] = str(result.get("narration_script", "")).strip() + return result + + def generate_narration_copy( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str = "", + temperature: float = 0.7, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + ) -> Dict[str, Any]: + """生成供用户审核修改的解说正文。""" + try: + prompt, system_prompt = self._render_prompt( + "narration_copy", + { + "drama_name": short_name, + "drama_genre": drama_genre, + "plot_analysis": plot_analysis, + "subtitle_content": subtitle_content, + "narration_language": narration_language, + }, + ) + return self._generate_plain_text(prompt, system_prompt, temperature) + except Exception as e: + logger.error(f"解说文案正文生成过程中发生错误: {str(e)}") + return { + "status": "error", + "message": str(e), + "temperature": temperature, + } + + def match_narration_copy_to_script( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str, + narration_copy: str, + temperature: float = 0.3, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + original_sound_ratio: int = 30, + ) -> Dict[str, Any]: + """将用户审核后的解说正文匹配到字幕时间戳。""" + try: + prompt, system_prompt = self._render_prompt( + "script_matching", + { + "drama_name": short_name, + "drama_genre": drama_genre, + "plot_analysis": plot_analysis, + "subtitle_content": subtitle_content, + "narration_copy": narration_copy, + "narration_language": narration_language, + "original_sound_ratio": int(original_sound_ratio), + }, + ) + return self._generate_json_text(prompt, system_prompt, min(float(temperature), 0.3)) + except Exception as e: + logger.error(f"解说文案画面匹配过程中发生错误: {str(e)}") + return { + "status": "error", + "message": str(e), + "temperature": temperature, + } + + def plan_narration_segments( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str = "", + temperature: float = 0.3, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + ) -> Dict[str, Any]: + """规划短剧解说片段,只输出片段来源和意图。""" + try: + prompt, system_prompt = self._render_prompt( + "segment_planning", + { + "drama_name": short_name, + "drama_genre": drama_genre, + "plot_analysis": plot_analysis, + "subtitle_content": subtitle_content, + "narration_language": narration_language, + }, + ) + return self._generate_json_text(prompt, system_prompt, min(float(temperature), 0.3)) + except Exception as e: + logger.error(f"片段规划过程中发生错误: {str(e)}") + return { + "status": "error", + "message": str(e), + "temperature": temperature, + } + + def repair_narration_script( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str, + invalid_script: str, + validation_errors: str, + temperature: float = 0.3, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + ) -> Dict[str, Any]: + """根据确定性校验错误修复解说脚本。""" + try: + prompt, system_prompt = self._render_prompt( + "script_repair", + { + "drama_name": short_name, + "drama_genre": drama_genre, + "plot_analysis": plot_analysis, + "subtitle_content": subtitle_content, + "invalid_script": invalid_script, + "validation_errors": validation_errors, + "narration_language": narration_language, + }, + ) + return self._generate_json_text(prompt, system_prompt, min(float(temperature), 0.3)) + except Exception as e: + logger.error(f"解说文案修复过程中发生错误: {str(e)}") + return { + "status": "error", + "message": str(e), + "temperature": temperature, + } + + def generate_narration_script( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str = "", + temperature: float = 0.7, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + ) -> Dict[str, Any]: """ 根据剧情分析生成解说文案 @@ -372,28 +544,36 @@ class SubtitleAnalyzer: plot_analysis: 剧情分析内容 subtitle_content: 原始字幕内容,用于提供准确的时间戳信息 temperature: 生成温度,控制创造性,默认0.7 + narration_language: 解说台词目标语言 Returns: Dict[str, Any]: 包含生成结果的字典 """ try: - # 使用新的提示词管理系统构建提示词 - prompt = PromptManager.get_prompt( - category="short_drama_narration", - name="script_generation", - parameters={ + segment_plan_result = self.plan_narration_segments( + short_name=short_name, + plot_analysis=plot_analysis, + subtitle_content=subtitle_content, + temperature=temperature, + narration_language=narration_language, + drama_genre=drama_genre, + ) + if segment_plan_result["status"] != "success": + return segment_plan_result + + prompt, system_prompt = self._render_prompt( + "script_generation", + { "drama_name": short_name, + "drama_genre": drama_genre, "plot_analysis": plot_analysis, - "subtitle_content": subtitle_content - } + "subtitle_content": subtitle_content, + "segment_plan": segment_plan_result["narration_script"], + "narration_language": narration_language, + }, ) - if self.is_native_gemini: - # 使用原生Gemini API格式 - return self._generate_narration_with_native_gemini(prompt, temperature) - else: - # 使用OpenAI兼容格式 - return self._generate_narration_with_openai_compatible(prompt, temperature) + return self._generate_json_text(prompt, system_prompt, temperature) except Exception as e: logger.error(f"解说文案生成过程中发生错误: {str(e)}") @@ -403,16 +583,35 @@ class SubtitleAnalyzer: "temperature": self.temperature } - def _generate_narration_with_native_gemini(self, prompt: str, temperature: float) -> Dict[str, Any]: + def _generate_narration_with_native_gemini( + self, + prompt: str, + temperature: float, + system_prompt: Optional[str] = None, + json_output: bool = True, + ) -> Dict[str, Any]: """使用原生Gemini API生成解说文案""" try: # 构建原生Gemini API请求数据 # 为了确保JSON输出,在提示词中添加更强的约束 - enhanced_prompt = f"{prompt}\n\n请确保输出严格的JSON格式,不要包含任何其他文字或标记。" + enhanced_prompt = ( + f"{prompt}\n\n请确保输出严格的JSON格式,不要包含任何其他文字或标记。" + if json_output + else prompt + ) payload = { "systemInstruction": { - "parts": [{"text": "你是一位专业的短视频解说脚本撰写专家。你必须严格按照JSON格式输出,不能包含任何其他文字、说明或代码块标记。"}] + "parts": [ + { + "text": system_prompt + or ( + "你必须严格按照JSON格式输出,不能包含任何其他文字、说明或代码块标记。" + if json_output + else "你是一位专业的短剧解说文案助手。" + ) + } + ] }, "contents": [{ "parts": [{"text": enhanced_prompt}] @@ -423,7 +622,6 @@ class SubtitleAnalyzer: "topP": 0.95, "maxOutputTokens": 64000, "candidateCount": 1, - "stopSequences": ["```", "注意", "说明"] }, "safetySettings": [ { @@ -444,6 +642,8 @@ class SubtitleAnalyzer: } ] } + if json_output: + payload["generationConfig"]["stopSequences"] = ["```", "注意", "说明"] # 构建请求URL url = f"{self.base_url}/models/{self.model}:generateContent" @@ -523,21 +723,27 @@ class SubtitleAnalyzer: "temperature": temperature } - def _generate_narration_with_openai_compatible(self, prompt: str, temperature: float) -> Dict[str, Any]: + def _generate_narration_with_openai_compatible( + self, + prompt: str, + temperature: float, + system_prompt: Optional[str] = None, + json_output: bool = True, + ) -> Dict[str, Any]: """使用OpenAI兼容API生成解说文案""" try: # 构建OpenAI格式的请求数据 payload = { "model": self.model, "messages": [ - {"role": "system", "content": "你是一位专业的短视频解说脚本撰写专家。"}, + {"role": "system", "content": system_prompt or ("你必须严格按照JSON格式输出。" if json_output else "你是一位专业的短剧解说文案助手。")}, {"role": "user", "content": prompt} ], "temperature": temperature } # 对特定模型添加响应格式设置 - if self.model not in ["deepseek-reasoner"]: + if json_output and self.model not in ["deepseek-reasoner"]: payload["response_format"] = {"type": "json_object"} # 构建请求地址 @@ -691,7 +897,9 @@ def generate_narration_script( temperature: float = 1.0, save_result: bool = False, output_path: Optional[str] = None, - provider: Optional[str] = None + provider: Optional[str] = None, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", ) -> Dict[str, Any]: """ 根据剧情分析生成解说文案的便捷函数 @@ -707,6 +915,7 @@ def generate_narration_script( save_result: 是否保存结果到文件 output_path: 输出文件路径 provider: 提供商类型 + narration_language: 解说台词目标语言 Returns: Dict[str, Any]: 包含生成结果的字典 @@ -721,7 +930,14 @@ def generate_narration_script( ) # 生成解说文案 - result = analyzer.generate_narration_script(short_name, plot_analysis, subtitle_content or "", temperature) + result = analyzer.generate_narration_script( + short_name, + plot_analysis, + subtitle_content or "", + temperature, + narration_language, + drama_genre, + ) # 保存结果 if save_result and result["status"] == "success": @@ -730,6 +946,107 @@ def generate_narration_script( return result +def generate_narration_copy( + short_name: str = None, + plot_analysis: str = None, + subtitle_content: str = None, + api_key: Optional[str] = None, + model: Optional[str] = None, + base_url: Optional[str] = None, + temperature: float = 0.7, + provider: Optional[str] = None, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", +) -> Dict[str, Any]: + """生成可供用户审核修改的解说正文。""" + analyzer = SubtitleAnalyzer( + temperature=temperature, + api_key=api_key, + model=model, + base_url=base_url, + provider=provider, + ) + + return analyzer.generate_narration_copy( + short_name=short_name, + plot_analysis=plot_analysis or "", + subtitle_content=subtitle_content or "", + temperature=temperature, + narration_language=narration_language, + drama_genre=drama_genre, + ) + + +def match_narration_copy_to_script( + short_name: str = None, + plot_analysis: str = None, + subtitle_content: str = None, + narration_copy: str = None, + api_key: Optional[str] = None, + model: Optional[str] = None, + base_url: Optional[str] = None, + temperature: float = 0.3, + provider: Optional[str] = None, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + original_sound_ratio: int = 30, +) -> Dict[str, Any]: + """将用户审核后的解说正文匹配到字幕时间戳。""" + analyzer = SubtitleAnalyzer( + temperature=temperature, + api_key=api_key, + model=model, + base_url=base_url, + provider=provider, + ) + + return analyzer.match_narration_copy_to_script( + short_name=short_name, + plot_analysis=plot_analysis or "", + subtitle_content=subtitle_content or "", + narration_copy=narration_copy or "", + temperature=temperature, + narration_language=narration_language, + drama_genre=drama_genre, + original_sound_ratio=original_sound_ratio, + ) + + +def repair_narration_script( + short_name: str = None, + plot_analysis: str = None, + subtitle_content: str = None, + invalid_script: str = None, + validation_errors: str = None, + api_key: Optional[str] = None, + model: Optional[str] = None, + base_url: Optional[str] = None, + temperature: float = 0.3, + provider: Optional[str] = None, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", +) -> Dict[str, Any]: + """根据校验错误修复解说文案的便捷函数。""" + analyzer = SubtitleAnalyzer( + temperature=temperature, + api_key=api_key, + model=model, + base_url=base_url, + provider=provider, + ) + + return analyzer.repair_narration_script( + short_name=short_name, + plot_analysis=plot_analysis or "", + subtitle_content=subtitle_content or "", + invalid_script=invalid_script or "", + validation_errors=validation_errors or "", + temperature=temperature, + narration_language=narration_language, + drama_genre=drama_genre, + ) + + if __name__ == '__main__': text_api_key = "skxxxx" text_model = "gemini-2.0-flash" diff --git a/app/services/llm/base.py b/app/services/llm/base.py index 737ceb9..e1a899a 100644 --- a/app/services/llm/base.py +++ b/app/services/llm/base.py @@ -178,6 +178,27 @@ class TextModelProvider(BaseLLMProvider): 生成的文本内容 """ pass + + async def generate_text_stream(self, + prompt: str, + system_prompt: Optional[str] = None, + temperature: float = 1.0, + max_tokens: Optional[int] = None, + response_format: Optional[str] = None, + on_chunk=None, + **kwargs) -> str: + """生成文本内容并尽可能回调流式片段;默认退化为一次性输出。""" + result = await self.generate_text( + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, + max_tokens=max_tokens, + response_format=response_format, + **kwargs, + ) + if on_chunk: + on_chunk({"type": "content", "text": result}) + return result def _build_messages(self, prompt: str, system_prompt: Optional[str] = None) -> List[Dict[str, str]]: """构建消息列表""" diff --git a/app/services/llm/migration_adapter.py b/app/services/llm/migration_adapter.py index 7bd5142..96b165f 100644 --- a/app/services/llm/migration_adapter.py +++ b/app/services/llm/migration_adapter.py @@ -225,6 +225,229 @@ class SubtitleAnalyzerAdapter: output = output.strip() return output + + def _render_prompt(self, name: str, parameters: Dict[str, Any]) -> tuple[str, Optional[str]]: + prompt = PromptManager.get_prompt( + category="short_drama_narration", + name=name, + parameters=parameters, + ) + prompt_object = PromptManager.get_prompt_object( + category="short_drama_narration", + name=name, + ) + return prompt, prompt_object.get_system_prompt() + + def _generate_json_text( + self, + prompt: str, + system_prompt: Optional[str], + temperature: float, + stream_callback=None, + ) -> str: + generate_func = ( + UnifiedLLMService.generate_text_stream + if stream_callback + else UnifiedLLMService.generate_text + ) + kwargs = { + "prompt": prompt, + "system_prompt": system_prompt, + "provider": self.provider, + "temperature": temperature, + "response_format": "json", + "api_key": self.api_key, + "api_base": self.base_url, + } + if stream_callback: + kwargs["on_chunk"] = stream_callback + result = self._run_async_safely(generate_func, **kwargs) + return self._clean_json_output(result) + + def _generate_plain_text(self, prompt: str, system_prompt: Optional[str], temperature: float) -> str: + result = self._run_async_safely( + UnifiedLLMService.generate_text, + prompt=prompt, + system_prompt=system_prompt, + provider=self.provider, + temperature=temperature, + api_key=self.api_key, + api_base=self.base_url, + ) + return str(result or "").strip() + + def generate_narration_copy( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str = "", + temperature: float = 0.7, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + ) -> Dict[str, Any]: + """Generate editable narration copy before timeline matching.""" + try: + prompt, system_prompt = self._render_prompt( + "narration_copy", + { + "drama_name": short_name, + "drama_genre": drama_genre, + "plot_analysis": plot_analysis, + "subtitle_content": subtitle_content, + "narration_language": narration_language, + }, + ) + narration_copy = self._generate_plain_text(prompt, system_prompt, temperature) + return { + "status": "success", + "narration_copy": narration_copy, + "model": self.model, + "temperature": temperature, + } + except Exception as e: + logger.error(f"解说文案正文生成失败: {str(e)}") + return { + "status": "error", + "message": str(e), + "temperature": temperature, + } + + def match_narration_copy_to_script( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str, + narration_copy: str, + temperature: float = 0.3, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + original_sound_ratio: int = 30, + stream_callback=None, + ) -> Dict[str, Any]: + """Match reviewed narration copy to source footage and return JSON script.""" + try: + prompt, system_prompt = self._render_prompt( + "script_matching", + { + "drama_name": short_name, + "drama_genre": drama_genre, + "plot_analysis": plot_analysis, + "subtitle_content": subtitle_content, + "narration_copy": narration_copy, + "narration_language": narration_language, + "original_sound_ratio": int(original_sound_ratio), + }, + ) + narration_script = self._generate_json_text( + prompt, + system_prompt, + min(float(temperature), 0.3), + stream_callback=stream_callback, + ) + return { + "status": "success", + "narration_script": narration_script, + "model": self.model, + "temperature": temperature, + } + except Exception as e: + logger.error(f"解说文案画面匹配失败: {str(e)}") + return { + "status": "error", + "message": str(e), + "temperature": temperature, + } + + def plan_narration_segments( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str = "", + temperature: float = 0.3, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + ) -> str: + """Plan source segments before generating final copy.""" + prompt, system_prompt = self._render_prompt( + "segment_planning", + { + "drama_name": short_name, + "drama_genre": drama_genre, + "plot_analysis": plot_analysis, + "subtitle_content": subtitle_content, + "narration_language": narration_language, + }, + ) + return self._generate_json_text(prompt, system_prompt, min(float(temperature), 0.3)) + + def generate_narration_script_from_plan( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str, + segment_plan: str, + temperature: float = 0.7, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + ) -> str: + prompt, system_prompt = self._render_prompt( + "script_generation", + { + "drama_name": short_name, + "drama_genre": drama_genre, + "plot_analysis": plot_analysis, + "subtitle_content": subtitle_content, + "segment_plan": segment_plan, + "narration_language": narration_language, + }, + ) + return self._generate_json_text(prompt, system_prompt, temperature) + + def repair_narration_script( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str, + invalid_script: str, + validation_errors: str, + temperature: float = 0.3, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + stream_callback=None, + ) -> Dict[str, Any]: + """Repair a generated script once after deterministic validation fails.""" + try: + prompt, system_prompt = self._render_prompt( + "script_repair", + { + "drama_name": short_name, + "drama_genre": drama_genre, + "plot_analysis": plot_analysis, + "subtitle_content": subtitle_content, + "invalid_script": invalid_script, + "validation_errors": validation_errors, + "narration_language": narration_language, + }, + ) + repaired_script = self._generate_json_text( + prompt, + system_prompt, + min(float(temperature), 0.3), + stream_callback=stream_callback, + ) + return { + "status": "success", + "narration_script": repaired_script, + "model": self.model, + "temperature": temperature, + } + except Exception as e: + logger.error(f"解说文案修复失败: {str(e)}") + return { + "status": "error", + "message": str(e), + "temperature": temperature, + } def analyze_subtitle(self, subtitle_content: str) -> Dict[str, Any]: """ @@ -262,7 +485,15 @@ class SubtitleAnalyzerAdapter: "temperature": 1.0 } - def generate_narration_script(self, short_name: str, plot_analysis: str, subtitle_content: str = "", temperature: float = 0.7) -> Dict[str, Any]: + def generate_narration_script( + self, + short_name: str, + plot_analysis: str, + subtitle_content: str = "", + temperature: float = 0.7, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", + ) -> Dict[str, Any]: """ 生成解说文案 - 兼容原有接口 @@ -271,36 +502,30 @@ class SubtitleAnalyzerAdapter: plot_analysis: 剧情分析内容 subtitle_content: 原始字幕内容,用于提供准确的时间戳信息 temperature: 生成温度 + narration_language: 解说台词目标语言 Returns: 生成结果字典 """ try: - # 使用新的提示词管理系统构建提示词 - prompt = PromptManager.get_prompt( - category="short_drama_narration", - name="script_generation", - parameters={ - "drama_name": short_name, - "plot_analysis": plot_analysis, - "subtitle_content": subtitle_content - } - ) - - # 使用统一服务生成文案 - result = self._run_async_safely( - UnifiedLLMService.generate_text, - prompt=prompt, - system_prompt="你是一位专业的短视频解说脚本撰写专家。", - provider=self.provider, + segment_plan = self.plan_narration_segments( + short_name=short_name, + plot_analysis=plot_analysis, + subtitle_content=subtitle_content, temperature=temperature, - response_format="json", - api_key=self.api_key, - api_base=self.base_url + narration_language=narration_language, + drama_genre=drama_genre, + ) + + cleaned_result = self.generate_narration_script_from_plan( + short_name=short_name, + plot_analysis=plot_analysis, + subtitle_content=subtitle_content, + segment_plan=segment_plan, + temperature=temperature, + narration_language=narration_language, + drama_genre=drama_genre, ) - - # 清理JSON输出 - cleaned_result = self._clean_json_output(result) # 新的提示词系统返回的是包含items数组的JSON格式 # 为了保持向后兼容,我们需要直接返回这个JSON字符串 diff --git a/app/services/llm/openai_compatible_provider.py b/app/services/llm/openai_compatible_provider.py index 9a2b183..d469955 100644 --- a/app/services/llm/openai_compatible_provider.py +++ b/app/services/llm/openai_compatible_provider.py @@ -233,25 +233,17 @@ class OpenAICompatibleVisionProvider(_OpenAICompatibleBase, VisionModelProvider) class OpenAICompatibleTextProvider(_OpenAICompatibleBase, TextModelProvider): """OpenAI 兼容文本模型提供商。""" - async def generate_text( + def _build_text_completion_kwargs( self, - prompt: str, - system_prompt: Optional[str] = None, - temperature: float = 1.0, - max_tokens: Optional[int] = None, - response_format: Optional[str] = None, - **kwargs, - ) -> str: - messages = self._build_messages(prompt, system_prompt) + messages: List[Dict[str, str]], + temperature: float, + max_tokens: Optional[int], + response_format: Optional[str], + kwargs: Dict[str, Any], + ) -> Dict[str, Any]: model_name = _normalize_model_name(self.model_name) - - client = self._build_client( - api_key_override=kwargs.get("api_key"), - base_url_override=kwargs.get("api_base"), - timeout_override=config.app.get("llm_text_timeout", 180), - ) - - temperature_override = kwargs.pop("temperature", None) + generation_kwargs = dict(kwargs) + temperature_override = generation_kwargs.pop("temperature", None) if temperature_override is None and temperature != 1.0: temperature_override = temperature @@ -263,12 +255,63 @@ class OpenAICompatibleTextProvider(_OpenAICompatibleBase, TextModelProvider): self._build_chat_completion_options( "text", temperature=temperature_override, - max_tokens=kwargs.pop("max_tokens", max_tokens), - **kwargs, + max_tokens=generation_kwargs.pop("max_tokens", max_tokens), + **generation_kwargs, ) ) if response_format == "json": completion_kwargs["response_format"] = {"type": "json_object"} + return completion_kwargs + + @staticmethod + def _emit_stream_chunk(on_chunk, chunk_type: str, text: str): + if not on_chunk or not text: + return + try: + on_chunk({"type": chunk_type, "text": text}) + except Exception as exc: + logger.debug(f"流式回调更新失败: {exc}") + + @staticmethod + def _extract_reasoning_delta(delta: Any) -> str: + if delta is None: + return "" + if hasattr(delta, "reasoning_content"): + value = getattr(delta, "reasoning_content") + if value: + return str(value) + if hasattr(delta, "model_dump"): + data = delta.model_dump(exclude_none=True) + for key in ("reasoning_content", "reasoning", "thinking"): + value = data.get(key) + if value: + return str(value) + return "" + + async def generate_text( + self, + prompt: str, + system_prompt: Optional[str] = None, + temperature: float = 1.0, + max_tokens: Optional[int] = None, + response_format: Optional[str] = None, + **kwargs, + ) -> str: + messages = self._build_messages(prompt, system_prompt) + + client = self._build_client( + api_key_override=kwargs.get("api_key"), + base_url_override=kwargs.get("api_base"), + timeout_override=config.app.get("llm_text_timeout", 180), + ) + + completion_kwargs = self._build_text_completion_kwargs( + messages, + temperature, + max_tokens, + response_format, + kwargs, + ) try: response = await client.chat.completions.create(**completion_kwargs) @@ -306,5 +349,81 @@ class OpenAICompatibleTextProvider(_OpenAICompatibleBase, TextModelProvider): logger.error(f"OpenAI 兼容接口调用失败: {exc}") raise APICallError(f"调用失败: {exc}") + async def generate_text_stream( + self, + prompt: str, + system_prompt: Optional[str] = None, + temperature: float = 1.0, + max_tokens: Optional[int] = None, + response_format: Optional[str] = None, + on_chunk=None, + **kwargs, + ) -> str: + messages = self._build_messages(prompt, system_prompt) + client = self._build_client( + api_key_override=kwargs.get("api_key"), + base_url_override=kwargs.get("api_base"), + timeout_override=config.app.get("llm_text_timeout", 180), + ) + completion_kwargs = self._build_text_completion_kwargs( + messages, + temperature, + max_tokens, + response_format, + kwargs, + ) + completion_kwargs["stream"] = True + + async def collect_stream() -> str: + content_parts: List[str] = [] + stream = await client.chat.completions.create(**completion_kwargs) + async for chunk in stream: + if not getattr(chunk, "choices", None): + continue + delta = chunk.choices[0].delta + reasoning_delta = self._extract_reasoning_delta(delta) + if reasoning_delta: + self._emit_stream_chunk(on_chunk, "reasoning", reasoning_delta) + + content_delta = getattr(delta, "content", None) if delta is not None else None + if content_delta: + content_parts.append(content_delta) + self._emit_stream_chunk(on_chunk, "content", content_delta) + + result = "".join(content_parts).strip() + if result: + self._emit_stream_chunk(on_chunk, "done", "") + return result + raise APICallError("OpenAI 兼容接口返回空响应") + + try: + return await collect_stream() + + except OpenAIBadRequestError as exc: + error_msg = str(exc) + if response_format == "json" and _is_response_format_error(error_msg): + logger.warning("目标网关不支持流式 response_format,回退为提示词约束 JSON 输出") + completion_kwargs.pop("response_format", None) + messages[-1]["content"] += "\n\n请确保输出严格的JSON格式,不要包含任何其他文字或标记。" + result = await collect_stream() + return _clean_json_output(result) + + if _is_content_filter_error(error_msg): + raise ContentFilterError(f"内容被安全过滤器阻止: {error_msg}") + raise APICallError(f"请求错误: {error_msg}") + + except OpenAIAuthError as exc: + logger.error(f"OpenAI 兼容接口认证失败: {exc}") + raise AuthenticationError(str(exc)) + except OpenAIRateLimitError as exc: + logger.error(f"OpenAI 兼容接口速率限制: {exc}") + raise RateLimitError(str(exc)) + except OpenAIAPIError as exc: + logger.error(f"OpenAI 兼容接口 API 错误: {exc}") + raise APICallError(f"API 错误: {exc}") + except Exception as exc: + logger.error(f"OpenAI 兼容接口流式调用失败: {exc}") + raise APICallError(f"流式调用失败: {exc}") + async def _make_api_call(self, payload: Dict[str, Any]) -> Dict[str, Any]: return payload diff --git a/app/services/llm/test_subtitle_adapter_pipeline_unittest.py b/app/services/llm/test_subtitle_adapter_pipeline_unittest.py new file mode 100644 index 0000000..2245031 --- /dev/null +++ b/app/services/llm/test_subtitle_adapter_pipeline_unittest.py @@ -0,0 +1,177 @@ +import json +import unittest +from unittest import mock + +from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter +from app.services.llm.unified_service import UnifiedLLMService + + +class SubtitleAnalyzerAdapterPipelineTests(unittest.TestCase): + def test_generate_narration_copy_uses_plain_text_prompt_with_selected_type(self): + adapter = SubtitleAnalyzerAdapter( + api_key="sk-test", + model="test-model", + base_url="https://example.test/v1", + provider="openai", + ) + + with mock.patch.object(adapter, "_run_async_safely", return_value="她被家人逼到绝路,反击从这一刻开始。") as call: + result = adapter.generate_narration_copy( + short_name="测试短剧", + plot_analysis="女主被家人误会后反击。", + subtitle_content="# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n女主被误会。", + temperature=0.7, + narration_language="简体中文(中国)", + drama_genre="家庭伦理", + ) + + self.assertEqual("success", result["status"]) + self.assertIn("反击", result["narration_copy"]) + self.assertIn("家庭伦理", call.call_args.kwargs["prompt"]) + self.assertNotIn("response_format", call.call_args.kwargs) + + def test_match_narration_copy_to_script_uses_json_prompt_with_selected_type(self): + adapter = SubtitleAnalyzerAdapter( + api_key="sk-test", + model="test-model", + base_url="https://example.test/v1", + provider="openai", + ) + matched = json.dumps( + { + "items": [ + { + "_id": 1, + "video_id": 1, + "video_name": "1.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "picture": "女主被家人误会", + "narration": "她被家人逼到绝路,反击从这一刻开始。", + "OST": 0, + } + ] + }, + ensure_ascii=False, + ) + + with mock.patch.object(adapter, "_run_async_safely", return_value=matched) as call: + result = adapter.match_narration_copy_to_script( + short_name="测试短剧", + plot_analysis="女主被家人误会后反击。", + subtitle_content="# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n女主被误会。", + narration_copy="她被家人逼到绝路,反击从这一刻开始。", + temperature=0.7, + narration_language="简体中文(中国)", + drama_genre="家庭伦理", + original_sound_ratio=60, + ) + + self.assertEqual("success", result["status"]) + self.assertEqual(1, json.loads(result["narration_script"])["items"][0]["_id"]) + self.assertIn("家庭伦理", call.call_args.kwargs["prompt"]) + self.assertIn("60%", call.call_args.kwargs["prompt"]) + self.assertEqual("json", call.call_args.kwargs["response_format"]) + + def test_match_narration_copy_to_script_uses_streaming_when_callback_exists(self): + adapter = SubtitleAnalyzerAdapter( + api_key="sk-test", + model="test-model", + base_url="https://example.test/v1", + provider="openai", + ) + matched = json.dumps({"items": []}, ensure_ascii=False) + + with mock.patch.object(adapter, "_run_async_safely", return_value=matched) as call: + result = adapter.match_narration_copy_to_script( + short_name="测试短剧", + plot_analysis="女主被家人误会后反击。", + subtitle_content="# 视频 1: 1.mp4", + narration_copy="她被家人逼到绝路,反击从这一刻开始。", + stream_callback=lambda _event: None, + ) + + self.assertEqual("success", result["status"]) + self.assertIs(UnifiedLLMService.generate_text_stream, call.call_args.args[0]) + self.assertIn("on_chunk", call.call_args.kwargs) + + def test_generate_narration_script_plans_segments_before_copywriting(self): + adapter = SubtitleAnalyzerAdapter( + api_key="sk-test", + model="test-model", + base_url="https://example.test/v1", + provider="openai", + ) + responses = iter( + [ + json.dumps( + { + "segments": [ + { + "_id": 1, + "video_id": 1, + "video_name": "1.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "OST": 0, + "intent": "开场钩子", + } + ] + }, + ensure_ascii=False, + ), + json.dumps( + { + "items": [ + { + "_id": 1, + "video_id": 1, + "video_name": "1.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "picture": "女主被误会", + "narration": "她被所有人误会,真正的反击却刚刚开始。", + "OST": 0, + } + ] + }, + ensure_ascii=False, + ), + ] + ) + + with mock.patch.object(adapter, "_run_async_safely", side_effect=lambda *_args, **_kwargs: next(responses)) as call: + result = adapter.generate_narration_script( + short_name="测试短剧", + plot_analysis="女主被误会后反击。", + subtitle_content="# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n女主被误会。", + temperature=0.7, + narration_language="简体中文(中国)", + ) + + self.assertEqual("success", result["status"]) + self.assertEqual(2, call.call_count) + self.assertEqual(1, json.loads(result["narration_script"])["items"][0]["_id"]) + + def test_repair_narration_script_returns_repaired_json(self): + adapter = SubtitleAnalyzerAdapter( + api_key="sk-test", + model="test-model", + base_url="https://example.test/v1", + provider="openai", + ) + repaired = json.dumps({"items": []}, ensure_ascii=False) + + with mock.patch.object(adapter, "_run_async_safely", return_value=repaired): + result = adapter.repair_narration_script( + short_name="测试短剧", + plot_analysis="", + subtitle_content="# 视频 1: 1.mp4", + invalid_script="{bad}", + validation_errors="时间戳错误", + narration_language="简体中文(中国)", + ) + + self.assertEqual("success", result["status"]) + self.assertEqual(repaired, result["narration_script"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/app/services/llm/unified_service.py b/app/services/llm/unified_service.py index 63cc48f..071e8da 100644 --- a/app/services/llm/unified_service.py +++ b/app/services/llm/unified_service.py @@ -108,6 +108,37 @@ class UnifiedLLMService: except Exception as e: logger.error(f"文本生成失败: {str(e)}") raise LLMServiceError(f"文本生成失败: {str(e)}") + + @staticmethod + async def generate_text_stream(prompt: str, + system_prompt: Optional[str] = None, + provider: Optional[str] = None, + temperature: float = 1.0, + max_tokens: Optional[int] = None, + response_format: Optional[str] = None, + on_chunk=None, + **kwargs) -> str: + """ + 流式生成文本内容;不支持流式的 provider 会退化为一次性返回。 + """ + try: + text_provider = LLMServiceManager.get_text_provider(provider) + result = await text_provider.generate_text_stream( + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, + max_tokens=max_tokens, + response_format=response_format, + on_chunk=on_chunk, + **kwargs + ) + + logger.info(f"流式文本生成完成,生成内容长度: {len(result)} 字符") + return result + + except Exception as e: + logger.error(f"流式文本生成失败: {str(e)}") + raise LLMServiceError(f"流式文本生成失败: {str(e)}") @staticmethod async def generate_narration_script(prompt: str, diff --git a/app/services/prompts/short_drama_narration/__init__.py b/app/services/prompts/short_drama_narration/__init__.py index dfa0171..3cee4de 100644 --- a/app/services/prompts/short_drama_narration/__init__.py +++ b/app/services/prompts/short_drama_narration/__init__.py @@ -10,7 +10,11 @@ """ from .plot_analysis import PlotAnalysisPrompt +from .narration_copy import NarrationCopyPrompt +from .segment_planning import SegmentPlanningPrompt from .script_generation import ScriptGenerationPrompt +from .script_matching import ScriptMatchingPrompt +from .script_repair import ScriptRepairPrompt from ..manager import PromptManager @@ -20,14 +24,34 @@ def register_prompts(): # 注册剧情分析提示词 plot_analysis_prompt = PlotAnalysisPrompt() PromptManager.register_prompt(plot_analysis_prompt, is_default=True) + + # 注册可审核解说文案提示词 + narration_copy_prompt = NarrationCopyPrompt() + PromptManager.register_prompt(narration_copy_prompt, is_default=True) + + # 注册片段规划提示词 + segment_planning_prompt = SegmentPlanningPrompt() + PromptManager.register_prompt(segment_planning_prompt, is_default=True) # 注册解说脚本生成提示词 script_generation_prompt = ScriptGenerationPrompt() PromptManager.register_prompt(script_generation_prompt, is_default=True) + # 注册文案画面匹配提示词 + script_matching_prompt = ScriptMatchingPrompt() + PromptManager.register_prompt(script_matching_prompt, is_default=True) + + # 注册解说脚本修复提示词 + script_repair_prompt = ScriptRepairPrompt() + PromptManager.register_prompt(script_repair_prompt, is_default=True) + __all__ = [ "PlotAnalysisPrompt", + "NarrationCopyPrompt", + "SegmentPlanningPrompt", "ScriptGenerationPrompt", + "ScriptMatchingPrompt", + "ScriptRepairPrompt", "register_prompts" ] diff --git a/app/services/prompts/short_drama_narration/narration_copy.py b/app/services/prompts/short_drama_narration/narration_copy.py new file mode 100644 index 0000000..362e0fd --- /dev/null +++ b/app/services/prompts/short_drama_narration/narration_copy.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 短剧解说-解说文案 +@File : narration_copy.py +@Description: 生成可供用户审核修改的短剧解说正文 +""" + +from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat + + +class NarrationCopyPrompt(ParameterizedPrompt): + """短剧解说正文生成提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="narration_copy", + category="short_drama_narration", + version="v1.0", + description="基于剧情理解和字幕生成可审核修改的短剧解说正文,不绑定时间戳", + model_type=ModelType.TEXT, + output_format=OutputFormat.TEXT, + tags=["短剧", "解说文案", "爆款开头", "叙事连续性", "用户审核"], + parameters=["drama_name", "drama_genre", "plot_analysis", "subtitle_content", "narration_language"], + ) + super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "subtitle_content"]) + + self._system_prompt = ( + "你是一位短剧解说文案创作者。你只输出可供用户审核修改的解说正文," + "不要输出JSON、时间戳、编号、标题、解释或Markdown。" + ) + + def get_template(self) -> str: + return """# 短剧解说正文创作任务 + +## 目标 +为短剧《${drama_name}》创作一份可直接给用户审核修改的解说文案正文。此阶段不做画面匹配,不输出时间戳。 + +## 剧情理解材料 + +${plot_analysis} + + +## 原始字幕 + +${subtitle_content} + + +## 输出语言 + +${narration_language} + + +## 用户选择的短剧类型 + +${drama_genre} + + +## 类型写作规则 +必须按用户选择的短剧类型调整表达重点,不要自行改判类型: +- 霸总/甜宠:突出误会、身份差、暧昧拉扯、守护感和情绪反差。 +- 逆袭/复仇:突出羞辱、反击、打脸、身份揭露和爽点升级。 +- 家庭伦理:突出亲情撕扯、秘密、委屈、选择和道德冲突。 +- 古装/权谋:突出身份、局势、算计、立场和反转。 +- 悬疑/犯罪:突出线索、危机、动机和未揭开的疑问。 +- 都市情感:突出关系裂痕、现实压力、误会和情绪拉扯。 +- 年代/乡村:突出家庭处境、人情压力、生活困境和命运转折。 +- 自定义类型:严格服从用户填写的类型方向。 + +## 开头钩子公式 +开头必须使用“高能反转 + 情绪冲突 + 悬念钩子”: +1. 强身份或强处境:兵王、单亲妈妈、被赶出家门的女人、被全家看不起的人等。 +2. 致命反差:刚立功就被迫退役、刚回家就发现钱被输光、刚结婚就遇到孩子/婆婆阻挠。 +3. 后续悬念:真正的噩梦才开始、他要讨回的不是钱、这段关系真正难的不是相爱。 + +## 写作规则 +1. 必须使用 ${narration_language}。 +2. 严格基于剧情理解和字幕事实,不编造核心情节、身份、结局。 +3. 先写完整故事线,再写金句;不要只堆爆点。 +4. 每句话只表达一个信息点,适合后续按句匹配画面。 +5. 句子尽量短,单句优先 15-35 字;信息复杂时拆成多句。 +6. 每 2-3 句要有明确因果承接,让观众知道为什么从上一幕来到下一幕。 +7. 总长度控制在 300-650 字;短素材取下限,长素材取上限。 +8. 不要使用编号、项目符号、章节标题或括号说明。 + +## 输出要求 +只输出解说正文。不要输出 JSON、时间戳、代码块或任何解释。""" diff --git a/app/services/prompts/short_drama_narration/script_generation.py b/app/services/prompts/short_drama_narration/script_generation.py index 234fc98..955f0c1 100644 --- a/app/services/prompts/short_drama_narration/script_generation.py +++ b/app/services/prompts/short_drama_narration/script_generation.py @@ -19,234 +19,112 @@ class ScriptGenerationPrompt(ParameterizedPrompt): metadata = PromptMetadata( name="script_generation", category="short_drama_narration", - version="v2.0", - description="基于短剧解说创作核心要素,生成高质量解说脚本,包含黄金开场、爽点放大、个性吐槽等专业技巧", + version="v2.1", + description="基于已规划片段生成高质量短剧解说脚本,重点补足剧情承接、因果解释和观众理解路径", model_type=ModelType.TEXT, output_format=OutputFormat.JSON, tags=["短剧", "解说脚本", "文案生成", "原声片段", "黄金开场", "爽点放大", "个性吐槽", "悬念预埋"], - parameters=["drama_name", "plot_analysis", "subtitle_content"] + parameters=[ + "drama_name", + "drama_genre", + "plot_analysis", + "subtitle_content", + "segment_plan", + "narration_language", + ] ) - super().__init__(metadata, required_parameters=["drama_name", "plot_analysis"]) + super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "segment_plan"]) - self._system_prompt = "你是一位顶级的短剧解说up主,精通短视频创作的所有核心技巧。你必须严格按照JSON格式输出,绝不能包含任何其他文字、说明或代码块标记。" + self._system_prompt = ( + "你是一位短剧解说文案写手。你必须严格按照JSON格式输出," + "只能补充picture和narration,不能改动上游片段规划中的_id、video_id、video_name、timestamp和OST。" + ) def get_template(self) -> str: - return """# 短剧解说脚本创作任务 + return """# 短剧解说脚本文案生成任务 ## 任务目标 -我是一位专业的短剧解说up主,需要为短剧《${drama_name}》创作一份高质量的解说脚本。目标是让观众在短时间内了解剧情精华,并产生强烈的继续观看欲望。 +为短剧《${drama_name}》生成最终可剪辑解说脚本。片段已经由上游规划完成,你只能补充 picture 和 narration,不能改变片段来源和时间戳。 -## 素材信息 +## 输入材料 ### 剧情概述 ${plot_analysis} +### 已规划片段(必须逐项照抄结构字段) + +${segment_plan} + + ### 原始字幕(含视频编号和精确时间戳) ${subtitle_content} +### 解说台词语言 + +${narration_language} + + +### 用户选择的短剧类型 + +${drama_genre} + + 字幕可能来自多个视频文件。每个字幕分段标题会以“视频 1: 文件名”“视频 2: 文件名”等形式标识来源。 生成脚本时必须把每个片段绑定到对应视频来源,时间戳表示该视频文件内部的局部时间,不是把多个视频拼接后的全局时间。 +所有 OST=0 的 narration 字段必须使用上方指定的解说台词语言输出;不要因为原始字幕是其他语言就切回字幕原语言。 +OST=1 的原声片段 narration 字段必须继续使用“播放原片+序号”格式,不要翻译这个固定标记。 -## 短剧解说创作核心要素 +## 绝对绑定规则 +1. 输出 items 数量、顺序和 _id 必须与 segment_plan 完全一致。 +2. 每个 item 的 _id、video_id、video_name、timestamp、OST 必须逐字复制 segment_plan,不得新增、删除、合并、拆分或改动。 +3. 你只能补充 picture 和 narration 两个字段。 +4. OST=1 的 narration 必须写成“播放原片+_id”,例如 _id 为 5 时写“播放原片5”。 +5. OST=0 的 narration 必须使用 ${narration_language},并严格基于剧情和字幕,不虚构字幕外的具体事件。 -### 1. 黄金开场(3秒法则) -**开头3秒内必须制造强烈钩子,激发"想知道后续发展"的强烈好奇心** -- **悬念设置**:直接抛出最核心的冲突或疑问 - * 示例:"身为一个名声恶臭的政客,他知道自己早晚会被暗杀" - * 技巧:直接定性角色身份和处境,制造紧张感 -- **冲突展示**:展现最激烈的对立关系 - * 示例:"而这一天,就在他刚露头的时候..." - * 技巧:用时间节点强调关键时刻的到来 -- **情感共鸣**:触及观众内心的普遍情感 -- **反转预告**:暗示即将发生的惊人转折 - * 技巧:使用"没想到"、"原来"、"竟然"等词汇预告反转 +## 叙事连续性要求 +- 你必须把每个 OST=0 当成“观众理解剧情的桥”,不能只概括当前画面。 +- 每个 OST=0 narration 要尽量回答:上一段发生了什么、为什么会发展到这一段、这一段带来什么新矛盾。 +- 跨 video_id 或跨时间大跳跃时,OST=0 必须明确补出承接句,例如“可这段婚姻真正难的不是相爱,而是两个孩子和婆婆都还没接纳她”。 +- 原声片段前后的 OST=0 要解释原声的重要性,避免观众只看到对白片段合集。 +- 如果 segment_plan 中有 story_role、intent、transition 字段,必须利用它们组织 narration,但不要把这些字段输出到最终 JSON。 +- 结尾 OST=0 要留下后续阻力或悬念;如果结尾是 OST=1,则前一个 OST=0 必须提前点出这段原声会把矛盾推向哪里。 -### 2. 主线提炼(去繁就简) -**快节奏解说,速度超越原剧,专注核心主线** -- 舍弃次要情节和配角,只保留推动主线的关键人物 -- 突出核心矛盾冲突,每个片段都要推进主要故事线 -- 快速跳过铺垫,直击剧情要害 -- 确保每个解说片段都有明确的剧情推进作用 -- **转折技巧**:大量使用"而这时"、"就在这时"、"没多久"等时间转折词 +## 开头钩子要求 +- 第一段必须是 OST=0 解说钩子,不能直接播放原片。 +- 开头用“高能反转 + 情绪冲突 + 悬念钩子”:强身份/强处境 + 致命反差 + 后续悬念。 +- 写法示例方向:一个刚立功的兵王,下一秒却被迫脱下军装;他回家的第一天,家里的钱和尊严都被赌桌吞了。 +- 示例只用于理解公式,必须基于当前字幕事实原创,不要夸大到字幕没有的情节。 -### 3. 爽点放大(情绪引爆) -**精准识别剧中"爽点"并用富有感染力的语言放大** -- **主角逆袭**:突出弱者变强、反败为胜的瞬间 -- **反派被打脸**:强调恶人得到报应的痛快感 -- **智商在线**:赞美角色的机智和策略 - * 示例:"豺狼已经提前数日跟踪这名清洁工,并在他身上放了窃听器" - * 技巧:展现角色的深谋远虑和专业能力 -- **情感爆发**:放大感人、愤怒、震撼等强烈情绪 -- 使用激昂语气和富有感染力的词汇调动观众情绪 +## 解说密度与画面节奏 +- OST=0 文案必须能被当前 timestamp 的画面承载,按“解说字数 / 5 = 所需视频秒数”估算。 +- 如果画面只有 6 秒,就不要写 80 字;应压缩到约 30 字,或依赖 segment_plan 选择更长画面。 +- 优先短句,单句只表达一个信息点;不要把人物介绍、前因、反转和悬念全塞进一个短画面。 +- 长信息要拆成多段,每段只承担一个叙事功能,让画面节奏跟上解说。 -### 4. 个性吐槽(增加趣味) -**以观众视角进行犀利点评,体现解说员独特人设** -- 避免单纯复述剧情,要有自己的观点和态度 -- **"上帝视角"分析技巧**: - * 揭示角色内心:"他莫名地笑了一下" - * 分析动机:"豺狼的这几步都是事先算好的" - * 预判后果:"这又会有何代价呢" -- 适当吐槽剧情的套路或角色的愚蠢行为 -- 用幽默、犀利的语言增加观看趣味 -- 站在观众立场,说出观众想说的话 -- **心理活动描述**:深入角色内心,增强代入感 +## 用户选择类型文案规则 +短剧类型由用户手动选择为 ${drama_genre},不得自行改判。必须按对应方向写: +- 霸总/甜宠:突出误会、身份差、暧昧拉扯、守护感和情绪反差。 +- 逆袭/复仇:突出羞辱、反击、打脸、身份揭露和爽点升级。 +- 家庭伦理:突出亲情撕扯、秘密、委屈、选择和道德冲突。 +- 古装/权谋:突出身份、局势、算计、立场和反转。 +- 悬疑/犯罪:突出线索、危机、动机和未揭开的疑问。 +- 都市情感:突出关系裂痕、现实压力、误会和情绪拉扯。 +- 年代/乡村:突出家庭处境、人情压力、生活困境和命运转折。 +- 自定义类型:严格服从用户填写的类型方向。 -### 5. 悬念预埋(引导互动) -**在关键节点和结尾处"卖关子",激发互动欲望** -- 在剧情高潮前停止,留下"接下来会发生什么"的疑问 -- **悬念设置技巧**: - * 问题抛出:"那么,UDC究竟是谁呢?" - * 反转预告:"而从这句话开始,所有的专业、体面和虚伪的平静都将分崩瓦解" - * 时间悬念:"几分钟后..."、"不久之后..." -- 提出引导性问题:"你们觉得他会怎么做?" -- 预告后续精彩:"更劲爆的还在后面" -- 为后续内容预热,激发评论、点赞、关注 +## 文案质量要求 +- 开场片段要有强钩子,直接点出冲突、悬念或情绪爆点。 +- 每段解说优先 25-90 字,具体长度必须服从画面时长;短画面宁可少说,不要密集灌信息。 +- 可以使用“没想到”“可下一秒”“而这时”“真正的问题来了”等短剧转折语,但不要堆砌。 +- picture 要描述画面和人物状态,便于后期识别素材。 +- 少用孤立信息句,多用承接句;不要让观众感觉剧情突然跳场。 +- 不要解释规则,不要输出 Markdown,不要输出代码块。 -### 6. 卡点配合(视听协调) -**考虑文案与画面、音乐的完美结合** -- 在情感高潮处预设BGM卡点 -- 解说节奏要配合画面节奏 -- 重要台词处保留原声,解说适时停顿 -- 追求文案+画面+音乐的协同效应 - -## 专业解说语言技巧 - -### 1. 氛围营造技巧 -**通过环境和细节描述增强画面感和代入感** -- **环境描述**:"在这个距离,枪声都无法传到那边" -- **细节刻画**:"他的床头有酒,身边的纸碟堆满烟头" -- **氛围渲染**:"黑暗树林里有一间仓房" -- **情绪描述**:"孤独又无助的豺狼,竟在这时露出了反常的一面" - -### 2. 情感词汇运用 -**使用富有感染力的词汇调动观众情绪** -- **紧张感**:"名声恶臭"、"早晚会被暗杀"、"动用军警资源" -- **神秘感**:"尘封的传奇"、"高度机密"、"暗藏玄机" -- **震撼感**:"空前绝后的一枪"、"天衣无缝"、"神不知鬼不觉" -- **悲伤感**:"目光非常悲伤"、"注定永远无法哀悼" - -### 3. 节奏控制技巧 -**通过语言节奏控制观众注意力** -- **快节奏推进**:使用短句,密集信息 -- **慢节奏渲染**:使用长句,详细描述 -- **停顿技巧**:在关键信息前适当停顿 -- **重复强调**:重要信息适当重复 - -## 严格技术要求 - -### 时间戳管理(绝对不能违反) -- **时间戳绝对不能重叠**,确保剪辑后无重复画面 -- **同一个 video_id 内的时间段必须连续且不交叉**,严格按该视频内时间顺序排列 -- **跨视频可以切换 video_id**,但每个时间戳都必须来自对应视频字幕分段 -- **每个时间戳都必须在对应视频的原始字幕中找到对应范围** -- 可以拆分原时间片段,但必须保持时间连续性 -- 时间戳的格式必须与原始字幕中的格式完全一致 - -### 多视频来源规范(多集/多文件必须遵守) -- **video_id**:必须填写,取字幕分段标题里的视频编号,例如“视频 3”就填 3 -- **video_name**:必须填写对应的视频文件名,例如“3_20260607002212.mp4” -- **timestamp**:只填写对应 video_id 内部的时间范围,不要换算成多个视频拼接后的累计时间 -- 如果剧情跨多个视频推进,脚本可以按故事顺序在不同 video_id 之间切换,但不得把视频 2 的时间戳写到 video_id=1 - -### 时长控制(1/3原则) -- **解说视频总长度 = 原视频长度的 1/3** -- 精确控制节奏和密度,既不能过短也不能过长 -- 合理分配解说和原声的时间比例 - -### 剧情连贯性 -- **保持故事逻辑完整**,确保情节发展自然流畅 -- **严格按照时间顺序**,禁止跳跃式叙述 -- **符合因果逻辑**:先发生A,再发生B,A导致B - -## 原声片段使用规范 - -### 原声片段格式要求 -原声片段必须严格按照以下JSON格式: -```json -{ - "_id": 序号, - "video_id": 视频编号, - "video_name": "视频文件名", - "timestamp": "开始时间-结束时间", - "picture": "画面内容描述", - "narration": "播放原片+序号", - "OST": 1 -} -``` - -### 原声片段插入策略 - -#### 1. 关键情绪爆发点 -**在角色强烈情绪表达时必须保留原声,增强观众代入感** -- **愤怒爆发**:角色愤怒咆哮、情绪失控的瞬间 - * 参考:"Come on, you bastard. Reaching."(愤怒对峙) -- **感动落泪**:角色感动哭泣、情感宣泄的时刻 -- **震惊反应**:角色震惊、不敢置信的表情和台词 - * 参考:"Are you sure about that?"(质疑震惊) -- **绝望崩溃**:角色绝望、崩溃的情感表达 - * 参考:"Charles you're scaring me, what's wrong"(恐惧绝望) -- **狂欢庆祝**:角色兴奋、狂欢的情绪高潮 - -#### 2. 重要对白时刻 -**保留推动剧情发展的关键台词和对话** -- **身份揭露**:揭示角色真实身份的重要台词 -- **真相大白**:揭晓谜底、真相的关键对话 -- **情感告白**:爱情告白、情感表达的重要台词 - * 参考:"i'm really not good"(情感表达) -- **威胁警告**:反派威胁、警告的重要对白 - * 参考:"You do not want to make enemies of these people"(威胁警告) -- **决定宣布**:角色做出重要决定的宣告 - -#### 3. 爽点瞬间 -**在"爽点"时刻保留原声增强痛快感** -- **主角逆袭**:弱者反击、逆转局面的台词 -- **反派被打脸**:恶人得到报应、被揭穿的瞬间 -- **智商碾压**:主角展现智慧、碾压对手的台词 - * 参考:"That is a fucking work of art guys"(技能展示) -- **正义伸张**:正义得到伸张、恶有恶报的时刻 -- **实力展现**:主角展现真实实力、震撼全场 - -#### 4. 悬念节点 -**在制造悬念或揭晓答案的关键时刻保留原声** -- **悬念制造**:制造悬念、留下疑问的台词 -- **答案揭晓**:揭晓答案、解开谜团的对话 -- **转折预告**:暗示即将发生转折的重要台词 -- **危机降临**:危机来临、紧张时刻的对白 - -#### 5. 经典台词时刻 -**保留具有强烈感染力和记忆点的经典台词** -- **哲理感悟**:角色的人生感悟和哲理思考 -- **幽默调侃**:轻松幽默的对话增加趣味性 -- **专业术语**:体现角色专业性的术语和对话 - * 参考:"The scanner will pick up the metal components"(专业解释) -- **情感共鸣**:能引起观众共鸣的经典表达 - -### 原声片段技术规范 - -#### 格式规范 -- **OST字段**:设置为1表示保留原声(解说片段设置为0) -- **narration格式**:严格使用"播放原片+序号"(如"播放原片26") -- **picture字段**:详细描述画面内容,便于后期剪辑参考 -- **时间戳精度**:必须与字幕中的重要对白时间精确匹配 - -#### 比例控制 -- **原声与解说比例**:7:3(原声70%,解说30%) -- **分布均匀**:原声片段要在整个视频中均匀分布 -- **长度适中**:单个原声片段时长控制在3-8秒 -- **衔接自然**:原声片段与解说片段之间衔接自然流畅 - -#### 选择原则 -- **情感优先**:优先选择情感强烈的台词和对话 -- **剧情关键**:必须是推动剧情发展的重要内容 -- **观众共鸣**:选择能引起观众共鸣的经典台词 -- **视听效果**:考虑台词的声音效果和表演张力 -- **代入感强**:选择能让观众产生强烈代入感的对话 - -## 输出格式要求 +## 输出格式 请严格按照以下JSON格式输出,绝不添加任何其他文字、说明或代码块标记: @@ -282,45 +160,4 @@ ${subtitle_content} ] } -## 质量标准 - -### 解说文案要求: -- **字数控制**:每段解说文案80-150字 -- **语言风格**:生动有趣,富有感染力,符合短视频观众喜好 - * 参考风格:"身为一个名声恶臭的政客,他知道自己早晚会被暗杀" - * 直接定性,制造紧张感和代入感 -- **情感调动**:能够有效调动观众情绪,产生代入感 - * 使用"而这时"、"没想到"、"原来"等转折词增强戏剧性 -- **节奏把控**:快节奏但不失条理,紧凑但不混乱 - * 短句推进剧情,长句渲染氛围 - -### 技术规范: -- **解说与原片比例**:3:7(解说30%,原片70%) -- **原声片段标识**:OST=1表示原声,OST=0表示解说 -- **原声格式规范**:narration字段必须使用"播放原片+序号"格式 -- **关键情绪点**:必须保留原片原声,增强观众代入感 -- **视频来源**:每个片段必须包含 video_id 和 video_name,用于定位多个上传视频中的源文件 -- **时间戳精度**:精确到毫秒级别,确保与字幕完美匹配 -- **逻辑连贯性**:严格遵循剧情发展顺序 - -### 创作原则: -1. **只输出JSON内容**,不要任何说明性文字 -2. **严格基于提供的剧情和字幕**,不虚构内容 -3. **突出核心冲突**,舍弃无关细节 -4. **强化观众体验**,始终考虑观看感受 -5. **保持专业水准**,体现解说up主的专业素养 -6. **融入经典解说技巧**: - - 大量使用"上帝视角"分析 - - 适时插入心理活动描述 - - 运用悬念设置和反转技巧 - - 保持强烈的画面感和代入感 - -### 参考解说风格示例: -- **开场悬念**:"身为一个名声恶臭的政客,他知道自己早晚会被暗杀" -- **转折技巧**:"而这一天,就在他刚露头的时候..." -- **上帝视角**:"豺狼已经提前数日跟踪这名清洁工" -- **情感渲染**:"孤独又无助的豺狼,竟在这时露出了反常的一面" -- **悬念设置**:"那么,UDC究竟是谁呢?" -- **反转预告**:"而从这句话开始,所有的专业、体面和虚伪的平静都将分崩瓦解" - 现在请基于以上要求,为短剧《${drama_name}》创作解说脚本:""" diff --git a/app/services/prompts/short_drama_narration/script_matching.py b/app/services/prompts/short_drama_narration/script_matching.py new file mode 100644 index 0000000..61052e5 --- /dev/null +++ b/app/services/prompts/short_drama_narration/script_matching.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 短剧解说-文案画面匹配 +@File : script_matching.py +@Description: 将用户审核后的解说文案匹配到字幕时间戳并生成最终剪辑脚本 +""" + +from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat + + +class ScriptMatchingPrompt(ParameterizedPrompt): + """短剧解说文案画面匹配提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="script_matching", + category="short_drama_narration", + version="v1.0", + description="将审核后的解说文案按叙事节奏拆分,并匹配到字幕时间戳生成最终剪辑JSON", + model_type=ModelType.TEXT, + output_format=OutputFormat.JSON, + tags=["短剧", "画面匹配", "剪辑脚本", "时间戳", "用户文案"], + parameters=[ + "drama_name", + "drama_genre", + "plot_analysis", + "subtitle_content", + "narration_copy", + "narration_language", + "original_sound_ratio", + ], + ) + super().__init__( + metadata, + required_parameters=["drama_name", "subtitle_content", "narration_copy"], + ) + + self._system_prompt = ( + "你是一位懂叙事节奏的短剧剪辑师。你必须严格输出JSON," + "核心任务是把用户审核后的解说文案逐句匹配到最合适的原视频字幕时间戳。" + ) + + def get_template(self) -> str: + return """# 短剧解说文案画面匹配任务 + +## 目标 +用户已经审核并修改了解说文案。请根据这份文案和原始字幕,生成最终可剪辑 JSON 脚本。 + +## 剧名 +${drama_name} + +## 剧情理解材料 + +${plot_analysis} + + +## 用户审核后的解说文案 + +${narration_copy} + + +## 原始字幕(含视频编号和局部时间戳) + +${subtitle_content} + + +## 输出语言 + +${narration_language} + + +## 用户选择的短剧类型 + +${drama_genre} + + +## 用户选择的原片占比 + +${original_sound_ratio}% + + +## 匹配流程 +1. 先按句号、问号、感叹号、省略号切分解说文案,得到候选解说句。 +2. 逗号只在明显分割两个动作、场景、观点或描述对象时切分;不要切出没有独立意义的碎片。 +3. 不要求每个候选句都单独输出为 OST=0;可以合并、压缩相邻候选句作为剧情桥段,但不能改变用户文案的核心意思。 +4. 为每个解说片段寻找最匹配的原始字幕画面,优先选择能表达该句核心含义的画面。 +5. 使用公式估算所需画面时长:所需秒数 = 解说字数 / 5。匹配画面时长尽量接近,误差优先控制在 ±0.5 秒。 +6. 如果一句解说太长,必须拆成多个 OST=0 片段,分别匹配不同或连续画面。 +7. timestamp 必须使用对应 video_id 内部局部时间戳,不得换算为多个视频拼接后的累计时间。 +8. 同一 video_id 内时间段不得交叉或重叠。 +9. 第一段必须是 OST=0 解说钩子,不能直接播放原片。 +10. OST=1 原声片段的总时长占比要尽量接近用户选择的 ${original_sound_ratio}%。这里按最终 items 的 timestamp 总时长估算,不按片段数量估算。 +11. 不要自行判断或改写短剧类型;画面匹配和 picture 描述要服务用户选择的 ${drama_genre} 叙事重点。 + +## 原片占比规则 +- ${original_sound_ratio}% = 0% 时,不要输出 OST=1,全部使用解说承接。 +- ${original_sound_ratio}% 在 10%-30% 时,只保留关键对白、反转、情绪爆发或爽点原声。 +- ${original_sound_ratio}% 在 40%-60% 时,解说负责串联因果,原片负责承载关键场面和对白。 +- ${original_sound_ratio}% 在 70%-90% 时,以原片对白和表演为主,解说只做开场钩子、转场桥和必要补充。 +- 如果原片占比与“第一段必须 OST=0”冲突,优先保证第一段是 OST=0,然后在后续片段提高 OST=1 时长占比。 +- 选择高原片占比时,可以把用户文案合并成更少的 OST=0 桥段,不要为了逐句使用文案而压低原片占比。 + +## 字段规则 +- _id:从 1 开始连续递增。 +- video_id:来自字幕分段标题,例如“视频 2”就填 2。 +- video_name:对应视频文件名,必须从字幕分段标题提取。 +- timestamp:格式为 "HH:MM:SS,mmm-HH:MM:SS,mmm"。 +- picture:描述匹配画面中人物、动作、情绪和场景。 +- narration:OST=0 时填写用户文案片段;OST=1 时填写“播放原片+_id”。 +- OST:解说片段填 0,原声片段填 1。 + +## 输出格式 +只输出严格 JSON: + +{ + "items": [ + { + "_id": 1, + "video_id": 1, + "video_name": "1.mp4", + "timestamp": "00:00:01,000-00:00:06,000", + "picture": "主角站在门口,震惊地看着屋内混乱的场面", + "narration": "一个刚立功的兵王,回家的第一天就发现家里四百万被亲爹输光。", + "OST": 0 + } + ] +} + +现在请基于用户审核后的解说文案生成最终剪辑脚本。""" diff --git a/app/services/prompts/short_drama_narration/script_repair.py b/app/services/prompts/short_drama_narration/script_repair.py new file mode 100644 index 0000000..0784c6c --- /dev/null +++ b/app/services/prompts/short_drama_narration/script_repair.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 短剧解说-脚本修复 +@File : script_repair.py +@Description: 短剧解说脚本校验失败后的JSON修复提示词 +""" + +from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat + + +class ScriptRepairPrompt(ParameterizedPrompt): + """短剧解说脚本修复提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="script_repair", + category="short_drama_narration", + version="v1.0", + description="根据确定性校验错误修复短剧解说脚本JSON,优先修正时间戳、视频来源和格式问题", + model_type=ModelType.TEXT, + output_format=OutputFormat.JSON, + tags=["短剧", "解说脚本", "JSON修复", "时间戳校验", "多视频"], + parameters=[ + "drama_name", + "drama_genre", + "plot_analysis", + "subtitle_content", + "invalid_script", + "validation_errors", + "narration_language", + ], + ) + super().__init__( + metadata, + required_parameters=["drama_name", "subtitle_content", "invalid_script", "validation_errors"], + ) + + self._system_prompt = ( + "你是一位短剧解说脚本JSON修复器。你只能根据校验错误修复JSON," + "必须输出严格JSON,不能输出解释、Markdown或代码块。" + ) + + def get_template(self) -> str: + return """# 短剧解说脚本修复任务 + +## 修复目标 +下面的短剧《${drama_name}》解说脚本未通过剪辑校验。请只根据校验错误和字幕内容修复它,输出一个完整可剪辑的 JSON。 + +## 剧情理解材料 + +${plot_analysis} + + +## 校验错误 + +${validation_errors} + + +## 当前无效脚本 + +${invalid_script} + + +## 可用字幕窗口 + +${subtitle_content} + + +## 解说台词目标语言 + +${narration_language} + + +## 用户选择的短剧类型 + +${drama_genre} + + +## 修复规则 +1. 只输出 JSON,不要任何解释、标题、Markdown 或代码块。 +2. 输出根对象必须是 {"items": [...]}。 +3. 每个 item 必须包含 _id、video_id、video_name、timestamp、picture、narration、OST。 +4. video_id、video_name 和 timestamp 必须来自对应字幕窗口;不得把不同视频的同名时间戳混用。 +5. 同一 video_id 内片段不得交叉或重叠。 +6. OST=1 的 narration 必须是“播放原片+序号”;OST=0 的 narration 必须使用 ${narration_language}。 +7. 禁止连续 3 个或更多 OST=1;必须插入或改写 OST=0 解说片段承接剧情。 +8. 跨 video_id 切换前后不能都是 OST=1;必须至少有一个 OST=0 片段解释场景和剧情为什么切换。 +9. OST=0 narration 要补足因果承接,不要只概括当前画面。 +10. 第一段必须是 OST=0 解说钩子,按“高能反转 + 情绪冲突 + 悬念钩子”写,不要直接播放原片。 +11. OST=0 文案必须匹配画面时长,按“解说字数 / 5 = 所需视频秒数”估算;过密时要缩短文案、延长时间戳或拆成多个片段。 +12. 不要自行改判短剧类型;如需改写 narration,必须按用户选择的 ${drama_genre} 保持表达重点。 +13. 尽量保留原脚本中没有错误的片段;无法修复的片段可以删除,但剩余片段必须重新按 1 开始编号。 + +请输出修复后的完整 JSON。""" diff --git a/app/services/prompts/short_drama_narration/segment_planning.py b/app/services/prompts/short_drama_narration/segment_planning.py new file mode 100644 index 0000000..da574bf --- /dev/null +++ b/app/services/prompts/short_drama_narration/segment_planning.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 短剧解说-片段规划 +@File : segment_planning.py +@Description: 短剧解说脚本片段规划提示词 +""" + +from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat + + +class SegmentPlanningPrompt(ParameterizedPrompt): + """短剧解说片段规划提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="segment_planning", + category="short_drama_narration", + version="v1.1", + description="基于剧情理解和原始字幕规划可剪辑片段,优先保证叙事连续性、跨视频承接和原声解说节奏", + model_type=ModelType.TEXT, + output_format=OutputFormat.JSON, + tags=["短剧", "解说脚本", "片段规划", "时间戳", "多视频", "原声"], + parameters=["drama_name", "drama_genre", "plot_analysis", "subtitle_content", "narration_language"], + ) + super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "subtitle_content"]) + + self._system_prompt = ( + "你是一位短剧解说剪辑规划师。你的任务是从字幕中选择可剪辑片段," + "必须严格输出JSON,不能写解说文案,不能输出Markdown或额外说明。" + ) + + def get_template(self) -> str: + return """# 短剧解说片段规划任务 + +## 目标 +为短剧《${drama_name}》规划一组可直接剪辑的视频片段。你只负责选片段和标注用途,不写最终解说台词。 + +## 剧情理解材料 + +${plot_analysis} + + +## 原始字幕(含视频编号和局部时间戳) + +${subtitle_content} + + +## 解说台词目标语言 + +${narration_language} + + +## 用户选择的短剧类型 + +${drama_genre} + + +## 叙事规划目标 +你不是在挑精彩片段合集,而是在规划一条观众能顺着看懂的短剧解说故事线。必须先想清楚“人物困境 -> 冲突触发 -> 关系变化 -> 新阻力 -> 悬念”的因果链,再选片段。 + +## 爆款开头钩子规则 +第一段必须是 OST=0 解说开场,不要直接播放原片。开头参考“高能反转 + 情绪冲突 + 悬念钩子”的公式: +- 先给人物一个强身份或强处境:兵王、单亲妈妈、被赶出家门的女人、被全家看不起的赘婿。 +- 再给一个反差冲突:刚立功就被迫退役、刚回家就发现钱被输光、刚结婚就遇到孩子/婆婆阻挠。 +- 最后抛出悬念:真正的噩梦才开始、他要讨回的不是钱、这场婚姻真正难的不是相爱。 +- 不要照抄示例,要基于字幕事实改写成当前剧情自己的钩子。 + +## 规划规则 +1. 只能使用原始字幕中真实存在的视频编号、视频文件名和时间范围。 +2. timestamp 必须是对应 video_id 内部的局部时间戳,禁止换算成多个视频拼接后的累计时间。 +3. 同一个 video_id 内的片段不得交叉或重叠;尽量按故事顺序排列。 +4. 每个片段必须推动主线、制造情绪点、承接原声或保留关键对白。 +5. OST=1 表示保留原声,适合关键对白、情绪爆发、身份揭露、反转和爽点;OST=0 表示后续需要配解说。 +6. 原声片段单段优先控制在 3-8 秒;解说片段可以更长,但必须能从字幕范围中定位。 +7. 短剧类型由用户手动选择为 ${drama_genre},不得自行改判;选片段时优先服务该类型的主要看点。 +8. 禁止连续 3 个或更多 OST=1;每 1-2 个原声片段后必须安排 OST=0 解说片段承接剧情。 +9. 跨 video_id 切换前后必须至少有一个 OST=0 片段作为剧情桥段,解释为什么从上一场转到下一场。 +10. 每个 OST=0 片段必须承担明确叙事功能:开场钩子、人物介绍、因果过渡、冲突升级、关系转折、阻力解释、结尾悬念。 +11. 不要跳过关键因果:例如从求婚直接跳到孩子/婆婆阻挠,中间必须用 OST=0 解释“婚姻真正的难题变成家庭接纳”。 +12. 结尾优先选择能留下后续阻力或新矛盾的片段,不要只停在原声对白堆叠上。 +13. 解说画面必须给足时长:按“解说字数 / 5 = 所需视频秒数”预估,短画面不要承载长解说。 +14. OST=0 片段如果需要讲清多层信息,应选择更长的连续画面,或拆成多个 OST=0 片段分别承接。 + +## 输出格式 +只输出严格 JSON: + +{ + "segments": [ + { + "_id": 1, + "video_id": 1, + "video_name": "1.mp4", + "timestamp": "00:00:01,000-00:00:05,500", + "OST": 0, + "story_role": "开场钩子", + "intent": "女主被羞辱,制造逆袭期待", + "transition": "从灾后恢复现场切入女主处境,引出她为什么敢和领导硬刚" + } + ] +} + +现在请规划短剧《${drama_name}》的解说片段。""" diff --git a/app/services/short_drama_narration_validation.py b/app/services/short_drama_narration_validation.py new file mode 100644 index 0000000..eb899cc --- /dev/null +++ b/app/services/short_drama_narration_validation.py @@ -0,0 +1,435 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +"""Validation helpers for short drama narration scripts.""" + +from __future__ import annotations + +import os +import re +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple + + +TIMESTAMP_RE = re.compile(r"^\d{2}:\d{2}:\d{2},\d{3}$") +SCRIPT_RANGE_RE = re.compile( + r"^(?P\d{2}:\d{2}:\d{2}[,.]\d{3})-(?P\d{2}:\d{2}:\d{2}[,.]\d{3})$" +) +SRT_RANGE_RE = re.compile( + r"(?P\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*" + r"(?P\d{2}:\d{2}:\d{2}[,.]\d{3})" +) +VIDEO_HEADER_RE = re.compile(r"^#\s*视频\s*(?P\d+)(?:\s*[::]\s*(?P.+?))?\s*$") +NARRATION_CHARS_PER_SECOND = 5.0 +NARRATION_DURATION_TOLERANCE_SECONDS = 0.5 + + +@dataclass(frozen=True) +class SubtitleCue: + video_id: int + video_name: str + start_ms: int + end_ms: int + text: str + timestamp: str + + +@dataclass(frozen=True) +class ScriptValidationResult: + valid: bool + errors: List[str] + items: List[Dict[str, Any]] + + +class NarrationScriptValidationError(ValueError): + """Raised when a narration script cannot be made safe for clipping.""" + + +def timestamp_to_ms(timestamp: str) -> int: + value = str(timestamp or "").strip().replace(".", ",") + if not TIMESTAMP_RE.match(value): + raise ValueError(f"时间戳格式错误: {timestamp}") + + hh, mm, rest = value.split(":") + ss, ms = rest.split(",") + return ((int(hh) * 60 + int(mm)) * 60 + int(ss)) * 1000 + int(ms) + + +def ms_to_timestamp(ms: int) -> str: + if ms < 0: + raise ValueError("毫秒时间不能为负数") + + hours, remainder = divmod(ms, 60 * 60 * 1000) + minutes, remainder = divmod(remainder, 60 * 1000) + seconds, millis = divmod(remainder, 1000) + return f"{hours:02d}:{minutes:02d}:{seconds:02d},{millis:03d}" + + +def parse_script_timestamp_range(timestamp_range: str) -> Tuple[int, int, str]: + value = str(timestamp_range or "").strip().replace(".", ",") + match = SCRIPT_RANGE_RE.match(value) + if not match: + raise ValueError("时间戳格式应为 'HH:MM:SS,mmm-HH:MM:SS,mmm'") + + start = timestamp_to_ms(match.group("start")) + end = timestamp_to_ms(match.group("end")) + return start, end, f"{ms_to_timestamp(start)}-{ms_to_timestamp(end)}" + + +def _normalize_paths(paths: Optional[Iterable[str]]) -> List[str]: + if isinstance(paths, str): + paths = [paths] + if not paths: + return [] + + normalized = [] + for path in paths: + if not isinstance(path, str): + continue + path = path.strip() + if path: + normalized.append(path) + return normalized + + +def _default_video_name(video_id: int, video_paths: Sequence[str]) -> str: + if 1 <= video_id <= len(video_paths): + return os.path.basename(video_paths[video_id - 1]) + return "" + + +def _split_subtitle_sections( + subtitle_content: str, + video_paths: Sequence[str], +) -> List[Tuple[int, str, str]]: + sections: List[Tuple[int, str, str]] = [] + current_video_id = 1 + current_video_name = _default_video_name(1, video_paths) + current_lines: List[str] = [] + saw_header = False + + for line in str(subtitle_content or "").splitlines(): + header_match = VIDEO_HEADER_RE.match(line.strip()) + if header_match: + if current_lines or saw_header: + sections.append((current_video_id, current_video_name, "\n".join(current_lines))) + current_lines = [] + + saw_header = True + current_video_id = int(header_match.group("video_id")) + header_video_name = str(header_match.group("video_name") or "").strip() + current_video_name = header_video_name or _default_video_name(current_video_id, video_paths) + continue + + current_lines.append(line) + + if current_lines or not sections: + sections.append((current_video_id, current_video_name, "\n".join(current_lines))) + + return sections + + +def _extract_cues_from_section(video_id: int, video_name: str, section_text: str) -> List[SubtitleCue]: + lines = str(section_text or "").splitlines() + cues: List[SubtitleCue] = [] + index = 0 + + while index < len(lines): + match = SRT_RANGE_RE.search(lines[index]) + if not match: + index += 1 + continue + + start_ms = timestamp_to_ms(match.group("start")) + end_ms = timestamp_to_ms(match.group("end")) + timestamp = f"{ms_to_timestamp(start_ms)}-{ms_to_timestamp(end_ms)}" + index += 1 + + text_lines: List[str] = [] + while index < len(lines) and lines[index].strip(): + text_lines.append(lines[index].strip()) + index += 1 + + cues.append( + SubtitleCue( + video_id=video_id, + video_name=video_name, + start_ms=start_ms, + end_ms=end_ms, + text=" ".join(text_lines).strip(), + timestamp=timestamp, + ) + ) + index += 1 + + return cues + + +def build_subtitle_index(subtitle_content: str, video_paths: Optional[Iterable[str]] = None) -> List[SubtitleCue]: + """Build a per-video subtitle index from combined SRT text.""" + normalized_video_paths = _normalize_paths(video_paths) + cues: List[SubtitleCue] = [] + + for video_id, video_name, section_text in _split_subtitle_sections(subtitle_content, normalized_video_paths): + cues.extend(_extract_cues_from_section(video_id, video_name, section_text)) + + return cues + + +def _coerce_positive_int(value: Any) -> Optional[int]: + try: + number = int(value) + except (TypeError, ValueError): + return None + return number if number > 0 else None + + +def _video_id_by_name(video_name: Any, video_paths: Sequence[str]) -> Optional[int]: + normalized_name = os.path.basename(str(video_name or "").strip()) + if not normalized_name: + return None + + for index, path in enumerate(video_paths, start=1): + if os.path.basename(path) == normalized_name: + return index + return None + + +def normalize_script_video_sources( + items: Sequence[Dict[str, Any]], + video_paths: Optional[Iterable[str]] = None, +) -> List[Dict[str, Any]]: + """Normalize video_name from a valid source without inventing video_id.""" + normalized_video_paths = _normalize_paths(video_paths) + normalized_items: List[Dict[str, Any]] = [] + + for raw_item in items: + item = dict(raw_item) + video_id = _coerce_positive_int(item.get("video_id") or item.get("video_index")) + matched_video_id = _video_id_by_name(item.get("video_name") or item.get("source_video"), normalized_video_paths) + if matched_video_id is not None: + video_id = matched_video_id + + if video_id is not None: + item["video_id"] = video_id + if 1 <= video_id <= len(normalized_video_paths): + item["video_name"] = os.path.basename(normalized_video_paths[video_id - 1]) + + normalized_items.append(item) + + return normalized_items + + +def _cues_for_video(cues: Sequence[SubtitleCue], video_id: int) -> List[SubtitleCue]: + return [cue for cue in cues if cue.video_id == video_id] + + +def _range_overlaps_subtitle(cues: Sequence[SubtitleCue], start_ms: int, end_ms: int) -> bool: + return any(start_ms < cue.end_ms and end_ms > cue.start_ms for cue in cues) + + +def _range_within_subtitle_bounds(cues: Sequence[SubtitleCue], start_ms: int, end_ms: int) -> bool: + if not cues: + return False + return min(cue.start_ms for cue in cues) <= start_ms and end_ms <= max(cue.end_ms for cue in cues) + + +def _item_ost(item: Dict[str, Any]) -> Optional[int]: + try: + return int(item.get("OST")) + except (TypeError, ValueError): + return None + + +def _item_video_id(item: Dict[str, Any]) -> Optional[int]: + return _coerce_positive_int(item.get("video_id")) + + +def count_narration_chars(text: str) -> int: + """Count visible narration characters for rough TTS/video-duration matching.""" + return len(re.sub(r"\s+", "", str(text or ""))) + + +def max_narration_chars_for_duration(start_ms: int, end_ms: int) -> int: + duration_seconds = max(0.0, (end_ms - start_ms) / 1000) + return max(8, int((duration_seconds + NARRATION_DURATION_TOLERANCE_SECONDS) * NARRATION_CHARS_PER_SECOND)) + + +def _validate_story_continuity(items: Sequence[Dict[str, Any]]) -> List[str]: + """Validate structural continuity rules that affect viewer comprehension.""" + errors: List[str] = [] + consecutive_ost = 0 + previous_item: Optional[Dict[str, Any]] = None + + for index, item in enumerate(items): + if not isinstance(item, dict): + consecutive_ost = 0 + previous_item = None + continue + + item_id = item.get("_id", index + 1) + ost = _item_ost(item) + if index == 0 and ost != 0: + errors.append(f"片段 {item_id} 必须是 OST=0 解说开场钩子,不能直接播放原片") + + if ost == 1: + consecutive_ost += 1 + if consecutive_ost > 2: + errors.append(f"片段 {item_id} 连续原声过多,必须插入 OST=0 解说承接剧情") + else: + consecutive_ost = 0 + + if previous_item is not None: + previous_video_id = _item_video_id(previous_item) + current_video_id = _item_video_id(item) + if ( + previous_video_id is not None + and current_video_id is not None + and previous_video_id != current_video_id + and _item_ost(previous_item) == 1 + and ost == 1 + ): + errors.append( + f"片段 {previous_item.get('_id')} 到片段 {item_id} 跨视频切换缺少 OST=0 解说桥段" + ) + + previous_item = item + + return errors + + +def validate_narration_script_items( + items: Any, + subtitle_index: Sequence[SubtitleCue], + video_paths: Optional[Iterable[str]] = None, +) -> ScriptValidationResult: + """Validate final narration items against subtitle/video source constraints.""" + errors: List[str] = [] + if not isinstance(items, list) or not items: + return ScriptValidationResult(False, ["解说脚本 items 必须是非空数组"], []) + + normalized_video_paths = _normalize_paths(video_paths) + normalized_items = normalize_script_video_sources(items, normalized_video_paths) + available_video_ids = {cue.video_id for cue in subtitle_index} + if normalized_video_paths: + available_video_ids.update(range(1, len(normalized_video_paths) + 1)) + + ranges_by_video: Dict[int, List[Tuple[int, int, int]]] = {} + seen_ids = set() + required_fields = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"] + + for index, item in enumerate(normalized_items): + if not isinstance(item, dict): + errors.append(f"第 {index + 1} 个片段必须是对象") + continue + + item_id = item.get("_id", index + 1) + coerced_item_id = _coerce_positive_int(item_id) + if coerced_item_id is None: + errors.append(f"第 {index + 1} 个片段缺少有效 _id") + coerced_item_id = index + 1 + elif coerced_item_id in seen_ids: + errors.append(f"片段 _id={coerced_item_id} 重复") + seen_ids.add(coerced_item_id) + + for field in required_fields: + if field not in item: + errors.append(f"片段 {item_id} 缺少字段 {field}") + + video_id = _coerce_positive_int(item.get("video_id")) + if video_id is None: + errors.append(f"片段 {item_id} 缺少有效 video_id") + continue + + if available_video_ids and video_id not in available_video_ids: + errors.append(f"片段 {item_id} 的 video_id={video_id} 不在已选视频范围内") + + expected_video_name = _default_video_name(video_id, normalized_video_paths) + if expected_video_name and os.path.basename(str(item.get("video_name") or "")) != expected_video_name: + errors.append(f"片段 {item_id} 的 video_name 必须是 {expected_video_name}") + + try: + start_ms, end_ms, normalized_timestamp = parse_script_timestamp_range(item.get("timestamp", "")) + item["timestamp"] = normalized_timestamp + except ValueError as exc: + errors.append(f"片段 {item_id}: {exc}") + continue + + if start_ms >= end_ms: + errors.append(f"片段 {item_id} 的开始时间必须早于结束时间") + continue + + video_cues = _cues_for_video(subtitle_index, video_id) + if not _range_within_subtitle_bounds(video_cues, start_ms, end_ms): + errors.append(f"片段 {item_id} 的时间戳不在视频 {video_id} 的字幕范围内") + elif not _range_overlaps_subtitle(video_cues, start_ms, end_ms): + errors.append(f"片段 {item_id} 的时间戳没有命中视频 {video_id} 的字幕内容") + + for text_field in ["picture", "narration"]: + if not isinstance(item.get(text_field), str) or not item[text_field].strip(): + errors.append(f"片段 {item_id} 的 {text_field} 不能为空") + + ost = _item_ost(item) + if item.get("OST") not in [0, 1, 2]: + errors.append(f"片段 {item_id} 的 OST 必须是 0、1 或 2") + if ost == 1 and not str(item.get("narration", "")).startswith("播放原片"): + errors.append(f"片段 {item_id} 是原声片段,narration 必须使用“播放原片+序号”") + if ost == 0: + narration_chars = count_narration_chars(item.get("narration", "")) + max_chars = max_narration_chars_for_duration(start_ms, end_ms) + if narration_chars > max_chars: + duration_seconds = (end_ms - start_ms) / 1000 + errors.append( + f"片段 {item_id} 解说过密:{narration_chars} 字需要约 {narration_chars / NARRATION_CHARS_PER_SECOND:.1f} 秒," + f"但画面只有 {duration_seconds:.1f} 秒,建议不超过 {max_chars} 字或延长画面" + ) + + ranges_by_video.setdefault(video_id, []).append((start_ms, end_ms, coerced_item_id)) + + for video_id, ranges in ranges_by_video.items(): + sorted_ranges = sorted(ranges, key=lambda item: (item[0], item[1], item[2])) + previous_start, previous_end, previous_id = sorted_ranges[0] + for start_ms, end_ms, item_id in sorted_ranges[1:]: + if start_ms < previous_end: + errors.append(f"视频 {video_id} 的片段 {item_id} 与片段 {previous_id} 时间戳重叠") + if end_ms > previous_end: + previous_start, previous_end, previous_id = start_ms, end_ms, item_id + + errors.extend(_validate_story_continuity(normalized_items)) + + return ScriptValidationResult(not errors, errors, normalized_items) + + +def require_valid_narration_script_items( + items: Any, + subtitle_index: Sequence[SubtitleCue], + video_paths: Optional[Iterable[str]] = None, +) -> List[Dict[str, Any]]: + result = validate_narration_script_items(items, subtitle_index, video_paths) + if not result.valid: + raise NarrationScriptValidationError("\n".join(result.errors)) + return result.items + + +def summarize_subtitle_window( + subtitle_index: Sequence[SubtitleCue], + max_cues_per_video: int = 80, +) -> str: + """Return compact subtitle context for a repair prompt.""" + lines: List[str] = [] + by_video: Dict[int, List[SubtitleCue]] = {} + for cue in subtitle_index: + by_video.setdefault(cue.video_id, []).append(cue) + + for video_id in sorted(by_video): + cues = by_video[video_id][:max_cues_per_video] + video_name = cues[0].video_name if cues else "" + header = f"# 视频 {video_id}: {video_name}" if video_name else f"# 视频 {video_id}" + lines.append(header) + for cue in cues: + text = cue.text.replace("\n", " ").strip() + lines.append(f"{cue.timestamp} {text}") + if len(by_video[video_id]) > max_cues_per_video: + lines.append(f"... 已省略 {len(by_video[video_id]) - max_cues_per_video} 条字幕") + + return "\n".join(lines) diff --git a/app/services/test_short_drama_narration_validation_unittest.py b/app/services/test_short_drama_narration_validation_unittest.py new file mode 100644 index 0000000..8b2cebe --- /dev/null +++ b/app/services/test_short_drama_narration_validation_unittest.py @@ -0,0 +1,290 @@ +import unittest + +from app.services.short_drama_narration_validation import ( + build_subtitle_index, + normalize_script_video_sources, + validate_narration_script_items, +) + + +SUBTITLE_CONTENT = """# 视频 1: first.mp4 +字幕文件: first.srt +1 +00:00:01,000 --> 00:00:04,000 +女主被众人误会。 + +2 +00:00:04,000 --> 00:00:08,000 +男主冷眼看着她。 + +# 视频 2: second.mp4 +字幕文件: second.srt +1 +00:00:02,000 --> 00:00:05,000 +女主终于拿出证据。 + +2 +00:00:05,000 --> 00:00:09,000 +众人震惊,反派慌了。 +""" + + +class ShortDramaNarrationValidationTests(unittest.TestCase): + def setUp(self): + self.video_paths = ["/tmp/first.mp4", "/tmp/second.mp4"] + self.subtitle_index = build_subtitle_index(SUBTITLE_CONTENT, self.video_paths) + + def test_build_subtitle_index_preserves_multi_video_sources(self): + self.assertEqual(4, len(self.subtitle_index)) + self.assertEqual({1, 2}, {cue.video_id for cue in self.subtitle_index}) + self.assertEqual("first.mp4", self.subtitle_index[0].video_name) + self.assertEqual("second.mp4", self.subtitle_index[2].video_name) + self.assertEqual("00:00:02,000-00:00:05,000", self.subtitle_index[2].timestamp) + + def test_valid_script_passes_and_normalizes_video_name(self): + items = [ + { + "_id": 1, + "video_id": 1, + "video_name": "wrong-name.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "picture": "女主被误会", + "narration": "她被当众误会。", + "OST": 0, + }, + { + "_id": 2, + "video_name": "second.mp4", + "timestamp": "00:00:02,000-00:00:05,000", + "picture": "女主拿出证据", + "narration": "播放原片2", + "OST": 1, + }, + ] + + normalized = normalize_script_video_sources(items, self.video_paths) + result = validate_narration_script_items(normalized, self.subtitle_index, self.video_paths) + + self.assertTrue(result.valid, result.errors) + self.assertEqual(2, result.items[1]["video_id"]) + self.assertEqual("second.mp4", result.items[1]["video_name"]) + + def test_invalid_timestamp_and_overlap_fail(self): + items = [ + { + "_id": 1, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:01,000-00:00:05,000", + "picture": "画面", + "narration": "解说", + "OST": 0, + }, + { + "_id": 2, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:04,500-00:00:08,000", + "picture": "画面", + "narration": "解说", + "OST": 0, + }, + { + "_id": 3, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "bad", + "picture": "画面", + "narration": "解说", + "OST": 0, + }, + ] + + result = validate_narration_script_items(items, self.subtitle_index, self.video_paths) + + self.assertFalse(result.valid) + self.assertTrue(any("重叠" in error for error in result.errors)) + self.assertTrue(any("时间戳格式" in error for error in result.errors)) + + def test_invalid_video_id_does_not_default_to_first_video(self): + items = [ + { + "_id": 1, + "video_id": 99, + "video_name": "missing.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "picture": "画面", + "narration": "解说", + "OST": 0, + } + ] + + result = validate_narration_script_items(items, self.subtitle_index, self.video_paths) + + self.assertFalse(result.valid) + self.assertTrue(any("video_id=99" in error for error in result.errors)) + + def test_out_of_range_timestamp_fails(self): + items = [ + { + "_id": 1, + "video_id": 2, + "video_name": "second.mp4", + "timestamp": "00:00:20,000-00:00:25,000", + "picture": "画面", + "narration": "解说", + "OST": 0, + } + ] + + result = validate_narration_script_items(items, self.subtitle_index, self.video_paths) + + self.assertFalse(result.valid) + self.assertTrue(any("不在视频 2 的字幕范围内" in error for error in result.errors)) + + def test_three_consecutive_original_audio_segments_fail(self): + items = [ + { + "_id": 1, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "picture": "女主被误会", + "narration": "她被当众误会。", + "OST": 0, + }, + { + "_id": 2, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:04,000-00:00:05,000", + "picture": "男主看着她", + "narration": "播放原片2", + "OST": 1, + }, + { + "_id": 3, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:05,000-00:00:06,000", + "picture": "男主看着她", + "narration": "播放原片3", + "OST": 1, + }, + { + "_id": 4, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:06,000-00:00:08,000", + "picture": "男主继续观察", + "narration": "播放原片4", + "OST": 1, + }, + ] + + result = validate_narration_script_items(items, self.subtitle_index, self.video_paths) + + self.assertFalse(result.valid) + self.assertTrue(any("连续原声过多" in error for error in result.errors)) + + def test_cross_video_original_audio_requires_narration_bridge(self): + items = [ + { + "_id": 1, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "picture": "女主被误会", + "narration": "她被当众误会。", + "OST": 0, + }, + { + "_id": 2, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:04,000-00:00:08,000", + "picture": "男主看着她", + "narration": "播放原片2", + "OST": 1, + }, + { + "_id": 3, + "video_id": 2, + "video_name": "second.mp4", + "timestamp": "00:00:02,000-00:00:05,000", + "picture": "女主拿出证据", + "narration": "播放原片3", + "OST": 1, + }, + ] + + result = validate_narration_script_items(items, self.subtitle_index, self.video_paths) + + self.assertFalse(result.valid) + self.assertTrue(any("跨视频切换缺少 OST=0 解说桥段" in error for error in result.errors)) + + def test_cross_video_switch_with_narration_bridge_passes(self): + items = [ + { + "_id": 1, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "picture": "女主被误会", + "narration": "她被当众误会。", + "OST": 0, + }, + { + "_id": 2, + "video_id": 2, + "video_name": "second.mp4", + "timestamp": "00:00:02,000-00:00:05,000", + "picture": "女主拿出证据", + "narration": "播放原片2", + "OST": 1, + }, + ] + + result = validate_narration_script_items(items, self.subtitle_index, self.video_paths) + + self.assertTrue(result.valid, result.errors) + + def test_first_segment_must_be_narration_hook(self): + items = [ + { + "_id": 1, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "picture": "女主被误会", + "narration": "播放原片1", + "OST": 1, + } + ] + + result = validate_narration_script_items(items, self.subtitle_index, self.video_paths) + + self.assertFalse(result.valid) + self.assertTrue(any("解说开场钩子" in error for error in result.errors)) + + def test_dense_narration_fails_when_video_duration_is_too_short(self): + items = [ + { + "_id": 1, + "video_id": 1, + "video_name": "first.mp4", + "timestamp": "00:00:01,000-00:00:04,000", + "picture": "女主被误会", + "narration": "她明明什么都没做却被所有人推到风口浪尖只能独自承受委屈", + "OST": 0, + } + ] + + result = validate_narration_script_items(items, self.subtitle_index, self.video_paths) + + self.assertFalse(result.valid) + self.assertTrue(any("解说过密" in error for error in result.errors)) + + +if __name__ == "__main__": + unittest.main() diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index d8b296e..d73eba0 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -14,12 +14,59 @@ from app.services.subtitle_text import decode_subtitle_bytes, read_subtitle_text from app.utils import utils, check_script from webui.tools.generate_script_docu import generate_script_docu from webui.tools.generate_script_short import generate_script_short -from webui.tools.generate_short_summary import analyze_short_drama_plot, generate_script_short_sunmmary +from webui.tools.generate_short_summary import ( + analyze_short_drama_plot, + generate_script_short_sunmmary, + generate_short_drama_narration_copy, +) SCRIPT_TABLE_BASE_COLUMNS = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"] VIDEO_UPLOAD_TYPES = ["mp4", "mov", "avi", "flv", "mkv", "mpeg4"] VIDEO_GLOB_PATTERNS = [f"*.{suffix}" for suffix in VIDEO_UPLOAD_TYPES] +SHORT_DRAMA_NARRATION_LANGUAGE_OPTIONS = [ + ("zh-CN", "简体中文(中国)"), + ("en-US", "英语(美国)"), + ("ja-JP", "日语(日本)"), + ("ko-KR", "韩语(韩国)"), + ("fr-FR", "法语(法国)"), + ("de-DE", "德语(德国)"), + ("es-ES", "西班牙语(西班牙)"), + ("pt-BR", "葡萄牙语(巴西)"), + ("ru-RU", "俄语(俄罗斯)"), + ("custom", "自定义"), +] +SHORT_DRAMA_NARRATION_LANGUAGE_VALUES = { + "zh-CN": "简体中文(中国)", + "en-US": "英语(美国)", + "ja-JP": "日语(日本)", + "ko-KR": "韩语(韩国)", + "fr-FR": "法语(法国)", + "de-DE": "德语(德国)", + "es-ES": "西班牙语(西班牙)", + "pt-BR": "葡萄牙语(巴西)", + "ru-RU": "俄语(俄罗斯)", +} +SHORT_DRAMA_TYPE_OPTIONS = [ + ("counterattack", "逆袭/复仇"), + ("ceo_romance", "霸总/甜宠"), + ("family", "家庭伦理"), + ("costume", "古装/权谋"), + ("suspense", "悬疑/犯罪"), + ("urban_emotion", "都市情感"), + ("period_rural", "年代/乡村"), + ("custom", "自定义"), +] +SHORT_DRAMA_TYPE_VALUES = { + "counterattack": "逆袭/复仇", + "ceo_romance": "霸总/甜宠", + "family": "家庭伦理", + "costume": "古装/权谋", + "suspense": "悬疑/犯罪", + "urban_emotion": "都市情感", + "period_rural": "年代/乡村", +} +SHORT_DRAMA_ORIGINAL_SOUND_RATIO_OPTIONS = list(range(0, 100, 10)) def _normalize_video_paths(paths): @@ -154,6 +201,22 @@ def _short_drama_plot_analysis_signature(subtitle_paths, video_theme, web_search ) +def _resolve_short_drama_narration_language(): + selected_language = st.session_state.get('short_drama_narration_language_option', 'zh-CN') + custom_language = str(st.session_state.get('short_drama_custom_narration_language', '') or '').strip() + if selected_language == "custom" and custom_language: + return custom_language + return SHORT_DRAMA_NARRATION_LANGUAGE_VALUES.get(selected_language, "简体中文(中国)") + + +def _resolve_short_drama_type(): + selected_type = st.session_state.get('short_drama_type_option', 'counterattack') + custom_type = str(st.session_state.get('short_drama_custom_type', '') or '').strip() + if selected_type == "custom" and custom_type: + return custom_type + return SHORT_DRAMA_TYPE_VALUES.get(selected_type, "逆袭/复仇") + + def render_script_panel(tr): """渲染脚本配置面板""" with st.container(border=True): @@ -1211,41 +1274,136 @@ def render_script_buttons(tr, params): elif script_path == "short": button_name = tr("Generate Short Video Script") elif script_path == "summary": - button_name = tr("生成短剧解说脚本") + button_name = tr("生成剪辑脚本") elif script_path.endswith("json"): button_name = tr("Load Video Script") else: button_name = tr("Please Select Script File") - if st.button(button_name, key="script_action", disabled=not script_path): - if script_path == "auto": - # 执行纪录片视频脚本生成(视频无字幕无配音) - generate_script_docu(params, tr) - elif script_path == "short": - # 执行 短剧混剪 脚本生成 - custom_clips = st.session_state.get('custom_clips') - generate_script_short(tr, params, custom_clips) - elif script_path == "summary": - # 执行 短剧解说 脚本生成 - subtitle_paths = _selected_subtitle_paths() - subtitle_path = subtitle_paths[0] if subtitle_paths else None - video_theme = st.session_state.get('video_theme') - temperature = st.session_state.get('temperature') - web_search_enabled = bool(st.session_state.get('short_drama_web_search_enabled', False)) - current_signature = _short_drama_plot_analysis_signature( - subtitle_paths, - video_theme, - web_search_enabled, - _selected_video_paths(), + if script_path == "summary": + config_cols = st.columns([1.15, 1.15, 0.9, 1.15, 1.15], vertical_alignment="bottom") + with config_cols[0]: + st.selectbox( + tr("短剧类型"), + options=[code for code, _ in SHORT_DRAMA_TYPE_OPTIONS], + format_func=lambda code: tr(dict(SHORT_DRAMA_TYPE_OPTIONS).get(code, code)), + key="short_drama_type_option", ) - plot_analysis = "" - if st.session_state.get('short_drama_plot_analysis_signature') == current_signature: - plot_analysis = st.session_state.get('short_drama_plot_analysis', '') - elif ( - not web_search_enabled - and st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path - ): - plot_analysis = st.session_state.get('short_drama_plot_analysis', '') + with config_cols[1]: + custom_type_disabled = ( + st.session_state.get('short_drama_type_option', 'counterattack') != "custom" + ) + st.text_input( + tr("自定义短剧类型"), + key="short_drama_custom_type", + placeholder=tr("例如:豪门虐恋"), + disabled=custom_type_disabled, + ) + with config_cols[2]: + st.selectbox( + tr("原片占比"), + options=SHORT_DRAMA_ORIGINAL_SOUND_RATIO_OPTIONS, + format_func=lambda ratio: f"{ratio}%", + index=SHORT_DRAMA_ORIGINAL_SOUND_RATIO_OPTIONS.index(30), + key="short_drama_original_sound_ratio", + ) + with config_cols[3]: + st.selectbox( + tr("解说语言"), + options=[code for code, _ in SHORT_DRAMA_NARRATION_LANGUAGE_OPTIONS], + format_func=lambda code: tr(dict(SHORT_DRAMA_NARRATION_LANGUAGE_OPTIONS).get(code, code)), + key="short_drama_narration_language_option", + ) + with config_cols[4]: + custom_language_disabled = ( + st.session_state.get('short_drama_narration_language_option', 'zh-CN') != "custom" + ) + st.text_input( + tr("自定义解说语言"), + key="short_drama_custom_narration_language", + placeholder=tr("例如:意大利语(意大利)"), + disabled=custom_language_disabled, + ) + + action_cols = st.columns([1, 1], vertical_alignment="bottom") + with action_cols[0]: + narration_copy_clicked = st.button( + tr("生成解说文案"), + key="short_drama_narration_copy_action", + disabled=not script_path, + use_container_width=True, + ) + with action_cols[1]: + action_clicked = st.button( + button_name, + key="script_action", + disabled=not script_path, + use_container_width=True, + ) + else: + narration_copy_clicked = False + action_clicked = st.button(button_name, key="script_action", disabled=not script_path) + + if script_path == "summary" and (narration_copy_clicked or action_clicked): + narration_language = _resolve_short_drama_narration_language() + drama_genre = _resolve_short_drama_type() + original_sound_ratio = int(st.session_state.get('short_drama_original_sound_ratio', 30)) + if ( + st.session_state.get('short_drama_type_option') == "custom" + and not str(st.session_state.get('short_drama_custom_type', '') or '').strip() + ): + st.error(tr("请输入自定义短剧类型")) + st.stop() + if ( + st.session_state.get('short_drama_narration_language_option') == "custom" + and not str(st.session_state.get('short_drama_custom_narration_language', '') or '').strip() + ): + st.error(tr("请输入自定义解说语言")) + st.stop() + + subtitle_paths = _selected_subtitle_paths() + subtitle_path = subtitle_paths[0] if subtitle_paths else None + video_theme = st.session_state.get('video_theme') + temperature = st.session_state.get('temperature') + web_search_enabled = bool(st.session_state.get('short_drama_web_search_enabled', False)) + current_signature = _short_drama_plot_analysis_signature( + subtitle_paths, + video_theme, + web_search_enabled, + _selected_video_paths(), + ) + plot_analysis = "" + if st.session_state.get('short_drama_plot_analysis_signature') == current_signature: + plot_analysis = st.session_state.get('short_drama_plot_analysis', '') + elif ( + not web_search_enabled + and st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path + ): + plot_analysis = st.session_state.get('short_drama_plot_analysis', '') + + if narration_copy_clicked: + with st.spinner(tr("Generating narration copy...")): + copy_result = generate_short_drama_narration_copy( + subtitle_paths, + video_theme, + temperature, + tr, + plot_analysis=plot_analysis, + subtitle_content=st.session_state.get('subtitle_content', ''), + enable_web_search=web_search_enabled, + video_paths=_selected_video_paths(), + narration_language=narration_language, + drama_genre=drama_genre, + ) + if copy_result: + st.session_state['short_drama_narration_copy'] = copy_result["narration_copy"] + if not plot_analysis: + st.session_state['short_drama_plot_analysis'] = copy_result["plot_analysis"] + st.session_state['short_drama_plot_analysis_subtitle_path'] = subtitle_path + st.session_state['short_drama_plot_analysis_signature'] = current_signature + st.success(tr("Narration copy generated successfully")) + + if action_clicked: generate_script_short_sunmmary( params, subtitle_paths, @@ -1256,7 +1414,28 @@ def render_script_buttons(tr, params): subtitle_content=st.session_state.get('subtitle_content', ''), enable_web_search=web_search_enabled, video_paths=_selected_video_paths(), + narration_language=narration_language, + narration_copy=st.session_state.get('short_drama_narration_copy', ''), + drama_genre=drama_genre, + original_sound_ratio=original_sound_ratio, ) + + if script_path == "summary": + st.text_area( + tr("短剧解说文案"), + key="short_drama_narration_copy", + height=220, + help=tr("Narration Copy Help"), + ) + + if action_clicked and script_path != "summary": + if script_path == "auto": + # 执行纪录片视频脚本生成(视频无字幕无配音) + generate_script_docu(params, tr) + elif script_path == "short": + # 执行 短剧混剪 脚本生成 + custom_clips = st.session_state.get('custom_clips') + generate_script_short(tr, params, custom_clips) else: load_script(tr, script_path) diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 284d9a6..e4de584 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -266,6 +266,37 @@ "字幕文件内容似乎为空,请检查文件": "The subtitle file appears to be empty. Please check the file.", "字幕上传成功": "Subtitle uploaded successfully", "短剧名称": "Short Drama Name", + "解说语言": "Narration Language", + "自定义解说语言": "Custom Narration Language", + "例如:意大利语(意大利)": "For example: Italian (Italy)", + "请输入自定义解说语言": "Please enter a custom narration language", + "简体中文(中国)": "Simplified Chinese (China)", + "英语(美国)": "English (United States)", + "日语(日本)": "Japanese (Japan)", + "韩语(韩国)": "Korean (South Korea)", + "法语(法国)": "French (France)", + "德语(德国)": "German (Germany)", + "西班牙语(西班牙)": "Spanish (Spain)", + "葡萄牙语(巴西)": "Portuguese (Brazil)", + "俄语(俄罗斯)": "Russian (Russia)", + "自定义": "Custom", + "短剧类型": "Short Drama Type", + "自定义短剧类型": "Custom Short Drama Type", + "原片占比": "Original Footage Ratio", + "例如:豪门虐恋": "For example: billionaire angst romance", + "请输入自定义短剧类型": "Please enter a custom short drama type", + "逆袭/复仇": "Counterattack / Revenge", + "霸总/甜宠": "CEO Romance / Sweet Romance", + "家庭伦理": "Family Ethics", + "古装/权谋": "Costume / Power Struggle", + "悬疑/犯罪": "Suspense / Crime", + "都市情感": "Urban Romance", + "年代/乡村": "Period / Rural", + "生成解说文案": "Generate Narration Copy", + "生成剪辑脚本": "Generate Editing Script", + "短剧解说文案": "Short Drama Narration Copy", + "Narration Copy Help": "Generate the narration copy first, review or rewrite it here, then generate the editing script to match footage and timestamps.", + "Narration copy generated successfully": "Narration copy generated. Please review and edit it.", "生成短剧解说脚本": "Generate Short Drama Narration Script", "请输入视频脚本": "Please enter the video script", "TTS engine does not support precise subtitles": "⚠️ {engine} does not support precise subtitle generation", @@ -587,11 +618,22 @@ "Preparing script generation": "Preparing script generation", "Script generation failed check logs": "Script generation failed. Please check the logs.", "Parsing subtitles...": "Parsing subtitles...", + "Analyzing subtitles with model...": "Waiting for the model to analyze subtitles...", "Subtitle file does not exist": "Subtitle file does not exist", "Subtitle file is empty or unreadable": "Subtitle file is empty or unreadable", "Generating narration copy...": "Generating narration copy...", + "Generated narration copy is empty": "The generated narration copy is empty", + "Please generate and review narration copy first": "Please generate and review the narration copy first", + "Matching narration copy to footage...": "Matching narration copy to footage and timestamps...", + "Waiting for model stream...": "Waiting for model stream...", + "Streaming unavailable fallback waiting...": "Streaming is unavailable for this request. Waiting for the full response...", + "LLM stream window title": "Model reasoning / output stream", + "Model reasoning stream": "[Model reasoning]", + "Model output preview": "[Model output preview]", + "Repairing narration script...": "Repairing narration script...", "Generated narration JSON parse failed": "The generated narration format is invalid and could not be parsed as JSON", "Generated narration missing items field": "The generated narration is missing the required 'items' field", + "Generated narration validation failed": "The generated narration script failed validation", "Preparing output...": "Preparing output..." } } diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 76872eb..e400adb 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -561,6 +561,37 @@ "字幕文件内容似乎为空,请检查文件": "字幕文件内容似乎为空,请检查文件", "字幕上传成功": "字幕上传成功", "短剧名称": "短剧名称", + "解说语言": "解说语言", + "自定义解说语言": "自定义解说语言", + "例如:意大利语(意大利)": "例如:意大利语(意大利)", + "请输入自定义解说语言": "请输入自定义解说语言", + "简体中文(中国)": "简体中文(中国)", + "英语(美国)": "英语(美国)", + "日语(日本)": "日语(日本)", + "韩语(韩国)": "韩语(韩国)", + "法语(法国)": "法语(法国)", + "德语(德国)": "德语(德国)", + "西班牙语(西班牙)": "西班牙语(西班牙)", + "葡萄牙语(巴西)": "葡萄牙语(巴西)", + "俄语(俄罗斯)": "俄语(俄罗斯)", + "自定义": "自定义", + "短剧类型": "短剧类型", + "自定义短剧类型": "自定义短剧类型", + "原片占比": "原片占比", + "例如:豪门虐恋": "例如:豪门虐恋", + "请输入自定义短剧类型": "请输入自定义短剧类型", + "逆袭/复仇": "逆袭/复仇", + "霸总/甜宠": "霸总/甜宠", + "家庭伦理": "家庭伦理", + "古装/权谋": "古装/权谋", + "悬疑/犯罪": "悬疑/犯罪", + "都市情感": "都市情感", + "年代/乡村": "年代/乡村", + "生成解说文案": "生成解说文案", + "生成剪辑脚本": "生成剪辑脚本", + "短剧解说文案": "短剧解说文案", + "Narration Copy Help": "先点击生成解说文案;审核、删改或重写这段文案后,再点击生成剪辑脚本匹配画面和时间戳。", + "Narration copy generated successfully": "解说文案已生成,可先审核修改", "生成短剧解说脚本": "生成短剧解说脚本", "请输入视频脚本": "请输入视频脚本", "自定义片段": "自定义片段", @@ -583,11 +614,22 @@ "Preparing script generation": "开始准备生成脚本", "Script generation failed check logs": "生成脚本失败,请检查日志", "Parsing subtitles...": "正在解析字幕...", + "Analyzing subtitles with model...": "正在等待模型分析字幕...", "Subtitle file does not exist": "字幕文件不存在", "Subtitle file is empty or unreadable": "字幕文件内容为空或无法读取", "Generating narration copy...": "正在生成文案...", + "Generated narration copy is empty": "生成的解说文案为空", + "Please generate and review narration copy first": "请先生成并审核解说文案", + "Matching narration copy to footage...": "正在根据解说文案匹配画面和时间戳...", + "Waiting for model stream...": "正在等待模型流式输出...", + "Streaming unavailable fallback waiting...": "当前接口未返回流式内容,正在等待完整响应...", + "LLM stream window title": "模型思考 / 输出流", + "Model reasoning stream": "【模型思考】", + "Model output preview": "【模型输出预览】", + "Repairing narration script...": "正在修复解说脚本...", "Generated narration JSON parse failed": "生成的解说文案格式错误,无法解析为 JSON", "Generated narration missing items field": "生成的解说文案缺少必要的 'items' 字段", + "Generated narration validation failed": "生成的解说脚本校验失败", "Preparing output...": "整理输出..." } } diff --git a/webui/tools/generate_short_summary.py b/webui/tools/generate_short_summary.py index d06431c..ab1e71b 100644 --- a/webui/tools/generate_short_summary.py +++ b/webui/tools/generate_short_summary.py @@ -11,12 +11,20 @@ import os import json import time import traceback +import html import streamlit as st from loguru import logger from app.config import config -from app.services.SDE.short_drama_explanation import analyze_subtitle, generate_narration_script +from app.services.SDE.short_drama_explanation import ( + analyze_subtitle, + generate_narration_copy as generate_narration_copy_legacy, + match_narration_copy_to_script as match_narration_copy_to_script_legacy, +) from app.services.subtitle_text import read_subtitle_text +from app.services.short_drama_narration_validation import ( + normalize_script_video_sources, +) from app.services.tavily_search import TavilySearchError, format_search_context, search_short_drama # 导入新的LLM服务模块 - 确保提供商被注册 import app.services.llm # 这会触发提供商注册 @@ -24,6 +32,9 @@ from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter import re +PUBLIC_SCRIPT_FIELDS = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"] + + def _normalize_paths(paths): if isinstance(paths, str): paths = [paths] @@ -63,53 +74,23 @@ def _build_combined_subtitle_content(subtitle_paths, video_paths=None): return "\n\n".join(sections) -def _coerce_video_id(value): - try: - video_id = int(value) - except (TypeError, ValueError): - return None - return video_id if video_id > 0 else None - - -def _match_video_id_by_name(video_name, video_paths): - video_name = str(video_name or "").strip() - if not video_name: - return None - - for index, video_path in enumerate(video_paths, start=1): - if os.path.basename(video_path) == os.path.basename(video_name): - return index - return None - - def _normalize_narration_items_video_sources(items, video_paths): - video_paths = _normalize_paths(video_paths) - if not video_paths: - return items + return normalize_script_video_sources(items, _normalize_paths(video_paths)) - normalized_items = [] - for item in items: - if not isinstance(item, dict): - normalized_items.append(item) - continue - item_copy = item.copy() - video_id = _coerce_video_id(item_copy.get("video_id") or item_copy.get("video_index")) - matched_video_id = _match_video_id_by_name( - item_copy.get("video_name") or item_copy.get("source_video"), - video_paths, - ) - if matched_video_id: - video_id = matched_video_id - if video_id is None or video_id > len(video_paths): - logger.warning(f"片段 {item_copy.get('_id')} 未提供有效 video_id,默认使用视频 1") - video_id = 1 +def _strip_planner_only_fields(items): + return [ + {field: item[field] for field in PUBLIC_SCRIPT_FIELDS if field in item} + for item in items + if isinstance(item, dict) + ] - item_copy["video_id"] = video_id - item_copy["video_name"] = os.path.basename(video_paths[video_id - 1]) - normalized_items.append(item_copy) - return normalized_items +def _format_progress_status(progress, message: str = "", tr=lambda key: key): + message = str(message or "").strip() + if message: + return message + return f"{tr('Progress')}: {progress}%" def parse_and_fix_json(json_string): @@ -203,25 +184,9 @@ def parse_and_fix_json(json_string): logger.debug(f"综合修复失败: {e}") pass - # 如果所有方法都失败,尝试创建一个基本的结构 + # 如果所有方法都失败,直接返回 None,避免生成不可剪辑的默认假脚本 logger.error(f"所有JSON解析方法都失败,原始内容: {json_string[:200]}...") - - # 尝试从文本中提取关键信息创建基本结构 - try: - # 这是一个简单的回退方案 - return { - "items": [ - { - "_id": 1, - "timestamp": "00:00:00,000-00:00:10,000", - "picture": "解析失败,使用默认内容", - "narration": json_string[:100] + "..." if len(json_string) > 100 else json_string, - "OST": 0 - } - ] - } - except Exception: - return None + return None def _get_tavily_api_key() -> str: @@ -350,6 +315,100 @@ def analyze_short_drama_plot( return analysis_result["analysis"] +def generate_short_drama_narration_copy( + subtitle_path, + video_theme, + temperature, + tr=lambda key: key, + plot_analysis=None, + subtitle_content=None, + enable_web_search: bool = False, + video_paths=None, + narration_language: str = "简体中文(中国)", + drama_genre: str = "逆袭/复仇", +): + """生成可由用户审核修改的短剧解说正文,不绑定时间戳。""" + subtitle_paths = _normalize_paths(subtitle_path) + if not subtitle_paths: + st.error(tr("Please generate or upload subtitles first")) + return None + missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)] + if missing_subtitle_paths: + st.error(tr("Subtitle file does not exist")) + return None + + selected_video_paths = _normalize_paths(video_paths) + subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content( + subtitle_paths, + selected_video_paths, + ) + if not subtitle_content: + st.error(tr("Subtitle file is empty or unreadable")) + return None + + analysis_text = str(plot_analysis or "").strip() + if not analysis_text: + analysis_text = analyze_short_drama_plot( + subtitle_paths, + temperature, + tr, + subtitle_content=subtitle_content, + short_name=video_theme, + enable_web_search=enable_web_search, + video_paths=selected_video_paths, + ) + if not analysis_text: + return None + + text_provider = config.app.get('text_llm_provider', 'gemini').lower() + text_api_key = config.app.get(f'text_{text_provider}_api_key') + text_model = config.app.get(f'text_{text_provider}_model_name') + text_base_url = config.app.get(f'text_{text_provider}_base_url') + + try: + logger.info("使用新的LLM服务架构生成可审核解说文案") + analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) + narration_result = analyzer.generate_narration_copy( + short_name=video_theme, + plot_analysis=analysis_text, + subtitle_content=subtitle_content, + temperature=temperature, + narration_language=narration_language, + drama_genre=drama_genre, + ) + except Exception as e: + logger.warning(f"使用新LLM服务生成文案失败,回退到旧实现: {str(e)}") + narration_result = generate_narration_copy_legacy( + short_name=video_theme, + plot_analysis=analysis_text, + subtitle_content=subtitle_content, + api_key=text_api_key, + model=text_model, + base_url=text_base_url, + temperature=temperature, + provider=text_provider, + narration_language=narration_language, + drama_genre=drama_genre, + ) + + if narration_result.get("status") != "success": + logger.error(f"解说文案正文生成失败: {narration_result.get('message')}") + st.error(tr("Script generation failed check logs")) + return None + + narration_copy = str(narration_result.get("narration_copy", "")).strip() + if not narration_copy: + logger.error("模型返回空解说文案正文") + st.error(tr("Generated narration copy is empty")) + return None + + return { + "narration_copy": narration_copy, + "plot_analysis": analysis_text, + "subtitle_content": subtitle_content, + } + + def generate_script_short_sunmmary( params, subtitle_path, @@ -360,21 +419,79 @@ def generate_script_short_sunmmary( subtitle_content=None, enable_web_search: bool = False, video_paths=None, + narration_language: str = "简体中文(中国)", + narration_copy: str = "", + drama_genre: str = "逆袭/复仇", + original_sound_ratio: int = 30, ): """ 生成 短剧解说 视频脚本 要求: 提供高质量短剧字幕 适合场景: 短剧 """ - progress_bar = st.progress(0) + progress_bar = st.empty() status_text = st.empty() + stream_text = st.empty() + stream_state = { + "reasoning": "", + "content": "", + "last_update": 0.0, + } def update_progress(progress: float, message: str = ""): progress_bar.progress(progress) + status_text.text(_format_progress_status(progress, message, tr)) + + def update_waiting(message: str = ""): + progress_bar.empty() if message: - status_text.text(f"{progress}% - {message}") + status_text.text(message) else: - status_text.text(f"{tr('Progress')}: {progress}%") + status_text.empty() + + def update_stream_window(event): + event = event or {} + chunk_type = str(event.get("type") or "content") + chunk_text = str(event.get("text") or "") + if chunk_type == "done" or not chunk_text: + return + + bucket = "reasoning" if chunk_type == "reasoning" else "content" + stream_state[bucket] += chunk_text + + now = time.time() + if now - stream_state["last_update"] < 0.12: + return + stream_state["last_update"] = now + + blocks = [] + if stream_state["reasoning"].strip(): + blocks.append( + f"{tr('Model reasoning stream')}\n" + f"{stream_state['reasoning'][-900:]}" + ) + if stream_state["content"].strip(): + blocks.append( + f"{tr('Model output preview')}\n" + f"{stream_state['content'][-900:]}" + ) + + preview = "\n\n".join(blocks)[-1800:] + escaped_preview = html.escape(preview) + stream_text.markdown( + f""" +
+
+ {html.escape(tr('LLM stream window title'))} +
+
{escaped_preview}
+
+ """, + unsafe_allow_html=True, + ) try: with st.spinner(tr("Generating script...")): @@ -414,9 +531,14 @@ def generate_script_short_sunmmary( st.error(tr("Subtitle file is empty or unreadable")) return + narration_copy = str(narration_copy or "").strip() + if not narration_copy: + st.error(tr("Please generate and review narration copy first")) + return + analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) if plot_analysis and str(plot_analysis).strip(): - logger.info("使用用户编辑后的剧情理解结果生成解说文案") + logger.info("使用用户编辑后的剧情理解结果匹配剪辑脚本") analysis_result = { "status": "success", "analysis": str(plot_analysis).strip(), @@ -424,7 +546,7 @@ def generate_script_short_sunmmary( else: plot_analysis_input = subtitle_content if enable_web_search: - update_progress(40, tr("Searching short drama with Tavily...")) + update_waiting(tr("Searching short drama with Tavily...")) plot_analysis_input = _build_plot_analysis_input( subtitle_content, short_name=video_theme, @@ -436,11 +558,13 @@ def generate_script_short_sunmmary( try: # 优先使用新的LLM服务架构 logger.info("使用新的LLM服务架构进行字幕分析") + update_waiting(tr("Analyzing subtitles with model...")) analysis_result = analyzer.analyze_subtitle(plot_analysis_input) except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") # 回退到旧的实现 + update_waiting(tr("Analyzing subtitles with model...")) analysis_result = analyze_subtitle( subtitle_content=plot_analysis_input, api_key=text_api_key, @@ -451,42 +575,50 @@ def generate_script_short_sunmmary( provider=text_provider ) """ - 3. 根据剧情生成解说文案 + 3. 根据用户审核后的文案匹配画面与时间戳 """ if analysis_result["status"] == "success": logger.info("字幕分析成功!") - update_progress(60, tr("Generating narration copy...")) + update_waiting() - # 根据剧情生成解说文案 - 使用新的LLM服务架构 try: - # 优先使用新的LLM服务架构 - logger.info("使用新的LLM服务架构生成解说文案") - narration_result = analyzer.generate_narration_script( + logger.info("使用新的LLM服务架构将审核文案匹配到字幕画面") + update_waiting(tr("Matching narration copy to footage...")) + stream_text.info(tr("Waiting for model stream...")) + narration_result = analyzer.match_narration_copy_to_script( short_name=video_theme, plot_analysis=analysis_result["analysis"], - subtitle_content=subtitle_content, # 传递原始字幕内容 - temperature=temperature + subtitle_content=subtitle_content, + narration_copy=narration_copy, + temperature=temperature, + narration_language=narration_language, + drama_genre=drama_genre, + original_sound_ratio=original_sound_ratio, + stream_callback=update_stream_window, ) except Exception as e: - logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") - # 回退到旧的实现 - narration_result = generate_narration_script( + logger.warning(f"使用新LLM服务匹配画面失败,回退到旧实现: {str(e)}") + stream_text.info(tr("Streaming unavailable fallback waiting...")) + narration_result = match_narration_copy_to_script_legacy( short_name=video_theme, plot_analysis=analysis_result["analysis"], - subtitle_content=subtitle_content, # 传递原始字幕内容 + subtitle_content=subtitle_content, + narration_copy=narration_copy, api_key=text_api_key, model=text_model, base_url=text_base_url, - save_result=True, temperature=temperature, - provider=text_provider + provider=text_provider, + narration_language=narration_language, + drama_genre=drama_genre, + original_sound_ratio=original_sound_ratio, ) if narration_result["status"] == "success": - logger.info("\n解说文案生成成功!") + logger.info("\n剪辑脚本匹配成功!") logger.info(narration_result["narration_script"]) else: - logger.info(f"\n解说文案生成失败: {narration_result['message']}") + logger.info(f"\n剪辑脚本匹配失败: {narration_result['message']}") st.error(tr("Script generation failed check logs")) st.stop() else: @@ -519,6 +651,7 @@ def generate_script_short_sunmmary( narration_dict['items'], selected_video_paths, ) + narration_items = _strip_planner_only_fields(narration_items) script = json.dumps(narration_items, ensure_ascii=False, indent=2) if script is None: @@ -543,3 +676,4 @@ def generate_script_short_sunmmary( time.sleep(2) progress_bar.empty() status_text.empty() + stream_text.empty() diff --git a/webui/tools/test_generate_short_summary_unittest.py b/webui/tools/test_generate_short_summary_unittest.py new file mode 100644 index 0000000..30122d6 --- /dev/null +++ b/webui/tools/test_generate_short_summary_unittest.py @@ -0,0 +1,27 @@ +import unittest + +from webui.tools.generate_short_summary import _format_progress_status, parse_and_fix_json + + +class GenerateShortSummaryJsonTests(unittest.TestCase): + def test_progress_message_does_not_prefix_fake_percentage(self): + status = _format_progress_status(60, "正在生成文案...") + + self.assertEqual("正在生成文案...", status) + self.assertNotIn("60%", status) + + def test_invalid_json_does_not_create_default_fake_script(self): + self.assertIsNone(parse_and_fix_json("not a json response")) + + def test_json_code_block_is_parsed(self): + parsed = parse_and_fix_json( + """```json +{"items": [{"_id": 1, "timestamp": "00:00:01,000-00:00:02,000"}]} +```""" + ) + + self.assertEqual(1, parsed["items"][0]["_id"]) + + +if __name__ == "__main__": + unittest.main() From 34d5532119fcfee5e98096e244bb4b8dc4682cfd Mon Sep 17 00:00:00 2001 From: viccy Date: Sun, 7 Jun 2026 17:58:02 +0800 Subject: [PATCH 15/24] =?UTF-8?q?feat(subtitle):=20=E6=96=B0=E5=A2=9E=20Fi?= =?UTF-8?q?reRedASR2=20=E6=9C=AC=E5=9C=B0=20ASR=20=E5=90=8E=E7=AB=AF?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加 FireRedASR2 本地 ASR 转写后端的完整支持: 1. 新增配置参数与数据模型字段 2. 更新示例配置文件,添加默认本地服务地址 3. 完善任务服务中的转写逻辑,支持 FireRedASR 后端 4. 更新 WebUI 界面,新增对应配置选项 5. 补充中英文多语言翻译 6. 新增本地 FireRedASR 服务的单元测试 --- app/models/schema.py | 1 + app/services/fun_asr_subtitle.py | 168 +++++++++++++++++- app/services/task.py | 15 +- .../test_fun_asr_subtitle_unittest.py | 86 +++++++++ config.example.toml | 3 +- webui/components/script_settings.py | 40 ++++- webui/components/subtitle_settings.py | 24 ++- webui/i18n/en.json | 11 +- webui/i18n/zh.json | 11 +- 9 files changed, 331 insertions(+), 28 deletions(-) diff --git a/app/models/schema.py b/app/models/schema.py index e0547e7..b492bb1 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -201,6 +201,7 @@ class VideoClipParams(BaseModel): subtitle_auto_transcribe_enabled: bool = False subtitle_auto_transcribe_backend: str = "local" subtitle_auto_transcribe_api_url: str = "" + subtitle_auto_transcribe_firered_api_url: str = "" subtitle_auto_transcribe_api_key: str = "" subtitle_auto_transcribe_hotword: str = "" subtitle_auto_transcribe_enable_spk: bool = False diff --git a/app/services/fun_asr_subtitle.py b/app/services/fun_asr_subtitle.py index 2567bc8..f6042d4 100644 --- a/app/services/fun_asr_subtitle.py +++ b/app/services/fun_asr_subtitle.py @@ -24,6 +24,7 @@ TRANSCRIPTION_URL = f"{DASHSCOPE_BASE_URL}/api/v1/services/audio/asr/transcripti TASK_URL_TEMPLATE = f"{DASHSCOPE_BASE_URL}/api/v1/tasks/{{task_id}}" MODEL_NAME = "fun-asr" LOCAL_FUN_ASR_API_URL = "http://127.0.0.1:7860" +LOCAL_FIRERED_ASR_API_URL = "http://127.0.0.1:7867" TERMINAL_FAILED_STATUSES = {"FAILED", "CANCELED", "UNKNOWN"} PUNCTUATION_BREAKS = set(",。!?;,.!?;") @@ -131,7 +132,11 @@ def _absolute_local_download_url(api_url: str, download_url: str) -> str: return urljoin(f"{_local_base_url(api_url)}/", download_url) -def _raise_for_local_http(response: requests.Response, action: str) -> None: +def _raise_for_local_http( + response: requests.Response, + action: str, + service_name: str = "本地 FunASR-Pack 服务", +) -> None: status_code = getattr(response, "status_code", 200) if status_code and status_code >= 400: detail = "" @@ -142,16 +147,20 @@ def _raise_for_local_http(response: requests.Response, action: str) -> None: except Exception: detail = "" suffix = f": {detail}" if detail else "" - raise FunAsrError(f"{action}失败{suffix},请确认本地 FunASR-Pack 服务可用") + raise FunAsrError(f"{action}失败{suffix},请确认{service_name}可用") try: response.raise_for_status() except Exception as exc: - raise FunAsrError(f"{action}失败,请确认本地 FunASR-Pack 服务可用") from exc + raise FunAsrError(f"{action}失败,请确认{service_name}可用") from exc -def _local_json(response: requests.Response, action: str) -> dict[str, Any]: - _raise_for_local_http(response, action) +def _local_json( + response: requests.Response, + action: str, + service_name: str = "本地 FunASR-Pack 服务", +) -> dict[str, Any]: + _raise_for_local_http(response, action, service_name=service_name) try: data = response.json() except Exception as exc: @@ -520,6 +529,19 @@ def request_local_fun_asr_health(api_url: str = LOCAL_FUN_ASR_API_URL, session=r return _local_json(response, "检查本地 FunASR-Pack 服务") +def request_local_firered_asr_health( + api_url: str = LOCAL_FIRERED_ASR_API_URL, + session=requests, +) -> dict[str, Any]: + """Fetch FireRedASR2-AED-Pack health metadata from the local service.""" + response = _session_get(session, f"{_local_base_url(api_url)}/health", timeout=10) + return _local_json( + response, + "检查本地 FireRedASR2-AED-Pack 服务", + service_name="本地 FireRedASR2-AED-Pack 服务", + ) + + def request_local_fun_asr( local_file: str, api_url: str = LOCAL_FUN_ASR_API_URL, @@ -548,21 +570,61 @@ def request_local_fun_asr( return _local_json(response, "调用本地 FunASR-Pack ASR API") +def request_local_firered_asr( + local_file: str, + api_url: str = LOCAL_FIRERED_ASR_API_URL, + enable_vad: Optional[bool] = True, + enable_lid: Optional[bool] = True, + enable_punc: Optional[bool] = True, + return_timestamp: Optional[bool] = True, + timeout: float = 600.0, + session=requests, +) -> dict[str, Any]: + """Call the local FireRedASR2-AED-Pack `/asr` API and return its JSON result.""" + _require_local_file(local_file) + data: dict[str, str] = {} + options = { + "enable_vad": enable_vad, + "enable_lid": enable_lid, + "enable_punc": enable_punc, + "return_timestamp": return_timestamp, + } + for key, value in options.items(): + if value is not None: + data[key] = "true" if value else "false" + + with open(local_file, "rb") as file_obj: + files = {"file": (_safe_upload_name(local_file), file_obj)} + response = _session_post( + session, + _local_asr_url(api_url), + data=data, + files=files, + timeout=timeout, + ) + return _local_json( + response, + "调用本地 FireRedASR2-AED-Pack ASR API", + service_name="本地 FireRedASR2-AED-Pack 服务", + ) + + def download_local_srt( download_url: str, api_url: str = LOCAL_FUN_ASR_API_URL, subtitle_file: str = "", session=requests, + service_name: str = "本地 FunASR-Pack 服务", ) -> str: """Download an SRT exposed by FunASR-Pack and save it as a NarratoAI subtitle.""" absolute_url = _absolute_local_download_url(api_url, download_url) if not absolute_url: raise FunAsrError("本地 FunASR-Pack 结果缺少 SRT 下载地址") response = _session_get(session, absolute_url, timeout=60) - _raise_for_local_http(response, "下载本地 FunASR-Pack SRT") + _raise_for_local_http(response, "下载本地 SRT", service_name=service_name) srt_content = _response_text(response) if not srt_content.strip(): - raise FunAsrError("本地 FunASR-Pack 返回了空 SRT") + raise FunAsrError(f"{service_name}返回了空 SRT") return write_srt_file(srt_content, subtitle_file) @@ -665,6 +727,45 @@ def local_fun_asr_result_to_srt( return "\n".join(lines).rstrip() + "\n" +def firered_asr_result_to_srt(result_json: dict[str, Any]) -> str: + """Convert a FireRedASR2-AED-Pack JSON response into SRT when no SRT URL is returned.""" + blocks: list[dict[str, Any]] = [] + sentences = result_json.get("sentences") + if isinstance(sentences, list): + for sentence in sentences: + if not isinstance(sentence, dict): + continue + text = str(sentence.get("text") or "").strip() + if not text: + continue + start = sentence.get("start_ms", sentence.get("begin_time", sentence.get("start_time", 0))) + end = sentence.get("end_ms", sentence.get("end_time")) + start_ms = _timestamp_ms(start, "firered.sentence.start_ms") + end_ms = _timestamp_ms(end, "firered.sentence.end_ms") if end is not None else start_ms + 500 + blocks.append({"start": start_ms, "end": end_ms, "text": text}) + + if not blocks: + return local_fun_asr_result_to_srt(result_json) + + lines = [] + for index, block in enumerate(blocks, start=1): + lines.append(_srt_block(index, block["start"], block["end"], block["text"])) + return "\n".join(lines).rstrip() + "\n" + + +def _get_local_srt_download_url(result_json: dict[str, Any]) -> str: + downloads = result_json.get("downloads") or {} + if isinstance(downloads, dict): + download_url = downloads.get("srt") + if download_url: + return str(download_url) + for key in ("srt_url", "srt_download_url", "download_url"): + download_url = result_json.get(key) + if download_url: + return str(download_url) + return "" + + def create_with_local_fun_asr( local_file: str, subtitle_file: str = "", @@ -689,8 +790,7 @@ def create_with_local_fun_asr( if isinstance(srt_file, str) and srt_file and os.path.isfile(srt_file): output_file = copy_srt_file(srt_file, subtitle_file) else: - downloads = result_json.get("downloads") or {} - download_url = downloads.get("srt") if isinstance(downloads, dict) else "" + download_url = _get_local_srt_download_url(result_json) if download_url: output_file = download_local_srt( download_url, @@ -710,6 +810,56 @@ def create_with_local_fun_asr( raise FunAsrError("本地 FunASR-Pack 字幕转写失败,请检查服务地址、文件或模型状态") from exc +def create_with_local_firered_asr( + local_file: str, + subtitle_file: str = "", + api_url: str = LOCAL_FIRERED_ASR_API_URL, + enable_vad: Optional[bool] = True, + enable_lid: Optional[bool] = True, + enable_punc: Optional[bool] = True, + return_timestamp: Optional[bool] = True, + timeout: float = 600.0, + session=requests, +) -> Optional[str]: + """Create an SRT file through a locally running FireRedASR2-AED-Pack API.""" + service_name = "本地 FireRedASR2-AED-Pack 服务" + try: + result_json = request_local_firered_asr( + local_file=local_file, + api_url=api_url, + enable_vad=enable_vad, + enable_lid=enable_lid, + enable_punc=enable_punc, + return_timestamp=return_timestamp, + timeout=timeout, + session=session, + ) + + srt_file = result_json.get("srt_file") + if isinstance(srt_file, str) and srt_file and os.path.isfile(srt_file): + output_file = copy_srt_file(srt_file, subtitle_file) + else: + download_url = _get_local_srt_download_url(result_json) + if download_url: + output_file = download_local_srt( + download_url, + api_url=api_url, + subtitle_file=subtitle_file, + session=session, + service_name=service_name, + ) + else: + srt_content = firered_asr_result_to_srt(result_json) + output_file = write_srt_file(srt_content, subtitle_file) + + logger.info(f"本地 FireRedASR2-AED-Pack 字幕文件已生成: {output_file}") + return output_file + except FunAsrError: + raise + except Exception as exc: + raise FunAsrError("本地ASR字幕转写失败,请检查 FireRedASR2-AED-Pack 服务地址、文件或模型状态") from exc + + def create_with_fun_asr( local_file: str, subtitle_file: str = "", diff --git a/app/services/task.py b/app/services/task.py index 28b05ea..74e7804 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -24,7 +24,7 @@ def _is_auto_transcription_enabled(params: VideoClipParams) -> bool: def _get_auto_transcription_backend(params: VideoClipParams) -> str: backend = str(getattr(params, "subtitle_auto_transcribe_backend", "") or "").strip().lower() - if backend not in {"local", "bailian"}: + if backend not in {"local", "firered", "bailian"}: backend = "local" return backend @@ -80,6 +80,19 @@ def _transcribe_final_video(task_id: str, video_path: str, params: VideoClipPara hotword=str(getattr(params, "subtitle_auto_transcribe_hotword", "") or "").strip(), enable_spk=bool(getattr(params, "subtitle_auto_transcribe_enable_spk", False)), ) + elif backend == "firered": + api_url = str( + getattr(params, "subtitle_auto_transcribe_firered_api_url", "") + or config.fun_asr.get("firered_api_url", fun_asr_subtitle.LOCAL_FIRERED_ASR_API_URL) + ).strip() + if not api_url: + raise ValueError("请先输入本地ASR API 地址") + + generated_path = fun_asr_subtitle.create_with_local_firered_asr( + local_file=video_path, + subtitle_file=subtitle_file, + api_url=api_url, + ) else: api_key = str( getattr(params, "subtitle_auto_transcribe_api_key", "") diff --git a/app/services/test_fun_asr_subtitle_unittest.py b/app/services/test_fun_asr_subtitle_unittest.py index d59550d..d5a3ccd 100644 --- a/app/services/test_fun_asr_subtitle_unittest.py +++ b/app/services/test_fun_asr_subtitle_unittest.py @@ -481,6 +481,91 @@ class LocalFunAsrServiceTests(unittest.TestCase): self.assertIn("世界。", srt) +class LocalFireRedAsrServiceTests(unittest.TestCase): + def test_request_local_firered_asr_posts_file_and_options(self): + class LocalSession: + def __init__(self): + self.calls = [] + + def post(self, url, **kwargs): + self.calls.append(("POST", url, kwargs)) + return FakeResponse({"text": "你好", "srt_url": "/outputs/out.srt"}) + + with tempfile.TemporaryDirectory() as tmp_dir: + local_file = Path(tmp_dir) / "audio.wav" + local_file.write_bytes(b"audio") + session = LocalSession() + + result = fasr.request_local_firered_asr( + str(local_file), + api_url="127.0.0.1:7867", + enable_vad=True, + enable_lid=False, + enable_punc=True, + return_timestamp=True, + timeout=456, + session=session, + ) + + self.assertEqual("你好", result["text"]) + self.assertEqual("POST", session.calls[0][0]) + self.assertEqual("http://127.0.0.1:7867/asr", session.calls[0][1]) + self.assertEqual( + { + "enable_vad": "true", + "enable_lid": "false", + "enable_punc": "true", + "return_timestamp": "true", + }, + session.calls[0][2]["data"], + ) + self.assertEqual(456, session.calls[0][2]["timeout"]) + self.assertIn("file", session.calls[0][2]["files"]) + + def test_create_with_local_firered_asr_downloads_srt_url(self): + class LocalSession: + def __init__(self): + self.calls = [] + + def post(self, url, **kwargs): + self.calls.append(("POST", url, kwargs)) + return FakeResponse({"text": "你好", "srt_url": "/outputs/result.srt"}) + + def get(self, url, **kwargs): + self.calls.append(("GET", url, kwargs)) + return FakeResponse(text="1\n00:00:00,000 --> 00:00:01,000\n你好\n") + + with tempfile.TemporaryDirectory() as tmp_dir: + local_file = Path(tmp_dir) / "audio.wav" + local_file.write_bytes(b"audio") + subtitle_file = Path(tmp_dir) / "out.srt" + session = LocalSession() + + result_path = fasr.create_with_local_firered_asr( + str(local_file), + subtitle_file=str(subtitle_file), + api_url="http://127.0.0.1:7867", + session=session, + ) + + self.assertEqual(str(subtitle_file), result_path) + self.assertEqual("http://127.0.0.1:7867/outputs/result.srt", session.calls[1][1]) + self.assertIn("你好", subtitle_file.read_text(encoding="utf-8")) + + def test_firered_asr_result_to_srt_uses_sentence_timestamps(self): + result = { + "sentences": [ + {"text": "你好。", "start_ms": 40, "end_ms": 900}, + {"text": "欢迎观看。", "start_ms": 900, "end_ms": 2100}, + ] + } + + srt = fasr.firered_asr_result_to_srt(result) + + self.assertIn("1\n00:00:00,040 --> 00:00:00,900\n你好。", srt) + self.assertIn("2\n00:00:00,900 --> 00:00:02,100\n欢迎观看。", srt) + + class FunAsrConfigTests(unittest.TestCase): def test_save_config_persists_fun_asr_section(self): original_config_file = cfg.config_file @@ -503,6 +588,7 @@ class FunAsrConfigTests(unittest.TestCase): config_data = tomllib.loads(Path("config.example.toml").read_text(encoding="utf-8")) self.assertEqual("local", config_data["fun_asr"]["backend"]) self.assertEqual("http://127.0.0.1:7860", config_data["fun_asr"]["api_url"]) + self.assertEqual("http://127.0.0.1:7867", config_data["fun_asr"]["firered_api_url"]) self.assertEqual("fun-asr", config_data["fun_asr"]["model"]) self.assertIn("api_key", config_data["fun_asr"]) diff --git a/config.example.toml b/config.example.toml index 547724e..3c815c3 100644 --- a/config.example.toml +++ b/config.example.toml @@ -110,10 +110,11 @@ [fun_asr] # Fun-ASR 字幕转录配置 - # backend = "local" 使用本地 FunASR-Pack API;backend = "bailian" 使用阿里百炼在线 fun-asr + # backend = "local" 使用本地 FunASR-Pack API;backend = "firered" 使用本地 FireRedASR2-AED-Pack API;backend = "bailian" 使用阿里百炼在线 fun-asr auto_transcribe_enabled = false backend = "local" api_url = "http://127.0.0.1:7860" + firered_api_url = "http://127.0.0.1:7867" hotword = "" enable_spk = false # 使用阿里百炼在线 fun-asr 时,访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取 API Key diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index d73eba0..555e1e1 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -996,11 +996,12 @@ def render_fun_asr_transcription(tr): backend_options = { tr("Local FunASR-Pack API"): "local", + tr("Local FireRedASR API"): "firered", tr("Ali Bailian Online Fun-ASR"): "bailian", tr("上传字幕文件"): "upload", } saved_backend = str(config.fun_asr.get("backend", "")).strip().lower() - if saved_backend not in {"local", "bailian", "upload"}: + if saved_backend not in {"local", "firered", "bailian", "upload"}: saved_backend = ( "bailian" if config.fun_asr.get("api_key") and not config.fun_asr.get("api_url") @@ -1012,6 +1013,7 @@ def render_fun_asr_transcription(tr): backend = saved_backend api_key = "" api_url = config.fun_asr.get("api_url", fun_asr_subtitle.LOCAL_FUN_ASR_API_URL) + firered_api_url = config.fun_asr.get("firered_api_url", fun_asr_subtitle.LOCAL_FIRERED_ASR_API_URL) hotword = config.fun_asr.get("hotword", "") enable_spk = bool(config.fun_asr.get("enable_spk", False)) media_paths = _selected_video_paths() @@ -1020,11 +1022,10 @@ def render_fun_asr_transcription(tr): with subtitle_cols[0]: with st.expander(tr("Ali Bailian Fun-ASR Subtitle Transcription"), expanded=False): - backend_label = st.radio( + backend_label = st.selectbox( tr("Subtitle Processing Method"), options=backend_labels, index=backend_values.index(saved_backend), - horizontal=True, key="fun_asr_backend", ) backend = backend_options[backend_label] @@ -1051,6 +1052,14 @@ def render_fun_asr_transcription(tr): help=tr("Enable speaker diarization Help"), key="fun_asr_enable_spk", ) + elif backend == "firered": + st.caption(tr("Local FireRed-ASR upload caption")) + firered_api_url = st.text_input( + tr("Local FireRedASR API URL"), + value=firered_api_url, + help=tr("Local FireRedASR API URL Help"), + key="fun_asr_firered_api_url", + ) else: st.caption(tr("Fun-ASR upload caption")) st.markdown( @@ -1166,6 +1175,10 @@ def render_fun_asr_transcription(tr): clear_fun_asr_subtitle_state() st.error(tr("Please enter local FunASR-Pack API URL")) return + if backend == "firered" and not str(firered_api_url).strip(): + clear_fun_asr_subtitle_state() + st.error(tr("Please enter local FireRedASR API URL")) + return missing_paths = [path for path in media_paths if not os.path.exists(path)] if not media_paths or missing_paths: clear_fun_asr_subtitle_state() @@ -1184,22 +1197,25 @@ def render_fun_asr_transcription(tr): config.fun_asr["backend"] = backend config.fun_asr["api_url"] = str(api_url).strip() + config.fun_asr["firered_api_url"] = str(firered_api_url).strip() config.fun_asr["api_key"] = api_key.strip() config.fun_asr["hotword"] = str(hotword).strip() config.fun_asr["enable_spk"] = bool(enable_spk) config.fun_asr["model"] = "fun-asr" config.save_config() - spinner_text = ( - tr("Transcribing with local FunASR-Pack...") - if backend == "local" - else tr("Transcribing with Fun-ASR...") - ) + if backend == "local": + spinner_text = tr("Transcribing with local FunASR-Pack...") + elif backend == "firered": + spinner_text = tr("Transcribing with local FireRedASR...") + else: + spinner_text = tr("Transcribing with Fun-ASR...") with st.spinner(spinner_text): progress_bar = st.progress(0) if len(media_paths) > 1 else None generated_paths = [] for index, media_path in enumerate(media_paths, start=1): - subtitle_name = f"{os.path.splitext(os.path.basename(media_path))[0]}_fun_asr.srt" + subtitle_suffix = "firered_asr" if backend == "firered" else "fun_asr" + subtitle_name = f"{os.path.splitext(os.path.basename(media_path))[0]}_{subtitle_suffix}.srt" subtitle_path = _unique_file_path(utils.subtitle_dir(), subtitle_name) if backend == "local": @@ -1210,6 +1226,12 @@ def render_fun_asr_transcription(tr): hotword=str(hotword).strip(), enable_spk=bool(enable_spk), ) + elif backend == "firered": + generated_path = fun_asr_subtitle.create_with_local_firered_asr( + local_file=media_path, + subtitle_file=subtitle_path, + api_url=str(firered_api_url).strip(), + ) else: generated_path = fun_asr_subtitle.create_with_fun_asr( local_file=media_path, diff --git a/webui/components/subtitle_settings.py b/webui/components/subtitle_settings.py index 96a7a7a..2c0355d 100644 --- a/webui/components/subtitle_settings.py +++ b/webui/components/subtitle_settings.py @@ -457,7 +457,7 @@ def render_subtitle_mask_settings(tr): def _get_saved_auto_transcribe_backend(): saved_backend = str(config.fun_asr.get("backend", "")).strip().lower() - if saved_backend not in {"local", "bailian"}: + if saved_backend not in {"local", "firered", "bailian"}: saved_backend = ( "bailian" if config.fun_asr.get("api_key") and not config.fun_asr.get("api_url") @@ -481,6 +481,7 @@ def render_auto_transcription_settings(tr): backend = _get_saved_auto_transcribe_backend() api_url = config.fun_asr.get("api_url", fun_asr_subtitle.LOCAL_FUN_ASR_API_URL) + firered_api_url = config.fun_asr.get("firered_api_url", fun_asr_subtitle.LOCAL_FIRERED_ASR_API_URL) hotword = config.fun_asr.get("hotword", "") enable_spk = bool(config.fun_asr.get("enable_spk", False)) api_key = config.fun_asr.get("api_key", "") @@ -488,6 +489,7 @@ def render_auto_transcription_settings(tr): if not auto_transcribe_enabled: st.session_state['subtitle_auto_transcribe_backend'] = backend st.session_state['subtitle_auto_transcribe_api_url'] = api_url + st.session_state['subtitle_auto_transcribe_firered_api_url'] = firered_api_url st.session_state['subtitle_auto_transcribe_hotword'] = hotword st.session_state['subtitle_auto_transcribe_enable_spk'] = enable_spk st.session_state['subtitle_auto_transcribe_api_key'] = api_key @@ -495,17 +497,17 @@ def render_auto_transcription_settings(tr): backend_options = { tr("Local FunASR-Pack API"): "local", + tr("Local FireRedASR API"): "firered", tr("Ali Bailian Online Fun-ASR"): "bailian", } backend_values = list(backend_options.values()) backend_labels = list(backend_options.keys()) - backend_label = st.radio( + backend_label = st.selectbox( tr("Subtitle Processing Method"), options=backend_labels, index=backend_values.index(backend), - horizontal=True, - key="subtitle_auto_transcribe_backend_radio", + key="subtitle_auto_transcribe_backend_select", ) backend = backend_options[backend_label] @@ -529,6 +531,14 @@ def render_auto_transcription_settings(tr): help=tr("Enable speaker diarization Help"), key="subtitle_auto_transcribe_enable_spk_checkbox", ) + elif backend == "firered": + st.caption(tr("Auto Transcription FireRed Caption")) + firered_api_url = st.text_input( + tr("Local FireRedASR API URL"), + value=firered_api_url, + help=tr("Local FireRedASR API URL Help"), + key="subtitle_auto_transcribe_firered_api_url_input", + ) else: st.caption(tr("Auto Transcription Online Caption")) st.markdown( @@ -546,6 +556,7 @@ def render_auto_transcription_settings(tr): config.fun_asr["backend"] = backend config.fun_asr["api_url"] = str(api_url).strip() + config.fun_asr["firered_api_url"] = str(firered_api_url).strip() config.fun_asr["api_key"] = str(api_key).strip() config.fun_asr["hotword"] = str(hotword).strip() config.fun_asr["enable_spk"] = bool(enable_spk) @@ -553,6 +564,7 @@ def render_auto_transcription_settings(tr): st.session_state['subtitle_auto_transcribe_backend'] = backend st.session_state['subtitle_auto_transcribe_api_url'] = str(api_url).strip() + st.session_state['subtitle_auto_transcribe_firered_api_url'] = str(firered_api_url).strip() st.session_state['subtitle_auto_transcribe_api_key'] = str(api_key).strip() st.session_state['subtitle_auto_transcribe_hotword'] = str(hotword).strip() st.session_state['subtitle_auto_transcribe_enable_spk'] = bool(enable_spk) @@ -692,6 +704,10 @@ def get_subtitle_params(): 'subtitle_auto_transcribe_api_url', config.fun_asr.get("api_url", "") ), + 'subtitle_auto_transcribe_firered_api_url': st.session_state.get( + 'subtitle_auto_transcribe_firered_api_url', + config.fun_asr.get("firered_api_url", "") + ), 'subtitle_auto_transcribe_api_key': st.session_state.get( 'subtitle_auto_transcribe_api_key', config.fun_asr.get("api_key", "") diff --git a/webui/i18n/en.json b/webui/i18n/en.json index e4de584..fae8c95 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -414,14 +414,19 @@ "Ali Bailian Fun-ASR Subtitle Transcription": "Subtitle Processing", "Subtitle Processing Method": "Subtitle Processing Method", "Fun-ASR Backend": "Fun-ASR Backend", - "Local FunASR-Pack API": "Local", - "Ali Bailian Online Fun-ASR": "Online", + "Local FunASR-Pack API": "FunASR (Local)", + "Local FireRedASR API": "FireRedASR2 (Local)", + "Ali Bailian Online Fun-ASR": "FunASR (Online)", "Local Fun-ASR upload caption": "The current video above will be converted to SRT subtitles through the locally running FunASR-Pack API.", + "Local FireRed-ASR upload caption": "The current video above will be converted to SRT subtitles through the locally running FireRedASR2-AED-Pack API.", "Fun-ASR upload caption": "The current video above will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.", "Auto Transcription Local Caption": "After the final video is merged, it will be converted to SRT subtitles through the locally running FunASR-Pack API.", + "Auto Transcription FireRed Caption": "After the final video is merged, it will be converted to SRT subtitles through the locally running FireRedASR2-AED-Pack API.", "Auto Transcription Online Caption": "After the final video is merged, it will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.", "Local FunASR-Pack API URL": "Local FunASR-Pack API URL", "Local FunASR-Pack API URL Help": "For example, http://127.0.0.1:7860. A full /asr endpoint URL is also supported.", + "Local FireRedASR API URL": "Local ASR API URL", + "Local FireRedASR API URL Help": "For example, http://127.0.0.1:7867. A full /asr endpoint URL is also supported.", "Fun-ASR Hotword": "Hotword", "Fun-ASR Hotword Help": "Optional hotwords passed to the local FunASR-Pack API.", "Enable speaker diarization": "Enable speaker diarization", @@ -439,8 +444,10 @@ "Calibrate subtitles": "Calibrate Subtitles", "Please enter Ali Bailian API Key": "Please enter the Ali Bailian API Key first", "Please enter local FunASR-Pack API URL": "Please enter the local FunASR-Pack API URL first", + "Please enter local FireRedASR API URL": "Please enter the local ASR API URL first", "Please upload media to transcribe": "Please upload the audio or video file to transcribe first", "Transcribing with local FunASR-Pack...": "Transcribing subtitles with local FunASR-Pack, please wait...", + "Transcribing with local FireRedASR...": "Transcribing subtitles with local ASR, please wait...", "Transcribing with Fun-ASR...": "Transcribing subtitles with Ali Bailian Fun-ASR, please wait...", "Fun-ASR failed without subtitle file": "Fun-ASR transcription failed: no subtitle file was generated", "Subtitle transcription succeeded": "Subtitle transcription succeeded: {file}", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index e400adb..63eac0e 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -365,14 +365,19 @@ "Ali Bailian Fun-ASR Subtitle Transcription": "字幕处理", "Subtitle Processing Method": "字幕处理方式", "Fun-ASR Backend": "Fun-ASR 后端", - "Local FunASR-Pack API": "本地转写", - "Ali Bailian Online Fun-ASR": "在线转写", + "Local FunASR-Pack API": "FunASR(本地部署)", + "Local FireRedASR API": "FireRedASR2(本地部署)", + "Ali Bailian Online Fun-ASR": "FunASR(在线服务)", "Local Fun-ASR upload caption": "将使用上方当前视频,通过本机运行的 FunASR-Pack API 生成 SRT 字幕。", + "Local FireRed-ASR upload caption": "将使用上方当前视频,通过本机运行的 FireRedASR2-AED-Pack API 生成 SRT 字幕。", "Fun-ASR upload caption": "将使用上方当前视频,自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。", "Auto Transcription Local Caption": "将在最终视频合并完成后,通过本机运行的 FunASR-Pack API 生成 SRT 字幕。", + "Auto Transcription FireRed Caption": "将在最终视频合并完成后,通过本机运行的 FireRedASR2-AED-Pack API 生成 SRT 字幕。", "Auto Transcription Online Caption": "将在最终视频合并完成后,自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。", "Local FunASR-Pack API URL": "本地 FunASR-Pack API 地址", "Local FunASR-Pack API URL Help": "例如 http://127.0.0.1:7860;也可以直接填到 /asr 的完整地址。", + "Local FireRedASR API URL": "本地ASR API 地址", + "Local FireRedASR API URL Help": "例如 http://127.0.0.1:7867;也可以直接填到 /asr 的完整地址。", "Fun-ASR Hotword": "热词", "Fun-ASR Hotword Help": "可选,传给本地 FunASR-Pack 的热词参数。", "Enable speaker diarization": "启用说话人分段", @@ -390,8 +395,10 @@ "Calibrate subtitles": "校准字幕", "Please enter Ali Bailian API Key": "请先输入阿里百炼 API Key", "Please enter local FunASR-Pack API URL": "请先输入本地 FunASR-Pack API 地址", + "Please enter local FireRedASR API URL": "请先输入本地ASR API 地址", "Please upload media to transcribe": "请先上传需要转录的音频或视频文件", "Transcribing with local FunASR-Pack...": "正在使用本地 FunASR-Pack 转写字幕,请稍候...", + "Transcribing with local FireRedASR...": "正在使用本地ASR转写字幕,请稍候...", "Transcribing with Fun-ASR...": "正在使用阿里百炼 Fun-ASR 转写字幕,请稍候...", "Fun-ASR failed without subtitle file": "Fun-ASR 转写失败:未生成字幕文件", "Subtitle transcription succeeded": "字幕转写成功: {file}", From 4ab29fd7763c18808509270389eef7e885bd1493 Mon Sep 17 00:00:00 2001 From: viccy Date: Sun, 7 Jun 2026 18:36:47 +0800 Subject: [PATCH 16/24] =?UTF-8?q?feat:=20=E4=BC=98=E5=8C=96=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E7=94=9F=E6=88=90=E8=BF=9B=E5=BA=A6=E5=B1=95=E7=A4=BA?= =?UTF-8?q?=E4=B8=8EUI=E7=BB=86=E8=8A=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 为视频生成任务的每个处理步骤添加详细的中文状态提示 - 重构WebUI的视频生成弹窗,使用Streamlit原生状态组件优化进度展示 - 清理多语言翻译文本中的冗余表情符号,统一UI文本风格 - 调整TTS设置面板的折叠面板默认展开状态为关闭,并移除标题中的表情前缀 --- app/services/task.py | 92 +++++++++++++++++-- webui.py | 137 +++++++++++++++++++---------- webui/components/audio_settings.py | 6 +- webui/i18n/en.json | 52 +++++------ webui/i18n/zh.json | 52 +++++------ 5 files changed, 231 insertions(+), 108 deletions(-) diff --git a/app/services/task.py b/app/services/task.py index 74e7804..356b7a6 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -434,12 +434,23 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): global merged_audio_path, merged_subtitle_path logger.info(f"\n\n## 开始统一视频处理任务: {task_id}") - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0) + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=0, + message="正在初始化视频生成任务", + ) """ 1. 加载剪辑脚本 """ logger.info("\n\n## 1. 加载视频脚本") + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=5, + message="正在加载剪辑脚本", + ) video_script_path = path.join(params.video_clip_json_path) if path.exists(video_script_path): @@ -465,6 +476,12 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): 2. 使用 TTS 生成音频素材 """ logger.info("\n\n## 2. 根据OST设置生成音频列表") + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=10, + message="正在生成 TTS 配音", + ) # 只为OST=0 or 2的判断生成音频, OST=0 仅保留解说 OST=2 保留解说和原声 tts_segments = [ segment for segment in list_script @@ -481,12 +498,23 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): voice_pitch=params.voice_pitch, ) - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20) + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=20, + message="TTS 配音生成完成", + ) """ 3. 统一视频裁剪 - 基于OST类型的差异化裁剪策略 """ logger.info("\n\n## 3. 统一视频裁剪(基于OST类型)") + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=30, + message="正在按脚本裁剪视频片段", + ) # 使用新的统一裁剪策略 video_clip_result = clip_video.clip_video_unified( @@ -505,12 +533,23 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): logger.info(f"统一裁剪完成,处理了 {len(video_clip_result)} 个视频片段") - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60) + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=60, + message="视频片段裁剪完成", + ) """ 4. 合并音频和字幕 """ logger.info("\n\n## 4. 合并音频和字幕") + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=65, + message="正在合并配音和字幕", + ) total_duration = sum([script["duration"] for script in new_script_list]) if tts_segments: try: @@ -540,6 +579,12 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): logger.warning("没有需要合并的音频/字幕") merged_audio_path = "" merged_subtitle_path = "" + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=70, + message="配音和字幕合并完成", + ) """ 5. 合并视频 @@ -549,6 +594,12 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4") logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}") + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=75, + message="正在合并视频片段", + ) # 使用统一裁剪后的视频片段 video_clips = [] @@ -568,7 +619,12 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): video_aspect=params.video_aspect, threads=params.n_threads ) - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80) + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=80, + message="视频片段合并完成", + ) """ 6. 合并字幕/BGM/配音/视频 @@ -581,6 +637,12 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): else output_video_path ) logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {merge_output_video_path}") + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=85, + message="正在合成最终视频", + ) bgm_path = utils.get_bgm_file( bgm_type=getattr(params, "bgm_type", "random"), @@ -634,10 +696,20 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): auto_subtitle_path = "" if auto_transcription_enabled: - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=90) + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=90, + message="正在自动转录最终视频", + ) logger.info("\n\n## 7. 自动转录最终视频字幕") auto_subtitle_path = _transcribe_final_video(task_id, merge_output_video_path, params) - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=95) + sm.state.update_task( + task_id, + state=const.TASK_STATE_PROCESSING, + progress=95, + message="正在压入自动转录字幕", + ) logger.info(f"\n\n## 8. 压入自动转录字幕 -> {output_video_path}") _merge_auto_transcribed_subtitles( source_video_path=merge_output_video_path, @@ -657,7 +729,13 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): } if auto_subtitle_path: kwargs["subtitles"] = [auto_subtitle_path] - sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs) + sm.state.update_task( + task_id, + state=const.TASK_STATE_COMPLETE, + progress=100, + message="视频生成完成", + **kwargs + ) return kwargs diff --git a/webui.py b/webui.py index 5ba26a3..3eac2eb 100644 --- a/webui.py +++ b/webui.py @@ -170,57 +170,102 @@ def render_generate_button(): # 生成一个新的task_id用于本次处理 task_id = str(uuid.uuid4()) - # 创建进度条 - progress_bar = st.progress(0) - status_text = st.empty() + @st.dialog(tr("Generating Video"), width="large") + def generate_video_dialog(): + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) - def run_task(): - try: - tm.start_subclip_unified( - task_id=task_id, - params=params - ) - except Exception as e: - logger.error(f"任务执行失败: {e}") - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, message=str(e)) + progress_bar = st.progress(0) + status_panel = st.status(tr("Generating Video"), expanded=True) + status_panel.write(tr("Generating Video")) - # 在新线程中启动任务 - thread = threading.Thread(target=run_task) - thread.start() + def run_task(): + try: + tm.start_subclip_unified( + task_id=task_id, + params=params + ) + except Exception as e: + logger.error(f"任务执行失败: {e}") + current_task = sm.state.get_task(task_id) or {} + sm.state.update_task( + task_id, + state=const.TASK_STATE_FAILED, + progress=current_task.get("progress", 0), + message=str(e), + ) - # 轮询任务状态 - while True: - task = sm.state.get_task(task_id) - if task: - progress = task.get("progress", 0) - state = task.get("state") - - # 更新进度条 - progress_bar.progress(progress / 100) - status_text.text(f"Processing... {progress}%") + # 在新线程中启动任务 + thread = threading.Thread(target=run_task) + thread.start() + + last_status_key = None + + # 轮询任务状态 + while True: + task = sm.state.get_task(task_id) + if task: + progress = task.get("progress", 0) + state = task.get("state") - if state == const.TASK_STATE_COMPLETE: - status_text.text(tr("Video Generation Completed")) - progress_bar.progress(1.0) - - # 显示结果 - video_files = task.get("videos", []) try: - if video_files: - player_cols = st.columns(len(video_files) * 2 + 1) - for i, url in enumerate(video_files): - player_cols[i * 2 + 1].video(url) - except Exception as e: - logger.error(f"播放视频失败: {e}") - - st.success(tr("Video Generation Completed")) - break - - elif state == const.TASK_STATE_FAILED: - st.error(f"{tr('Task failed')}: {task.get('message', 'Unknown error')}") - break - - time.sleep(0.5) + progress = int(progress) + except (TypeError, ValueError): + progress = 0 + progress = max(0, min(progress, 100)) + + # 更新进度条和阶段状态 + progress_bar.progress(progress / 100) + current_message = task.get("message") or f"Processing... {progress}%" + status_label = f"{current_message} ({progress}%)" + status_key = (state, progress, current_message) + if status_key != last_status_key: + status_panel.write(status_label) + last_status_key = status_key + + if state == const.TASK_STATE_COMPLETE: + status_panel.update( + label=tr("Video Generation Completed"), + state="complete", + expanded=False, + ) + progress_bar.progress(1.0) + + # 显示结果 + video_files = task.get("videos", []) + try: + if video_files: + for url in video_files: + st.video(url) + except Exception as e: + logger.error(f"播放视频失败: {e}") + + st.success(tr("Video Generation Completed")) + break + + if state == const.TASK_STATE_FAILED: + status_panel.update( + label=f"{tr('Task failed')}: {task.get('message', 'Unknown error')}", + state="error", + expanded=True, + ) + st.error(f"{tr('Task failed')}: {task.get('message', 'Unknown error')}") + break + + time.sleep(0.5) + + generate_video_dialog() def get_voice_name_for_tts_engine(tts_engine: str) -> str: diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index cab5413..8a6f4cc 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -522,7 +522,7 @@ def render_tts_settings(tr): """渲染TTS(文本转语音)设置""" # 1. TTS引擎选择器 - # st.subheader("🎤 TTS引擎选择") + # st.subheader("TTS引擎选择") engine_options = get_tts_engine_options(tr) engine_descriptions = get_tts_engine_descriptions(tr) @@ -553,7 +553,7 @@ def render_tts_settings(tr): if selected_engine in engine_descriptions: desc = engine_descriptions[selected_engine] - with st.expander(tr("TTS Engine Details").format(engine=desc['title']), expanded=True): + with st.expander(tr("TTS Engine Details").format(engine=desc['title']), expanded=False): st.markdown(f"**{tr('Features')}:** {desc['features']}") st.markdown(f"**{tr('Use Case')}:** {desc['use_case']}") @@ -561,7 +561,7 @@ def render_tts_settings(tr): st.markdown(f"**{tr('Registration URL')}:** [{desc['registration']}]({desc['registration']})") # 3. 根据选择的引擎渲染对应的配置界面 - # st.subheader("⚙️ 引擎配置") + # st.subheader("引擎配置") if selected_engine == "edge_tts": render_edge_tts_settings(tr) diff --git a/webui/i18n/en.json b/webui/i18n/en.json index fae8c95..0a8fb4b 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -61,7 +61,7 @@ "Preview Background Music Help": "Play the selected background music.", "Upload Background Music File": "Upload Background Music File", "Upload Background Music Help": "Upload an audio file to use as background music.", - "Background Music uploaded": "✅ Background music uploaded: {path}", + "Background Music uploaded": "Background music uploaded: {path}", "Background Music Volume": "Background Music Volume (0.2 represents 20%, background sound should not be too loud)", "Subtitle Settings": "**Subtitle Settings**", "Enable Subtitles": "Enable Subtitles (If unchecked, the following settings will not take effect)", @@ -326,13 +326,13 @@ "Cloud Service": "Cloud Service", "Select TTS Engine": "Select TTS Engine", "Select TTS Engine Help": "Choose the text-to-speech engine you want to use.", - "TTS Engine Details": "📋 {engine} Details", + "TTS Engine Details": "{engine} Details", "Features": "Features", "Use Case": "Use Case", "Registration URL": "Registration URL", "Voice Selection": "Voice Selection", "Select Edge TTS Voice": "Select an Edge TTS voice", - "Edge TTS Voice Description": "💡 Edge TTS Voice Notes", + "Edge TTS Voice Description": "Edge TTS Voice Notes", "Loaded voice count": "Loaded {count} voices", "Female Voice": "Female voice", "Male Voice": "Male voice", @@ -348,21 +348,21 @@ "Azure Speech Key Help": "Azure Speech Services API key", "Voice Name": "Voice Name", "Azure Voice Name Help": "Enter an Azure Speech Services voice name. You can use the official voice name directly, such as zh-CN-YunzeNeural.", - "Common Voice Reference": "💡 Common Voice Reference", + "Common Voice Reference": "Common Voice Reference", "Chinese Voices": "Chinese Voices", "English Voices": "English Voices", "Multilingual": "multilingual", - "Azure Voices Docs Notice": "💡 For more voices, see the [Azure Speech Services documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support).", + "Azure Voices Docs Notice": "For more voices, see the [Azure Speech Services documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support).", "Quick Select": "Quick Select", "Chinese Female Voice": "Chinese Female Voice", "Chinese Male Voice": "Chinese Male Voice", "English Female Voice": "English Female Voice", - "Voice name valid": "✅ Voice name is valid: {voice}", - "Voice name format may be invalid": "⚠️ Voice name format may be incorrect: {voice}", - "Azure voice name format notice": "💡 Azure voice names usually follow this format: [language]-[region]-[name]Neural", - "Azure Speech Services configured": "✅ Azure Speech Services is configured", - "Please configure service region": "⚠️ Please configure the service region", - "Please configure API Key": "⚠️ Please configure the API Key", + "Voice name valid": "Voice name is valid: {voice}", + "Voice name format may be invalid": "Voice name format may be incorrect: {voice}", + "Azure voice name format notice": "Azure voice names usually follow this format: [language]-[region]-[name]Neural", + "Azure Speech Services configured": "Azure Speech Services is configured", + "Please configure service region": "Please configure the service region", + "Please configure API Key": "Please configure the API Key", "Task failed": "Task failed", "Script file cannot be empty": "Script file cannot be empty", "Video file cannot be empty": "Video file cannot be empty", @@ -486,10 +486,10 @@ "Tencent Service Region Help": "Select the Tencent Cloud TTS service region", "Custom Voice": "Custom Voice", "Select Tencent TTS Voice": "Select a Tencent Cloud TTS voice", - "Tencent Cloud TTS Voice Description": "💡 Tencent Cloud TTS Voice Notes", + "Tencent Cloud TTS Voice Description": "Tencent Cloud TTS Voice Notes", "Female Voices": "Female Voices", "Male Voices": "Male Voices", - "Tencent More Voices Notice": "💡 See the official Tencent Cloud documentation for more voices.", + "Tencent More Voices Notice": "See the official Tencent Cloud documentation for more voices.", "Qwen DashScope API Key Help": "Tongyi Qwen DashScope API Key", "TTS Model Name": "TTS Model Name", "Qwen TTS Model Help": "Qwen TTS model name, for example qwen3-tts-flash", @@ -532,12 +532,12 @@ "Preview Reference Audio Help": "Play the selected reference audio.", "Upload Reference Audio File": "Upload Reference Audio File", "Upload Reference Audio Help": "Upload a clear audio clip for voice cloning", - "Audio uploaded": "✅ Audio uploaded: {path}", + "Audio uploaded": "Audio uploaded: {path}", "Inference Mode": "Inference Mode", "Standard Inference": "Standard Inference", "Fast Inference": "Fast Inference", "Inference Mode Help": "Standard inference has higher quality but is slower. Fast inference is faster with slightly lower quality.", - "Advanced Parameters": "🔧 Advanced Parameters", + "Advanced Parameters": "Advanced Parameters", "Sampling Temperature": "Sampling Temperature", "Sampling Temperature Help": "Controls randomness. Higher values are more random; lower values are more deterministic.", "Top P Help": "Probability threshold for nucleus sampling. Smaller values make results more deterministic.", @@ -548,9 +548,9 @@ "Repetition Penalty Help": "Higher values reduce repetition, but overly high values may sound unnatural.", "Enable Sampling": "Enable Sampling", "Enable Sampling Help": "Enable sampling for more natural speech.", - "IndexTTS Usage Instructions Title": "💡 IndexTTS-1.5 Usage Instructions", + "IndexTTS Usage Instructions Title": "IndexTTS-1.5 Usage Instructions", "IndexTTS Usage Instructions": "**Zero-shot voice cloning**\n\n1. **Prepare reference audio**: upload or specify a clear audio file (3-10 seconds recommended)\n2. **Set API URL**: make sure the IndexTTS-1.5 service is running\n3. **Start synthesis**: the system will use the reference voice to synthesize new speech\n\n**Notes**:\n- Reference audio quality directly affects synthesis quality\n- Use clean audio without background noise when possible\n- Keep text length within a reasonable range\n- The first synthesis may take longer", - "IndexTTS2 Emotion Parameters": "🎭 Emotion Parameters", + "IndexTTS2 Emotion Parameters": "Emotion Parameters", "Emotion Mode": "Emotion Mode", "Emotion Mode Help": "Choose the emotion control source for IndexTTS-2.", "Emotion Mode Speaker": "Same as speaker reference", @@ -578,7 +578,7 @@ "Max Text Tokens Per Segment Help": "Maximum text tokens per segment for IndexTTS-2 inference.", "Max Mel Tokens": "Max Mel Tokens", "Max Mel Tokens Help": "Controls the maximum mel tokens generated in one request. Higher values can produce longer audio.", - "IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 Usage Instructions", + "IndexTTS2 Usage Instructions Title": "IndexTTS-2 Usage Instructions", "IndexTTS2 Usage Instructions": "**IndexTTS-2 voice cloning**\n\n1. **Choose a voice**: reuse IndexTTS-1.5 resource audio or upload a reference audio file\n2. **Set API URL**: for example http://192.168.3.6:7863/tts, or enter the service root\n3. **Tune emotion**: speaker is the default; switch to audio, vector, or text when needed\n4. **Tune generation**: temperature, top_p, top_k, num_beams, repetition_penalty, and max_mel_tokens are sent directly to the IndexTTS-2 API\n\n**Notes**:\n- Reference audio quality directly affects cloning quality\n- The first request may load the model and take longer\n- CPU deployments are much slower than GPU deployments", "OmniVoice Usage Instructions Title": "OmniVoice Usage Instructions", "OmniVoice Usage Instructions": "**OmniVoice-Pack speech synthesis**\n\n1. **Automatic voice**: set the API URL and language, then synthesize directly.\n2. **Voice design**: fill instruct with the desired gender, pitch, accent, or style.\n3. **Reference-audio clone**: upload or choose reference audio and fill its matching transcript.\n\n**Notes**:\n- The default service URL is http://127.0.0.1:7866/tts\n- Reference-audio cloning requires reference text when the service has no ASR model loaded\n- OmniVoice returns WAV audio, and NarratoAI estimates subtitle segment timing from the audio duration", @@ -594,7 +594,7 @@ "Voice Pitch Help 0.5-1.5": "Adjust voice pitch (0.5-1.5)", "Sentence Silence Duration": "Sentence-end Silence Duration (seconds)", "Sentence Silence Duration Help": "Adjust sentence-end silence duration (0.0-2.0 seconds)", - "Doubao TTS API Key Application Process": "💡 Doubao TTS API Key Application Process", + "Doubao TTS API Key Application Process": "Doubao TTS API Key Application Process", "Application Steps": "Application Steps", "Doubao TTS Step 1": "1. Open [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)", "Doubao TTS Step 2": "2. Create a new Access Key and Secret Key", @@ -602,15 +602,15 @@ "Doubao TTS Step 4": "4. Click Start Now", "Doubao TTS Step 5": "5. In the left API Service Center, find Speech Synthesis under Audio Generation (note: Speech Synthesis, not the speech synthesis large model)", "Doubao TTS Step 6": "6. Scroll to the bottom to get the APPID and Access Token", - "Doubao TTS Fill Credentials Notice": "💡 Fill the Access Key, Secret Key, AppID, and Token above.", - "Doubao TTS configured": "✅ Doubao TTS is configured", - "Please configure missing fields": "⚠️ Please configure: {fields}", - "Preview Voice Synthesis": "🎵 Preview Voice Synthesis", + "Doubao TTS Fill Credentials Notice": "Fill the Access Key, Secret Key, AppID, and Token above.", + "Doubao TTS configured": "Doubao TTS is configured", + "Please configure missing fields": "Please configure: {fields}", + "Preview Voice Synthesis": "Preview Voice Synthesis", "Voice Preview Sample": "Thanks for using NarratoAI. If you have any questions or suggestions, please join the community for help and discussion.", "Please configure voice settings first": "Please configure voice settings first", - "Voice synthesis successful": "✅ Voice synthesis successful!", - "Voice synthesis failed": "❌ Voice synthesis failed. Please check your configuration.", - "SoulVoice pitch not supported": "ℹ️ SoulVoice does not support pitch adjustment", + "Voice synthesis successful": "Voice synthesis successful!", + "Voice synthesis failed": "Voice synthesis failed. Please check your configuration.", + "SoulVoice pitch not supported": "SoulVoice does not support pitch adjustment", "Progress": "Progress", "Generating script...": "Generating script...", "Please select video file first": "Please select a video file first", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 63eac0e..1099604 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -49,7 +49,7 @@ "Preview Background Music Help": "播放当前背景音乐", "Upload Background Music File": "上传背景音乐文件", "Upload Background Music Help": "上传一个音频文件作为背景音乐", - "Background Music uploaded": "✅ 背景音乐已上传: {path}", + "Background Music uploaded": "背景音乐已上传: {path}", "Background Music Volume": "背景音乐音量(0.2表示20%,背景声音不宜过高)", "Subtitle Settings": "**字幕设置**", "Enable Subtitles": "启用字幕(若取消勾选,下面的设置都将不生效)", @@ -276,13 +276,13 @@ "Cloud Service": "云端服务", "Select TTS Engine": "选择 TTS 引擎", "Select TTS Engine Help": "选择您要使用的文本转语音引擎", - "TTS Engine Details": "📋 {engine} 详细说明", + "TTS Engine Details": "{engine} 详细说明", "Features": "特点", "Use Case": "适用场景", "Registration URL": "注册地址", "Voice Selection": "音色选择", "Select Edge TTS Voice": "选择 Edge TTS 音色", - "Edge TTS Voice Description": "💡 Edge TTS 音色说明", + "Edge TTS Voice Description": "Edge TTS 音色说明", "Loaded voice count": "已加载 {count} 个音色", "Female Voice": "女声", "Male Voice": "男声", @@ -298,21 +298,21 @@ "Azure Speech Key Help": "Azure Speech Services API 密钥", "Voice Name": "音色名称", "Azure Voice Name Help": "输入 Azure Speech Services 音色名称,直接使用官方音色名称即可。例如:zh-CN-YunzeNeural", - "Common Voice Reference": "💡 常用音色参考", + "Common Voice Reference": "常用音色参考", "Chinese Voices": "中文音色", "English Voices": "英文音色", "Multilingual": "多语言", - "Azure Voices Docs Notice": "💡 更多音色请参考 [Azure Speech Services 官方文档](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support)", + "Azure Voices Docs Notice": "更多音色请参考 [Azure Speech Services 官方文档](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support)", "Quick Select": "快速选择", "Chinese Female Voice": "中文女声", "Chinese Male Voice": "中文男声", "English Female Voice": "英文女声", - "Voice name valid": "✅ 音色名称有效: {voice}", - "Voice name format may be invalid": "⚠️ 音色名称格式可能不正确: {voice}", - "Azure voice name format notice": "💡 Azure 音色名称通常格式为: [语言]-[地区]-[名称]Neural", - "Azure Speech Services configured": "✅ Azure Speech Services 配置已设置", - "Please configure service region": "⚠️ 请配置服务区域", - "Please configure API Key": "⚠️ 请配置 API Key", + "Voice name valid": "音色名称有效: {voice}", + "Voice name format may be invalid": "音色名称格式可能不正确: {voice}", + "Azure voice name format notice": "Azure 音色名称通常格式为: [语言]-[地区]-[名称]Neural", + "Azure Speech Services configured": "Azure Speech Services 配置已设置", + "Please configure service region": "请配置服务区域", + "Please configure API Key": "请配置 API Key", "Language": "界面语言", "Task failed": "任务失败", "Script file cannot be empty": "脚本文件不能为空", @@ -437,10 +437,10 @@ "Tencent Service Region Help": "选择腾讯云 TTS 服务地域", "Custom Voice": "自定义音色", "Select Tencent TTS Voice": "选择腾讯云 TTS 音色", - "Tencent Cloud TTS Voice Description": "💡 腾讯云 TTS 音色说明", + "Tencent Cloud TTS Voice Description": "腾讯云 TTS 音色说明", "Female Voices": "女声音色", "Male Voices": "男声音色", - "Tencent More Voices Notice": "💡 更多音色请参考腾讯云官方文档", + "Tencent More Voices Notice": "更多音色请参考腾讯云官方文档", "Qwen DashScope API Key Help": "通义千问 DashScope API Key", "TTS Model Name": "模型名称", "Qwen TTS Model Help": "Qwen TTS 模型名,例如 qwen3-tts-flash", @@ -483,12 +483,12 @@ "Preview Reference Audio Help": "播放当前参考音频", "Upload Reference Audio File": "上传参考音频文件", "Upload Reference Audio Help": "上传一段清晰的音频用于语音克隆", - "Audio uploaded": "✅ 音频已上传: {path}", + "Audio uploaded": "音频已上传: {path}", "Inference Mode": "推理模式", "Standard Inference": "普通推理", "Fast Inference": "快速推理", "Inference Mode Help": "普通推理质量更高但速度较慢,快速推理速度更快但质量略低", - "Advanced Parameters": "🔧 高级参数", + "Advanced Parameters": "高级参数", "Sampling Temperature": "采样温度 (Temperature)", "Sampling Temperature Help": "控制随机性,值越高输出越随机,值越低越确定", "Top P Help": "nucleus 采样的概率阈值,值越小结果越确定", @@ -499,9 +499,9 @@ "Repetition Penalty Help": "值越大越能避免重复,但过大可能导致不自然", "Enable Sampling": "启用采样", "Enable Sampling Help": "启用采样可以获得更自然的语音", - "IndexTTS Usage Instructions Title": "💡 IndexTTS-1.5 使用说明", + "IndexTTS Usage Instructions Title": "IndexTTS-1.5 使用说明", "IndexTTS Usage Instructions": "**零样本语音克隆**\n\n1. **准备参考音频**:上传或指定一段清晰的音频文件(建议 3-10 秒)\n2. **设置 API 地址**:确保 IndexTTS-1.5 服务正常运行\n3. **开始合成**:系统会自动使用参考音频的音色合成新语音\n\n**注意事项**:\n- 参考音频质量直接影响合成效果\n- 建议使用无背景噪音的清晰音频\n- 文本长度建议控制在合理范围内\n- 首次合成可能需要较长时间", - "IndexTTS2 Emotion Parameters": "🎭 情感参数", + "IndexTTS2 Emotion Parameters": "情感参数", "Emotion Mode": "情感控制方式", "Emotion Mode Help": "选择 IndexTTS-2 的情感控制来源", "Emotion Mode Speaker": "与音色参考相同", @@ -529,7 +529,7 @@ "Max Text Tokens Per Segment Help": "IndexTTS-2 分段推理的最大文本 token 数", "Max Mel Tokens": "最大 Mel Tokens", "Max Mel Tokens Help": "控制单次生成的最大 mel token 数,值越大可生成更长音频", - "IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 使用说明", + "IndexTTS2 Usage Instructions Title": "IndexTTS-2 使用说明", "IndexTTS2 Usage Instructions": "**IndexTTS-2 语音克隆**\n\n1. **选择音色**:复用 IndexTTS-1.5 的资源音频或上传参考音频\n2. **设置 API 地址**:例如 http://192.168.3.6:7863/tts,也可以填写服务根地址\n3. **调整情感参数**:默认使用 speaker,可按需切换到 audio、vector 或 text\n4. **调整生成参数**:temperature、top_p、top_k、num_beams、repetition_penalty 和 max_mel_tokens 会直接传给 IndexTTS-2 接口\n\n**注意事项**:\n- 参考音频质量会直接影响克隆效果\n- 首次请求可能需要加载模型,耗时更长\n- CPU 部署生成速度会明显慢于 GPU", "OmniVoice Usage Instructions Title": "OmniVoice 使用说明", "OmniVoice Usage Instructions": "**OmniVoice-Pack 语音合成**\n\n1. **自动音色**:只需要设置 API 地址和语言,可直接合成。\n2. **指令音色**:填写 instruct 描述想要的性别、音高、口音或风格。\n3. **参考音频克隆**:上传或选择参考音频,并填写该音频对应文本。\n\n**注意事项**:\n- 当前默认服务地址为 http://127.0.0.1:7866/tts\n- 参考音频克隆在服务未加载 ASR 模型时必须填写参考文本\n- OmniVoice 返回 WAV 音频,系统会按音频时长估算字幕段落", @@ -545,7 +545,7 @@ "Voice Pitch Help 0.5-1.5": "调节语音音高 (0.5-1.5)", "Sentence Silence Duration": "句尾静音时长 (秒)", "Sentence Silence Duration Help": "调节句尾静音时长 (0.0-2.0 秒)", - "Doubao TTS API Key Application Process": "💡 豆包语音 TTS API Key申请流程", + "Doubao TTS API Key Application Process": "豆包语音 TTS API Key申请流程", "Application Steps": "申请步骤", "Doubao TTS Step 1": "1. 打开 [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)", "Doubao TTS Step 2": "2. 新建 Access Key 和 Secret Key", @@ -553,15 +553,15 @@ "Doubao TTS Step 4": "4. 点击立即使用", "Doubao TTS Step 5": "5. 在最左边的 API 服务中心找到音频生成下面的语音合成(注意:是语音合成,不是语音合成大模型)", "Doubao TTS Step 6": "6. 翻到最下面获取 APPID 和 Access Token", - "Doubao TTS Fill Credentials Notice": "💡 请将获取到的 Access Key、Secret Key、AppID 和 Token 填写到上方的配置中", - "Doubao TTS configured": "✅ 豆包语音 TTS 配置已设置", - "Please configure missing fields": "⚠️ 请配置: {fields}", - "Preview Voice Synthesis": "🎵 试听语音合成", + "Doubao TTS Fill Credentials Notice": "请将获取到的 Access Key、Secret Key、AppID 和 Token 填写到上方的配置中", + "Doubao TTS configured": "豆包语音 TTS 配置已设置", + "Please configure missing fields": "请配置: {fields}", + "Preview Voice Synthesis": "试听语音合成", "Voice Preview Sample": "感谢关注 NarratoAI,有任何问题或建议,可以加入社区频道求助或讨论", "Please configure voice settings first": "请先配置语音设置", - "Voice synthesis successful": "✅ 语音合成成功!", - "Voice synthesis failed": "❌ 语音合成失败,请检查配置", - "SoulVoice pitch not supported": "ℹ️ SoulVoice 引擎不支持音调调节", + "Voice synthesis successful": "语音合成成功!", + "Voice synthesis failed": "语音合成失败,请检查配置", + "SoulVoice pitch not supported": "SoulVoice 引擎不支持音调调节", "上传字幕文件": "上传字幕", "清除已上传字幕": "清除已上传字幕", "无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312)": "无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312)", From ca4f2bf59455df5faf03f3b5f1d635a41f79a9c7 Mon Sep 17 00:00:00 2001 From: viccy Date: Sun, 7 Jun 2026 19:05:49 +0800 Subject: [PATCH 17/24] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E8=87=AA=E5=8A=A8=E5=AD=97=E5=B9=95=E7=94=9F=E6=88=90?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E5=B9=B6=E4=BC=98=E5=8C=96=E7=BD=91=E9=A1=B5?= =?UTF-8?q?=E8=A7=86=E9=A2=91=E5=B1=95=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加script_subtitle服务,支持基于脚本内容自动生成标准SRT字幕文件 修改任务处理流程,优先使用新的脚本字幕生成逻辑,失败时回退至原TTS字幕合并方案 优化最终视频自动转录逻辑,已生成脚本字幕时跳过重复的自动转录步骤 改进网页端弹窗视频的展示样式,根据宽高比调整预览宽度并添加黑色背景 新增完整的单元测试覆盖字幕生成相关功能 --- app/services/script_subtitle.py | 213 ++++++++++++++++++ app/services/task.py | 31 ++- app/services/test_script_subtitle_unittest.py | 94 ++++++++ webui.py | 15 +- 4 files changed, 348 insertions(+), 5 deletions(-) create mode 100644 app/services/script_subtitle.py create mode 100644 app/services/test_script_subtitle_unittest.py diff --git a/app/services/script_subtitle.py b/app/services/script_subtitle.py new file mode 100644 index 0000000..2259580 --- /dev/null +++ b/app/services/script_subtitle.py @@ -0,0 +1,213 @@ +import os +import re +import unicodedata +from typing import Iterable, List, Optional, Sequence, Tuple + +from loguru import logger + +from app.utils import utils + + +DEFAULT_SUBTITLE_OST_TYPES = (0, 2) +DEFAULT_MAX_CHARS_PER_SUBTITLE = 12 +SENTENCE_PART_RE = re.compile(r"[^。!?!?;;,,、\n]+[。!?!?;;,,、]?") + + +def _normalize_text(text: str) -> str: + return re.sub(r"\s+", " ", str(text or "")).strip() + + +def _remove_punctuation(text: str) -> str: + return "".join( + char for char in str(text or "") + if not unicodedata.category(char).startswith("P") + ) + + +def clean_subtitle_text(text: str) -> str: + """Normalize subtitle text for burn-in display.""" + return _normalize_text(_remove_punctuation(text)) + + +def split_narration(text: str, max_chars: int = DEFAULT_MAX_CHARS_PER_SUBTITLE) -> List[str]: + """Split narration into readable subtitle chunks.""" + text = _normalize_text(text) + if not text: + return [] + + max_chars = max(1, int(max_chars or DEFAULT_MAX_CHARS_PER_SUBTITLE)) + parts = [match.group(0).strip() for match in SENTENCE_PART_RE.finditer(text)] + if not parts: + parts = [text] + + chunks = [] + current = "" + + def flush_long_part(part: str) -> str: + while len(part) > max_chars: + chunks.append(part[:max_chars].strip()) + part = part[max_chars:].strip() + return part + + for part in parts: + if not part: + continue + + if len(part) > max_chars: + if current: + chunks.append(current.strip()) + current = "" + current = flush_long_part(part) + continue + + candidate = f"{current}{part}" if current else part + if len(candidate) <= max_chars: + current = candidate + else: + if current: + chunks.append(current.strip()) + current = part + + if current: + chunks.append(current.strip()) + + return [cleaned for chunk in chunks if (cleaned := clean_subtitle_text(chunk))] + + +def parse_srt_like_time(time_text: str) -> float: + time_text = str(time_text or "").strip().replace(",", ".") + parts = time_text.split(":") + if len(parts) != 3: + raise ValueError(f"不支持的时间格式: {time_text}") + + hours = int(parts[0]) + minutes = int(parts[1]) + seconds = float(parts[2]) + return hours * 3600 + minutes * 60 + seconds + + +def parse_time_range(time_range: str) -> Tuple[float, float]: + if not time_range or "-" not in str(time_range): + raise ValueError(f"不支持的时间范围: {time_range}") + + start_text, end_text = str(time_range).split("-", 1) + start = parse_srt_like_time(start_text) + end = parse_srt_like_time(end_text) + if end <= start: + raise ValueError(f"结束时间必须晚于开始时间: {time_range}") + + return start, end + + +def format_srt_time(seconds: float) -> str: + milliseconds_total = max(0, int(round(float(seconds) * 1000))) + milliseconds = milliseconds_total % 1000 + total_seconds = milliseconds_total // 1000 + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + secs = total_seconds % 60 + return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}" + + +def _safe_ost_value(value) -> Optional[int]: + try: + return int(value) + except (TypeError, ValueError): + return None + + +def _resolve_item_time_range(item: dict, current_time: float) -> Tuple[Optional[Tuple[float, float]], float]: + edited_time_range = item.get("editedTimeRange") + if edited_time_range: + try: + start, end = parse_time_range(edited_time_range) + return (start, end), end + except ValueError as e: + logger.warning(f"解析 editedTimeRange 失败,将尝试使用 duration: {e}") + + duration = float(item.get("duration", 0.0) or 0.0) + if duration <= 0: + return None, current_time + + start = current_time + end = current_time + duration + return (start, end), end + + +def _build_srt_blocks( + list_script: Sequence[dict], + include_ost: Iterable[int], + max_chars: int, +) -> List[str]: + include_ost_set = {int(item) for item in include_ost} + blocks = [] + subtitle_index = 1 + current_time = 0.0 + + for item in list_script: + time_range, current_time = _resolve_item_time_range(item, current_time) + if not time_range: + continue + + ost = _safe_ost_value(item.get("OST")) + if ost not in include_ost_set: + continue + + chunks = split_narration(item.get("narration", ""), max_chars=max_chars) + if not chunks: + continue + + start, end = time_range + segment_duration = end - start + if segment_duration <= 0: + continue + + chunk_duration = segment_duration / len(chunks) + for chunk_index, chunk in enumerate(chunks): + chunk_start = start + chunk_duration * chunk_index + chunk_end = end if chunk_index == len(chunks) - 1 else start + chunk_duration * (chunk_index + 1) + blocks.append( + "\n".join( + [ + str(subtitle_index), + f"{format_srt_time(chunk_start)} --> {format_srt_time(chunk_end)}", + chunk, + ] + ) + ) + subtitle_index += 1 + + return blocks + + +def create_script_subtitle_file( + task_id: str, + list_script: Sequence[dict], + output_file: Optional[str] = None, + include_ost: Optional[Iterable[int]] = None, + max_chars: int = DEFAULT_MAX_CHARS_PER_SUBTITLE, +) -> str: + """Create a full SRT file from script narration and edited timeline ranges.""" + if not list_script: + return "" + + if include_ost is None: + include_ost = DEFAULT_SUBTITLE_OST_TYPES + + blocks = _build_srt_blocks(list_script, include_ost=include_ost, max_chars=max_chars) + if not blocks: + logger.warning("程序化字幕未生成内容") + return "" + + if output_file is None: + output_file = os.path.join(utils.task_dir(task_id), "script_subtitles.srt") + + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + with open(output_file, "w", encoding="utf-8") as f: + f.write("\n\n".join(blocks)) + f.write("\n") + + logger.info(f"程序化字幕生成成功: {output_file}, 共 {len(blocks)} 条") + return output_file diff --git a/app/services/task.py b/app/services/task.py index 356b7a6..f5f60ce 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -10,7 +10,16 @@ from app.config import config from app.config.audio_config import AudioConfig, get_recommended_volumes_for_content from app.models import const from app.models.schema import VideoClipParams -from app.services import (voice, audio_merger, subtitle_merger, clip_video, merger_video, update_script, generate_video) +from app.services import ( + voice, + audio_merger, + subtitle_merger, + clip_video, + merger_video, + update_script, + generate_video, + script_subtitle, +) from app.services import state as sm from app.utils import utils @@ -561,8 +570,20 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): ) logger.info(f"音频文件合并成功->{merged_audio_path}") - # 合并字幕文件 - merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list) + # 优先基于脚本文案和成片时间线生成字幕,失败时回退到TTS字幕合并 + merged_subtitle_path = "" + if getattr(params, "subtitle_enabled", True): + try: + merged_subtitle_path = script_subtitle.create_script_subtitle_file( + task_id=task_id, + list_script=new_script_list, + ) + except Exception as e: + logger.warning(f"程序化字幕生成失败,将尝试合并TTS字幕: {e}") + + if not merged_subtitle_path and getattr(params, "subtitle_enabled", True): + merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list) + if merged_subtitle_path: logger.info(f"字幕文件合并成功->{merged_subtitle_path}") else: @@ -630,7 +651,9 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): 6. 合并字幕/BGM/配音/视频 """ output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4") - auto_transcription_enabled = _is_auto_transcription_enabled(params) + auto_transcription_enabled = _is_auto_transcription_enabled(params) and not bool(merged_subtitle_path) + if _is_auto_transcription_enabled(params) and merged_subtitle_path: + logger.info("已生成字幕文件,跳过最终视频自动转录") merge_output_video_path = ( path.join(utils.task_dir(task_id), "combined_without_auto_subtitles.mp4") if auto_transcription_enabled diff --git a/app/services/test_script_subtitle_unittest.py b/app/services/test_script_subtitle_unittest.py new file mode 100644 index 0000000..a4eed37 --- /dev/null +++ b/app/services/test_script_subtitle_unittest.py @@ -0,0 +1,94 @@ +import tempfile +import unittest +from pathlib import Path + +from app.services import script_subtitle + + +class ScriptSubtitleTests(unittest.TestCase): + def test_split_narration_prefers_punctuation_boundaries(self): + chunks = script_subtitle.split_narration( + "她终于意识到,这场婚姻不是爱情,而是一场交易。", + max_chars=12, + ) + + self.assertEqual( + ["她终于意识到", "这场婚姻不是爱情", "而是一场交易"], + chunks, + ) + + def test_time_range_parsing_supports_milliseconds(self): + start, end = script_subtitle.parse_time_range("00:00:01,500-00:00:03,250") + + self.assertAlmostEqual(1.5, start) + self.assertAlmostEqual(3.25, end) + + def test_create_script_subtitle_file_skips_original_audio_segments(self): + list_script = [ + { + "_id": 1, + "OST": 0, + "narration": "第一句解说。第二句解说。", + "editedTimeRange": "00:00:00-00:00:04", + "duration": 4, + }, + { + "_id": 2, + "OST": 1, + "narration": "这句是原声,不应该默认生成。", + "editedTimeRange": "00:00:04-00:00:08", + "duration": 4, + }, + { + "_id": 3, + "OST": 2, + "narration": "混合片段也保留解说字幕。", + "editedTimeRange": "00:00:08-00:00:12", + "duration": 4, + }, + ] + + with tempfile.TemporaryDirectory() as temp_dir: + output_file = Path(temp_dir) / "script_subtitles.srt" + result = script_subtitle.create_script_subtitle_file( + task_id="test", + list_script=list_script, + output_file=str(output_file), + max_chars=16, + ) + + self.assertEqual(str(output_file), result) + content = output_file.read_text(encoding="utf-8") + + self.assertIn("00:00:00,000 -->", content) + self.assertIn("第一句解说", content) + self.assertIn("混合片段也保留解说字幕", content) + self.assertNotIn("这句是原声", content) + self.assertNotIn("。", content) + self.assertNotIn(",", content) + + def test_create_script_subtitle_file_uses_duration_when_edited_range_missing(self): + list_script = [ + { + "_id": 1, + "OST": 0, + "narration": "没有 editedTimeRange 时使用 duration。", + "duration": 3, + } + ] + + with tempfile.TemporaryDirectory() as temp_dir: + output_file = Path(temp_dir) / "script_subtitles.srt" + script_subtitle.create_script_subtitle_file( + task_id="test", + list_script=list_script, + output_file=str(output_file), + ) + content = output_file.read_text(encoding="utf-8") + + self.assertIn("00:00:00,000 -->", content) + self.assertIn("--> 00:00:03,000", content) + + +if __name__ == "__main__": + unittest.main() diff --git a/webui.py b/webui.py index 3eac2eb..34d1204 100644 --- a/webui.py +++ b/webui.py @@ -181,6 +181,11 @@ def render_generate_button(): div[data-testid="stDialog"] div[data-testid="stProgress"] { margin-bottom: 0.75rem; } + div[data-testid="stDialog"] video { + max-height: 62vh; + object-fit: contain; + background: #000; + } """, unsafe_allow_html=True, @@ -246,8 +251,16 @@ def render_generate_button(): video_files = task.get("videos", []) try: if video_files: + aspect = getattr(params, "video_aspect", "") + aspect = getattr(aspect, "value", aspect) + preview_width = 320 if aspect in { + VideoAspect.portrait.value, + VideoAspect.portrait_2.value, + } else 600 for url in video_files: - st.video(url) + _, preview_col, _ = st.columns([1, 2, 1]) + with preview_col: + st.video(url, width=preview_width) except Exception as e: logger.error(f"播放视频失败: {e}") From d10c2ff7c5aed36f30f6bd2f830221f856f8b839 Mon Sep 17 00:00:00 2001 From: viccy Date: Mon, 8 Jun 2026 00:30:37 +0800 Subject: [PATCH 18/24] =?UTF-8?q?feat(prompts,=20webui,=20llm):=20?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=BD=B1=E8=A7=86=E8=A7=A3=E8=AF=B4=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=E5=8F=8A=E9=85=8D=E5=A5=97=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增影视解说专属提示词模块,覆盖剧情分析、文案生成、片段规划、脚本匹配与修复全流程 - 注册影视解说模块到全局提示词系统,更新初始化加载逻辑 - 重构Tavily搜索服务,拆分通用搜索函数适配短剧和影视两类作品 - 更新WebUI界面,新增影视解说配置项、多语言翻译与版本号展示 - 升级项目版本号从0.7.9到0.8.1 - 调整LLM服务与适配器逻辑,支持自定义prompt分类适配不同解说类型 - 完善相关工具类与单元测试,覆盖影视解说场景调用流程 --- app/services/SDE/short_drama_explanation.py | 24 +- app/services/llm/migration_adapter.py | 15 +- ...test_subtitle_adapter_pipeline_unittest.py | 26 ++ app/services/llm/unified_service.py | 5 +- app/services/prompts/__init__.py | 2 + .../prompts/film_tv_narration/__init__.py | 48 +++ .../film_tv_narration/narration_copy.py | 88 ++++ .../film_tv_narration/plot_analysis.py | 99 +++++ .../film_tv_narration/script_generation.py | 152 +++++++ .../film_tv_narration/script_matching.py | 131 ++++++ .../film_tv_narration/script_repair.py | 96 +++++ .../film_tv_narration/segment_planning.py | 103 +++++ app/services/tavily_search.py | 33 +- project_version | 2 +- webui.py | 7 +- webui/components/script_settings.py | 375 +++++++++++++----- webui/i18n/en.json | 18 +- webui/i18n/zh.json | 18 +- webui/tools/generate_short_summary.py | 91 ++++- 19 files changed, 1186 insertions(+), 147 deletions(-) create mode 100644 app/services/prompts/film_tv_narration/__init__.py create mode 100644 app/services/prompts/film_tv_narration/narration_copy.py create mode 100644 app/services/prompts/film_tv_narration/plot_analysis.py create mode 100644 app/services/prompts/film_tv_narration/script_generation.py create mode 100644 app/services/prompts/film_tv_narration/script_matching.py create mode 100644 app/services/prompts/film_tv_narration/script_repair.py create mode 100644 app/services/prompts/film_tv_narration/segment_planning.py diff --git a/app/services/SDE/short_drama_explanation.py b/app/services/SDE/short_drama_explanation.py index 6910324..5d85679 100644 --- a/app/services/SDE/short_drama_explanation.py +++ b/app/services/SDE/short_drama_explanation.py @@ -31,6 +31,7 @@ class SubtitleAnalyzer: custom_prompt: Optional[str] = None, temperature: Optional[float] = 1.0, provider: Optional[str] = None, + prompt_category: str = "short_drama_narration", ): """ 初始化字幕分析器 @@ -49,6 +50,7 @@ class SubtitleAnalyzer: self.base_url = base_url self.temperature = temperature self.provider = provider or self._detect_provider() + self.prompt_category = prompt_category or "short_drama_narration" # 设置自定义提示词(如果提供) self.custom_prompt = custom_prompt @@ -94,7 +96,7 @@ class SubtitleAnalyzer: else: # 使用新的提示词管理系统,正确传入参数 prompt = PromptManager.get_prompt( - category="short_drama_narration", + category=self.prompt_category, name="plot_analysis", parameters={"subtitle_content": subtitle_content} ) @@ -365,12 +367,12 @@ class SubtitleAnalyzer: def _render_prompt(self, name: str, parameters: Dict[str, Any]) -> Tuple[str, Optional[str]]: prompt = PromptManager.get_prompt( - category="short_drama_narration", + category=self.prompt_category, name=name, parameters=parameters, ) prompt_object = PromptManager.get_prompt_object( - category="short_drama_narration", + category=self.prompt_category, name=name, ) return prompt, prompt_object.get_system_prompt() @@ -838,7 +840,8 @@ def analyze_subtitle( temperature: float = 1.0, save_result: bool = False, output_path: Optional[str] = None, - provider: Optional[str] = None + provider: Optional[str] = None, + prompt_category: str = "short_drama_narration", ) -> Dict[str, Any]: """ 分析字幕内容的便捷函数 @@ -865,7 +868,8 @@ def analyze_subtitle( model=model, base_url=base_url, custom_prompt=custom_prompt, - provider=provider + provider=provider, + prompt_category=prompt_category, ) logger.debug(f"使用模型: {analyzer.model} 开始分析, 温度: {analyzer.temperature}") # 分析字幕 @@ -900,6 +904,7 @@ def generate_narration_script( provider: Optional[str] = None, narration_language: str = "简体中文(中国)", drama_genre: str = "逆袭/复仇", + prompt_category: str = "short_drama_narration", ) -> Dict[str, Any]: """ 根据剧情分析生成解说文案的便捷函数 @@ -926,7 +931,8 @@ def generate_narration_script( api_key=api_key, model=model, base_url=base_url, - provider=provider + provider=provider, + prompt_category=prompt_category, ) # 生成解说文案 @@ -957,6 +963,7 @@ def generate_narration_copy( provider: Optional[str] = None, narration_language: str = "简体中文(中国)", drama_genre: str = "逆袭/复仇", + prompt_category: str = "short_drama_narration", ) -> Dict[str, Any]: """生成可供用户审核修改的解说正文。""" analyzer = SubtitleAnalyzer( @@ -965,6 +972,7 @@ def generate_narration_copy( model=model, base_url=base_url, provider=provider, + prompt_category=prompt_category, ) return analyzer.generate_narration_copy( @@ -990,6 +998,7 @@ def match_narration_copy_to_script( narration_language: str = "简体中文(中国)", drama_genre: str = "逆袭/复仇", original_sound_ratio: int = 30, + prompt_category: str = "short_drama_narration", ) -> Dict[str, Any]: """将用户审核后的解说正文匹配到字幕时间戳。""" analyzer = SubtitleAnalyzer( @@ -998,6 +1007,7 @@ def match_narration_copy_to_script( model=model, base_url=base_url, provider=provider, + prompt_category=prompt_category, ) return analyzer.match_narration_copy_to_script( @@ -1025,6 +1035,7 @@ def repair_narration_script( provider: Optional[str] = None, narration_language: str = "简体中文(中国)", drama_genre: str = "逆袭/复仇", + prompt_category: str = "short_drama_narration", ) -> Dict[str, Any]: """根据校验错误修复解说文案的便捷函数。""" analyzer = SubtitleAnalyzer( @@ -1033,6 +1044,7 @@ def repair_narration_script( model=model, base_url=base_url, provider=provider, + prompt_category=prompt_category, ) return analyzer.repair_narration_script( diff --git a/app/services/llm/migration_adapter.py b/app/services/llm/migration_adapter.py index 96b165f..aec7ab5 100644 --- a/app/services/llm/migration_adapter.py +++ b/app/services/llm/migration_adapter.py @@ -198,11 +198,19 @@ class VisionAnalyzerAdapter: class SubtitleAnalyzerAdapter: """字幕分析器适配器""" - def __init__(self, api_key: str, model: str, base_url: str, provider: str = None): + def __init__( + self, + api_key: str, + model: str, + base_url: str, + provider: str = None, + prompt_category: str = "short_drama_narration", + ): self.api_key = api_key self.model = model self.base_url = base_url self.provider = provider or "openai" + self.prompt_category = prompt_category or "short_drama_narration" def _run_async_safely(self, coro_func, *args, **kwargs): """安全地运行异步协程""" @@ -228,12 +236,12 @@ class SubtitleAnalyzerAdapter: def _render_prompt(self, name: str, parameters: Dict[str, Any]) -> tuple[str, Optional[str]]: prompt = PromptManager.get_prompt( - category="short_drama_narration", + category=self.prompt_category, name=name, parameters=parameters, ) prompt_object = PromptManager.get_prompt_object( - category="short_drama_narration", + category=self.prompt_category, name=name, ) return prompt, prompt_object.get_system_prompt() @@ -466,6 +474,7 @@ class SubtitleAnalyzerAdapter: subtitle_content=subtitle_content, provider=self.provider, temperature=1.0, + prompt_category=self.prompt_category, api_key=self.api_key, api_base=self.base_url ) diff --git a/app/services/llm/test_subtitle_adapter_pipeline_unittest.py b/app/services/llm/test_subtitle_adapter_pipeline_unittest.py index 2245031..c9ed3f9 100644 --- a/app/services/llm/test_subtitle_adapter_pipeline_unittest.py +++ b/app/services/llm/test_subtitle_adapter_pipeline_unittest.py @@ -4,6 +4,7 @@ from unittest import mock from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter from app.services.llm.unified_service import UnifiedLLMService +from app.services.prompts import PromptManager class SubtitleAnalyzerAdapterPipelineTests(unittest.TestCase): @@ -30,6 +31,31 @@ class SubtitleAnalyzerAdapterPipelineTests(unittest.TestCase): self.assertIn("家庭伦理", call.call_args.kwargs["prompt"]) self.assertNotIn("response_format", call.call_args.kwargs) + def test_generate_narration_copy_can_use_film_tv_prompt_category(self): + self.assertTrue(PromptManager.exists("film_tv_narration", "narration_copy")) + adapter = SubtitleAnalyzerAdapter( + api_key="sk-test", + model="test-model", + base_url="https://example.test/v1", + provider="openai", + prompt_category="film_tv_narration", + ) + + with mock.patch.object(adapter, "_run_async_safely", return_value="他发现证据不对,真正的凶手另有其人。") as call: + result = adapter.generate_narration_copy( + short_name="测试电影", + plot_analysis="主角发现证据疑点。", + subtitle_content="# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n证据不对。", + temperature=0.7, + narration_language="简体中文(中国)", + drama_genre="悬疑/犯罪", + ) + + self.assertEqual("success", result["status"]) + self.assertIn("影视解说正文创作任务", call.call_args.kwargs["prompt"]) + self.assertIn("用户选择的影视类型", call.call_args.kwargs["prompt"]) + self.assertNotIn("短剧解说正文创作任务", call.call_args.kwargs["prompt"]) + def test_match_narration_copy_to_script_uses_json_prompt_with_selected_type(self): adapter = SubtitleAnalyzerAdapter( api_key="sk-test", diff --git a/app/services/llm/unified_service.py b/app/services/llm/unified_service.py index 071e8da..70d9ae6 100644 --- a/app/services/llm/unified_service.py +++ b/app/services/llm/unified_service.py @@ -194,6 +194,7 @@ class UnifiedLLMService: async def analyze_subtitle(subtitle_content: str, provider: Optional[str] = None, temperature: float = 1.0, + prompt_category: str = "short_drama_narration", validate_output: bool = True, **kwargs) -> str: """ @@ -214,12 +215,12 @@ class UnifiedLLMService: """ try: prompt = PromptManager.get_prompt( - category="short_drama_narration", + category=prompt_category, name="plot_analysis", parameters={"subtitle_content": subtitle_content}, ) prompt_object = PromptManager.get_prompt_object( - category="short_drama_narration", + category=prompt_category, name="plot_analysis", ) system_prompt = prompt_object.get_system_prompt() diff --git a/app/services/prompts/__init__.py b/app/services/prompts/__init__.py index 3338673..55674cc 100644 --- a/app/services/prompts/__init__.py +++ b/app/services/prompts/__init__.py @@ -56,11 +56,13 @@ __all__ = [ def initialize_prompts(): """初始化提示词模块,注册所有提示词""" from . import documentary + from . import film_tv_narration from . import short_drama_editing from . import short_drama_narration # 注册各模块的提示词 documentary.register_prompts() + film_tv_narration.register_prompts() short_drama_editing.register_prompts() short_drama_narration.register_prompts() diff --git a/app/services/prompts/film_tv_narration/__init__.py b/app/services/prompts/film_tv_narration/__init__.py new file mode 100644 index 0000000..e98bc60 --- /dev/null +++ b/app/services/prompts/film_tv_narration/__init__.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: NarratoAI +@File : __init__.py +@Description: 影视解说提示词模块 +""" + +from .plot_analysis import PlotAnalysisPrompt +from .narration_copy import NarrationCopyPrompt +from .segment_planning import SegmentPlanningPrompt +from .script_generation import ScriptGenerationPrompt +from .script_matching import ScriptMatchingPrompt +from .script_repair import ScriptRepairPrompt +from ..manager import PromptManager + + +def register_prompts(): + """注册影视解说相关的提示词""" + plot_analysis_prompt = PlotAnalysisPrompt() + PromptManager.register_prompt(plot_analysis_prompt, is_default=True) + + narration_copy_prompt = NarrationCopyPrompt() + PromptManager.register_prompt(narration_copy_prompt, is_default=True) + + segment_planning_prompt = SegmentPlanningPrompt() + PromptManager.register_prompt(segment_planning_prompt, is_default=True) + + script_generation_prompt = ScriptGenerationPrompt() + PromptManager.register_prompt(script_generation_prompt, is_default=True) + + script_matching_prompt = ScriptMatchingPrompt() + PromptManager.register_prompt(script_matching_prompt, is_default=True) + + script_repair_prompt = ScriptRepairPrompt() + PromptManager.register_prompt(script_repair_prompt, is_default=True) + + +__all__ = [ + "PlotAnalysisPrompt", + "NarrationCopyPrompt", + "SegmentPlanningPrompt", + "ScriptGenerationPrompt", + "ScriptMatchingPrompt", + "ScriptRepairPrompt", + "register_prompts", +] diff --git a/app/services/prompts/film_tv_narration/narration_copy.py b/app/services/prompts/film_tv_narration/narration_copy.py new file mode 100644 index 0000000..7c6182a --- /dev/null +++ b/app/services/prompts/film_tv_narration/narration_copy.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 影视解说-解说文案 +@File : narration_copy.py +@Description: 生成可供用户审核修改的影视解说正文 +""" + +from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat + + +class NarrationCopyPrompt(ParameterizedPrompt): + """影视解说正文生成提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="narration_copy", + category="film_tv_narration", + version="v1.0", + description="基于剧情理解和字幕生成可审核修改的影视解说正文,不绑定时间戳", + model_type=ModelType.TEXT, + output_format=OutputFormat.TEXT, + tags=["影视", "解说文案", "电影解说", "剧情承接", "用户审核"], + parameters=["drama_name", "drama_genre", "plot_analysis", "subtitle_content", "narration_language"], + ) + super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "subtitle_content"]) + + self._system_prompt = ( + "你是一位影视解说文案创作者。你只输出可供用户审核修改的解说正文," + "不要输出JSON、时间戳、编号、标题、解释或Markdown。" + ) + + def get_template(self) -> str: + return """# 影视解说正文创作任务 + +## 目标 +为影视作品《${drama_name}》创作一份可直接给用户审核修改的解说文案正文。此阶段不做画面匹配,不输出时间戳。 + +## 剧情理解材料 + +${plot_analysis} + + +## 原始字幕 + +${subtitle_content} + + +## 输出语言 + +${narration_language} + + +## 用户选择的影视类型 + +${drama_genre} + + +## 类型写作规则 +必须按用户选择的影视类型调整表达重点,不要自行改判类型: +- 剧情/情感:突出人物选择、关系裂痕、命运压力和情绪余波。 +- 悬疑/犯罪:突出线索、疑点、动机、误导和未揭开的真相。 +- 动作/冒险:突出目标、危险升级、身体对抗和关键抉择。 +- 喜剧/轻松:突出误会、反差、节奏包袱和人物可爱处。 +- 科幻/奇幻:突出设定规则、未知威胁、世界观反差和代价。 +- 历史/战争:突出时代处境、阵营选择、牺牲和局势变化。 +- 恐怖/惊悚:突出异常细节、压迫感、未知危险和心理悬念。 +- 自定义类型:严格服从用户填写的类型方向。 + +## 开头钩子公式 +开头必须使用“人物困境 + 反常信息 + 悬念问题”: +1. 先点出主角或关键人物正在面对什么压力。 +2. 再抛出一个违背常识、关系突变或危险升级的信息。 +3. 最后留下观众想继续看的问题:他为什么这样做、谁在撒谎、这场选择会把所有人推向哪里。 + +## 写作规则 +1. 必须使用 ${narration_language}。 +2. 严格基于剧情理解和字幕事实,不编造核心情节、身份、结局。 +3. 先写清楚人物动机和因果链,再写情绪金句;不要只堆形容词。 +4. 每句话只表达一个信息点,适合后续按句匹配画面。 +5. 句子尽量短,单句优先 15-35 字;信息复杂时拆成多句。 +6. 每 2-3 句要有明确承接,让观众知道为什么从上一幕来到下一幕。 +7. 总长度控制在 350-750 字;短素材取下限,长素材取上限。 +8. 不要使用编号、项目符号、章节标题或括号说明。 + +## 输出要求 +只输出解说正文。不要输出 JSON、时间戳、代码块或任何解释。""" diff --git a/app/services/prompts/film_tv_narration/plot_analysis.py b/app/services/prompts/film_tv_narration/plot_analysis.py new file mode 100644 index 0000000..e32faf2 --- /dev/null +++ b/app/services/prompts/film_tv_narration/plot_analysis.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 影视解说-剧情分析 +@File : plot_analysis.py +@Description: 影视剧情分析提示词 +""" + +from ..base import TextPrompt, PromptMetadata, ModelType, OutputFormat + + +class PlotAnalysisPrompt(TextPrompt): + """影视剧情分析提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="plot_analysis", + category="film_tv_narration", + version="v1.0", + description="结合字幕和可选联网检索上下文,输出适合影视解说脚本生成的结构化剧情理解", + model_type=ModelType.TEXT, + output_format=OutputFormat.TEXT, + tags=["影视", "电影", "电视剧", "剧情分析", "字幕解析", "解说脚本素材"], + parameters=["subtitle_content"], + ) + super().__init__(metadata) + + self._system_prompt = ( + "你是一位专业的影视解说策划和剧作分析师。请输出克制、结构化、" + "可直接供下游影视解说脚本生成使用的剧情理解材料。" + ) + + def get_template(self) -> str: + return """# 角色 +你是一位专业的影视解说策划和剧作分析师。你的输出不是给观众看的成片文案,而是给下游“影视解说脚本生成器”使用的结构化剧情理解材料。 + +# 输入说明 +下面的输入可能只包含一个视频的原始字幕,也可能包含多个视频文件的字幕;也可能同时包含联网检索结果和原始字幕。 +- 联网检索结果只能用于辅助识别作品名称、人物关系、时代背景、公开剧情梗概。 +- 原始字幕是唯一可信的当前片段事实来源。 +- 如果联网检索结果与字幕冲突,必须以字幕为准。 +- 如果联网检索结果包含当前字幕尚未出现的后续剧情,只能放在“字幕未覆盖/需谨慎信息”中,不能写进当前剧情事实。 +- 多个视频字幕会以“视频 1: 文件名”“视频 2: 文件名”等标题分隔。时间戳均为对应视频内部时间,不是拼接后的累计时间。 + +# 核心任务 +请基于输入完成剧情理解,目标是帮助后续生成高质量影视解说脚本: +1. 识别作品名称、当前字幕范围、视频来源、联网检索辅助信息和字幕事实边界。 +2. 统一人物称呼,梳理人物关系、动机和当前场景中的立场变化。 +3. 用 120-220 字概括当前字幕覆盖的剧情,不提前剧透字幕未出现的内容。 +4. 按视频来源和字幕时间顺序拆分关键剧情段落,并为每段标注准确 video_id / video_name / 时间戳。 +5. 提炼解说创作可用的开场钩子、人物困境、情绪转折、信息反转、名场面和建议保留原声片段。 + +# 强制输出规则 +1. 禁止输出寒暄、解释身份或“好的,我将……”等聊天式开场。 +2. 禁止编造字幕中没有的具体事件、对白、关系进展或结局。 +3. 时间戳必须直接来自对应视频字幕;无法确定时写“字幕未明确”,不要猜测。 +4. 多视频场景下必须明确每段来自哪个视频文件,禁止把不同视频的同名时间戳混在一起。 +5. 人名必须统一:优先采用联网检索中的正式名称;如果字幕写法不同,在人物表中保留“字幕称呼”。 +6. 内容要简洁、客观、可复用,避免散文化长段落。 +7. 必须严格按照下面的 Markdown 格式输出,不要添加额外章节。 + +# 输出格式 +## 一、基础识别 +- 作品名称:[如输入可判断则填写,否则写“未知”] +- 当前字幕范围:[开始时间戳] --> [结束时间戳];无法确定则写“字幕未明确” +- 视频来源:[列出视频编号、文件名和各自字幕时间范围;单视频也要写] +- 联网检索确认:[仅写可辅助理解的公开信息;没有联网结果则写“未启用/未提供”] +- 字幕内实际出现:[列出当前字幕真实出现的关键事实,2-5 条] +- 字幕未覆盖/需谨慎信息:[列出联网结果提到但当前字幕未发生的内容;没有则写“无”] + +## 二、人物与关系 +| 统一称呼 | 字幕称呼 | 身份/关系 | 当前动机/立场 | 确定性 | +|---|---|---|---|---| +| [人物名] | [字幕原文称呼] | [身份或关系] | [在当前片段中的目标、压力或转变] | 字幕明确/联网辅助/合理推断 | + +## 三、整体剧情概括 +[120-220 字,只概括当前字幕覆盖的剧情。必须包含核心冲突、人物动机、场景推进和当前悬念。] + +## 四、分段剧情解析 +| 视频 | 时间戳 | 段落主题 | 剧情事件 | 叙事功能 | +|---|---|---|---|---| +| [video_id + video_name] | [开始] --> [结束] | [简短主题] | [当前段落发生了什么] | [铺垫/冲突升级/人物塑造/反转/悬念/情绪爆发/名场面等] | + +## 五、解说创作重点 +- 开场钩子:[用一句话指出最适合开场抓人的冲突、谜题或人物困境] +- 核心冲突:[当前片段最主要的矛盾] +- 情绪转折/信息反转:[列 1-3 条,没有则写“无明显”] +- 名场面/高光对白:[列 1-3 条,没有则写“无明显”] +- 悬念点:[当前片段留下的疑问或后续期待] +- 建议保留原声片段: + 1. [video_id + video_name + 时间戳]:[保留理由;如果没有合适原声,写“无明显”] + +## 六、联网信息校验 +- 可用于辅助理解的信息:[联网结果中可帮助理解当前字幕的信息;没有则写“无”] +- 与字幕不一致或字幕未覆盖的信息:[必须列出,不要混入当前剧情事实;没有则写“无”] + +# 输入内容 +${subtitle_content}""" diff --git a/app/services/prompts/film_tv_narration/script_generation.py b/app/services/prompts/film_tv_narration/script_generation.py new file mode 100644 index 0000000..c945334 --- /dev/null +++ b/app/services/prompts/film_tv_narration/script_generation.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 影视解说-文案画面匹配 +@File : script_generation.py +@Description: 影视解说脚本生成提示词 +""" + +from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat + + +class ScriptGenerationPrompt(ParameterizedPrompt): + """影视解说脚本生成提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="script_generation", + category="film_tv_narration", + version="v1.0", + description="基于已规划片段生成高质量影视解说脚本,重点补足人物动机、信息承接和剧情因果", + model_type=ModelType.TEXT, + output_format=OutputFormat.JSON, + tags=["影视", "解说脚本", "文案生成", "原声片段", "悬念", "名场面"], + parameters=[ + "drama_name", + "drama_genre", + "plot_analysis", + "subtitle_content", + "segment_plan", + "narration_language", + ], + ) + super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "segment_plan"]) + + self._system_prompt = ( + "你是一位影视解说文案写手。你必须严格按照JSON格式输出," + "只能补充picture和narration,不能改动上游片段规划中的_id、video_id、video_name、timestamp和OST。" + ) + + def get_template(self) -> str: + return """# 影视解说脚本文案生成任务 + +## 任务目标 +为影视作品《${drama_name}》生成最终可剪辑解说脚本。片段已经由上游规划完成,你只能补充 picture 和 narration,不能改变片段来源和时间戳。 + +## 输入材料 + +### 剧情概述 + +${plot_analysis} + + +### 已规划片段(必须逐项照抄结构字段) + +${segment_plan} + + +### 原始字幕(含视频编号和精确时间戳) + +${subtitle_content} + + +### 解说台词语言 + +${narration_language} + + +### 用户选择的影视类型 + +${drama_genre} + + +字幕可能来自多个视频文件。每个字幕分段标题会以“视频 1: 文件名”“视频 2: 文件名”等形式标识来源。 +生成脚本时必须把每个片段绑定到对应视频来源,时间戳表示该视频文件内部的局部时间,不是把多个视频拼接后的全局时间。 +所有 OST=0 的 narration 字段必须使用上方指定的解说台词语言输出;不要因为原始字幕是其他语言就切回字幕原语言。 +OST=1 的原声片段 narration 字段必须继续使用“播放原片+序号”格式,不要翻译这个固定标记。 + +## 绝对绑定规则 +1. 输出 items 数量、顺序和 _id 必须与 segment_plan 完全一致。 +2. 每个 item 的 _id、video_id、video_name、timestamp、OST 必须逐字复制 segment_plan,不得新增、删除、合并、拆分或改动。 +3. 你只能补充 picture 和 narration 两个字段。 +4. OST=1 的 narration 必须写成“播放原片+_id”,例如 _id 为 5 时写“播放原片5”。 +5. OST=0 的 narration 必须使用 ${narration_language},并严格基于剧情和字幕,不虚构字幕外的具体事件。 + +## 叙事连续性要求 +- 你必须把每个 OST=0 当成“观众理解剧情的桥”,不能只概括当前画面。 +- 每个 OST=0 narration 要尽量回答:上一段发生了什么、人物为什么这么做、这一段带来什么新信息或新危机。 +- 跨 video_id 或跨时间大跳跃时,OST=0 必须明确补出承接句,例如“真正危险的不是这场争吵,而是他终于发现证据指向了身边人”。 +- 原声片段前后的 OST=0 要解释原声的重要性,避免观众只看到对白片段合集。 +- 如果 segment_plan 中有 story_role、intent、transition 字段,必须利用它们组织 narration,但不要把这些字段输出到最终 JSON。 +- 结尾 OST=0 要留下后续阻力、真相疑问或人物选择;如果结尾是 OST=1,则前一个 OST=0 必须提前点出这段原声会把矛盾推向哪里。 + +## 开头钩子要求 +- 第一段必须是 OST=0 解说钩子,不能直接播放原片。 +- 开头用“人物困境 + 反常信息 + 悬念问题”:主角压力 + 异常线索/关系突变 + 后续疑问。 +- 写法示例方向:他以为这只是一次普通问询,可一句话之后,所有证据都指向了他最信任的人。 +- 示例只用于理解公式,必须基于当前字幕事实原创,不要夸大到字幕没有的情节。 + +## 解说密度与画面节奏 +- OST=0 文案必须能被当前 timestamp 的画面承载,按“解说字数 / 5 = 所需视频秒数”估算。 +- 如果画面只有 6 秒,就不要写 80 字;应压缩到约 30 字,或依赖 segment_plan 选择更长画面。 +- 优先短句,单句只表达一个信息点;不要把人物介绍、前因、反转和悬念全塞进一个短画面。 +- 长信息要拆成多段,每段只承担一个叙事功能,让画面节奏跟上解说。 + +## 用户选择类型文案规则 +影视类型由用户手动选择为 ${drama_genre},不得自行改判。必须按对应方向写: +- 剧情/情感:突出人物选择、关系裂痕、命运压力和情绪余波。 +- 悬疑/犯罪:突出线索、疑点、动机、误导和未揭开的真相。 +- 动作/冒险:突出目标、危险升级、身体对抗和关键抉择。 +- 喜剧/轻松:突出误会、反差、节奏包袱和人物可爱处。 +- 科幻/奇幻:突出设定规则、未知威胁、世界观反差和代价。 +- 历史/战争:突出时代处境、阵营选择、牺牲和局势变化。 +- 恐怖/惊悚:突出异常细节、压迫感、未知危险和心理悬念。 +- 自定义类型:严格服从用户填写的类型方向。 + +## 文案质量要求 +- 开场片段要有强钩子,直接点出冲突、疑点或人物困境。 +- 每段解说优先 25-90 字,具体长度必须服从画面时长;短画面宁可少说,不要密集灌信息。 +- 可以使用“可真正的问题是”“而他还不知道”“这句话背后”“危险已经开始靠近”等影视解说转折语,但不要堆砌。 +- picture 要描述画面和人物状态,便于后期识别素材。 +- 少用孤立信息句,多用承接句;不要让观众感觉剧情突然跳场。 +- 不要解释规则,不要输出 Markdown,不要输出代码块。 + +## 输出格式 + +请严格按照以下JSON格式输出,绝不添加任何其他文字、说明或代码块标记: + +{ + "items": [ + { + "_id": 1, + "video_id": 1, + "video_name": "1.mp4", + "timestamp": "00:00:01,000-00:00:05,500", + "picture": "男主站在审讯室门口,神情紧张地看向桌上的证据袋", + "narration": "他以为这只是一次普通问询,可桌上的证据却把所有矛头指向了自己。", + "OST": 0 + }, + { + "_id": 2, + "video_id": 1, + "video_name": "1.mp4", + "timestamp": "00:00:05,500-00:00:08,000", + "picture": "警官低声质问,男主沉默不语", + "narration": "播放原片2", + "OST": 1 + } + ] +} + +现在请基于以上要求,为影视作品《${drama_name}》创作解说脚本:""" diff --git a/app/services/prompts/film_tv_narration/script_matching.py b/app/services/prompts/film_tv_narration/script_matching.py new file mode 100644 index 0000000..9577e49 --- /dev/null +++ b/app/services/prompts/film_tv_narration/script_matching.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 影视解说-文案画面匹配 +@File : script_matching.py +@Description: 将用户审核后的影视解说文案匹配到字幕时间戳并生成最终剪辑脚本 +""" + +from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat + + +class ScriptMatchingPrompt(ParameterizedPrompt): + """影视解说文案画面匹配提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="script_matching", + category="film_tv_narration", + version="v1.0", + description="将审核后的影视解说文案按叙事节奏拆分,并匹配到字幕时间戳生成最终剪辑JSON", + model_type=ModelType.TEXT, + output_format=OutputFormat.JSON, + tags=["影视", "画面匹配", "剪辑脚本", "时间戳", "用户文案"], + parameters=[ + "drama_name", + "drama_genre", + "plot_analysis", + "subtitle_content", + "narration_copy", + "narration_language", + "original_sound_ratio", + ], + ) + super().__init__( + metadata, + required_parameters=["drama_name", "subtitle_content", "narration_copy"], + ) + + self._system_prompt = ( + "你是一位懂影视叙事节奏的剪辑师。你必须严格输出JSON," + "核心任务是把用户审核后的解说文案逐句匹配到最合适的原视频字幕时间戳。" + ) + + def get_template(self) -> str: + return """# 影视解说文案画面匹配任务 + +## 目标 +用户已经审核并修改了解说文案。请根据这份文案和原始字幕,生成最终可剪辑 JSON 脚本。 + +## 作品名 +${drama_name} + +## 剧情理解材料 + +${plot_analysis} + + +## 用户审核后的解说文案 + +${narration_copy} + + +## 原始字幕(含视频编号和局部时间戳) + +${subtitle_content} + + +## 输出语言 + +${narration_language} + + +## 用户选择的影视类型 + +${drama_genre} + + +## 用户选择的原片占比 + +${original_sound_ratio}% + + +## 匹配流程 +1. 先按句号、问号、感叹号、省略号切分解说文案,得到候选解说句。 +2. 逗号只在明显分割两个动作、场景、观点或描述对象时切分;不要切出没有独立意义的碎片。 +3. 不要求每个候选句都单独输出为 OST=0;可以合并、压缩相邻候选句作为剧情桥段,但不能改变用户文案的核心意思。 +4. 为每个解说片段寻找最匹配的原始字幕画面,优先选择能表达该句核心含义、人物状态或信息转折的画面。 +5. 使用公式估算所需画面时长:所需秒数 = 解说字数 / 5。匹配画面时长尽量接近,误差优先控制在 ±0.5 秒。 +6. 如果一句解说太长,必须拆成多个 OST=0 片段,分别匹配不同或连续画面。 +7. timestamp 必须使用对应 video_id 内部局部时间戳,不得换算为多个视频拼接后的累计时间。 +8. 同一 video_id 内时间段不得交叉或重叠。 +9. 第一段必须是 OST=0 解说钩子,不能直接播放原片。 +10. OST=1 原声片段的总时长占比要尽量接近用户选择的 ${original_sound_ratio}%。这里按最终 items 的 timestamp 总时长估算,不按片段数量估算。 +11. 不要自行判断或改写影视类型;画面匹配和 picture 描述要服务用户选择的 ${drama_genre} 叙事重点。 + +## 原片占比规则 +- ${original_sound_ratio}% = 0% 时,不要输出 OST=1,全部使用解说承接。 +- ${original_sound_ratio}% 在 10%-30% 时,只保留关键对白、信息反转、情绪爆发或名场面原声。 +- ${original_sound_ratio}% 在 40%-60% 时,解说负责串联因果,原片负责承载关键场面和对白。 +- ${original_sound_ratio}% 在 70%-90% 时,以原片对白和表演为主,解说只做开场钩子、转场桥和必要补充。 +- 如果原片占比与“第一段必须 OST=0”冲突,优先保证第一段是 OST=0,然后在后续片段提高 OST=1 时长占比。 +- 选择高原片占比时,可以把用户文案合并成更少的 OST=0 桥段,不要为了逐句使用文案而压低原片占比。 + +## 字段规则 +- _id:从 1 开始连续递增。 +- video_id:来自字幕分段标题,例如“视频 2”就填 2。 +- video_name:对应视频文件名,必须从字幕分段标题提取。 +- timestamp:格式为 "HH:MM:SS,mmm-HH:MM:SS,mmm"。 +- picture:描述匹配画面中人物、动作、情绪、场景和关键道具。 +- narration:OST=0 时填写用户文案片段;OST=1 时填写“播放原片+_id”。 +- OST:解说片段填 0,原声片段填 1。 + +## 输出格式 +只输出严格 JSON: + +{ + "items": [ + { + "_id": 1, + "video_id": 1, + "video_name": "1.mp4", + "timestamp": "00:00:01,000-00:00:06,000", + "picture": "主角站在走廊尽头,回头看向紧闭的房门", + "narration": "他以为自己终于逃出了那间房,可真正的危险,其实才刚刚醒来。", + "OST": 0 + } + ] +} + +现在请基于用户审核后的解说文案生成最终剪辑脚本。""" diff --git a/app/services/prompts/film_tv_narration/script_repair.py b/app/services/prompts/film_tv_narration/script_repair.py new file mode 100644 index 0000000..cdd9c88 --- /dev/null +++ b/app/services/prompts/film_tv_narration/script_repair.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 影视解说-脚本修复 +@File : script_repair.py +@Description: 影视解说脚本校验失败后的JSON修复提示词 +""" + +from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat + + +class ScriptRepairPrompt(ParameterizedPrompt): + """影视解说脚本修复提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="script_repair", + category="film_tv_narration", + version="v1.0", + description="根据确定性校验错误修复影视解说脚本JSON,优先修正时间戳、视频来源和格式问题", + model_type=ModelType.TEXT, + output_format=OutputFormat.JSON, + tags=["影视", "解说脚本", "JSON修复", "时间戳校验", "多视频"], + parameters=[ + "drama_name", + "drama_genre", + "plot_analysis", + "subtitle_content", + "invalid_script", + "validation_errors", + "narration_language", + ], + ) + super().__init__( + metadata, + required_parameters=["drama_name", "subtitle_content", "invalid_script", "validation_errors"], + ) + + self._system_prompt = ( + "你是一位影视解说脚本JSON修复器。你只能根据校验错误修复JSON," + "必须输出严格JSON,不能输出解释、Markdown或代码块。" + ) + + def get_template(self) -> str: + return """# 影视解说脚本修复任务 + +## 修复目标 +下面的影视作品《${drama_name}》解说脚本未通过剪辑校验。请只根据校验错误和字幕内容修复它,输出一个完整可剪辑的 JSON。 + +## 剧情理解材料 + +${plot_analysis} + + +## 校验错误 + +${validation_errors} + + +## 当前无效脚本 + +${invalid_script} + + +## 可用字幕窗口 + +${subtitle_content} + + +## 解说台词目标语言 + +${narration_language} + + +## 用户选择的影视类型 + +${drama_genre} + + +## 修复规则 +1. 只输出 JSON,不要任何解释、标题、Markdown 或代码块。 +2. 输出根对象必须是 {"items": [...]}。 +3. 每个 item 必须包含 _id、video_id、video_name、timestamp、picture、narration、OST。 +4. video_id、video_name 和 timestamp 必须来自对应字幕窗口;不得把不同视频的同名时间戳混用。 +5. 同一 video_id 内片段不得交叉或重叠。 +6. OST=1 的 narration 必须是“播放原片+序号”;OST=0 的 narration 必须使用 ${narration_language}。 +7. 禁止连续 3 个或更多 OST=1;必须插入或改写 OST=0 解说片段承接剧情。 +8. 跨 video_id 切换前后不能都是 OST=1;必须至少有一个 OST=0 片段解释场景和剧情为什么切换。 +9. OST=0 narration 要补足人物动机、信息承接和因果转折,不要只概括当前画面。 +10. 第一段必须是 OST=0 解说钩子,按“人物困境 + 反常信息 + 悬念问题”写,不要直接播放原片。 +11. OST=0 文案必须匹配画面时长,按“解说字数 / 5 = 所需视频秒数”估算;过密时要缩短文案、延长时间戳或拆成多个片段。 +12. 不要自行改判影视类型;如需改写 narration,必须按用户选择的 ${drama_genre} 保持表达重点。 +13. 尽量保留原脚本中没有错误的片段;无法修复的片段可以删除,但剩余片段必须重新按 1 开始编号。 + +请输出修复后的完整 JSON。""" diff --git a/app/services/prompts/film_tv_narration/segment_planning.py b/app/services/prompts/film_tv_narration/segment_planning.py new file mode 100644 index 0000000..a1da09e --- /dev/null +++ b/app/services/prompts/film_tv_narration/segment_planning.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +@Project: 影视解说-片段规划 +@File : segment_planning.py +@Description: 影视解说脚本片段规划提示词 +""" + +from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat + + +class SegmentPlanningPrompt(ParameterizedPrompt): + """影视解说片段规划提示词""" + + def __init__(self): + metadata = PromptMetadata( + name="segment_planning", + category="film_tv_narration", + version="v1.0", + description="基于剧情理解和原始字幕规划可剪辑片段,优先保证影视叙事连续性和原声解说节奏", + model_type=ModelType.TEXT, + output_format=OutputFormat.JSON, + tags=["影视", "解说脚本", "片段规划", "时间戳", "多视频", "原声"], + parameters=["drama_name", "drama_genre", "plot_analysis", "subtitle_content", "narration_language"], + ) + super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "subtitle_content"]) + + self._system_prompt = ( + "你是一位影视解说剪辑规划师。你的任务是从字幕中选择可剪辑片段," + "必须严格输出JSON,不能写解说文案,不能输出Markdown或额外说明。" + ) + + def get_template(self) -> str: + return """# 影视解说片段规划任务 + +## 目标 +为影视作品《${drama_name}》规划一组可直接剪辑的视频片段。你只负责选片段和标注用途,不写最终解说台词。 + +## 剧情理解材料 + +${plot_analysis} + + +## 原始字幕(含视频编号和局部时间戳) + +${subtitle_content} + + +## 解说台词目标语言 + +${narration_language} + + +## 用户选择的影视类型 + +${drama_genre} + + +## 叙事规划目标 +你不是在挑精彩片段合集,而是在规划一条观众能顺着看懂的影视解说故事线。必须先想清楚“人物处境 -> 事件触发 -> 关系或信息变化 -> 新危机 -> 悬念”的因果链,再选片段。 + +## 开场钩子规则 +第一段必须是 OST=0 解说开场,不要直接播放原片。开头参考“人物困境 + 反常信息 + 悬念问题”的公式: +- 先给人物一个明确压力:被误解、被追捕、被迫选择、失去重要之人、发现异常线索。 +- 再给一个反常信息:熟人背叛、证据失效、规则被打破、危险提前出现。 +- 最后抛出问题:谁在说谎、真相藏在哪里、这次选择会付出什么代价。 +- 不要照抄示例,要基于字幕事实改写成当前作品自己的钩子。 + +## 规划规则 +1. 只能使用原始字幕中真实存在的视频编号、视频文件名和时间范围。 +2. timestamp 必须是对应 video_id 内部的局部时间戳,禁止换算成多个视频拼接后的累计时间。 +3. 同一个 video_id 内的片段不得交叉或重叠;尽量按故事顺序排列。 +4. 每个片段必须推动主线、解释人物动机、制造情绪转折、承接原声或保留关键对白。 +5. OST=1 表示保留原声,适合关键对白、情绪爆发、真相揭露、名场面和反转;OST=0 表示后续需要配解说。 +6. 原声片段单段优先控制在 3-10 秒;解说片段可以更长,但必须能从字幕范围中定位。 +7. 影视类型由用户手动选择为 ${drama_genre},不得自行改判;选片段时优先服务该类型的主要看点。 +8. 禁止连续 3 个或更多 OST=1;每 1-2 个原声片段后必须安排 OST=0 解说片段承接剧情。 +9. 跨 video_id 切换前后必须至少有一个 OST=0 片段作为剧情桥段,解释为什么从上一场转到下一场。 +10. 每个 OST=0 片段必须承担明确叙事功能:开场钩子、人物介绍、因果过渡、信息解释、情绪转折、冲突升级、结尾悬念。 +11. 不要跳过关键因果;关系变化、线索发现、危机升级必须有画面或解说桥段承接。 +12. 结尾优先选择能留下新问题、新危险或人物选择的片段,不要只停在原声对白堆叠上。 +13. 解说画面必须给足时长:按“解说字数 / 5 = 所需视频秒数”预估,短画面不要承载长解说。 + +## 输出格式 +只输出严格 JSON: + +{ + "segments": [ + { + "_id": 1, + "video_id": 1, + "video_name": "1.mp4", + "timestamp": "00:00:01,000-00:00:05,500", + "OST": 0, + "story_role": "开场钩子", + "intent": "点出主角困境和反常线索,制造继续观看的疑问", + "transition": "从当前场景切入人物压力,引出下一段关键对白" + } + ] +} + +现在请规划影视作品《${drama_name}》的解说片段。""" diff --git a/app/services/tavily_search.py b/app/services/tavily_search.py index 586a7ee..0f61014 100644 --- a/app/services/tavily_search.py +++ b/app/services/tavily_search.py @@ -35,15 +35,37 @@ def search_short_drama( timeout: int = DEFAULT_TIMEOUT, ) -> dict[str, Any]: """Search web context for a short drama name with Tavily.""" - short_name = str(short_name or "").strip() - if not short_name: - raise TavilySearchError("短剧名称不能为空") + return search_story_context( + short_name, + api_key, + search_keywords="短剧 剧情 介绍 人物 结局", + empty_name_message="短剧名称不能为空", + search_depth=search_depth, + max_results=max_results, + timeout=timeout, + ) + + +def search_story_context( + title: str, + api_key: str | None = None, + *, + search_keywords: str = "剧情 介绍 人物 结局", + empty_name_message: str = "作品名称不能为空", + search_depth: str = DEFAULT_SEARCH_DEPTH, + max_results: int = DEFAULT_MAX_RESULTS, + timeout: int = DEFAULT_TIMEOUT, +) -> dict[str, Any]: + """Search web context for a story title with Tavily.""" + title = str(title or "").strip() + if not title: + raise TavilySearchError(empty_name_message) api_key = (api_key or os.getenv("TAVILY_API_KEY") or "").strip() if not api_key: raise TavilySearchError("Tavily API Key 未配置") - query = f"{short_name} 短剧 剧情 介绍 人物 结局" + query = f"{title} {search_keywords}".strip() payload = { "query": query, "search_depth": search_depth or DEFAULT_SEARCH_DEPTH, @@ -77,13 +99,12 @@ def search_short_drama( raise TavilySearchError("Tavily 返回内容不是有效 JSON") from exc logger.info( - "Tavily 短剧检索完成: query={}, results={}", + "Tavily 剧情检索完成: query={}, results={}", query, len(data.get("results") or []), ) return data - def format_search_context(search_data: dict[str, Any], *, max_chars: int = 6000) -> str: """Format Tavily response into compact LLM context.""" if not search_data: diff --git a/project_version b/project_version index 1451d48..c18d72b 100644 --- a/project_version +++ b/project_version @@ -1 +1 @@ -0.7.9 \ No newline at end of file +0.8.1 \ No newline at end of file diff --git a/webui.py b/webui.py index 34d1204..57f8eb7 100644 --- a/webui.py +++ b/webui.py @@ -129,6 +129,11 @@ def tr(key): return loc.get("Translation", {}).get(key, key) +def get_help_text(): + """返回带当前项目版本号的帮助文案""" + return tr("Get Help").replace("🎉🎉🎉", f" v{config.project_version}") + + def render_generate_button(): """渲染生成按钮和处理逻辑""" if st.button(tr("Generate Video"), use_container_width=True, type="primary"): @@ -588,7 +593,7 @@ def main(): logger.warning(f"资源初始化时出现警告: {e}") st.title(f"Narrato:blue[AI]:sunglasses: 📽️") - st.write(tr("Get Help")) + st.write(get_help_text()) # 首先渲染不依赖PyTorch的UI部分 # 渲染基础设置面板 diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index 555e1e1..e57c42d 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -15,6 +15,10 @@ from app.utils import utils, check_script from webui.tools.generate_script_docu import generate_script_docu from webui.tools.generate_script_short import generate_script_short from webui.tools.generate_short_summary import ( + FILM_TV_PROMPT_CATEGORY, + FILM_TV_SEARCH_KEYWORDS, + SHORT_DRAMA_PROMPT_CATEGORY, + SHORT_DRAMA_SEARCH_KEYWORDS, analyze_short_drama_plot, generate_script_short_sunmmary, generate_short_drama_narration_copy, @@ -22,6 +26,12 @@ from webui.tools.generate_short_summary import ( SCRIPT_TABLE_BASE_COLUMNS = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"] +MODE_FILE = "file_selection" +MODE_AUTO = "auto" +MODE_SHORT = "short" +MODE_SHORT_SUMMARY = "summary" +MODE_FILM_SUMMARY = "film_summary" +SUMMARY_SCRIPT_MODES = {MODE_SHORT_SUMMARY, MODE_FILM_SUMMARY} VIDEO_UPLOAD_TYPES = ["mp4", "mov", "avi", "flv", "mkv", "mpeg4"] VIDEO_GLOB_PATTERNS = [f"*.{suffix}" for suffix in VIDEO_UPLOAD_TYPES] SHORT_DRAMA_NARRATION_LANGUAGE_OPTIONS = [ @@ -66,7 +76,64 @@ SHORT_DRAMA_TYPE_VALUES = { "urban_emotion": "都市情感", "period_rural": "年代/乡村", } +FILM_TV_TYPE_OPTIONS = [ + ("drama_emotion", "剧情/情感"), + ("suspense_crime", "悬疑/犯罪"), + ("action_adventure", "动作/冒险"), + ("comedy_light", "喜剧/轻松"), + ("sci_fi_fantasy", "科幻/奇幻"), + ("history_war", "历史/战争"), + ("horror_thriller", "恐怖/惊悚"), + ("custom", "自定义"), +] +FILM_TV_TYPE_VALUES = { + "drama_emotion": "剧情/情感", + "suspense_crime": "悬疑/犯罪", + "action_adventure": "动作/冒险", + "comedy_light": "喜剧/轻松", + "sci_fi_fantasy": "科幻/奇幻", + "history_war": "历史/战争", + "horror_thriller": "恐怖/惊悚", +} SHORT_DRAMA_ORIGINAL_SOUND_RATIO_OPTIONS = list(range(0, 100, 10)) +SUMMARY_MODE_CONFIGS = { + MODE_FILM_SUMMARY: { + "mode_label_key": "Film TV Narration", + "session_prefix": "film_tv", + "prompt_category": FILM_TV_PROMPT_CATEGORY, + "search_keywords": FILM_TV_SEARCH_KEYWORDS, + "web_search_context_description": "影视作品名称、人物关系、剧情背景和公开剧情梗概", + "empty_title_message_key": "Please enter film/tv title before web search", + "title_label_key": "影视名称", + "type_label_key": "影视类型", + "custom_type_label_key": "自定义影视类型", + "custom_type_placeholder_key": "例如:悬疑犯罪", + "custom_type_empty_key": "请输入自定义影视类型", + "narration_copy_label_key": "影视解说文案", + "type_options": FILM_TV_TYPE_OPTIONS, + "type_values": FILM_TV_TYPE_VALUES, + "default_type": "drama_emotion", + "default_type_value": "剧情/情感", + }, + MODE_SHORT_SUMMARY: { + "mode_label_key": "Short Drama Summary", + "session_prefix": "short_drama", + "prompt_category": SHORT_DRAMA_PROMPT_CATEGORY, + "search_keywords": SHORT_DRAMA_SEARCH_KEYWORDS, + "web_search_context_description": "短剧名称、人物关系、剧情背景和公开剧情梗概", + "empty_title_message_key": "Please enter short drama name before web search", + "title_label_key": "短剧名称", + "type_label_key": "短剧类型", + "custom_type_label_key": "自定义短剧类型", + "custom_type_placeholder_key": "例如:豪门虐恋", + "custom_type_empty_key": "请输入自定义短剧类型", + "narration_copy_label_key": "短剧解说文案", + "type_options": SHORT_DRAMA_TYPE_OPTIONS, + "type_values": SHORT_DRAMA_TYPE_VALUES, + "default_type": "counterattack", + "default_type_value": "逆袭/复仇", + }, +} def _normalize_video_paths(paths): @@ -201,20 +268,47 @@ def _short_drama_plot_analysis_signature(subtitle_paths, video_theme, web_search ) -def _resolve_short_drama_narration_language(): - selected_language = st.session_state.get('short_drama_narration_language_option', 'zh-CN') - custom_language = str(st.session_state.get('short_drama_custom_narration_language', '') or '').strip() +def _summary_mode_config(script_path=None): + script_path = script_path or st.session_state.get('video_clip_json_path', MODE_FILM_SUMMARY) + return SUMMARY_MODE_CONFIGS.get(script_path, SUMMARY_MODE_CONFIGS[MODE_SHORT_SUMMARY]) + + +def _summary_state_key(summary_config, suffix): + return f"{summary_config['session_prefix']}_{suffix}" + + +def _resolve_summary_narration_language(summary_config): + selected_language = st.session_state.get( + _summary_state_key(summary_config, "narration_language_option"), + "zh-CN", + ) + custom_language = str( + st.session_state.get(_summary_state_key(summary_config, "custom_narration_language"), "") or "" + ).strip() if selected_language == "custom" and custom_language: return custom_language return SHORT_DRAMA_NARRATION_LANGUAGE_VALUES.get(selected_language, "简体中文(中国)") -def _resolve_short_drama_type(): - selected_type = st.session_state.get('short_drama_type_option', 'counterattack') - custom_type = str(st.session_state.get('short_drama_custom_type', '') or '').strip() +def _resolve_summary_type(summary_config): + selected_type = st.session_state.get( + _summary_state_key(summary_config, "type_option"), + summary_config["default_type"], + ) + custom_type = str( + st.session_state.get(_summary_state_key(summary_config, "custom_type"), "") or "" + ).strip() if selected_type == "custom" and custom_type: return custom_type - return SHORT_DRAMA_TYPE_VALUES.get(selected_type, "逆袭/复仇") + return summary_config["type_values"].get(selected_type, summary_config["default_type_value"]) + + +def _resolve_short_drama_narration_language(): + return _resolve_summary_narration_language(SUMMARY_MODE_CONFIGS[MODE_SHORT_SUMMARY]) + + +def _resolve_short_drama_type(): + return _resolve_summary_type(SUMMARY_MODE_CONFIGS[MODE_SHORT_SUMMARY]) def render_script_panel(tr): @@ -239,9 +333,9 @@ def render_script_panel(tr): elif script_path == "short": # 短剧混剪 render_short_generate_options(tr) - elif script_path == "summary": - # 短剧解说 - short_drama_summary(tr) + elif script_path in SUMMARY_SCRIPT_MODES: + # 影视解说 / 短剧解说 + summary_narration_panel(tr, _summary_mode_config(script_path)) else: # 默认为空 pass @@ -252,15 +346,10 @@ def render_script_panel(tr): def render_script_file(tr, params): """渲染脚本文件选择""" - # 定义功能模式 - MODE_FILE = "file_selection" - MODE_AUTO = "auto" - MODE_SHORT = "short" - MODE_SUMMARY = "summary" - # 模式选项映射,按工作流优先级展示 mode_options = { - tr("Short Drama Summary"): MODE_SUMMARY, + tr("Film TV Narration"): MODE_FILM_SUMMARY, + tr("Short Drama Summary"): MODE_SHORT_SUMMARY, tr("Auto Generate"): MODE_AUTO, tr("Short Generate"): MODE_SHORT, tr("Select/Upload Script"): MODE_FILE, @@ -279,6 +368,8 @@ def render_script_file(tr, params): default_index = mode_keys.index(tr("Short Generate")) elif current_path == "summary": default_index = mode_keys.index(tr("Short Drama Summary")) + elif current_path == "film_summary": + default_index = mode_keys.index(tr("Film TV Narration")) elif current_path: default_index = mode_keys.index(tr("Select/Upload Script")) else: @@ -354,8 +445,12 @@ def render_script_file(tr, params): script_list.append((display_name, file['file'])) # 找到保存的脚本文件在列表中的索引 - # 如果当前path是特殊值(auto/short/summary),则重置为空 - saved_script_path = current_path if current_path not in [MODE_AUTO, MODE_SHORT, MODE_SUMMARY] else "" + # 如果当前path是特殊值(auto/short/summary/film_summary),则重置为空 + saved_script_path = ( + current_path + if current_path not in [MODE_AUTO, MODE_SHORT, MODE_SHORT_SUMMARY, MODE_FILM_SUMMARY] + else "" + ) selected_index = 0 for i, (_, path) in enumerate(script_list): @@ -558,7 +653,7 @@ def render_short_generate_options(tr): 渲染Short Generate模式下的特殊选项 在Short Generate模式下,替换原有的输入框为自定义片段选项 """ - short_drama_summary(tr) + summary_narration_panel(tr, SUMMARY_MODE_CONFIGS[MODE_SHORT_SUMMARY]) # 显示自定义片段数量选择器 custom_clips = st.number_input( tr("自定义片段"), @@ -605,8 +700,8 @@ def render_video_details(tr): return video_theme, custom_prompt -def short_drama_summary(tr): - """短剧解说 渲染视频主题和提示词""" +def summary_narration_panel(tr, summary_config): + """影视/短剧解说 渲染视频主题和提示词""" # 检查是否已经处理过字幕文件 if 'subtitle_file_processed' not in st.session_state: st.session_state['subtitle_file_processed'] = False @@ -616,22 +711,27 @@ def short_drama_summary(tr): current_subtitle_paths = _selected_subtitle_paths() current_subtitle_path = current_subtitle_paths[0] if current_subtitle_paths else '' + web_search_key = _summary_state_key(summary_config, "web_search_enabled") + plot_button_key = _summary_state_key(summary_config, "plot_analysis_button") + plot_analysis_key = _summary_state_key(summary_config, "plot_analysis") + plot_source_key = _summary_state_key(summary_config, "plot_analysis_subtitle_path") + plot_signature_key = _summary_state_key(summary_config, "plot_analysis_signature") st.markdown( - """ + f""" """, unsafe_allow_html=True, @@ -670,18 +770,18 @@ def short_drama_summary(tr): name_cols = st.columns([3.4, 1.1, 2], vertical_alignment="bottom") with name_cols[0]: - video_theme = st.text_input(tr("短剧名称")) + video_theme = st.text_input(tr(summary_config["title_label_key"])) with name_cols[1]: web_search_enabled = st.toggle( tr("联网搜索"), - key="short_drama_web_search_enabled", + key=web_search_key, help=tr("Enable Web Search Help"), disabled=not current_subtitle_path, ) with name_cols[2]: analyze_plot_clicked = st.button( tr("剧情理解"), - key="short_drama_plot_analysis_button", + key=plot_button_key, disabled=not current_subtitle_path, use_container_width=True, ) @@ -693,15 +793,15 @@ def short_drama_summary(tr): web_search_enabled, _selected_video_paths(), ) - saved_signature = st.session_state.get('short_drama_plot_analysis_signature') - legacy_source = st.session_state.get('short_drama_plot_analysis_subtitle_path') + saved_signature = st.session_state.get(plot_signature_key) + legacy_source = st.session_state.get(plot_source_key) if ( (saved_signature and saved_signature != current_signature) or (legacy_source and legacy_source != current_subtitle_path) ): - st.session_state['short_drama_plot_analysis'] = "" - st.session_state['short_drama_plot_analysis_subtitle_path'] = "" - st.session_state['short_drama_plot_analysis_signature'] = "" + st.session_state[plot_analysis_key] = "" + st.session_state[plot_source_key] = "" + st.session_state[plot_signature_key] = "" if analyze_plot_clicked: with st.spinner(tr("Analyzing plot...")): @@ -713,23 +813,32 @@ def short_drama_summary(tr): short_name=video_theme, enable_web_search=web_search_enabled, video_paths=_selected_video_paths(), + prompt_category=summary_config["prompt_category"], + search_keywords=summary_config["search_keywords"], + empty_title_message_key=summary_config["empty_title_message_key"], + web_search_context_description=summary_config["web_search_context_description"], ) if plot_analysis: - st.session_state['short_drama_plot_analysis'] = plot_analysis - st.session_state['short_drama_plot_analysis_subtitle_path'] = current_subtitle_path - st.session_state['short_drama_plot_analysis_signature'] = current_signature + st.session_state[plot_analysis_key] = plot_analysis + st.session_state[plot_source_key] = current_subtitle_path + st.session_state[plot_signature_key] = current_signature st.success(tr("Plot analysis completed")) - if st.session_state.get('short_drama_plot_analysis'): + if st.session_state.get(plot_analysis_key): st.text_area( tr("剧情理解结果"), - key="short_drama_plot_analysis", + key=plot_analysis_key, height=240, ) return video_theme +def short_drama_summary(tr): + """短剧解说 渲染视频主题和提示词""" + return summary_narration_panel(tr, SUMMARY_MODE_CONFIGS[MODE_SHORT_SUMMARY]) + + def render_subtitle_preview(tr): """渲染可折叠的当前字幕预览;没有字幕时提示用户先转写或上传。""" subtitle_paths = _selected_subtitle_paths() @@ -1295,63 +1404,88 @@ def render_script_buttons(tr, params): button_name = tr("Generate Video Script") elif script_path == "short": button_name = tr("Generate Short Video Script") - elif script_path == "summary": + elif script_path in SUMMARY_SCRIPT_MODES: button_name = tr("生成剪辑脚本") elif script_path.endswith("json"): button_name = tr("Load Video Script") else: button_name = tr("Please Select Script File") - if script_path == "summary": - config_cols = st.columns([1.15, 1.15, 0.9, 1.15, 1.15], vertical_alignment="bottom") - with config_cols[0]: + if script_path in SUMMARY_SCRIPT_MODES: + summary_config = _summary_mode_config(script_path) + type_option_key = _summary_state_key(summary_config, "type_option") + custom_type_key = _summary_state_key(summary_config, "custom_type") + original_sound_ratio_key = _summary_state_key(summary_config, "original_sound_ratio") + language_option_key = _summary_state_key(summary_config, "narration_language_option") + custom_language_key = _summary_state_key(summary_config, "custom_narration_language") + narration_copy_key = _summary_state_key(summary_config, "narration_copy") + + type_options = [code for code, _ in summary_config["type_options"]] + if st.session_state.get(type_option_key) not in type_options: + st.session_state[type_option_key] = summary_config["default_type"] + language_options = [code for code, _ in SHORT_DRAMA_NARRATION_LANGUAGE_OPTIONS] + if st.session_state.get(language_option_key) not in language_options: + st.session_state[language_option_key] = "zh-CN" + + show_custom_type = st.session_state.get(type_option_key, summary_config["default_type"]) == "custom" + show_custom_language = ( + st.session_state.get(language_option_key, 'zh-CN') == "custom" + ) + config_col_widths = [1.15] + if show_custom_type: + config_col_widths.append(1.15) + config_col_widths.extend([0.9, 1.15]) + if show_custom_language: + config_col_widths.append(1.15) + + config_cols = st.columns(config_col_widths, vertical_alignment="bottom") + config_col_index = 0 + with config_cols[config_col_index]: st.selectbox( - tr("短剧类型"), - options=[code for code, _ in SHORT_DRAMA_TYPE_OPTIONS], - format_func=lambda code: tr(dict(SHORT_DRAMA_TYPE_OPTIONS).get(code, code)), - key="short_drama_type_option", + tr(summary_config["type_label_key"]), + options=type_options, + format_func=lambda code: tr(dict(summary_config["type_options"]).get(code, code)), + key=type_option_key, ) - with config_cols[1]: - custom_type_disabled = ( - st.session_state.get('short_drama_type_option', 'counterattack') != "custom" - ) - st.text_input( - tr("自定义短剧类型"), - key="short_drama_custom_type", - placeholder=tr("例如:豪门虐恋"), - disabled=custom_type_disabled, - ) - with config_cols[2]: + config_col_index += 1 + if show_custom_type: + with config_cols[config_col_index]: + st.text_input( + tr(summary_config["custom_type_label_key"]), + key=custom_type_key, + placeholder=tr(summary_config["custom_type_placeholder_key"]), + ) + config_col_index += 1 + with config_cols[config_col_index]: st.selectbox( tr("原片占比"), options=SHORT_DRAMA_ORIGINAL_SOUND_RATIO_OPTIONS, format_func=lambda ratio: f"{ratio}%", index=SHORT_DRAMA_ORIGINAL_SOUND_RATIO_OPTIONS.index(30), - key="short_drama_original_sound_ratio", + key=original_sound_ratio_key, ) - with config_cols[3]: + config_col_index += 1 + with config_cols[config_col_index]: st.selectbox( tr("解说语言"), options=[code for code, _ in SHORT_DRAMA_NARRATION_LANGUAGE_OPTIONS], format_func=lambda code: tr(dict(SHORT_DRAMA_NARRATION_LANGUAGE_OPTIONS).get(code, code)), - key="short_drama_narration_language_option", - ) - with config_cols[4]: - custom_language_disabled = ( - st.session_state.get('short_drama_narration_language_option', 'zh-CN') != "custom" - ) - st.text_input( - tr("自定义解说语言"), - key="short_drama_custom_narration_language", - placeholder=tr("例如:意大利语(意大利)"), - disabled=custom_language_disabled, + key=language_option_key, ) + config_col_index += 1 + if show_custom_language: + with config_cols[config_col_index]: + st.text_input( + tr("自定义解说语言"), + key=custom_language_key, + placeholder=tr("例如:意大利语(意大利)"), + ) action_cols = st.columns([1, 1], vertical_alignment="bottom") with action_cols[0]: narration_copy_clicked = st.button( tr("生成解说文案"), - key="short_drama_narration_copy_action", + key=_summary_state_key(summary_config, "narration_copy_action"), disabled=not script_path, use_container_width=True, ) @@ -1366,19 +1500,31 @@ def render_script_buttons(tr, params): narration_copy_clicked = False action_clicked = st.button(button_name, key="script_action", disabled=not script_path) - if script_path == "summary" and (narration_copy_clicked or action_clicked): - narration_language = _resolve_short_drama_narration_language() - drama_genre = _resolve_short_drama_type() - original_sound_ratio = int(st.session_state.get('short_drama_original_sound_ratio', 30)) + if script_path in SUMMARY_SCRIPT_MODES and (narration_copy_clicked or action_clicked): + summary_config = _summary_mode_config(script_path) + type_option_key = _summary_state_key(summary_config, "type_option") + custom_type_key = _summary_state_key(summary_config, "custom_type") + original_sound_ratio_key = _summary_state_key(summary_config, "original_sound_ratio") + language_option_key = _summary_state_key(summary_config, "narration_language_option") + custom_language_key = _summary_state_key(summary_config, "custom_narration_language") + narration_copy_key = _summary_state_key(summary_config, "narration_copy") + plot_analysis_key = _summary_state_key(summary_config, "plot_analysis") + plot_source_key = _summary_state_key(summary_config, "plot_analysis_subtitle_path") + plot_signature_key = _summary_state_key(summary_config, "plot_analysis_signature") + web_search_key = _summary_state_key(summary_config, "web_search_enabled") + + narration_language = _resolve_summary_narration_language(summary_config) + drama_genre = _resolve_summary_type(summary_config) + original_sound_ratio = int(st.session_state.get(original_sound_ratio_key, 30)) if ( - st.session_state.get('short_drama_type_option') == "custom" - and not str(st.session_state.get('short_drama_custom_type', '') or '').strip() + st.session_state.get(type_option_key) == "custom" + and not str(st.session_state.get(custom_type_key, '') or '').strip() ): - st.error(tr("请输入自定义短剧类型")) + st.error(tr(summary_config["custom_type_empty_key"])) st.stop() if ( - st.session_state.get('short_drama_narration_language_option') == "custom" - and not str(st.session_state.get('short_drama_custom_narration_language', '') or '').strip() + st.session_state.get(language_option_key) == "custom" + and not str(st.session_state.get(custom_language_key, '') or '').strip() ): st.error(tr("请输入自定义解说语言")) st.stop() @@ -1387,7 +1533,7 @@ def render_script_buttons(tr, params): subtitle_path = subtitle_paths[0] if subtitle_paths else None video_theme = st.session_state.get('video_theme') temperature = st.session_state.get('temperature') - web_search_enabled = bool(st.session_state.get('short_drama_web_search_enabled', False)) + web_search_enabled = bool(st.session_state.get(web_search_key, False)) current_signature = _short_drama_plot_analysis_signature( subtitle_paths, video_theme, @@ -1395,13 +1541,13 @@ def render_script_buttons(tr, params): _selected_video_paths(), ) plot_analysis = "" - if st.session_state.get('short_drama_plot_analysis_signature') == current_signature: - plot_analysis = st.session_state.get('short_drama_plot_analysis', '') + if st.session_state.get(plot_signature_key) == current_signature: + plot_analysis = st.session_state.get(plot_analysis_key, '') elif ( not web_search_enabled - and st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path + and st.session_state.get(plot_source_key) == subtitle_path ): - plot_analysis = st.session_state.get('short_drama_plot_analysis', '') + plot_analysis = st.session_state.get(plot_analysis_key, '') if narration_copy_clicked: with st.spinner(tr("Generating narration copy...")): @@ -1416,13 +1562,17 @@ def render_script_buttons(tr, params): video_paths=_selected_video_paths(), narration_language=narration_language, drama_genre=drama_genre, + prompt_category=summary_config["prompt_category"], + search_keywords=summary_config["search_keywords"], + empty_title_message_key=summary_config["empty_title_message_key"], + web_search_context_description=summary_config["web_search_context_description"], ) if copy_result: - st.session_state['short_drama_narration_copy'] = copy_result["narration_copy"] + st.session_state[narration_copy_key] = copy_result["narration_copy"] if not plot_analysis: - st.session_state['short_drama_plot_analysis'] = copy_result["plot_analysis"] - st.session_state['short_drama_plot_analysis_subtitle_path'] = subtitle_path - st.session_state['short_drama_plot_analysis_signature'] = current_signature + st.session_state[plot_analysis_key] = copy_result["plot_analysis"] + st.session_state[plot_source_key] = subtitle_path + st.session_state[plot_signature_key] = current_signature st.success(tr("Narration copy generated successfully")) if action_clicked: @@ -1437,20 +1587,25 @@ def render_script_buttons(tr, params): enable_web_search=web_search_enabled, video_paths=_selected_video_paths(), narration_language=narration_language, - narration_copy=st.session_state.get('short_drama_narration_copy', ''), + narration_copy=st.session_state.get(narration_copy_key, ''), drama_genre=drama_genre, original_sound_ratio=original_sound_ratio, + prompt_category=summary_config["prompt_category"], + search_keywords=summary_config["search_keywords"], + empty_title_message_key=summary_config["empty_title_message_key"], + web_search_context_description=summary_config["web_search_context_description"], ) - if script_path == "summary": + if script_path in SUMMARY_SCRIPT_MODES: + summary_config = _summary_mode_config(script_path) st.text_area( - tr("短剧解说文案"), - key="short_drama_narration_copy", + tr(summary_config["narration_copy_label_key"]), + key=_summary_state_key(summary_config, "narration_copy"), height=220, help=tr("Narration Copy Help"), ) - if action_clicked and script_path != "summary": + if action_clicked and script_path not in SUMMARY_SCRIPT_MODES: if script_path == "auto": # 执行纪录片视频脚本生成(视频无字幕无配音) generate_script_docu(params, tr) diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 0a8fb4b..24b2f0a 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -251,6 +251,7 @@ "Batch Size": "Batch Size", "Batch Size (More keyframes consume more tokens)": "Batch Size (smaller batches consume more tokens)", "Short Drama Summary": "Short Drama Summary", + "Film TV Narration": "Film/TV Narration", "Video Type": "Creation Type", "Select/Upload Script": "Custom Script", "原生Gemini模型连接成功": "Native Gemini model connection succeeded", @@ -266,6 +267,7 @@ "字幕文件内容似乎为空,请检查文件": "The subtitle file appears to be empty. Please check the file.", "字幕上传成功": "Subtitle uploaded successfully", "短剧名称": "Short Drama Name", + "影视名称": "Film/TV Title", "解说语言": "Narration Language", "自定义解说语言": "Custom Narration Language", "例如:意大利语(意大利)": "For example: Italian (Italy)", @@ -282,9 +284,13 @@ "自定义": "Custom", "短剧类型": "Short Drama Type", "自定义短剧类型": "Custom Short Drama Type", + "影视类型": "Film/TV Type", + "自定义影视类型": "Custom Film/TV Type", "原片占比": "Original Footage Ratio", "例如:豪门虐恋": "For example: billionaire angst romance", + "例如:悬疑犯罪": "For example: suspense crime", "请输入自定义短剧类型": "Please enter a custom short drama type", + "请输入自定义影视类型": "Please enter a custom film/TV type", "逆袭/复仇": "Counterattack / Revenge", "霸总/甜宠": "CEO Romance / Sweet Romance", "家庭伦理": "Family Ethics", @@ -292,9 +298,16 @@ "悬疑/犯罪": "Suspense / Crime", "都市情感": "Urban Romance", "年代/乡村": "Period / Rural", + "剧情/情感": "Drama / Emotion", + "动作/冒险": "Action / Adventure", + "喜剧/轻松": "Comedy / Light", + "科幻/奇幻": "Sci-Fi / Fantasy", + "历史/战争": "History / War", + "恐怖/惊悚": "Horror / Thriller", "生成解说文案": "Generate Narration Copy", "生成剪辑脚本": "Generate Editing Script", "短剧解说文案": "Short Drama Narration Copy", + "影视解说文案": "Film/TV Narration Copy", "Narration Copy Help": "Generate the narration copy first, review or rewrite it here, then generate the editing script to match footage and timestamps.", "Narration copy generated successfully": "Narration copy generated. Please review and edit it.", "生成短剧解说脚本": "Generate Short Drama Narration Script", @@ -459,12 +472,13 @@ "Transcribed subtitles storage hint": "Previously transcribed subtitles are saved in {path}; drag a file from that folder to upload", "Tavily Search Settings": "Tavily Web Search", "Tavily API Key": "Tavily API Key", - "Tavily API Key Help": "Used for web search before short drama plot analysis. When Web Search is enabled, the app searches plot, character, and episode context by drama name, then combines it with subtitles.", + "Tavily API Key Help": "Used for web search before plot analysis. When Web Search is enabled, the app searches plot, character, and background context by title, then combines it with subtitles.", "Tavily config saved": "Tavily configuration saved", "联网搜索": "Web Search", - "Enable Web Search Help": "When enabled, plot analysis searches the web with Tavily by short drama name before combining those results with subtitles.", + "Enable Web Search Help": "When enabled, plot analysis searches the web with Tavily by title before combining those results with subtitles.", "Please configure Tavily API Key in Basic Settings": "Please configure the Tavily API Key in Basic Settings first", "Please enter short drama name before web search": "Please enter the short drama name before enabling web search", + "Please enter film/tv title before web search": "Please enter the film/TV title before enabling web search", "Searching short drama with Tavily...": "Searching short drama context with Tavily...", "Tavily search failed": "Tavily search failed", "剧情理解": "Plot Analysis", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 1099604..539a6d1 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -241,6 +241,7 @@ "Batch Size": "批处理大小", "Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多", "Short Drama Summary": "短剧解说", + "Film TV Narration": "影视解说", "Video Type": "创作类型", "Select/Upload Script": "自定义脚本", "Script loaded successfully": "脚本加载成功", @@ -410,12 +411,13 @@ "Transcribed subtitles storage hint": "之前转录生成的字幕保存在 {path},可从该目录拖入上传", "Tavily Search Settings": "Tavily 联网搜索", "Tavily API Key": "Tavily API Key", - "Tavily API Key Help": "用于短剧剧情理解前的联网检索。开启“联网搜索”后,会先按短剧名称检索剧情、人物和分集信息,再结合字幕分析。", + "Tavily API Key Help": "用于剧情理解前的联网检索。开启“联网搜索”后,会先按作品名称检索剧情、人物和背景信息,再结合字幕分析。", "Tavily config saved": "Tavily 配置已保存", "联网搜索": "联网搜索", - "Enable Web Search Help": "开启后,剧情理解会先使用 Tavily 按短剧名称联网检索,再结合检索结果和字幕分析剧情。", + "Enable Web Search Help": "开启后,剧情理解会先使用 Tavily 按作品名称联网检索,再结合检索结果和字幕分析剧情。", "Please configure Tavily API Key in Basic Settings": "请先在基础设置中配置 Tavily API Key", "Please enter short drama name before web search": "开启联网搜索前,请先填写短剧名称", + "Please enter film/tv title before web search": "开启联网搜索前,请先填写影视名称", "Searching short drama with Tavily...": "正在使用 Tavily 检索短剧信息...", "Tavily search failed": "Tavily 检索失败", "剧情理解": "剧情理解", @@ -568,6 +570,7 @@ "字幕文件内容似乎为空,请检查文件": "字幕文件内容似乎为空,请检查文件", "字幕上传成功": "字幕上传成功", "短剧名称": "短剧名称", + "影视名称": "影视名称", "解说语言": "解说语言", "自定义解说语言": "自定义解说语言", "例如:意大利语(意大利)": "例如:意大利语(意大利)", @@ -584,9 +587,13 @@ "自定义": "自定义", "短剧类型": "短剧类型", "自定义短剧类型": "自定义短剧类型", + "影视类型": "影视类型", + "自定义影视类型": "自定义影视类型", "原片占比": "原片占比", "例如:豪门虐恋": "例如:豪门虐恋", + "例如:悬疑犯罪": "例如:悬疑犯罪", "请输入自定义短剧类型": "请输入自定义短剧类型", + "请输入自定义影视类型": "请输入自定义影视类型", "逆袭/复仇": "逆袭/复仇", "霸总/甜宠": "霸总/甜宠", "家庭伦理": "家庭伦理", @@ -594,9 +601,16 @@ "悬疑/犯罪": "悬疑/犯罪", "都市情感": "都市情感", "年代/乡村": "年代/乡村", + "剧情/情感": "剧情/情感", + "动作/冒险": "动作/冒险", + "喜剧/轻松": "喜剧/轻松", + "科幻/奇幻": "科幻/奇幻", + "历史/战争": "历史/战争", + "恐怖/惊悚": "恐怖/惊悚", "生成解说文案": "生成解说文案", "生成剪辑脚本": "生成剪辑脚本", "短剧解说文案": "短剧解说文案", + "影视解说文案": "影视解说文案", "Narration Copy Help": "先点击生成解说文案;审核、删改或重写这段文案后,再点击生成剪辑脚本匹配画面和时间戳。", "Narration copy generated successfully": "解说文案已生成,可先审核修改", "生成短剧解说脚本": "生成短剧解说脚本", diff --git a/webui/tools/generate_short_summary.py b/webui/tools/generate_short_summary.py index ab1e71b..468206d 100644 --- a/webui/tools/generate_short_summary.py +++ b/webui/tools/generate_short_summary.py @@ -25,7 +25,7 @@ from app.services.subtitle_text import read_subtitle_text from app.services.short_drama_narration_validation import ( normalize_script_video_sources, ) -from app.services.tavily_search import TavilySearchError, format_search_context, search_short_drama +from app.services.tavily_search import TavilySearchError, format_search_context, search_story_context # 导入新的LLM服务模块 - 确保提供商被注册 import app.services.llm # 这会触发提供商注册 from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter @@ -33,6 +33,10 @@ import re PUBLIC_SCRIPT_FIELDS = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"] +SHORT_DRAMA_PROMPT_CATEGORY = "short_drama_narration" +FILM_TV_PROMPT_CATEGORY = "film_tv_narration" +SHORT_DRAMA_SEARCH_KEYWORDS = "短剧 剧情 介绍 人物 结局" +FILM_TV_SEARCH_KEYWORDS = "影视 剧情 介绍 人物 结局 电影 电视剧" def _normalize_paths(paths): @@ -197,10 +201,15 @@ def _get_tavily_api_key() -> str: ).strip() -def _build_tavily_context(short_name: str, tr=lambda key: key) -> str | None: - short_name = str(short_name or "").strip() - if not short_name: - st.error(tr("Please enter short drama name before web search")) +def _build_tavily_context( + title: str, + tr=lambda key: key, + search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS, + empty_title_message_key: str = "Please enter short drama name before web search", +) -> str | None: + title = str(title or "").strip() + if not title: + st.error(tr(empty_title_message_key)) return None api_key = _get_tavily_api_key() @@ -209,9 +218,11 @@ def _build_tavily_context(short_name: str, tr=lambda key: key) -> str | None: return None try: - search_data = search_short_drama( - short_name, + search_data = search_story_context( + title, api_key, + search_keywords=search_keywords, + empty_name_message=tr(empty_title_message_key), search_depth=config.app.get("tavily_search_depth", "basic"), max_results=config.app.get("tavily_max_results", 5), ) @@ -231,17 +242,25 @@ def _build_plot_analysis_input( short_name: str = "", enable_web_search: bool = False, tr=lambda key: key, + search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS, + empty_title_message_key: str = "Please enter short drama name before web search", + web_search_context_description: str = "短剧名称、人物关系、剧情背景和公开剧情梗概", ) -> str | None: subtitle_content = str(subtitle_content or "").strip() if not enable_web_search: return subtitle_content - tavily_context = _build_tavily_context(short_name, tr) + tavily_context = _build_tavily_context( + short_name, + tr, + search_keywords=search_keywords, + empty_title_message_key=empty_title_message_key, + ) if tavily_context is None: return None return f"""# 分析补充说明 -请先参考 Tavily 联网检索结果理解短剧名称、人物关系、剧情背景和公开剧情梗概,再结合原始字幕完成剧情理解。 +请先参考 Tavily 联网检索结果理解{web_search_context_description},再结合原始字幕完成剧情理解。 如果联网检索结果与字幕内容冲突,请以字幕内容为准;时间戳必须只从字幕内容中提取。 {tavily_context} @@ -258,6 +277,10 @@ def analyze_short_drama_plot( short_name: str = "", enable_web_search: bool = False, video_paths=None, + prompt_category: str = SHORT_DRAMA_PROMPT_CATEGORY, + search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS, + empty_title_message_key: str = "Please enter short drama name before web search", + web_search_context_description: str = "短剧名称、人物关系、剧情背景和公开剧情梗概", ): """仅执行短剧字幕剧情理解,返回可编辑的剧情分析文本。""" subtitle_paths = _normalize_paths(subtitle_path) @@ -287,13 +310,22 @@ def analyze_short_drama_plot( short_name=short_name, enable_web_search=enable_web_search, tr=tr, + search_keywords=search_keywords, + empty_title_message_key=empty_title_message_key, + web_search_context_description=web_search_context_description, ) if plot_analysis_input is None: return None try: logger.info("使用新的LLM服务架构进行字幕分析") - analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) + analyzer = SubtitleAnalyzerAdapter( + text_api_key, + text_model, + text_base_url, + text_provider, + prompt_category=prompt_category, + ) analysis_result = analyzer.analyze_subtitle(plot_analysis_input) except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") @@ -304,7 +336,8 @@ def analyze_short_drama_plot( base_url=text_base_url, save_result=True, temperature=temperature, - provider=text_provider + provider=text_provider, + prompt_category=prompt_category, ) if analysis_result["status"] != "success": @@ -326,6 +359,10 @@ def generate_short_drama_narration_copy( video_paths=None, narration_language: str = "简体中文(中国)", drama_genre: str = "逆袭/复仇", + prompt_category: str = SHORT_DRAMA_PROMPT_CATEGORY, + search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS, + empty_title_message_key: str = "Please enter short drama name before web search", + web_search_context_description: str = "短剧名称、人物关系、剧情背景和公开剧情梗概", ): """生成可由用户审核修改的短剧解说正文,不绑定时间戳。""" subtitle_paths = _normalize_paths(subtitle_path) @@ -356,6 +393,10 @@ def generate_short_drama_narration_copy( short_name=video_theme, enable_web_search=enable_web_search, video_paths=selected_video_paths, + prompt_category=prompt_category, + search_keywords=search_keywords, + empty_title_message_key=empty_title_message_key, + web_search_context_description=web_search_context_description, ) if not analysis_text: return None @@ -367,7 +408,13 @@ def generate_short_drama_narration_copy( try: logger.info("使用新的LLM服务架构生成可审核解说文案") - analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) + analyzer = SubtitleAnalyzerAdapter( + text_api_key, + text_model, + text_base_url, + text_provider, + prompt_category=prompt_category, + ) narration_result = analyzer.generate_narration_copy( short_name=video_theme, plot_analysis=analysis_text, @@ -389,6 +436,7 @@ def generate_short_drama_narration_copy( provider=text_provider, narration_language=narration_language, drama_genre=drama_genre, + prompt_category=prompt_category, ) if narration_result.get("status") != "success": @@ -423,6 +471,10 @@ def generate_script_short_sunmmary( narration_copy: str = "", drama_genre: str = "逆袭/复仇", original_sound_ratio: int = 30, + prompt_category: str = SHORT_DRAMA_PROMPT_CATEGORY, + search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS, + empty_title_message_key: str = "Please enter short drama name before web search", + web_search_context_description: str = "短剧名称、人物关系、剧情背景和公开剧情梗概", ): """ 生成 短剧解说 视频脚本 @@ -536,7 +588,13 @@ def generate_script_short_sunmmary( st.error(tr("Please generate and review narration copy first")) return - analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) + analyzer = SubtitleAnalyzerAdapter( + text_api_key, + text_model, + text_base_url, + text_provider, + prompt_category=prompt_category, + ) if plot_analysis and str(plot_analysis).strip(): logger.info("使用用户编辑后的剧情理解结果匹配剪辑脚本") analysis_result = { @@ -552,6 +610,9 @@ def generate_script_short_sunmmary( short_name=video_theme, enable_web_search=True, tr=tr, + search_keywords=search_keywords, + empty_title_message_key=empty_title_message_key, + web_search_context_description=web_search_context_description, ) if plot_analysis_input is None: return @@ -572,7 +633,8 @@ def generate_script_short_sunmmary( base_url=text_base_url, save_result=True, temperature=temperature, - provider=text_provider + provider=text_provider, + prompt_category=prompt_category, ) """ 3. 根据用户审核后的文案匹配画面与时间戳 @@ -612,6 +674,7 @@ def generate_script_short_sunmmary( narration_language=narration_language, drama_genre=drama_genre, original_sound_ratio=original_sound_ratio, + prompt_category=prompt_category, ) if narration_result["status"] == "success": From 8e4271c2ceee5b86dd6a651d08f69470a85f03e9 Mon Sep 17 00:00:00 2001 From: viccy Date: Mon, 8 Jun 2026 01:58:40 +0800 Subject: [PATCH 19/24] =?UTF-8?q?perf(clip=5Fvideo):=20=E4=BC=98=E5=8C=96F?= =?UTF-8?q?Fmpeg=E5=89=AA=E8=BE=91=E5=91=BD=E4=BB=A4=E4=B8=BA=E5=BF=AB?= =?UTF-8?q?=E9=80=9F=E6=90=9C=E7=B4=A2=E6=A8=A1=E5=BC=8F=EF=BC=8C=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E5=8D=95=E5=85=83=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 优化了视频剪辑的FFmpeg命令参数顺序,将原本后置`-ss`的慢搜索改为前置`-ss`的快速搜索模式,大幅减少长视频剪辑时的不必要解码开销。重构了时间处理逻辑,新增辅助函数统一转换时间格式与计算裁剪时长,更新了所有兼容降级的编码命令以适配新参数格式,同时新增单元测试验证命令参数的正确性。 --- app/services/clip_video.py | 60 +++++++++++++------ ...test_subtitle_adapter_pipeline_unittest.py | 38 ++++++++++++ .../film_tv_narration/script_generation.py | 6 +- .../film_tv_narration/script_matching.py | 18 +++--- .../film_tv_narration/script_repair.py | 22 +++---- .../film_tv_narration/segment_planning.py | 22 +++---- ...est_multi_video_script_sources_unittest.py | 17 ++++++ 7 files changed, 135 insertions(+), 48 deletions(-) diff --git a/app/services/clip_video.py b/app/services/clip_video.py index 93f9ddd..a8235f3 100644 --- a/app/services/clip_video.py +++ b/app/services/clip_video.py @@ -32,6 +32,32 @@ def parse_timestamp(timestamp: str) -> tuple: return start_time, end_time +def _ffmpeg_time_to_seconds(time_value: str) -> float: + normalized_time = str(time_value).strip().replace(",", ".") + parts = normalized_time.split(":") + + if len(parts) == 3: + hours, minutes, seconds = parts + return int(hours) * 3600 + int(minutes) * 60 + float(seconds) + if len(parts) == 2: + minutes, seconds = parts + return int(minutes) * 60 + float(seconds) + return float(normalized_time) + + +def _calculate_ffmpeg_duration(start_time: str, end_time: str) -> str: + duration = _ffmpeg_time_to_seconds(end_time) - _ffmpeg_time_to_seconds(start_time) + if duration <= 0: + raise ValueError(f"无效的视频裁剪时间范围: {start_time} -> {end_time}") + + return f"{duration:.3f}".rstrip("0").rstrip(".") + + +def _append_fast_seek_input(cmd: List[str], input_path: str, start_time: str, end_time: str) -> None: + duration = _calculate_ffmpeg_duration(start_time, end_time) + cmd.extend(["-ss", start_time, "-i", input_path, "-t", duration]) + + def _normalize_video_origin_paths( video_origin_path: str, video_origin_paths: Optional[List[str]] = None, @@ -253,11 +279,8 @@ def build_ffmpeg_command( # 对于其他编码器,可以使用硬件解码参数 cmd.extend(hwaccel_args) - # 输入文件 - cmd.extend(["-i", input_path]) - - # 时间范围 - cmd.extend(["-ss", start_time, "-to", end_time]) + # 快速定位输入文件,避免长视频从头解码到目标片段 + _append_fast_seek_input(cmd, input_path, start_time, end_time) # 编码器设置 cmd.extend(["-c:v", encoder_config["video_codec"]]) @@ -439,11 +462,12 @@ def try_compatibility_fallback( bool: 是否成功 """ # 兼容性模式:避免所有可能的滤镜链问题 + duration = _calculate_ffmpeg_duration(start_time, end_time) fallback_cmd = [ "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", - "-i", input_path, "-ss", start_time, - "-to", end_time, + "-i", input_path, + "-t", duration, "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", # 明确指定像素格式 @@ -480,11 +504,12 @@ def try_software_fallback( bool: 是否成功 """ # 纯软件编码 + duration = _calculate_ffmpeg_duration(start_time, end_time) fallback_cmd = [ "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", - "-i", input_path, "-ss", start_time, - "-to", end_time, + "-i", input_path, + "-t", duration, "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", @@ -520,11 +545,12 @@ def try_basic_fallback( bool: 是否成功 """ # 最基本的编码参数 + duration = _calculate_ffmpeg_duration(start_time, end_time) fallback_cmd = [ "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", - "-i", input_path, "-ss", start_time, - "-to", end_time, + "-i", input_path, + "-t", duration, "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", @@ -603,11 +629,12 @@ def try_fallback_encoding( bool: 是否成功 """ # 最简单的软件编码命令 + duration = _calculate_ffmpeg_duration(start_time, end_time) fallback_cmd = [ "ffmpeg", "-y", - "-i", input_path, "-ss", start_time, - "-to", end_time, + "-i", input_path, + "-t", duration, "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", @@ -801,11 +828,8 @@ def _build_ffmpeg_command_with_audio_control( elif hwaccel_args: cmd.extend(hwaccel_args) - # 输入文件 - cmd.extend(["-i", input_path]) - - # 时间范围 - cmd.extend(["-ss", start_time, "-to", end_time]) + # 快速定位输入文件,避免长视频从头解码到目标片段 + _append_fast_seek_input(cmd, input_path, start_time, end_time) # 视频编码器设置 cmd.extend(["-c:v", encoder_config["video_codec"]]) diff --git a/app/services/llm/test_subtitle_adapter_pipeline_unittest.py b/app/services/llm/test_subtitle_adapter_pipeline_unittest.py index c9ed3f9..77087e6 100644 --- a/app/services/llm/test_subtitle_adapter_pipeline_unittest.py +++ b/app/services/llm/test_subtitle_adapter_pipeline_unittest.py @@ -56,6 +56,44 @@ class SubtitleAnalyzerAdapterPipelineTests(unittest.TestCase): self.assertIn("用户选择的影视类型", call.call_args.kwargs["prompt"]) self.assertNotIn("短剧解说正文创作任务", call.call_args.kwargs["prompt"]) + def test_film_tv_script_prompts_exclude_intro_outro_and_ads(self): + base_parameters = { + "drama_name": "测试电影", + "drama_genre": "悬疑/犯罪", + "plot_analysis": "主角发现证据疑点。", + "subtitle_content": "# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n证据不对。", + "narration_language": "简体中文(中国)", + } + prompt_parameters = { + "segment_planning": base_parameters, + "script_matching": { + **base_parameters, + "narration_copy": "他发现证据不对,真正的凶手另有其人。", + "original_sound_ratio": 30, + }, + "script_generation": { + **base_parameters, + "segment_plan": '{"segments": []}', + }, + "script_repair": { + **base_parameters, + "invalid_script": '{"items": []}', + "validation_errors": "片段包含广告", + }, + } + + for prompt_name, parameters in prompt_parameters.items(): + with self.subTest(prompt_name=prompt_name): + prompt = PromptManager.get_prompt( + category="film_tv_narration", + name=prompt_name, + parameters=parameters, + ) + self.assertIn("片头", prompt) + self.assertIn("片尾", prompt) + self.assertIn("广告", prompt) + self.assertIn("绝对不能", prompt) + def test_match_narration_copy_to_script_uses_json_prompt_with_selected_type(self): adapter = SubtitleAnalyzerAdapter( api_key="sk-test", diff --git a/app/services/prompts/film_tv_narration/script_generation.py b/app/services/prompts/film_tv_narration/script_generation.py index c945334..e1ca2d1 100644 --- a/app/services/prompts/film_tv_narration/script_generation.py +++ b/app/services/prompts/film_tv_narration/script_generation.py @@ -77,8 +77,9 @@ ${drama_genre} OST=1 的原声片段 narration 字段必须继续使用“播放原片+序号”格式,不要翻译这个固定标记。 ## 绝对绑定规则 -1. 输出 items 数量、顺序和 _id 必须与 segment_plan 完全一致。 -2. 每个 item 的 _id、video_id、video_name、timestamp、OST 必须逐字复制 segment_plan,不得新增、删除、合并、拆分或改动。 +0. 最高优先级:如果 segment_plan 中混入片头、片尾、演职员表、版权声明、平台水印展示、下集预告、花絮、赞助口播、商品露出、贴片广告、中插广告、片中广告或任何与主线剧情无关的推广片段,必须直接删除这些片段,绝对不能输出到最终 items;此规则高于下面所有“照抄 segment_plan”的绑定规则。 +1. 除被第 0 条删除的片头、片尾和广告片段外,输出 items 数量、顺序和 _id 必须与 segment_plan 完全一致。 +2. 除被第 0 条删除的片头、片尾和广告片段外,每个 item 的 _id、video_id、video_name、timestamp、OST 必须逐字复制 segment_plan,不得新增、合并、拆分或改动。 3. 你只能补充 picture 和 narration 两个字段。 4. OST=1 的 narration 必须写成“播放原片+_id”,例如 _id 为 5 时写“播放原片5”。 5. OST=0 的 narration 必须使用 ${narration_language},并严格基于剧情和字幕,不虚构字幕外的具体事件。 @@ -116,6 +117,7 @@ OST=1 的原声片段 narration 字段必须继续使用“播放原片+序号 ## 文案质量要求 - 开场片段要有强钩子,直接点出冲突、疑点或人物困境。 +- 最终剪辑脚本不得包含片头、片尾或任何广告片段;如果字幕内容明显属于非剧情推广,不要把它包装成剧情解说。 - 每段解说优先 25-90 字,具体长度必须服从画面时长;短画面宁可少说,不要密集灌信息。 - 可以使用“可真正的问题是”“而他还不知道”“这句话背后”“危险已经开始靠近”等影视解说转折语,但不要堆砌。 - picture 要描述画面和人物状态,便于后期识别素材。 diff --git a/app/services/prompts/film_tv_narration/script_matching.py b/app/services/prompts/film_tv_narration/script_matching.py index 9577e49..9dfa528 100644 --- a/app/services/prompts/film_tv_narration/script_matching.py +++ b/app/services/prompts/film_tv_narration/script_matching.py @@ -85,14 +85,16 @@ ${original_sound_ratio}% 1. 先按句号、问号、感叹号、省略号切分解说文案,得到候选解说句。 2. 逗号只在明显分割两个动作、场景、观点或描述对象时切分;不要切出没有独立意义的碎片。 3. 不要求每个候选句都单独输出为 OST=0;可以合并、压缩相邻候选句作为剧情桥段,但不能改变用户文案的核心意思。 -4. 为每个解说片段寻找最匹配的原始字幕画面,优先选择能表达该句核心含义、人物状态或信息转折的画面。 -5. 使用公式估算所需画面时长:所需秒数 = 解说字数 / 5。匹配画面时长尽量接近,误差优先控制在 ±0.5 秒。 -6. 如果一句解说太长,必须拆成多个 OST=0 片段,分别匹配不同或连续画面。 -7. timestamp 必须使用对应 video_id 内部局部时间戳,不得换算为多个视频拼接后的累计时间。 -8. 同一 video_id 内时间段不得交叉或重叠。 -9. 第一段必须是 OST=0 解说钩子,不能直接播放原片。 -10. OST=1 原声片段的总时长占比要尽量接近用户选择的 ${original_sound_ratio}%。这里按最终 items 的 timestamp 总时长估算,不按片段数量估算。 -11. 不要自行判断或改写影视类型;画面匹配和 picture 描述要服务用户选择的 ${drama_genre} 叙事重点。 +4. 严禁把解说文案匹配到片头、片尾、演职员表、版权声明、平台水印展示、下集预告、花絮、赞助口播、商品露出、贴片广告、中插广告、片中广告或任何与主线剧情无关的推广片段;这些内容绝对不能进入最终 items。 +5. 如果字幕或画面文字出现“广告”“赞助”“推广”“片头”“片尾”“预告”“下集”“扫码”“购买”“会员”“关注”等明显非剧情信号,必须跳过对应时间段,不得用作 OST=0 或 OST=1。 +6. 为每个解说片段寻找最匹配的原始字幕画面,优先选择能表达该句核心含义、人物状态或信息转折的画面。 +7. 使用公式估算所需画面时长:所需秒数 = 解说字数 / 5。匹配画面时长尽量接近,误差优先控制在 ±0.5 秒。 +8. 如果一句解说太长,必须拆成多个 OST=0 片段,分别匹配不同或连续画面。 +9. timestamp 必须使用对应 video_id 内部局部时间戳,不得换算为多个视频拼接后的累计时间。 +10. 同一 video_id 内时间段不得交叉或重叠。 +11. 第一段必须是 OST=0 解说钩子,不能直接播放原片。 +12. OST=1 原声片段的总时长占比要尽量接近用户选择的 ${original_sound_ratio}%。这里按最终 items 的 timestamp 总时长估算,不按片段数量估算。 +13. 不要自行判断或改写影视类型;画面匹配和 picture 描述要服务用户选择的 ${drama_genre} 叙事重点。 ## 原片占比规则 - ${original_sound_ratio}% = 0% 时,不要输出 OST=1,全部使用解说承接。 diff --git a/app/services/prompts/film_tv_narration/script_repair.py b/app/services/prompts/film_tv_narration/script_repair.py index cdd9c88..ad7e3f5 100644 --- a/app/services/prompts/film_tv_narration/script_repair.py +++ b/app/services/prompts/film_tv_narration/script_repair.py @@ -82,15 +82,17 @@ ${drama_genre} 1. 只输出 JSON,不要任何解释、标题、Markdown 或代码块。 2. 输出根对象必须是 {"items": [...]}。 3. 每个 item 必须包含 _id、video_id、video_name、timestamp、picture、narration、OST。 -4. video_id、video_name 和 timestamp 必须来自对应字幕窗口;不得把不同视频的同名时间戳混用。 -5. 同一 video_id 内片段不得交叉或重叠。 -6. OST=1 的 narration 必须是“播放原片+序号”;OST=0 的 narration 必须使用 ${narration_language}。 -7. 禁止连续 3 个或更多 OST=1;必须插入或改写 OST=0 解说片段承接剧情。 -8. 跨 video_id 切换前后不能都是 OST=1;必须至少有一个 OST=0 片段解释场景和剧情为什么切换。 -9. OST=0 narration 要补足人物动机、信息承接和因果转折,不要只概括当前画面。 -10. 第一段必须是 OST=0 解说钩子,按“人物困境 + 反常信息 + 悬念问题”写,不要直接播放原片。 -11. OST=0 文案必须匹配画面时长,按“解说字数 / 5 = 所需视频秒数”估算;过密时要缩短文案、延长时间戳或拆成多个片段。 -12. 不要自行改判影视类型;如需改写 narration,必须按用户选择的 ${drama_genre} 保持表达重点。 -13. 尽量保留原脚本中没有错误的片段;无法修复的片段可以删除,但剩余片段必须重新按 1 开始编号。 +4. 必须删除片头、片尾、演职员表、版权声明、平台水印展示、下集预告、花絮、赞助口播、商品露出、贴片广告、中插广告、片中广告或任何与主线剧情无关的推广片段;这些内容绝对不能出现在修复后的 items 中。 +5. 如果字幕或画面文字出现“广告”“赞助”“推广”“片头”“片尾”“预告”“下集”“扫码”“购买”“会员”“关注”等明显非剧情信号,必须删除对应 item,不得改写成解说片段。 +6. video_id、video_name 和 timestamp 必须来自对应字幕窗口;不得把不同视频的同名时间戳混用。 +7. 同一 video_id 内片段不得交叉或重叠。 +8. OST=1 的 narration 必须是“播放原片+序号”;OST=0 的 narration 必须使用 ${narration_language}。 +9. 禁止连续 3 个或更多 OST=1;必须插入或改写 OST=0 解说片段承接剧情。 +10. 跨 video_id 切换前后不能都是 OST=1;必须至少有一个 OST=0 片段解释场景和剧情为什么切换。 +11. OST=0 narration 要补足人物动机、信息承接和因果转折,不要只概括当前画面。 +12. 第一段必须是 OST=0 解说钩子,按“人物困境 + 反常信息 + 悬念问题”写,不要直接播放原片。 +13. OST=0 文案必须匹配画面时长,按“解说字数 / 5 = 所需视频秒数”估算;过密时要缩短文案、延长时间戳或拆成多个片段。 +14. 不要自行改判影视类型;如需改写 narration,必须按用户选择的 ${drama_genre} 保持表达重点。 +15. 尽量保留原脚本中没有错误的片段;无法修复的片段可以删除,但剩余片段必须重新按 1 开始编号。 请输出修复后的完整 JSON。""" diff --git a/app/services/prompts/film_tv_narration/segment_planning.py b/app/services/prompts/film_tv_narration/segment_planning.py index a1da09e..f598b9c 100644 --- a/app/services/prompts/film_tv_narration/segment_planning.py +++ b/app/services/prompts/film_tv_narration/segment_planning.py @@ -71,16 +71,18 @@ ${drama_genre} 1. 只能使用原始字幕中真实存在的视频编号、视频文件名和时间范围。 2. timestamp 必须是对应 video_id 内部的局部时间戳,禁止换算成多个视频拼接后的累计时间。 3. 同一个 video_id 内的片段不得交叉或重叠;尽量按故事顺序排列。 -4. 每个片段必须推动主线、解释人物动机、制造情绪转折、承接原声或保留关键对白。 -5. OST=1 表示保留原声,适合关键对白、情绪爆发、真相揭露、名场面和反转;OST=0 表示后续需要配解说。 -6. 原声片段单段优先控制在 3-10 秒;解说片段可以更长,但必须能从字幕范围中定位。 -7. 影视类型由用户手动选择为 ${drama_genre},不得自行改判;选片段时优先服务该类型的主要看点。 -8. 禁止连续 3 个或更多 OST=1;每 1-2 个原声片段后必须安排 OST=0 解说片段承接剧情。 -9. 跨 video_id 切换前后必须至少有一个 OST=0 片段作为剧情桥段,解释为什么从上一场转到下一场。 -10. 每个 OST=0 片段必须承担明确叙事功能:开场钩子、人物介绍、因果过渡、信息解释、情绪转折、冲突升级、结尾悬念。 -11. 不要跳过关键因果;关系变化、线索发现、危机升级必须有画面或解说桥段承接。 -12. 结尾优先选择能留下新问题、新危险或人物选择的片段,不要只停在原声对白堆叠上。 -13. 解说画面必须给足时长:按“解说字数 / 5 = 所需视频秒数”预估,短画面不要承载长解说。 +4. 严禁选择片头、片尾、演职员表、版权声明、平台水印展示、下集预告、花絮、赞助口播、商品露出、贴片广告、中插广告、片中广告或任何与主线剧情无关的推广片段;这些内容绝对不能进入 segments。 +5. 如果字幕或画面文字出现“广告”“赞助”“推广”“片头”“片尾”“预告”“下集”“扫码”“购买”“会员”“关注”等明显非剧情信号,必须整段跳过,不得用作 OST=0 或 OST=1。 +6. 每个片段必须推动主线、解释人物动机、制造情绪转折、承接原声或保留关键对白。 +7. OST=1 表示保留原声,适合关键对白、情绪爆发、真相揭露、名场面和反转;OST=0 表示后续需要配解说。 +8. 原声片段单段优先控制在 3-10 秒;解说片段可以更长,但必须能从字幕范围中定位。 +9. 影视类型由用户手动选择为 ${drama_genre},不得自行改判;选片段时优先服务该类型的主要看点。 +10. 禁止连续 3 个或更多 OST=1;每 1-2 个原声片段后必须安排 OST=0 解说片段承接剧情。 +11. 跨 video_id 切换前后必须至少有一个 OST=0 片段作为剧情桥段,解释为什么从上一场转到下一场。 +12. 每个 OST=0 片段必须承担明确叙事功能:开场钩子、人物介绍、因果过渡、信息解释、情绪转折、冲突升级、结尾悬念。 +13. 不要跳过关键因果;关系变化、线索发现、危机升级必须有画面或解说桥段承接。 +14. 结尾优先选择能留下新问题、新危险或人物选择的片段,不要只停在原声对白堆叠上。 +15. 解说画面必须给足时长:按“解说字数 / 5 = 所需视频秒数”预估,短画面不要承载长解说。 ## 输出格式 只输出严格 JSON: diff --git a/app/services/test_multi_video_script_sources_unittest.py b/app/services/test_multi_video_script_sources_unittest.py index dd6fce8..f4b7c0f 100644 --- a/app/services/test_multi_video_script_sources_unittest.py +++ b/app/services/test_multi_video_script_sources_unittest.py @@ -9,6 +9,23 @@ from app.utils import check_script class TestMultiVideoScriptSources(unittest.TestCase): + def test_clip_command_uses_input_fast_seek(self): + encoder_config = clip_video.get_safe_encoder_config(None) + + cmd = clip_video._build_ffmpeg_command_with_audio_control( + input_path="/tmp/input.mp4", + output_path="/tmp/output.mp4", + start_time="00:27:32.240", + end_time="00:27:38.240", + encoder_config=encoder_config, + hwaccel_args=[], + remove_audio=False, + ) + + self.assertLess(cmd.index("-ss"), cmd.index("-i")) + self.assertEqual("6", cmd[cmd.index("-t") + 1]) + self.assertNotIn("-to", cmd) + def test_check_format_accepts_optional_video_source_fields(self): script = [ { From dc12f390bb83ff0afe5badee52787b58ae38ff91 Mon Sep 17 00:00:00 2001 From: viccy Date: Mon, 8 Jun 2026 13:05:30 +0800 Subject: [PATCH 20/24] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=E5=8E=9F?= =?UTF-8?q?=E7=89=87=E5=AD=97=E5=B9=95=E6=94=AF=E6=8C=81=E5=B9=B6=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E8=A7=86=E9=A2=91=E5=90=88=E5=B9=B6=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 为VideoClipParams新增原字幕路径配置字段,支持单条/多条字幕路径 - 完善webui参数获取逻辑,处理字幕路径兼容性并对接前端选择 - 重构后端字幕处理流程,支持自动匹配视频对应原字幕,合并原声字幕 - 优化视频合并逻辑,新增ffmpeg无损copy合并判断,自动回退重编码提升效率 - 新增ffmpeg快速素材合并路径,支持自定义字幕样式与多音轨混合 - 新增多个单元测试覆盖字幕匹配、合并及视频合并场景 --- app/models/schema.py | 2 + app/services/generate_video.py | 1047 ++++++++++++++++- app/services/merger_video.py | 200 +++- app/services/script_subtitle.py | 252 +++- app/services/task.py | 158 ++- .../test_merger_video_concat_unittest.py | 120 ++ app/services/test_script_subtitle_unittest.py | 98 ++ .../test_task_subtitle_resolution_unittest.py | 46 + webui.py | 11 + webui/components/script_settings.py | 3 + 10 files changed, 1863 insertions(+), 74 deletions(-) create mode 100644 app/services/test_merger_video_concat_unittest.py create mode 100644 app/services/test_task_subtitle_resolution_unittest.py diff --git a/app/models/schema.py b/app/models/schema.py index b492bb1..5b16143 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -165,6 +165,8 @@ class VideoClipParams(BaseModel): video_clip_json_path: Optional[str] = Field(default="", description="LLM 生成的视频剪辑脚本路径") video_origin_path: Optional[str] = Field(default="", description="原视频路径") video_origin_paths: Optional[List[str]] = Field(default=[], description="原视频路径列表") + original_subtitle_path: Optional[str] = Field(default="", description="原视频字幕路径") + original_subtitle_paths: Optional[List[str]] = Field(default=[], description="原视频字幕路径列表") video_aspect: Optional[VideoAspect] = Field(default=VideoAspect.portrait.value, description="视频比例") video_language: Optional[str] = Field(default="zh-CN", description="视频语言") diff --git a/app/services/generate_video.py b/app/services/generate_video.py index d66166b..cca0c04 100644 --- a/app/services/generate_video.py +++ b/app/services/generate_video.py @@ -9,6 +9,10 @@ ''' import os +import json +import re +import shlex +import subprocess import traceback import tempfile from typing import Optional, Dict, Any @@ -49,6 +53,9 @@ SUBTITLE_MASK_DEFAULTS = { }, } +_FFMPEG_FILTER_CACHE: Dict[tuple[str, str], bool] = {} +_FFMPEG_ENCODER_CACHE: Dict[tuple[str, str], bool] = {} + def _clamp(value, minimum, maximum): return min(max(value, minimum), maximum) @@ -266,6 +273,924 @@ def is_valid_subtitle_file(subtitle_path: str) -> bool: return False +def _has_existing_file(file_path: Optional[str]) -> bool: + return bool(file_path and os.path.exists(file_path)) + + +def _get_ffmpeg_binary() -> str: + for env_name in ("NARRATO_FFMPEG_EXE", "IMAGEIO_FFMPEG_EXE"): + candidate = os.environ.get(env_name, "").strip() + if candidate and os.path.isfile(candidate): + return candidate + + try: + import imageio_ffmpeg + + candidate = imageio_ffmpeg.get_ffmpeg_exe() + if candidate and os.path.isfile(candidate): + return candidate + except Exception as e: + logger.debug(f"未找到 imageio-ffmpeg 二进制: {e}") + + return "ffmpeg" + + +def _get_ffprobe_binary(ffmpeg_binary: Optional[str] = None) -> str: + for env_name in ("NARRATO_FFPROBE_EXE", "IMAGEIO_FFPROBE_EXE"): + candidate = os.environ.get(env_name, "").strip() + if candidate and os.path.isfile(candidate): + return candidate + + if ffmpeg_binary: + sibling = os.path.join(os.path.dirname(ffmpeg_binary), "ffprobe") + if os.path.isfile(sibling): + return sibling + + return "ffprobe" + + +def _check_ffmpeg_binary(ffmpeg_binary: str) -> bool: + try: + subprocess.run( + [ffmpeg_binary, "-version"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + ) + return True + except (subprocess.SubprocessError, FileNotFoundError) as e: + logger.error(f"ffmpeg 不可用: {ffmpeg_binary}, {e}") + return False + + +def _format_ffmpeg_float(value: float) -> str: + return f"{float(value):.3f}".rstrip("0").rstrip(".") + + +def _quote_filter_value(value: str) -> str: + escaped = str(value).replace("\\", "\\\\").replace("'", "\\'") + return f"'{escaped}'" + + +def _probe_video(video_path: str) -> Dict[str, Any]: + ffmpeg_binary = _get_ffmpeg_binary() + ffprobe_binary = _get_ffprobe_binary(ffmpeg_binary) + cmd = [ + ffprobe_binary, + "-v", + "error", + "-print_format", + "json", + "-show_streams", + "-show_format", + video_path, + ] + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + if result.returncode != 0: + raise RuntimeError(f"ffprobe 读取视频失败: {result.stderr.strip()}") + + data = json.loads(result.stdout or "{}") + streams = data.get("streams", []) + video_stream = next((stream for stream in streams if stream.get("codec_type") == "video"), None) + if not video_stream: + raise RuntimeError("ffprobe 未找到视频流") + + duration = ( + video_stream.get("duration") + or data.get("format", {}).get("duration") + or 0 + ) + duration = float(duration) + if duration <= 0: + raise RuntimeError("ffprobe 未获取到有效视频时长") + + return { + "width": int(video_stream["width"]), + "height": int(video_stream["height"]), + "duration": duration, + "has_audio": any(stream.get("codec_type") == "audio" for stream in streams), + } + + +def _ffmpeg_filter_available(filter_name: str) -> bool: + ffmpeg_binary = _get_ffmpeg_binary() + cache_key = (ffmpeg_binary, filter_name) + if cache_key in _FFMPEG_FILTER_CACHE: + return _FFMPEG_FILTER_CACHE[cache_key] + + try: + result = subprocess.run( + [ffmpeg_binary, "-hide_banner", "-filters"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + available = False + if result.returncode == 0: + for line in result.stdout.splitlines(): + parts = line.split() + if len(parts) >= 2 and parts[1] == filter_name: + available = True + break + _FFMPEG_FILTER_CACHE[cache_key] = available + return available + except Exception: + _FFMPEG_FILTER_CACHE[cache_key] = False + return False + + +def _ffmpeg_encoder_available(encoder_name: str) -> bool: + ffmpeg_binary = _get_ffmpeg_binary() + cache_key = (ffmpeg_binary, encoder_name) + if cache_key in _FFMPEG_ENCODER_CACHE: + return _FFMPEG_ENCODER_CACHE[cache_key] + + try: + result = subprocess.run( + [ffmpeg_binary, "-hide_banner", "-encoders"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + available = result.returncode == 0 and encoder_name in result.stdout + _FFMPEG_ENCODER_CACHE[cache_key] = available + return available + except Exception: + _FFMPEG_ENCODER_CACHE[cache_key] = False + return False + + +def _select_compatible_encoder(preferred_encoder: str) -> str: + if _ffmpeg_encoder_available(preferred_encoder): + return preferred_encoder + logger.warning(f"当前 ffmpeg 二进制不支持编码器 {preferred_encoder},回退 libx264") + return "libx264" + + +def _srt_timestamp_to_seconds(timestamp: str) -> float: + match = re.match( + r"(?P\d{2}):(?P\d{2}):(?P\d{2}),(?P\d{3})", + timestamp.strip(), + ) + if not match: + raise ValueError(f"无效 SRT 时间戳: {timestamp}") + parts = {key: int(value) for key, value in match.groupdict().items()} + return ( + parts["hours"] * 3600 + + parts["minutes"] * 60 + + parts["seconds"] + + parts["millis"] / 1000 + ) + + +def _parse_srt_subtitles(subtitle_path: str) -> list[tuple[float, float, str]]: + with open(subtitle_path, "r", encoding="utf-8-sig") as file: + content = file.read().strip() + + if not content: + return [] + + subtitles = [] + blocks = re.split(r"\n\s*\n", content) + time_pattern = re.compile( + r"(?P\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*" + r"(?P\d{2}:\d{2}:\d{2},\d{3})" + ) + for block in blocks: + lines = [line.strip("\ufeff") for line in block.splitlines() if line.strip()] + if not lines: + continue + + time_index = next( + (index for index, line in enumerate(lines) if time_pattern.search(line)), + None, + ) + if time_index is None: + continue + + match = time_pattern.search(lines[time_index]) + if not match: + continue + + text = "\n".join(lines[time_index + 1:]).strip() + if not text: + continue + + subtitles.append( + ( + _srt_timestamp_to_seconds(match.group("start")), + _srt_timestamp_to_seconds(match.group("end")), + text, + ) + ) + return subtitles + + +def _normalize_hex_color(color: Optional[str], default: str) -> str: + color_names = { + "white": "#FFFFFF", + "black": "#000000", + "red": "#FF0000", + "green": "#008000", + "blue": "#0000FF", + "yellow": "#FFFF00", + "cyan": "#00FFFF", + "magenta": "#FF00FF", + } + value = (color or default or "").strip() + value = color_names.get(value.lower(), value) + + if not value.startswith("#"): + return default + value = value[1:] + if len(value) == 3: + value = "".join(char * 2 for char in value) + if len(value) != 6: + return default + try: + int(value, 16) + except ValueError: + return default + return f"#{value.upper()}" + + +def _css_color_to_ass(color: Optional[str], default: str) -> str: + hex_color = _normalize_hex_color(color, default)[1:] + red = int(hex_color[0:2], 16) + green = int(hex_color[2:4], 16) + blue = int(hex_color[4:6], 16) + return f"&H00{blue:02X}{green:02X}{red:02X}" + + +def _resolve_font_path(subtitle_font: str) -> Optional[str]: + if subtitle_font and os.path.isabs(subtitle_font) and os.path.exists(subtitle_font): + return subtitle_font + + if subtitle_font: + font_path = os.path.join(utils.font_dir(), subtitle_font) + if os.path.exists(font_path): + return font_path + + for candidate in [ + os.path.join(utils.font_dir(), "SourceHanSansCN-Regular.otf"), + os.path.join(utils.font_dir(), "SimHei.ttf"), + "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", + "/System/Library/Fonts/STHeiti Medium.ttc", + "/System/Library/Fonts/Hiragino Sans GB.ttc", + ]: + if os.path.exists(candidate): + return candidate + return None + + +def _resolve_font_family(font_path: Optional[str], subtitle_font: str) -> str: + if font_path: + try: + return ImageFont.truetype(font_path, 12).getname()[0] + except Exception: + pass + if subtitle_font: + return os.path.splitext(os.path.basename(subtitle_font))[0] + return "Arial" + + +def _estimate_subtitle_margin( + video_height: int, + font_size: int, + subtitle_position: str, + custom_position: float, + orientation_subtitle_y_percent: Optional[float], +) -> tuple[int, int]: + if subtitle_position == "top": + return 8, max(10, round(video_height * 0.05)) + if subtitle_position == "center": + return 5, 10 + + y_percent = orientation_subtitle_y_percent + if y_percent is None and subtitle_position == "custom": + y_percent = custom_position + + if y_percent is not None: + estimated_text_height = max(24, round(font_size * 1.35)) + y = (video_height - estimated_text_height) * (y_percent / 100) + margin = video_height - y - estimated_text_height + return 2, max(10, round(margin)) + + return 2, max(10, round(video_height * 0.05)) + + +def _build_subtitle_filter( + subtitle_path: str, + font_path: Optional[str], + subtitle_font: str, + subtitle_font_size: int, + subtitle_color: str, + stroke_color: str, + stroke_width: float, + video_width: int, + video_height: int, + subtitle_position: str, + custom_position: float, + orientation_subtitle_y_percent: Optional[float], +) -> str: + font_family = _resolve_font_family(font_path, subtitle_font) + alignment, margin_v = _estimate_subtitle_margin( + video_height=video_height, + font_size=subtitle_font_size, + subtitle_position=subtitle_position, + custom_position=custom_position, + orientation_subtitle_y_percent=orientation_subtitle_y_percent, + ) + force_style = ",".join( + [ + f"Fontname={font_family}", + f"Fontsize={subtitle_font_size}", + f"PrimaryColour={_css_color_to_ass(subtitle_color, '#FFFFFF')}", + f"OutlineColour={_css_color_to_ass(stroke_color, '#000000')}", + "BorderStyle=1", + f"Outline={stroke_width}", + "Shadow=0", + f"Alignment={alignment}", + f"MarginV={margin_v}", + ] + ) + + args = [f"filename={_quote_filter_value(subtitle_path)}"] + args.append(f"original_size={video_width}x{video_height}") + if font_path: + args.append(f"fontsdir={_quote_filter_value(os.path.dirname(font_path))}") + args.append(f"force_style={_quote_filter_value(force_style)}") + return f"subtitles={':'.join(args)}" + + +def _css_color_to_drawtext(color: Optional[str], default: str) -> str: + return f"0x{_normalize_hex_color(color, default)[1:]}" + + +def _escape_drawtext_text(text: str) -> str: + return ( + text.replace("\\", "\\\\") + .replace("%", "\\%") + .replace("\r\n", "\n") + .replace("\r", "\n") + .replace("\n", "\\n") + ) + + +def _resolve_drawtext_y_expression( + subtitle_position: str, + custom_position: float, + orientation_subtitle_y_percent: Optional[float], +) -> str: + if subtitle_position == "top": + return "h*0.05" + if subtitle_position == "center": + return "(h-text_h)/2" + + y_percent = orientation_subtitle_y_percent + if y_percent is None and subtitle_position == "custom": + y_percent = custom_position + + if y_percent is not None: + return f"(h-text_h)*{_format_ffmpeg_float(y_percent / 100)}" + return "h*0.95-text_h" + + +def _build_drawtext_filters( + subtitle_path: str, + font_path: Optional[str], + subtitle_font_size: int, + subtitle_color: str, + stroke_color: str, + stroke_width: float, + subtitle_position: str, + custom_position: float, + orientation_subtitle_y_percent: Optional[float], + video_width: int, +) -> list[str]: + subtitles = _parse_srt_subtitles(subtitle_path) + if not subtitles: + raise RuntimeError("SRT 字幕解析结果为空,无法使用 drawtext 快路径") + + y_expr = _resolve_drawtext_y_expression( + subtitle_position=subtitle_position, + custom_position=custom_position, + orientation_subtitle_y_percent=orientation_subtitle_y_percent, + ) + max_width = video_width * 0.9 + drawtext_filters = [] + + for start, end, text in subtitles: + wrapped_text = text + if font_path: + wrapped_text, _ = wrap_text( + text, + max_width=max_width, + font=font_path, + fontsize=subtitle_font_size, + ) + + args = [] + if font_path: + args.append(f"fontfile={_quote_filter_value(font_path)}") + args.extend( + [ + f"text={_quote_filter_value(_escape_drawtext_text(wrapped_text))}", + f"fontcolor={_css_color_to_drawtext(subtitle_color, '#FFFFFF')}", + f"fontsize={subtitle_font_size}", + f"borderw={stroke_width}", + f"bordercolor={_css_color_to_drawtext(stroke_color, '#000000')}", + "x=(w-text_w)/2", + f"y={y_expr}", + ( + "enable=" + f"{_quote_filter_value(f'between(t,{_format_ffmpeg_float(start)},{_format_ffmpeg_float(end)})')}" + ), + ] + ) + drawtext_filters.append(f"drawtext={':'.join(args)}") + + return drawtext_filters + + +def _hex_to_rgba(color: Optional[str], default: str, alpha: int = 255) -> tuple[int, int, int, int]: + hex_color = _normalize_hex_color(color, default)[1:] + return ( + int(hex_color[0:2], 16), + int(hex_color[2:4], 16), + int(hex_color[4:6], 16), + alpha, + ) + + +def _create_subtitle_png_file( + text: str, + font_path: Optional[str], + subtitle_font_size: int, + subtitle_color: str, + stroke_color: str, + stroke_width: float, + video_width: int, + output_dir: str, +) -> str: + font = ImageFont.truetype(font_path, subtitle_font_size) if font_path else ImageFont.load_default() + wrapped_text, _ = wrap_text( + text, + max_width=video_width * 0.9, + font=font_path or "Arial", + fontsize=subtitle_font_size, + ) + stroke_width_px = max(0, int(round(float(stroke_width)))) + padding = max(8, stroke_width_px * 3 + 6) + + probe = Image.new("RGBA", (1, 1), (0, 0, 0, 0)) + draw = ImageDraw.Draw(probe) + bbox = draw.multiline_textbbox( + (0, 0), + wrapped_text, + font=font, + spacing=4, + stroke_width=stroke_width_px, + align="center", + ) + text_width = max(1, bbox[2] - bbox[0]) + text_height = max(1, bbox[3] - bbox[1]) + image = Image.new( + "RGBA", + (text_width + padding * 2, text_height + padding * 2), + (0, 0, 0, 0), + ) + draw = ImageDraw.Draw(image) + draw.multiline_text( + (image.width / 2, padding - bbox[1]), + wrapped_text, + font=font, + fill=_hex_to_rgba(subtitle_color, "#FFFFFF"), + anchor="ma", + spacing=4, + align="center", + stroke_width=stroke_width_px, + stroke_fill=_hex_to_rgba(stroke_color, "#000000"), + ) + + temp_file = tempfile.NamedTemporaryFile( + suffix=".png", + prefix="subtitle_text_", + dir=output_dir, + delete=False, + ) + temp_file.close() + image.save(temp_file.name) + return temp_file.name + + +def _resolve_overlay_y_expression( + subtitle_position: str, + custom_position: float, + orientation_subtitle_y_percent: Optional[float], +) -> str: + if subtitle_position == "top": + return "main_h*0.05" + if subtitle_position == "center": + return "(main_h-overlay_h)/2" + + y_percent = orientation_subtitle_y_percent + if y_percent is None and subtitle_position == "custom": + y_percent = custom_position + + if y_percent is not None: + return f"(main_h-overlay_h)*{_format_ffmpeg_float(y_percent / 100)}" + return "main_h*0.95-overlay_h" + + +def _create_subtitle_mask_alpha_file(region: Dict[str, Any], output_dir: str) -> str: + alpha = _build_subtitle_mask_alpha(region) + temp_file = tempfile.NamedTemporaryFile( + suffix=".png", + prefix="subtitle_mask_", + dir=output_dir, + delete=False, + ) + temp_file.close() + alpha.save(temp_file.name) + return temp_file.name + + +def _build_mask_filter( + input_label: str, + mask_input_index: int, + region: Dict[str, Any], + output_label: str, +) -> list[str]: + blur_sigma = ( + max(4, round(region["blur_radius"] * (0.9 + region["opacity"] * 0.35))) + if region["blur_radius"] > 0 + else 0 + ) + brightness = 1.0 + 0.03 + region["opacity"] * 0.04 + contrast = 0.975 - region["opacity"] * 0.035 + saturation = 1.0 + region["opacity"] * 0.03 + obliterate_width = max(24, round(region["padded_width"] * 0.12)) + obliterate_height = max(12, round(region["padded_height"] * 0.18)) + + blur_chain = ( + f"[masksrc]crop={region['padded_width']}:{region['padded_height']}:" + f"{region['padded_x']}:{region['padded_y']}," + f"scale={obliterate_width}:{obliterate_height}:flags=bicubic," + f"scale={region['padded_width']}:{region['padded_height']}:flags=lanczos" + ) + if blur_sigma > 0: + blur_chain += f",gblur=sigma={blur_sigma}" + blur_chain += ( + ",boxblur=4," + f"eq=brightness={brightness - 1.0:.3f}:" + f"contrast={contrast:.3f}:saturation={saturation:.3f}," + "format=rgba[maskblur]" + ) + + return [ + f"{input_label}split[maskbase][masksrc]", + blur_chain, + ( + f"[{mask_input_index}:v]format=gray," + f"scale={region['padded_width']}:{region['padded_height']}[maskalpha]" + ), + "[maskblur][maskalpha]alphamerge[masked]", + ( + f"[maskbase][masked]overlay={region['padded_x']}:{region['padded_y']}:" + f"format=auto{output_label}" + ), + ] + + +def _build_video_encoder_args(encoder: str, threads: int) -> list[str]: + if encoder == "h264_vaapi": + logger.warning("当前合成滤镜链暂不使用 VAAPI 编码,回退到 libx264") + encoder = "libx264" + + args = ["-c:v", encoder] + if encoder == "h264_nvenc": + args.extend(["-preset", "fast", "-cq", "23"]) + elif encoder == "h264_videotoolbox": + args.extend(["-q:v", "65"]) + elif encoder == "h264_qsv": + args.extend(["-preset", "veryfast", "-global_quality", "23"]) + elif encoder == "h264_amf": + args.extend(["-quality", "speed", "-qp_i", "23", "-qp_p", "23"]) + else: + args.extend(["-preset", "veryfast", "-crf", "23", "-threads", str(threads)]) + return args + + +def _build_moviepy_encoder_options() -> tuple[str, list[str]]: + from app.utils import ffmpeg_utils + + encoder = _select_compatible_encoder(ffmpeg_utils.get_optimal_ffmpeg_encoder()) + if encoder == "h264_vaapi": + logger.warning("MoviePy 兼容路径暂不使用 VAAPI 编码,回退到 libx264") + encoder = "libx264" + + if encoder == "h264_nvenc": + return encoder, ["-preset", "fast", "-cq", "23", "-pix_fmt", "yuv420p"] + if encoder == "h264_videotoolbox": + return encoder, ["-q:v", "65", "-pix_fmt", "yuv420p"] + if encoder == "h264_qsv": + return encoder, ["-preset", "veryfast", "-global_quality", "23", "-pix_fmt", "yuv420p"] + if encoder == "h264_amf": + return encoder, ["-quality", "speed", "-qp_i", "23", "-qp_p", "23", "-pix_fmt", "yuv420p"] + return "libx264", ["-preset", "veryfast", "-crf", "23", "-pix_fmt", "yuv420p"] + + +def _build_ffmpeg_merge_command( + video_path: str, + audio_path: str, + output_path: str, + subtitle_path: Optional[str], + bgm_path: Optional[str], + options: Dict[str, Any], +) -> tuple[list[str], list[str]]: + from app.utils import ffmpeg_utils + + video_meta = _probe_video(video_path) + output_dir = os.path.dirname(output_path) + duration = float(video_meta["duration"]) + duration_arg = _format_ffmpeg_float(duration) + video_width = int(video_meta["width"]) + video_height = int(video_meta["height"]) + + voice_volume = options.get("voice_volume", AudioVolumeDefaults.VOICE_VOLUME) + bgm_volume = options.get("bgm_volume", AudioVolumeDefaults.BGM_VOLUME) + original_audio_volume = options.get("original_audio_volume", AudioVolumeDefaults.ORIGINAL_VOLUME) + keep_original_audio = options.get("keep_original_audio", True) + subtitle_font = options.get("subtitle_font", "") + subtitle_font_size = int(options.get("subtitle_font_size", 40)) + subtitle_color = options.get("subtitle_color", "#FFFFFF") + subtitle_position = options.get("subtitle_position", "bottom") + custom_position = float(options.get("custom_position", 70)) + stroke_color = options.get("stroke_color", "#000000") + stroke_width = options.get("stroke_width", 1) + threads = int(options.get("threads", 2)) + fps = options.get("fps", 30) + subtitle_enabled = options.get("subtitle_enabled", True) + subtitle_mask_enabled = bool(options.get("subtitle_mask_enabled", False)) + + input_args = ["-i", video_path] + next_input_index = 1 + audio_filters = [] + audio_labels = [] + temp_files = [] + + if keep_original_audio and original_audio_volume > 0 and video_meta["has_audio"]: + label = f"a{len(audio_labels)}" + audio_filters.append( + f"[0:a]volume={original_audio_volume},atrim=0:{duration_arg}," + f"asetpts=PTS-STARTPTS[{label}]" + ) + audio_labels.append(f"[{label}]") + + if _has_existing_file(audio_path): + voice_input_index = next_input_index + next_input_index += 1 + input_args.extend(["-i", audio_path]) + label = f"a{len(audio_labels)}" + audio_filters.append( + f"[{voice_input_index}:a]volume={voice_volume},atrim=0:{duration_arg}," + f"asetpts=PTS-STARTPTS[{label}]" + ) + audio_labels.append(f"[{label}]") + + if _has_existing_file(bgm_path) and bgm_volume > 0: + bgm_input_index = next_input_index + next_input_index += 1 + input_args.extend(["-stream_loop", "-1", "-i", bgm_path]) + fade_start = max(0.0, duration - 3.0) + label = f"a{len(audio_labels)}" + audio_filters.append( + f"[{bgm_input_index}:a]volume={bgm_volume},atrim=0:{duration_arg}," + f"afade=t=out:st={_format_ffmpeg_float(fade_start)}:d=3," + f"asetpts=PTS-STARTPTS[{label}]" + ) + audio_labels.append(f"[{label}]") + + if len(audio_labels) == 1: + audio_filters.append( + f"{audio_labels[0]}atrim=0:{duration_arg},asetpts=PTS-STARTPTS[aout]" + ) + elif len(audio_labels) > 1: + audio_filters.append( + f"{''.join(audio_labels)}amix=inputs={len(audio_labels)}:" + f"duration=longest:dropout_transition=0:normalize=0," + f"atrim=0:{duration_arg},asetpts=PTS-STARTPTS[aout]" + ) + + valid_subtitle = bool( + subtitle_enabled + and subtitle_path + and is_valid_subtitle_file(subtitle_path) + ) + has_subtitles_filter = _ffmpeg_filter_available("subtitles") if valid_subtitle else False + has_drawtext_filter = _ffmpeg_filter_available("drawtext") if valid_subtitle else False + if valid_subtitle and not has_subtitles_filter and not has_drawtext_filter: + if not _ffmpeg_filter_available("overlay"): + raise RuntimeError("当前 ffmpeg 缺少 subtitles/drawtext/overlay 字幕处理滤镜") + logger.warning("当前 ffmpeg 缺少 subtitles/drawtext,改用 PNG 字幕叠加快路径") + + video_filters = [] + current_video_label = "[0:v]" + + if subtitle_enabled and subtitle_mask_enabled: + region = _resolve_subtitle_mask_region(video_width, video_height, options) + mask_path = _create_subtitle_mask_alpha_file(region, output_dir) + temp_files.append(mask_path) + mask_input_index = next_input_index + next_input_index += 1 + input_args.extend(["-loop", "1", "-t", duration_arg, "-i", mask_path]) + logger.info( + "ffmpeg 字幕遮罩已启用: " + f"{region['orientation']} x={region['x']} y={region['y']} " + f"w={region['width']} h={region['height']} blur={region['blur_radius']}" + ) + video_filters.extend( + _build_mask_filter( + input_label=current_video_label, + mask_input_index=mask_input_index, + region=region, + output_label="[v_masked]", + ) + ) + current_video_label = "[v_masked]" + + if valid_subtitle: + font_path = _resolve_font_path(subtitle_font) + if font_path: + logger.info(f"ffmpeg 使用字幕字体: {font_path}") + orientation_subtitle_y_percent = _resolve_orientation_subtitle_y_percent( + video_width, + video_height, + options, + ) + if has_drawtext_filter: + drawtext_filters = _build_drawtext_filters( + subtitle_path=subtitle_path, + font_path=font_path, + subtitle_font_size=subtitle_font_size, + subtitle_color=subtitle_color, + stroke_color=stroke_color, + stroke_width=stroke_width, + subtitle_position=subtitle_position, + custom_position=custom_position, + orientation_subtitle_y_percent=orientation_subtitle_y_percent, + video_width=video_width, + ) + for index, drawtext_filter in enumerate(drawtext_filters): + next_label = f"[v_drawtext_{index}]" + video_filters.append(f"{current_video_label}{drawtext_filter}{next_label}") + current_video_label = next_label + elif has_subtitles_filter: + subtitle_filter = _build_subtitle_filter( + subtitle_path=subtitle_path, + font_path=font_path, + subtitle_font=subtitle_font, + subtitle_font_size=subtitle_font_size, + subtitle_color=subtitle_color, + stroke_color=stroke_color, + stroke_width=stroke_width, + video_width=video_width, + video_height=video_height, + subtitle_position=subtitle_position, + custom_position=custom_position, + orientation_subtitle_y_percent=orientation_subtitle_y_percent, + ) + video_filters.append(f"{current_video_label}{subtitle_filter}[v_subtitled]") + current_video_label = "[v_subtitled]" + else: + y_expr = _resolve_overlay_y_expression( + subtitle_position=subtitle_position, + custom_position=custom_position, + orientation_subtitle_y_percent=orientation_subtitle_y_percent, + ) + for index, (start, end, text) in enumerate(_parse_srt_subtitles(subtitle_path)): + png_path = _create_subtitle_png_file( + text=text, + font_path=font_path, + subtitle_font_size=subtitle_font_size, + subtitle_color=subtitle_color, + stroke_color=stroke_color, + stroke_width=stroke_width, + video_width=video_width, + output_dir=output_dir, + ) + temp_files.append(png_path) + subtitle_input_index = next_input_index + next_input_index += 1 + input_args.extend(["-loop", "1", "-t", duration_arg, "-i", png_path]) + next_label = f"[v_subtitle_png_{index}]" + enable_expr = ( + f"between(t,{_format_ffmpeg_float(start)},{_format_ffmpeg_float(end)})" + ) + video_filters.append( + f"{current_video_label}[{subtitle_input_index}:v]" + f"overlay=x=(main_w-overlay_w)/2:y={y_expr}:" + f"enable={_quote_filter_value(enable_expr)}:format=auto{next_label}" + ) + current_video_label = next_label + elif subtitle_enabled and subtitle_path: + logger.warning(f"字幕文件无效或为空: {subtitle_path},ffmpeg 快路径跳过字幕") + + has_video_filter = bool(video_filters) + if has_video_filter: + final_video_filters = [] + if fps: + final_video_filters.append(f"fps={fps}") + final_video_filters.append("format=yuv420p") + video_filters.append( + f"{current_video_label}{','.join(final_video_filters)}[vout]" + ) + + filter_parts = [*video_filters, *audio_filters] + ffmpeg_binary = _get_ffmpeg_binary() + cmd = [ffmpeg_binary, "-y", "-hide_banner", "-loglevel", "error", *input_args] + if filter_parts: + cmd.extend(["-filter_complex", ";".join(filter_parts)]) + + if has_video_filter: + encoder = _select_compatible_encoder(ffmpeg_utils.get_optimal_ffmpeg_encoder()) + cmd.extend(["-map", "[vout]", *_build_video_encoder_args(encoder, threads)]) + else: + cmd.extend(["-map", "0:v:0", "-c:v", "copy"]) + + if audio_labels: + cmd.extend(["-map", "[aout]", "-c:a", "aac", "-b:a", "192k"]) + else: + cmd.append("-an") + + cmd.extend(["-t", duration_arg, "-movflags", "+faststart", output_path]) + return cmd, temp_files + + +def _merge_materials_with_ffmpeg( + video_path: str, + audio_path: str, + output_path: str, + subtitle_path: Optional[str] = None, + bgm_path: Optional[str] = None, + options: Optional[Dict[str, Any]] = None, +) -> bool: + ffmpeg_binary = _get_ffmpeg_binary() + if not _check_ffmpeg_binary(ffmpeg_binary): + return False + + options = options or {} + temp_files = [] + try: + cmd, temp_files = _build_ffmpeg_merge_command( + video_path=video_path, + audio_path=audio_path, + output_path=output_path, + subtitle_path=subtitle_path, + bgm_path=bgm_path, + options=options, + ) + logger.info(f"使用 ffmpeg 快速合并素材: {shlex.join(cmd)}") + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + if result.returncode != 0: + logger.warning(f"ffmpeg 快速合并失败,将回退 MoviePy: {result.stderr[-3000:]}") + if os.path.exists(output_path): + try: + os.remove(output_path) + except OSError: + pass + return False + + logger.success(f"ffmpeg 素材合并完成: {output_path}") + return True + except Exception as e: + logger.warning(f"ffmpeg 快速合并不可用,将回退 MoviePy: {e}") + return False + finally: + for temp_file in temp_files: + try: + if os.path.exists(temp_file): + os.remove(temp_file) + except OSError: + pass + + def merge_materials( video_path: str, audio_path: str, @@ -364,6 +1289,41 @@ def merge_materials( if bgm_path: logger.info(f" ④ 背景音乐: {bgm_path}") logger.info(f" ⑤ 输出: {output_path}") + + merge_engine = str(options.get("merge_engine", "ffmpeg")).lower() + use_ffmpeg_merge = bool(options.get("use_ffmpeg_merge", True)) + if use_ffmpeg_merge and merge_engine != "moviepy": + ffmpeg_options = dict(options) + ffmpeg_options.update( + { + "voice_volume": voice_volume, + "bgm_volume": bgm_volume, + "original_audio_volume": original_audio_volume, + "keep_original_audio": keep_original_audio, + "subtitle_font": subtitle_font, + "subtitle_font_size": subtitle_font_size, + "subtitle_color": subtitle_color, + "subtitle_bg_color": subtitle_bg_color, + "subtitle_position": subtitle_position, + "custom_position": custom_position, + "stroke_color": stroke_color, + "stroke_width": stroke_width, + "threads": threads, + "fps": fps, + "subtitle_enabled": subtitle_enabled, + "subtitle_mask_enabled": subtitle_mask_enabled, + } + ) + if _merge_materials_with_ffmpeg( + video_path=video_path, + audio_path=audio_path, + output_path=output_path, + subtitle_path=subtitle_path, + bgm_path=bgm_path, + options=ffmpeg_options, + ): + return output_path + logger.warning("ffmpeg 快速合并失败,继续使用 MoviePy 兼容路径") # 加载视频 try: @@ -406,7 +1366,7 @@ def merge_materials( temp_original_path = os.path.join(temp_dir, "temp_original.wav") # 保存原声到临时文件进行分析 - original_audio.write_audiofile(temp_original_path, verbose=False, logger=None) + original_audio.write_audiofile(temp_original_path, logger=None) # 计算智能音量调整 tts_adjustment, original_adjustment = normalizer.calculate_volume_adjustment( @@ -475,9 +1435,8 @@ def merge_materials( logger.warning("没有可用的音频轨道,输出视频将没有声音") # 处理字体路径 - font_path = None - if subtitle_path and subtitle_font: - font_path = os.path.join(utils.font_dir(), subtitle_font) + font_path = _resolve_font_path(subtitle_font) if subtitle_path else None + if font_path: if os.name == "nt": font_path = font_path.replace("\\", "/") logger.info(f"使用字体: {font_path}") @@ -508,24 +1467,28 @@ def merge_materials( # 创建文本片段 try: - _clip = TextClip( - text=wrapped_txt, - font=font_path, - font_size=subtitle_font_size, - color=subtitle_color, - bg_color=subtitle_bg_color, # 这里已经在前面处理过,None表示透明 - stroke_color=stroke_color, - stroke_width=stroke_width, - ) + text_clip_kwargs = { + "text": wrapped_txt, + "font_size": subtitle_font_size, + "color": subtitle_color, + "bg_color": subtitle_bg_color, # 这里已经在前面处理过,None表示透明 + "stroke_color": stroke_color, + "stroke_width": stroke_width, + } + if font_path: + text_clip_kwargs["font"] = font_path + _clip = TextClip(**text_clip_kwargs) except Exception as e: logger.error(f"创建字幕片段失败: {str(e)}, 使用简化参数重试") # 如果上面的方法失败,尝试使用更简单的参数 - _clip = TextClip( - text=wrapped_txt, - font=font_path, - font_size=subtitle_font_size, - color=subtitle_color, - ) + fallback_kwargs = { + "text": wrapped_txt, + "font_size": subtitle_font_size, + "color": subtitle_color, + } + if font_path: + fallback_kwargs["font"] = font_path + _clip = TextClip(**fallback_kwargs) # 设置字幕时间 duration = subtitle_item[0][1] - subtitle_item[0][0] @@ -561,12 +1524,14 @@ def merge_materials( # 创建TextClip工厂函数 def make_textclip(text): - return TextClip( - text=text, - font=font_path, - font_size=subtitle_font_size, - color=subtitle_color, - ) + text_clip_kwargs = { + "text": text, + "font_size": subtitle_font_size, + "color": subtitle_color, + } + if font_path: + text_clip_kwargs["font"] = font_path + return TextClip(**text_clip_kwargs) # 处理字幕 - 修复字幕开关bug和空字幕文件问题 if subtitle_enabled and subtitle_path: @@ -601,13 +1566,31 @@ def merge_materials( # 导出最终视频 try: - video_clip.write_videofile( - output_path, - audio_codec="aac", - temp_audiofile_path=output_dir, - threads=threads, - fps=fps, - ) + encoder, ffmpeg_params = _build_moviepy_encoder_options() + logger.info(f"MoviePy 导出编码器: {encoder}, 参数: {ffmpeg_params}") + try: + video_clip.write_videofile( + output_path, + codec=encoder, + audio_codec="aac", + temp_audiofile_path=output_dir, + threads=threads, + fps=fps, + ffmpeg_params=ffmpeg_params, + ) + except Exception: + if encoder == "libx264": + raise + logger.warning(f"MoviePy 使用 {encoder} 导出失败,回退 libx264: {traceback.format_exc()}") + video_clip.write_videofile( + output_path, + codec="libx264", + audio_codec="aac", + temp_audiofile_path=output_dir, + threads=threads, + fps=fps, + ffmpeg_params=["-preset", "veryfast", "-crf", "23", "-pix_fmt", "yuv420p"], + ) logger.success(f"素材合并完成: {output_path}") except Exception as e: logger.error(f"导出视频失败: {str(e)}") diff --git a/app/services/merger_video.py b/app/services/merger_video.py index c6ef84d..ddb9f64 100644 --- a/app/services/merger_video.py +++ b/app/services/merger_video.py @@ -9,6 +9,7 @@ ''' import os +import json import shutil import subprocess from enum import Enum @@ -127,6 +128,188 @@ def create_ffmpeg_concat_file(video_paths: List[str], concat_file_path: str) -> return concat_file_path +def _get_video_stream_signature(video_path: str) -> Optional[dict]: + """ + 获取用于判断 concat copy 是否安全的视频流关键参数。 + """ + probe_cmd = [ + 'ffprobe', '-v', 'error', + '-select_streams', 'v:0', + '-show_entries', + 'stream=codec_name,profile,width,height,pix_fmt,r_frame_rate,avg_frame_rate,time_base,sample_aspect_ratio', + '-of', 'json', + video_path + ] + + try: + result = subprocess.run( + probe_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True + ) + streams = json.loads(result.stdout or "{}").get("streams", []) + if not streams: + logger.warning(f"视频没有可用的视频流,不能使用 copy 合并: {video_path}") + return None + + stream = streams[0] + return { + "codec_name": stream.get("codec_name"), + "profile": stream.get("profile"), + "width": stream.get("width"), + "height": stream.get("height"), + "pix_fmt": stream.get("pix_fmt"), + "r_frame_rate": stream.get("r_frame_rate"), + "avg_frame_rate": stream.get("avg_frame_rate"), + "time_base": stream.get("time_base"), + "sample_aspect_ratio": stream.get("sample_aspect_ratio", "1:1"), + } + except Exception as e: + logger.warning(f"探测视频流参数失败,不能使用 copy 合并: {video_path}, 错误: {str(e)}") + return None + + +def _can_concat_video_copy(video_paths: List[str]) -> bool: + """ + 判断所有片段的视频流参数是否一致,避免 concat copy 造成时间轴或封装异常。 + """ + if not video_paths: + return False + + signatures = [] + for video_path in video_paths: + signature = _get_video_stream_signature(video_path) + if not signature: + return False + signatures.append(signature) + + base_signature = signatures[0] + for video_path, signature in zip(video_paths[1:], signatures[1:]): + if signature != base_signature: + logger.warning( + "视频片段参数不一致,跳过 copy 合并并回退重编码: " + f"{video_path}, 基准={base_signature}, 当前={signature}" + ) + return False + + return True + + +def _get_media_duration(video_path: str) -> Optional[float]: + probe_cmd = [ + 'ffprobe', '-v', 'error', + '-show_entries', 'format=duration', + '-of', 'csv=p=0', + video_path + ] + + try: + result = subprocess.run( + probe_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True + ) + return float(result.stdout.strip()) + except Exception as e: + logger.warning(f"探测视频时长失败: {video_path}, 错误: {str(e)}") + return None + + +def _concat_duration_matches(video_paths: List[str], output_path: str) -> bool: + input_durations = [] + for video_path in video_paths: + duration = _get_media_duration(video_path) + if duration is None: + return False + input_durations.append(duration) + + output_duration = _get_media_duration(output_path) + if output_duration is None: + return False + + expected_duration = sum(input_durations) + diff = abs(expected_duration - output_duration) + tolerance = max(0.5, len(video_paths) * 0.04) + if diff > tolerance: + logger.warning( + "视频流 copy 合并后的时长偏差过大,将回退重编码: " + f"期望={expected_duration:.3f}s, 实际={output_duration:.3f}s, 偏差={diff:.3f}s" + ) + return False + + logger.info( + "视频流 copy 合并时长校验通过: " + f"期望={expected_duration:.3f}s, 实际={output_duration:.3f}s" + ) + return True + + +def _build_concat_video_copy_cmd(concat_file: str, output_path: str) -> List[str]: + return [ + 'ffmpeg', '-y', + '-f', 'concat', + '-safe', '0', + '-i', concat_file, + '-c:v', 'copy', + '-an', + '-movflags', '+faststart', + '-avoid_negative_ts', 'make_zero', + output_path + ] + + +def _build_concat_video_reencode_cmd(concat_file: str, output_path: str, threads: int) -> List[str]: + return [ + 'ffmpeg', '-y', + '-f', 'concat', + '-safe', '0', + '-i', concat_file, + '-c:v', 'libx264', + '-preset', 'medium', + '-profile:v', 'high', + '-an', + '-threads', str(threads), + output_path + ] + + +def _concat_video_streams( + video_paths: List[str], + concat_file: str, + output_path: str, + threads: int +) -> None: + """ + 优先使用无损 copy 合并视频流,失败时回退到原来的重编码合并。 + """ + if _can_concat_video_copy(video_paths): + copy_cmd = _build_concat_video_copy_cmd(concat_file, output_path) + try: + subprocess.run(copy_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if _concat_duration_matches(video_paths, output_path): + logger.info("视频流 copy 合并完成") + return + + if os.path.exists(output_path): + try: + os.remove(output_path) + except OSError as e: + logger.warning(f"删除 copy 合并临时结果失败,将继续尝试重编码覆盖: {str(e)}") + except subprocess.CalledProcessError as e: + error_msg = e.stderr.decode() if e.stderr else str(e) + logger.warning(f"视频流 copy 合并失败,将回退重编码合并: {error_msg}") + else: + logger.info("视频流不满足 copy 合并条件,将使用重编码合并") + + reencode_cmd = _build_concat_video_reencode_cmd(concat_file, output_path, threads) + subprocess.run(reencode_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.info("视频流重编码合并完成") + + def process_single_video( input_path: str, output_path: str, @@ -474,22 +657,7 @@ def combine_clip_videos( concat_file = os.path.join(temp_dir, "concat_list.txt") create_ffmpeg_concat_file(video_paths_only, concat_file) - # 合并所有视频流,但不包含音频 - concat_cmd = [ - 'ffmpeg', '-y', - '-f', 'concat', - '-safe', '0', - '-i', concat_file, - '-c:v', 'libx264', - '-preset', 'medium', - '-profile:v', 'high', - '-an', # 不包含音频 - '-threads', str(threads), - video_concat_path - ] - - subprocess.run(concat_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - logger.info("视频流合并完成") + _concat_video_streams(video_paths_only, concat_file, video_concat_path, threads) # 2. 提取并合并有音频的片段 audio_segments = [video for video in processed_videos if video["keep_audio"]] diff --git a/app/services/script_subtitle.py b/app/services/script_subtitle.py index 2259580..0be21f6 100644 --- a/app/services/script_subtitle.py +++ b/app/services/script_subtitle.py @@ -5,12 +5,16 @@ from typing import Iterable, List, Optional, Sequence, Tuple from loguru import logger +from app.services.short_drama_narration_validation import build_subtitle_index +from app.services.subtitle_text import read_subtitle_text from app.utils import utils DEFAULT_SUBTITLE_OST_TYPES = (0, 2) +DEFAULT_ORIGINAL_SUBTITLE_OST_TYPES = (1,) DEFAULT_MAX_CHARS_PER_SUBTITLE = 12 SENTENCE_PART_RE = re.compile(r"[^。!?!?;;,,、\n]+[。!?!?;;,,、]?") +SubtitleEntry = Tuple[float, float, str] def _normalize_text(text: str) -> str: @@ -116,7 +120,95 @@ def _safe_ost_value(value) -> Optional[int]: return None +def _coerce_positive_int(value) -> Optional[int]: + try: + number = int(value) + except (TypeError, ValueError): + return None + return number if number > 0 else None + + +def _normalize_paths(paths) -> List[str]: + if isinstance(paths, str): + paths = [paths] + if not paths: + return [] + + normalized_paths = [] + seen = set() + for item in paths: + if not isinstance(item, str): + continue + item = item.strip() + if not item or item in seen: + continue + normalized_paths.append(item) + seen.add(item) + return normalized_paths + + +def _resolve_script_video_id(item: dict, video_origin_paths: Sequence[str]) -> int: + video_id = _coerce_positive_int(item.get("video_id") or item.get("video_index")) + if video_id is not None: + return video_id + + video_name = os.path.basename( + str(item.get("video_name") or item.get("source_video") or "").strip() + ) + if video_name: + for index, video_path in enumerate(video_origin_paths, start=1): + if os.path.basename(video_path) == video_name: + return index + + return 1 + + +def _read_subtitle_file(subtitle_path: str) -> str: + try: + return read_subtitle_text(subtitle_path).text + except Exception as e: + logger.warning(f"读取原片字幕失败: {subtitle_path}, {e}") + return "" + + +def _build_combined_original_subtitle_content( + original_subtitle_paths, + video_origin_paths=None, +) -> str: + subtitle_paths = _normalize_paths(original_subtitle_paths) + video_paths = _normalize_paths(video_origin_paths) + sections = [] + + for index, subtitle_path in enumerate(subtitle_paths, start=1): + if not os.path.exists(subtitle_path): + logger.warning(f"原片字幕文件不存在,跳过: {subtitle_path}") + continue + + content = _read_subtitle_file(subtitle_path) + if not content: + logger.warning(f"原片字幕文件为空,跳过: {subtitle_path}") + continue + + video_path = video_paths[index - 1] if index <= len(video_paths) else "" + if video_path: + header = ( + f"# 视频 {index}: {os.path.basename(video_path)}\n" + f"字幕文件: {os.path.basename(subtitle_path)}" + ) + else: + header = f"# 视频 {index}\n字幕文件: {os.path.basename(subtitle_path)}" + sections.append(f"{header}\n{content}".strip()) + + return "\n\n".join(sections) + + def _resolve_item_time_range(item: dict, current_time: float) -> Tuple[Optional[Tuple[float, float]], float]: + duration = float(item.get("duration", 0.0) or 0.0) + if duration > 0: + start = current_time + end = current_time + duration + return (start, end), end + edited_time_range = item.get("editedTimeRange") if edited_time_range: try: @@ -125,23 +217,16 @@ def _resolve_item_time_range(item: dict, current_time: float) -> Tuple[Optional[ except ValueError as e: logger.warning(f"解析 editedTimeRange 失败,将尝试使用 duration: {e}") - duration = float(item.get("duration", 0.0) or 0.0) - if duration <= 0: - return None, current_time - - start = current_time - end = current_time + duration - return (start, end), end + return None, current_time -def _build_srt_blocks( +def _build_narration_subtitle_entries( list_script: Sequence[dict], include_ost: Iterable[int], max_chars: int, -) -> List[str]: +) -> List[SubtitleEntry]: include_ost_set = {int(item) for item in include_ost} - blocks = [] - subtitle_index = 1 + entries: List[SubtitleEntry] = [] current_time = 0.0 for item in list_script: @@ -166,35 +251,158 @@ def _build_srt_blocks( for chunk_index, chunk in enumerate(chunks): chunk_start = start + chunk_duration * chunk_index chunk_end = end if chunk_index == len(chunks) - 1 else start + chunk_duration * (chunk_index + 1) - blocks.append( - "\n".join( - [ - str(subtitle_index), - f"{format_srt_time(chunk_start)} --> {format_srt_time(chunk_end)}", - chunk, - ] - ) + entries.append((chunk_start, chunk_end, chunk)) + + return entries + + +def _build_original_subtitle_entries( + list_script: Sequence[dict], + original_subtitle_paths=None, + video_origin_paths=None, + include_ost: Iterable[int] = DEFAULT_ORIGINAL_SUBTITLE_OST_TYPES, +) -> List[SubtitleEntry]: + original_subtitle_content = _build_combined_original_subtitle_content( + original_subtitle_paths, + video_origin_paths, + ) + if not original_subtitle_content: + return [] + + video_paths = _normalize_paths(video_origin_paths) + subtitle_index = build_subtitle_index(original_subtitle_content, video_paths) + if not subtitle_index: + logger.warning("原片字幕索引为空,无法为原声片段生成字幕") + return [] + + cues_by_video = {} + for cue in subtitle_index: + cues_by_video.setdefault(cue.video_id, []).append(cue) + + include_ost_set = {int(item) for item in include_ost} + entries: List[SubtitleEntry] = [] + current_time = 0.0 + + for item in list_script: + time_range, current_time = _resolve_item_time_range(item, current_time) + if not time_range: + continue + + ost = _safe_ost_value(item.get("OST")) + if ost not in include_ost_set: + continue + + source_time_range = item.get("sourceTimeRange") or item.get("timestamp") + try: + source_start, source_end = parse_time_range(source_time_range) + except ValueError as e: + logger.warning(f"解析原声片段源时间失败,跳过原片字幕: {e}") + continue + + target_start, target_end = time_range + source_duration = source_end - source_start + target_duration = target_end - target_start + if source_duration <= 0 or target_duration <= 0: + continue + + video_id = _resolve_script_video_id(item, video_paths) + video_cues = cues_by_video.get(video_id, []) + if not video_cues: + logger.warning(f"视频 {video_id} 未找到可用原片字幕,片段 {item.get('_id')} 跳过") + continue + + for cue in video_cues: + cue_start = cue.start_ms / 1000 + cue_end = cue.end_ms / 1000 + overlap_start = max(source_start, cue_start) + overlap_end = min(source_end, cue_end) + if overlap_end <= overlap_start: + continue + + text = clean_subtitle_text(cue.text) + if not text: + continue + + mapped_start = target_start + (overlap_start - source_start) + mapped_end = target_start + (overlap_end - source_start) + mapped_start = max(target_start, min(mapped_start, target_end)) + mapped_end = max(target_start, min(mapped_end, target_end)) + if mapped_end <= mapped_start: + continue + + entries.append((mapped_start, mapped_end, text)) + + return entries + + +def _subtitle_entries_to_blocks(entries: Sequence[SubtitleEntry]) -> List[str]: + blocks = [] + sorted_entries = sorted( + entries, + key=lambda entry: (entry[0], entry[1], entry[2]), + ) + + for subtitle_index, (start, end, text) in enumerate(sorted_entries, start=1): + blocks.append( + "\n".join( + [ + str(subtitle_index), + f"{format_srt_time(start)} --> {format_srt_time(end)}", + text, + ] ) - subtitle_index += 1 + ) return blocks +def _build_srt_blocks( + list_script: Sequence[dict], + include_ost: Iterable[int], + max_chars: int, +) -> List[str]: + entries = _build_narration_subtitle_entries( + list_script, + include_ost=include_ost, + max_chars=max_chars, + ) + return _subtitle_entries_to_blocks(entries) + + def create_script_subtitle_file( task_id: str, list_script: Sequence[dict], output_file: Optional[str] = None, include_ost: Optional[Iterable[int]] = None, max_chars: int = DEFAULT_MAX_CHARS_PER_SUBTITLE, + original_subtitle_paths=None, + video_origin_paths=None, + include_original_ost: Optional[Iterable[int]] = None, ) -> str: - """Create a full SRT file from script narration and edited timeline ranges.""" + """Create a full SRT file from script narration plus original-audio subtitles.""" if not list_script: return "" if include_ost is None: include_ost = DEFAULT_SUBTITLE_OST_TYPES + if include_original_ost is None: + include_original_ost = DEFAULT_ORIGINAL_SUBTITLE_OST_TYPES - blocks = _build_srt_blocks(list_script, include_ost=include_ost, max_chars=max_chars) + entries = _build_narration_subtitle_entries( + list_script, + include_ost=include_ost, + max_chars=max_chars, + ) + entries.extend( + _build_original_subtitle_entries( + list_script, + original_subtitle_paths=original_subtitle_paths, + video_origin_paths=video_origin_paths, + include_ost=include_original_ost, + ) + ) + + blocks = _subtitle_entries_to_blocks(entries) if not blocks: logger.warning("程序化字幕未生成内容") return "" diff --git a/app/services/task.py b/app/services/task.py index f5f60ce..d7aa1c9 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -38,6 +38,125 @@ def _get_auto_transcription_backend(params: VideoClipParams) -> str: return backend +def _get_original_subtitle_paths(params: VideoClipParams) -> list[str]: + subtitle_paths = getattr(params, "original_subtitle_paths", []) or [] + if isinstance(subtitle_paths, str): + subtitle_paths = [subtitle_paths] + + normalized_paths = [] + seen = set() + for subtitle_path in subtitle_paths: + if not isinstance(subtitle_path, str): + continue + subtitle_path = subtitle_path.strip() + if subtitle_path and subtitle_path not in seen: + normalized_paths.append(subtitle_path) + seen.add(subtitle_path) + + single_subtitle_path = str(getattr(params, "original_subtitle_path", "") or "").strip() + if single_subtitle_path and single_subtitle_path not in seen: + normalized_paths.insert(0, single_subtitle_path) + + if not normalized_paths: + normalized_paths = _find_original_subtitle_paths_for_videos(_get_video_origin_paths(params)) + + return normalized_paths + + +def _get_video_origin_paths(params: VideoClipParams) -> list[str]: + video_paths = getattr(params, "video_origin_paths", []) or [] + if isinstance(video_paths, str): + video_paths = [video_paths] + + normalized_paths = [] + seen = set() + for video_path in video_paths: + if not isinstance(video_path, str): + continue + video_path = video_path.strip() + if video_path and video_path not in seen: + normalized_paths.append(video_path) + seen.add(video_path) + + single_video_path = str(getattr(params, "video_origin_path", "") or "").strip() + if single_video_path and single_video_path not in seen: + normalized_paths.insert(0, single_video_path) + + return normalized_paths + + +def _video_stem_candidates(video_path: str) -> list[str]: + stem = path.splitext(path.basename(str(video_path or "").strip()))[0] + if not stem: + return [] + + candidates = [stem] + timestamp_stripped = re.sub(r"_[0-9]{14}$", "", stem) + if timestamp_stripped and timestamp_stripped not in candidates: + candidates.append(timestamp_stripped) + return candidates + + +def _find_original_subtitle_paths_for_videos(video_paths: list[str]) -> list[str]: + subtitle_dir = utils.subtitle_dir() + if not path.isdir(subtitle_dir): + return [] + + subtitle_files = [ + path.join(subtitle_dir, filename) + for filename in os.listdir(subtitle_dir) + if filename.lower().endswith(".srt") + ] + if not subtitle_files: + return [] + + resolved_paths = [] + seen = set() + for video_path in video_paths: + candidates = _video_stem_candidates(video_path) + if not candidates: + continue + + matches = [] + for subtitle_path in subtitle_files: + subtitle_stem = path.splitext(path.basename(subtitle_path))[0] + for candidate in candidates: + if subtitle_stem == candidate or subtitle_stem.startswith(f"{candidate}_"): + matches.append(subtitle_path) + break + + if not matches: + continue + + matches.sort(key=lambda item: path.getmtime(item), reverse=True) + selected_path = matches[0] + if selected_path not in seen: + resolved_paths.append(selected_path) + seen.add(selected_path) + + if resolved_paths: + logger.info(f"未从参数获取原片字幕,已按视频文件名自动匹配: {resolved_paths}") + return resolved_paths + + +def _create_programmatic_subtitle_file( + task_id: str, + list_script: list[dict], + params: VideoClipParams, +) -> str: + if not getattr(params, "subtitle_enabled", True): + return "" + + original_subtitle_paths = _get_original_subtitle_paths(params) + logger.info(f"程序化字幕使用原片字幕路径: {original_subtitle_paths or '未提供'}") + return script_subtitle.create_script_subtitle_file( + task_id=task_id, + list_script=list_script, + original_subtitle_paths=original_subtitle_paths, + video_origin_paths=_get_video_origin_paths(params), + ) + + def _build_subtitle_mask_options(params: VideoClipParams, enabled=None) -> dict: mask_configured = bool( getattr(params, "subtitle_enabled", True) @@ -279,7 +398,19 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di logger.info(f"音频文件合并成功->{merged_audio_path}") # 合并字幕文件 - merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list) + merged_subtitle_path = "" + if getattr(params, "subtitle_enabled", True): + try: + merged_subtitle_path = _create_programmatic_subtitle_file( + task_id, + new_script_list, + params, + ) + except Exception as e: + logger.warning(f"程序化字幕生成失败,将尝试合并TTS字幕: {e}") + + if not merged_subtitle_path and getattr(params, "subtitle_enabled", True): + merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list) if merged_subtitle_path: logger.info(f"字幕文件合并成功->{merged_subtitle_path}") else: @@ -296,6 +427,15 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di logger.warning("没有需要合并的音频/字幕") merged_audio_path = "" merged_subtitle_path = "" + if getattr(params, "subtitle_enabled", True): + try: + merged_subtitle_path = _create_programmatic_subtitle_file( + task_id, + new_script_list, + params, + ) + except Exception as e: + logger.warning(f"程序化字幕生成失败: {e}") """ 5. 合并视频 @@ -574,9 +714,10 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): merged_subtitle_path = "" if getattr(params, "subtitle_enabled", True): try: - merged_subtitle_path = script_subtitle.create_script_subtitle_file( - task_id=task_id, - list_script=new_script_list, + merged_subtitle_path = _create_programmatic_subtitle_file( + task_id, + new_script_list, + params, ) except Exception as e: logger.warning(f"程序化字幕生成失败,将尝试合并TTS字幕: {e}") @@ -600,6 +741,15 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): logger.warning("没有需要合并的音频/字幕") merged_audio_path = "" merged_subtitle_path = "" + if getattr(params, "subtitle_enabled", True): + try: + merged_subtitle_path = _create_programmatic_subtitle_file( + task_id, + new_script_list, + params, + ) + except Exception as e: + logger.warning(f"程序化字幕生成失败: {e}") sm.state.update_task( task_id, state=const.TASK_STATE_PROCESSING, diff --git a/app/services/test_merger_video_concat_unittest.py b/app/services/test_merger_video_concat_unittest.py new file mode 100644 index 0000000..1297f76 --- /dev/null +++ b/app/services/test_merger_video_concat_unittest.py @@ -0,0 +1,120 @@ +import subprocess +import unittest +from unittest import mock + +from app.services import merger_video + + +class MergerVideoConcatTests(unittest.TestCase): + def test_can_concat_video_copy_when_signatures_match(self): + signature = { + "codec_name": "h264", + "profile": "High", + "width": 1080, + "height": 1920, + "pix_fmt": "yuv420p", + "r_frame_rate": "30/1", + "avg_frame_rate": "30/1", + "time_base": "1/15360", + "sample_aspect_ratio": "1:1", + } + + with mock.patch.object( + merger_video, + "_get_video_stream_signature", + side_effect=[signature, dict(signature)], + ): + self.assertTrue(merger_video._can_concat_video_copy(["1.mp4", "2.mp4"])) + + def test_can_concat_video_copy_rejects_mismatched_signature(self): + base_signature = { + "codec_name": "h264", + "profile": "High", + "width": 1080, + "height": 1920, + "pix_fmt": "yuv420p", + "r_frame_rate": "30/1", + "avg_frame_rate": "30/1", + "time_base": "1/15360", + "sample_aspect_ratio": "1:1", + } + mismatch_signature = dict(base_signature, r_frame_rate="24000/1001") + + with mock.patch.object( + merger_video, + "_get_video_stream_signature", + side_effect=[base_signature, mismatch_signature], + ): + self.assertFalse(merger_video._can_concat_video_copy(["1.mp4", "2.mp4"])) + + def test_concat_video_streams_prefers_copy_when_compatible(self): + completed = subprocess.CompletedProcess(args=["ffmpeg"], returncode=0) + + with ( + mock.patch.object(merger_video, "_can_concat_video_copy", return_value=True), + mock.patch.object(merger_video, "_concat_duration_matches", return_value=True), + mock.patch.object(merger_video.subprocess, "run", return_value=completed) as run_mock, + ): + merger_video._concat_video_streams( + ["1.mp4", "2.mp4"], + "concat.txt", + "video_concat.mp4", + threads=4, + ) + + cmd = run_mock.call_args.args[0] + self.assertEqual("copy", cmd[cmd.index("-c:v") + 1]) + self.assertNotIn("libx264", cmd) + + def test_concat_video_streams_falls_back_when_copy_duration_mismatches(self): + completed = subprocess.CompletedProcess(args=["ffmpeg"], returncode=0) + + with ( + mock.patch.object(merger_video, "_can_concat_video_copy", return_value=True), + mock.patch.object(merger_video, "_concat_duration_matches", return_value=False), + mock.patch.object(merger_video.os.path, "exists", return_value=False), + mock.patch.object(merger_video.subprocess, "run", return_value=completed) as run_mock, + ): + merger_video._concat_video_streams( + ["1.mp4", "2.mp4"], + "concat.txt", + "video_concat.mp4", + threads=6, + ) + + self.assertEqual(2, run_mock.call_count) + fallback_cmd = run_mock.call_args_list[1].args[0] + self.assertEqual("libx264", fallback_cmd[fallback_cmd.index("-c:v") + 1]) + self.assertEqual("6", fallback_cmd[fallback_cmd.index("-threads") + 1]) + + def test_concat_video_streams_falls_back_to_reencode_when_copy_fails(self): + copy_error = subprocess.CalledProcessError( + returncode=1, + cmd=["ffmpeg"], + stderr=b"copy failed", + ) + completed = subprocess.CompletedProcess(args=["ffmpeg"], returncode=0) + + with ( + mock.patch.object(merger_video, "_can_concat_video_copy", return_value=True), + mock.patch.object( + merger_video.subprocess, + "run", + side_effect=[copy_error, completed], + ) as run_mock, + ): + merger_video._concat_video_streams( + ["1.mp4", "2.mp4"], + "concat.txt", + "video_concat.mp4", + threads=8, + ) + + self.assertEqual(2, run_mock.call_count) + fallback_cmd = run_mock.call_args_list[1].args[0] + self.assertEqual("libx264", fallback_cmd[fallback_cmd.index("-c:v") + 1]) + self.assertEqual("8", fallback_cmd[fallback_cmd.index("-threads") + 1]) + + +if __name__ == "__main__": + unittest.main() diff --git a/app/services/test_script_subtitle_unittest.py b/app/services/test_script_subtitle_unittest.py index a4eed37..7b8bdaf 100644 --- a/app/services/test_script_subtitle_unittest.py +++ b/app/services/test_script_subtitle_unittest.py @@ -89,6 +89,104 @@ class ScriptSubtitleTests(unittest.TestCase): self.assertIn("00:00:00,000 -->", content) self.assertIn("--> 00:00:03,000", content) + def test_create_script_subtitle_file_includes_original_audio_subtitles(self): + list_script = [ + { + "_id": 1, + "OST": 0, + "narration": "前情解说。", + "editedTimeRange": "00:00:00-00:00:02", + "duration": 2, + }, + { + "_id": 2, + "video_id": 1, + "video_name": "source.mp4", + "OST": 1, + "narration": "播放原片2", + "timestamp": "00:00:10,000-00:00:14,000", + "sourceTimeRange": "00:00:10,000-00:00:14,000", + "editedTimeRange": "00:00:02-00:00:06", + "duration": 4, + }, + ] + original_srt = """1 +00:00:09,000 --> 00:00:11,000 +开头会被裁掉一秒。 + +2 +00:00:11,500 --> 00:00:13,000 +这句原声对白应该出现! + +3 +00:00:13,500 --> 00:00:15,000 +结尾只保留半秒。 +""" + + with tempfile.TemporaryDirectory() as temp_dir: + subtitle_file = Path(temp_dir) / "source.srt" + subtitle_file.write_text(original_srt, encoding="utf-8") + output_file = Path(temp_dir) / "script_subtitles.srt" + script_subtitle.create_script_subtitle_file( + task_id="test", + list_script=list_script, + output_file=str(output_file), + original_subtitle_paths=[str(subtitle_file)], + video_origin_paths=["source.mp4"], + max_chars=16, + ) + content = output_file.read_text(encoding="utf-8") + + self.assertIn("前情解说", content) + self.assertIn("开头会被裁掉一秒", content) + self.assertIn("这句原声对白应该出现", content) + self.assertIn("结尾只保留半秒", content) + self.assertIn("00:00:02,000 --> 00:00:03,000", content) + self.assertIn("00:00:03,500 --> 00:00:05,000", content) + self.assertIn("00:00:05,500 --> 00:00:06,000", content) + self.assertNotIn("播放原片2", content) + + def test_create_script_subtitle_file_uses_matching_video_id_for_original_subtitles(self): + list_script = [ + { + "_id": 1, + "video_id": 2, + "video_name": "second.mp4", + "OST": 1, + "narration": "播放原片1", + "timestamp": "00:00:01,000-00:00:03,000", + "sourceTimeRange": "00:00:01,000-00:00:03,000", + "editedTimeRange": "00:00:00-00:00:02", + "duration": 2, + }, + ] + first_srt = """1 +00:00:01,000 --> 00:00:03,000 +第一个视频的字幕不应该出现。 +""" + second_srt = """1 +00:00:01,000 --> 00:00:03,000 +第二个视频的字幕应该出现。 +""" + + with tempfile.TemporaryDirectory() as temp_dir: + first_file = Path(temp_dir) / "first.srt" + second_file = Path(temp_dir) / "second.srt" + output_file = Path(temp_dir) / "script_subtitles.srt" + first_file.write_text(first_srt, encoding="utf-8") + second_file.write_text(second_srt, encoding="utf-8") + script_subtitle.create_script_subtitle_file( + task_id="test", + list_script=list_script, + output_file=str(output_file), + original_subtitle_paths=[str(first_file), str(second_file)], + video_origin_paths=["first.mp4", "second.mp4"], + ) + content = output_file.read_text(encoding="utf-8") + + self.assertIn("第二个视频的字幕应该出现", content) + self.assertNotIn("第一个视频的字幕不应该出现", content) + if __name__ == "__main__": unittest.main() diff --git a/app/services/test_task_subtitle_resolution_unittest.py b/app/services/test_task_subtitle_resolution_unittest.py new file mode 100644 index 0000000..b7744b2 --- /dev/null +++ b/app/services/test_task_subtitle_resolution_unittest.py @@ -0,0 +1,46 @@ +import tempfile +import time +import unittest +from pathlib import Path + +from app.models.schema import VideoClipParams +from app.services import task + + +class TaskSubtitleResolutionTests(unittest.TestCase): + def test_get_original_subtitle_paths_falls_back_to_matching_video_name(self): + original_subtitle_dir = task.utils.subtitle_dir + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + older = temp_path / "01_1080p_fun_asr.srt" + newer = temp_path / "01_1080p_fun_asr_20260608010240.srt" + unrelated = temp_path / "other_fun_asr.srt" + older.write_text("older", encoding="utf-8") + unrelated.write_text("other", encoding="utf-8") + time.sleep(0.01) + newer.write_text("newer", encoding="utf-8") + + task.utils.subtitle_dir = lambda: str(temp_path) + params = VideoClipParams( + video_origin_path="/tmp/01_1080p_20260608113314.mp4", + ) + + try: + subtitle_paths = task._get_original_subtitle_paths(params) + finally: + task.utils.subtitle_dir = original_subtitle_dir + + self.assertEqual([str(newer)], subtitle_paths) + + def test_get_original_subtitle_paths_keeps_explicit_params(self): + params = VideoClipParams( + video_origin_path="/tmp/01_1080p_20260608113314.mp4", + original_subtitle_paths=["/tmp/provided.srt"], + ) + + self.assertEqual(["/tmp/provided.srt"], task._get_original_subtitle_paths(params)) + + +if __name__ == "__main__": + unittest.main() diff --git a/webui.py b/webui.py index 57f8eb7..bf9dd71 100644 --- a/webui.py +++ b/webui.py @@ -328,11 +328,22 @@ def get_jianying_export_params(draft_name=None) -> VideoClipParams: voice_name = get_voice_name_for_tts_engine(tts_engine) voice_rate = st.session_state.get('voice_rate', 1.0) voice_pitch = st.session_state.get('voice_pitch', 1.0) + subtitle_paths = st.session_state.get('subtitle_paths', []) + if isinstance(subtitle_paths, str): + subtitle_paths = [subtitle_paths] + subtitle_paths = [ + path for path in subtitle_paths + if isinstance(path, str) and path.strip() + ] + if not subtitle_paths and st.session_state.get('subtitle_path'): + subtitle_paths = [st.session_state.get('subtitle_path')] return VideoClipParams( video_clip_json_path=st.session_state['video_clip_json_path'], video_origin_path=st.session_state['video_origin_path'], video_origin_paths=st.session_state.get('video_origin_paths', []), + original_subtitle_path=subtitle_paths[0] if subtitle_paths else "", + original_subtitle_paths=subtitle_paths, tts_engine=tts_engine, voice_name=voice_name, voice_rate=voice_rate, diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index e57c42d..5d68dd6 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -1718,11 +1718,14 @@ def save_script_with_validation(tr, video_clip_json_details): def get_script_params(): """获取脚本参数""" + subtitle_paths = _selected_subtitle_paths() return { 'video_language': st.session_state.get('video_language', ''), 'video_clip_json_path': st.session_state.get('video_clip_json_path', ''), 'video_origin_path': st.session_state.get('video_origin_path', ''), 'video_origin_paths': _selected_video_paths(), + 'original_subtitle_path': subtitle_paths[0] if subtitle_paths else '', + 'original_subtitle_paths': subtitle_paths, 'video_name': st.session_state.get('video_name', ''), 'video_plot': st.session_state.get('video_plot', '') } From 7a5303aa209127dcd34a5f544d97a83917bb3d15 Mon Sep 17 00:00:00 2001 From: viccy Date: Mon, 8 Jun 2026 13:28:27 +0800 Subject: [PATCH 21/24] =?UTF-8?q?feat(ffmpeg,webui):=20=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=20FFmpeg=20=E5=BC=95=E6=93=8E=E7=AE=A1=E7=90=86=E4=B8=8E?= =?UTF-8?q?=E6=A3=80=E6=B5=8B=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增配置项 ffmpeg_path 及路径应用逻辑,自动配置 FFmpeg 环境变量 - 实现全量 FFmpeg 引擎自动发现、能力检测工具链,支持多来源识别 - 添加 WebUI 系统设置面板,支持选择、测试和保存 FFmpeg 引擎 - 优化视频合并模块的 FFmpeg 调用,新增进度日志与流式输出处理 - 新增 FFmpeg 检测器单元测试覆盖核心功能 --- app/config/config.py | 39 +- app/services/generate_video.py | 146 +++++- app/utils/ffmpeg_detector.py | 493 +++++++++++++++++++++ app/utils/test_ffmpeg_detector_unittest.py | 76 ++++ config.example.toml | 4 + webui/components/system_settings.py | 160 +++++++ webui/i18n/en.json | 42 ++ webui/i18n/zh.json | 42 ++ 8 files changed, 986 insertions(+), 16 deletions(-) create mode 100644 app/utils/ffmpeg_detector.py create mode 100644 app/utils/test_ffmpeg_detector_unittest.py diff --git a/app/config/config.py b/app/config/config.py index de17645..b1f4ca9 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -173,8 +173,43 @@ imagemagick_path = app.get("imagemagick_path", "") if imagemagick_path and os.path.isfile(imagemagick_path): os.environ["IMAGEMAGICK_BINARY"] = imagemagick_path +_applied_ffmpeg_dir = None + + +def apply_ffmpeg_path(ffmpeg_binary: str = "") -> None: + """Apply the configured FFmpeg binary to this Python process.""" + global _applied_ffmpeg_dir + + if not ffmpeg_binary or not os.path.isfile(ffmpeg_binary): + return + + ffmpeg_binary = os.path.abspath(os.path.expanduser(ffmpeg_binary)) + ffmpeg_dir = os.path.dirname(ffmpeg_binary) + os.environ["IMAGEIO_FFMPEG_EXE"] = ffmpeg_binary + + current_paths = os.environ.get("PATH", "").split(os.pathsep) + normalized_ffmpeg_dir = os.path.normcase(os.path.abspath(ffmpeg_dir)) + normalized_previous_dir = ( + os.path.normcase(os.path.abspath(_applied_ffmpeg_dir)) + if _applied_ffmpeg_dir + else None + ) + filtered_paths = [] + for path_item in current_paths: + if not path_item: + continue + normalized_item = os.path.normcase(os.path.abspath(path_item)) + if normalized_item == normalized_ffmpeg_dir: + continue + if normalized_previous_dir and normalized_item == normalized_previous_dir: + continue + filtered_paths.append(path_item) + + os.environ["PATH"] = os.pathsep.join([ffmpeg_dir, *filtered_paths]) + _applied_ffmpeg_dir = ffmpeg_dir + + ffmpeg_path = app.get("ffmpeg_path", "") -if ffmpeg_path and os.path.isfile(ffmpeg_path): - os.environ["IMAGEIO_FFMPEG_EXE"] = ffmpeg_path +apply_ffmpeg_path(ffmpeg_path) logger.info(f"{project_name} v{project_version}") diff --git a/app/services/generate_video.py b/app/services/generate_video.py index cca0c04..0d2c11d 100644 --- a/app/services/generate_video.py +++ b/app/services/generate_video.py @@ -11,8 +11,8 @@ import os import json import re -import shlex import subprocess +import time import traceback import tempfile from typing import Optional, Dict, Any @@ -327,6 +327,16 @@ def _format_ffmpeg_float(value: float) -> str: return f"{float(value):.3f}".rstrip("0").rstrip(".") +def _format_duration(seconds: float) -> str: + seconds = max(0, float(seconds or 0)) + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + if hours: + return f"{hours:02d}:{minutes:02d}:{secs:02d}" + return f"{minutes:02d}:{secs:02d}" + + def _quote_filter_value(value: str) -> str: escaped = str(value).replace("\\", "\\\\").replace("'", "\\'") return f"'{escaped}'" @@ -435,6 +445,106 @@ def _select_compatible_encoder(preferred_encoder: str) -> str: return "libx264" +def _parse_ffmpeg_progress_time(progress: Dict[str, str]) -> float: + for key in ("out_time_us", "out_time_ms"): + value = progress.get(key) + if value: + try: + return max(0.0, int(value) / 1_000_000) + except ValueError: + pass + + value = progress.get("out_time") + if value: + match = re.match( + r"(?P\d+):(?P\d{2}):(?P\d{2})(?:\.(?P\d+))?", + value, + ) + if match: + fraction = match.group("fraction") or "0" + return ( + int(match.group("hours")) * 3600 + + int(match.group("minutes")) * 60 + + int(match.group("seconds")) + + float(f"0.{fraction}") + ) + return 0.0 + + +def _run_ffmpeg_with_progress(cmd: list[str], duration: float) -> tuple[int, str]: + progress_keys = { + "frame", + "fps", + "stream_0_0_q", + "bitrate", + "total_size", + "out_time_us", + "out_time_ms", + "out_time", + "dup_frames", + "drop_frames", + "speed", + "progress", + } + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + progress: Dict[str, str] = {} + output_tail: list[str] = [] + last_log_time = 0.0 + last_logged_percent = -1.0 + + assert process.stdout is not None + for raw_line in process.stdout: + line = raw_line.strip() + if not line: + continue + + if "=" not in line: + output_tail.append(line) + output_tail = output_tail[-80:] + continue + + key, value = line.split("=", 1) + if key not in progress_keys: + output_tail.append(line) + output_tail = output_tail[-80:] + continue + + progress[key] = value + if key != "progress": + continue + + current = _parse_ffmpeg_progress_time(progress) + if value == "end": + current = duration + percent = min(100.0, (current / duration) * 100) if duration > 0 else 0.0 + now = time.monotonic() + should_log = ( + value == "end" + or now - last_log_time >= 5 + or percent - last_logged_percent >= 5 + ) + if should_log: + speed = progress.get("speed", "N/A") + logger.info( + "ffmpeg 合并进度: " + f"{percent:.1f}% " + f"({_format_duration(current)}/{_format_duration(duration)}), " + f"speed={speed}" + ) + last_log_time = now + last_logged_percent = percent + progress = {} + + return_code = process.wait() + return return_code, "\n".join(output_tail[-80:]) + + def _srt_timestamp_to_seconds(timestamp: str) -> float: match = re.match( r"(?P\d{2}):(?P\d{2}):(?P\d{2}),(?P\d{3})", @@ -917,7 +1027,7 @@ def _build_ffmpeg_merge_command( subtitle_path: Optional[str], bgm_path: Optional[str], options: Dict[str, Any], -) -> tuple[list[str], list[str]]: +) -> tuple[list[str], list[str], float]: from app.utils import ffmpeg_utils video_meta = _probe_video(video_path) @@ -1118,7 +1228,17 @@ def _build_ffmpeg_merge_command( filter_parts = [*video_filters, *audio_filters] ffmpeg_binary = _get_ffmpeg_binary() - cmd = [ffmpeg_binary, "-y", "-hide_banner", "-loglevel", "error", *input_args] + cmd = [ + ffmpeg_binary, + "-y", + "-hide_banner", + "-loglevel", + "error", + "-nostats", + "-progress", + "pipe:1", + *input_args, + ] if filter_parts: cmd.extend(["-filter_complex", ";".join(filter_parts)]) @@ -1134,7 +1254,7 @@ def _build_ffmpeg_merge_command( cmd.append("-an") cmd.extend(["-t", duration_arg, "-movflags", "+faststart", output_path]) - return cmd, temp_files + return cmd, temp_files, duration def _merge_materials_with_ffmpeg( @@ -1152,7 +1272,7 @@ def _merge_materials_with_ffmpeg( options = options or {} temp_files = [] try: - cmd, temp_files = _build_ffmpeg_merge_command( + cmd, temp_files, duration = _build_ffmpeg_merge_command( video_path=video_path, audio_path=audio_path, output_path=output_path, @@ -1160,16 +1280,14 @@ def _merge_materials_with_ffmpeg( bgm_path=bgm_path, options=options, ) - logger.info(f"使用 ffmpeg 快速合并素材: {shlex.join(cmd)}") - result = subprocess.run( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=False, + logger.info( + "使用 ffmpeg 快速合并素材: " + f"video={video_path}, audio={audio_path}, output={output_path}, " + f"duration={_format_duration(duration)}" ) - if result.returncode != 0: - logger.warning(f"ffmpeg 快速合并失败,将回退 MoviePy: {result.stderr[-3000:]}") + return_code, ffmpeg_output = _run_ffmpeg_with_progress(cmd, duration) + if return_code != 0: + logger.warning(f"ffmpeg 快速合并失败,将回退 MoviePy: {ffmpeg_output[-3000:]}") if os.path.exists(output_path): try: os.remove(output_path) diff --git a/app/utils/ffmpeg_detector.py b/app/utils/ffmpeg_detector.py new file mode 100644 index 0000000..8075d39 --- /dev/null +++ b/app/utils/ffmpeg_detector.py @@ -0,0 +1,493 @@ +"""FFmpeg engine discovery and capability diagnostics.""" + +from __future__ import annotations + +import os +import platform +import re +import shutil +import subprocess +import sys +import tempfile +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + +from loguru import logger + + +_FFMPEG_EXE = "ffmpeg.exe" if os.name == "nt" else "ffmpeg" +_FFPROBE_EXE = "ffprobe.exe" if os.name == "nt" else "ffprobe" +_SOURCE_PRIORITY = { + "Configured": 0, + "NarratoAI packaged runtime": 1, + "Integrated runtime": 2, + "System PATH": 3, + "Homebrew": 4, + "Python environment": 5, + "Python executable folder": 6, + "IMAGEIO_FFMPEG_EXE": 7, + "imageio-ffmpeg": 8, + "System": 9, +} + + +@dataclass(frozen=True) +class FFmpegEngine: + """A discovered FFmpeg executable.""" + + path: str + source: str + ffprobe_path: str + available: bool + version_line: str + + @property + def label(self) -> str: + status = "OK" if self.available else "Unavailable" + version = self.version_line.replace("ffmpeg version", "").strip() or "unknown version" + return f"{self.source} - {version} - {self.path} ({status})" + + def to_dict(self) -> dict[str, Any]: + payload = asdict(self) + payload["label"] = self.label + return payload + + +def _run_command(args: list[str], timeout: int = 10) -> subprocess.CompletedProcess[str]: + return subprocess.run( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + timeout=timeout, + ) + + +def _first_line(text: str) -> str: + for line in (text or "").splitlines(): + stripped = line.strip() + if stripped: + return stripped + return "" + + +def _is_executable(path: str) -> bool: + if not path: + return False + if os.name == "nt": + return os.path.isfile(path) + return os.path.isfile(path) and os.access(path, os.X_OK) + + +def _normalize_path(path: str) -> str: + return str(Path(path).expanduser().resolve()) + + +def _ffmpeg_version_line(ffmpeg_path: str) -> tuple[bool, str]: + if not _is_executable(ffmpeg_path): + return False, "" + try: + result = _run_command([ffmpeg_path, "-version"], timeout=8) + except Exception as exc: + logger.debug(f"FFmpeg version check failed for {ffmpeg_path}: {exc}") + return False, "" + + output = result.stdout or result.stderr + return result.returncode == 0, _first_line(output) + + +def _paired_ffprobe_path(ffmpeg_path: str) -> str: + ffmpeg = Path(ffmpeg_path) + sibling = ffmpeg.with_name(_FFPROBE_EXE) + if _is_executable(str(sibling)): + return _normalize_path(str(sibling)) + + scoped_path = os.pathsep.join([str(ffmpeg.parent), os.environ.get("PATH", "")]) + discovered = shutil.which(_FFPROBE_EXE, path=scoped_path) + return _normalize_path(discovered) if discovered else "" + + +def _candidate_paths(root_dir: str = "", include_system: bool = True) -> list[tuple[str, str]]: + candidates: list[tuple[str, str]] = [] + root = Path(root_dir).expanduser().resolve() if root_dir else Path.cwd().resolve() + project_parent = root.parent + + candidates.extend( + [ + ("Integrated runtime", str(root / "runtime" / "python" / "bin" / _FFMPEG_EXE)), + ("Integrated runtime", str(root.parent / "runtime" / "python" / "bin" / _FFMPEG_EXE)), + ( + "NarratoAI packaged runtime", + str( + project_parent + / "NarratoAI-Pack" + / "dist" + / "NarratoAI-macos-arm64" + / "runtime" + / "python" + / "bin" + / _FFMPEG_EXE + ), + ), + ("Python environment", str(Path(sys.prefix) / "bin" / _FFMPEG_EXE)), + ("Python executable folder", str(Path(sys.executable).with_name(_FFMPEG_EXE))), + ] + ) + + env_ffmpeg = os.environ.get("IMAGEIO_FFMPEG_EXE", "") + if env_ffmpeg: + candidates.append(("IMAGEIO_FFMPEG_EXE", env_ffmpeg)) + + if include_system: + path_ffmpeg = shutil.which(_FFMPEG_EXE) + if path_ffmpeg: + candidates.append(("System PATH", path_ffmpeg)) + + for source, path in ( + ("Homebrew", f"/opt/homebrew/bin/{_FFMPEG_EXE}"), + ("Homebrew", f"/usr/local/bin/{_FFMPEG_EXE}"), + ("System", f"/usr/bin/{_FFMPEG_EXE}"), + ): + candidates.append((source, path)) + + try: + import imageio_ffmpeg + + candidates.append(("imageio-ffmpeg", imageio_ffmpeg.get_ffmpeg_exe())) + except Exception as exc: + logger.debug(f"imageio-ffmpeg discovery skipped: {exc}") + + return candidates + + +def discover_ffmpeg_engines( + configured_path: str = "", + root_dir: str = "", + include_system: bool = True, +) -> list[dict[str, Any]]: + """Discover available FFmpeg engines from config, packaged runtime and PATH.""" + + candidates: list[tuple[str, str]] = [] + if configured_path: + candidates.append(("Configured", configured_path)) + candidates.extend(_candidate_paths(root_dir=root_dir, include_system=include_system)) + + engines: list[FFmpegEngine] = [] + seen: set[str] = set() + for source, raw_path in candidates: + if not raw_path: + continue + try: + path = _normalize_path(raw_path) + except Exception: + path = str(Path(raw_path).expanduser()) + key = os.path.normcase(path) + if key in seen: + continue + seen.add(key) + + available, version_line = _ffmpeg_version_line(path) + if not available and source not in {"Configured", "IMAGEIO_FFMPEG_EXE"}: + continue + engines.append( + FFmpegEngine( + path=path, + source=source, + ffprobe_path=_paired_ffprobe_path(path), + available=available, + version_line=version_line, + ) + ) + + engines.sort( + key=lambda engine: ( + not engine.available, + _SOURCE_PRIORITY.get(engine.source, 99), + engine.path, + ) + ) + return [engine.to_dict() for engine in engines] + + +def _parse_hwaccels(output: str) -> list[str]: + values: list[str] = [] + for line in output.splitlines(): + item = line.strip().lower() + if not item or item.startswith("hardware acceleration"): + continue + if re.fullmatch(r"[a-z0-9_]+", item): + values.append(item) + return sorted(set(values)) + + +def _parse_ffmpeg_table_names(output: str) -> set[str]: + names: set[str] = set() + for line in output.splitlines(): + match = re.match(r"\s*[A-Z.]{2,}\s+([A-Za-z0-9_]+)\b", line) + if match: + names.add(match.group(1).lower()) + return names + + +def _run_optional(args: list[str], timeout: int = 15, max_output_chars: int = 1200) -> tuple[bool, str]: + try: + result = _run_command(args, timeout=timeout) + except subprocess.TimeoutExpired: + return False, "Command timed out" + except Exception as exc: + return False, str(exc) + + output = "\n".join(part for part in (result.stderr, result.stdout) if part) + if max_output_chars > 0: + output = output[-max_output_chars:] + return result.returncode == 0, output + + +def _hardware_candidates() -> list[tuple[str, str, list[str]]]: + system = platform.system().lower() + if system == "darwin": + return [ + ("videotoolbox", "h264_videotoolbox", ["-c:v", "h264_videotoolbox", "-q:v", "65"]), + ] + if system == "windows": + return [ + ("nvenc", "h264_nvenc", ["-c:v", "h264_nvenc", "-preset", "fast"]), + ("qsv", "h264_qsv", ["-c:v", "h264_qsv", "-preset", "fast"]), + ("amf", "h264_amf", ["-c:v", "h264_amf"]), + ] + return [ + ("nvenc", "h264_nvenc", ["-c:v", "h264_nvenc", "-preset", "fast"]), + ("qsv", "h264_qsv", ["-vf", "format=nv12", "-c:v", "h264_qsv"]), + ("vaapi", "h264_vaapi", ["-vf", "format=nv12,hwupload", "-c:v", "h264_vaapi"]), + ] + + +def _detect_hardware_encoding(ffmpeg_path: str, encoders: set[str]) -> dict[str, Any]: + tested: list[dict[str, Any]] = [] + for accel_type, encoder, encoder_args in _hardware_candidates(): + if encoder.lower() not in encoders: + tested.append( + { + "type": accel_type, + "encoder": encoder, + "available": False, + "message": "Encoder not listed by this FFmpeg build", + } + ) + continue + + cmd = [ + ffmpeg_path, + "-y", + "-hide_banner", + "-loglevel", + "error", + "-f", + "lavfi", + "-i", + "testsrc=duration=0.5:size=128x72:rate=15", + "-frames:v", + "5", + *encoder_args, + "-pix_fmt", + "yuv420p", + "-f", + "null", + "-", + ] + ok, message = _run_optional(cmd, timeout=18) + tested.append( + { + "type": accel_type, + "encoder": encoder, + "available": ok, + "message": "Hardware encode test passed" if ok else message, + } + ) + if ok: + return { + "available": True, + "type": accel_type, + "encoder": encoder, + "message": "Hardware encode test passed", + "tested": tested, + } + + return { + "available": False, + "type": None, + "encoder": None, + "message": "No hardware encoder passed the runtime test", + "tested": tested, + } + + +def _escape_filter_path(path: str) -> str: + return path.replace("\\", "\\\\").replace(":", "\\:").replace("'", "\\'") + + +def _test_subtitle_burn(ffmpeg_path: str, filters: set[str]) -> dict[str, Any]: + filter_status = { + "subtitles": "subtitles" in filters, + "ass": "ass" in filters, + "drawtext": "drawtext" in filters, + "overlay": "overlay" in filters, + } + + if filter_status["subtitles"]: + with tempfile.TemporaryDirectory() as tmp_dir: + srt_path = Path(tmp_dir) / "subtitle_test.srt" + srt_path.write_text( + "1\n00:00:00,000 --> 00:00:00,800\nNarratoAI FFmpeg subtitle test\n", + encoding="utf-8", + ) + ok, message = _run_optional( + [ + ffmpeg_path, + "-y", + "-hide_banner", + "-loglevel", + "error", + "-f", + "lavfi", + "-i", + "color=black:size=320x180:duration=1", + "-vf", + f"subtitles={_escape_filter_path(str(srt_path))}", + "-frames:v", + "1", + "-f", + "null", + "-", + ], + timeout=18, + ) + if ok: + return { + "available": True, + "method": "subtitles", + "message": "SRT subtitle burn-in test passed", + "filters": filter_status, + } + subtitles_error = message + else: + subtitles_error = "subtitles filter is not listed by this FFmpeg build" + + if filter_status["drawtext"]: + ok, message = _run_optional( + [ + ffmpeg_path, + "-y", + "-hide_banner", + "-loglevel", + "error", + "-f", + "lavfi", + "-i", + "color=black:size=320x180:duration=1", + "-vf", + "drawtext=text=NarratoAI:x=10:y=10:fontsize=18:fontcolor=white", + "-frames:v", + "1", + "-f", + "null", + "-", + ], + timeout=18, + ) + if ok: + return { + "available": True, + "method": "drawtext", + "message": "drawtext burn-in fallback test passed", + "filters": filter_status, + } + drawtext_error = message + else: + drawtext_error = "drawtext filter is not listed by this FFmpeg build" + + return { + "available": False, + "method": None, + "message": f"{subtitles_error}\n{drawtext_error}".strip(), + "filters": filter_status, + } + + +def validate_ffmpeg_engine(ffmpeg_path: str) -> dict[str, Any]: + """Run runtime checks for a selected FFmpeg engine.""" + + path = _normalize_path(ffmpeg_path) + report: dict[str, Any] = { + "path": path, + "ffmpeg_available": False, + "version_line": "", + "ffprobe_path": "", + "ffprobe_available": False, + "ffprobe_version_line": "", + "hwaccels": [], + "hardware_acceleration": { + "available": False, + "type": None, + "encoder": None, + "message": "", + "tested": [], + }, + "subtitle_burn": { + "available": False, + "method": None, + "message": "", + "filters": {}, + }, + "software_encoder_available": False, + "errors": [], + } + + available, version_line = _ffmpeg_version_line(path) + report["ffmpeg_available"] = available + report["version_line"] = version_line + if not available: + report["errors"].append("FFmpeg is not executable or failed to run -version") + return report + + ffprobe_path = _paired_ffprobe_path(path) + report["ffprobe_path"] = ffprobe_path + if ffprobe_path: + probe_available, probe_version = _ffmpeg_version_line(ffprobe_path) + report["ffprobe_available"] = probe_available + report["ffprobe_version_line"] = probe_version + + ok, hwaccel_output = _run_optional( + [path, "-hide_banner", "-hwaccels"], + timeout=10, + max_output_chars=0, + ) + if ok: + report["hwaccels"] = _parse_hwaccels(hwaccel_output) + else: + report["errors"].append(f"Failed to list hardware acceleration methods: {hwaccel_output}") + + ok, encoders_output = _run_optional( + [path, "-hide_banner", "-encoders"], + timeout=10, + max_output_chars=0, + ) + encoders = _parse_ffmpeg_table_names(encoders_output) if ok else set() + report["software_encoder_available"] = "libx264" in encoders or "libopenh264" in encoders + if not ok: + report["errors"].append(f"Failed to list encoders: {encoders_output}") + + ok, filters_output = _run_optional( + [path, "-hide_banner", "-filters"], + timeout=10, + max_output_chars=0, + ) + filters = _parse_ffmpeg_table_names(filters_output) if ok else set() + if not ok: + report["errors"].append(f"Failed to list filters: {filters_output}") + + report["hardware_acceleration"] = _detect_hardware_encoding(path, encoders) + report["subtitle_burn"] = _test_subtitle_burn(path, filters) + return report diff --git a/app/utils/test_ffmpeg_detector_unittest.py b/app/utils/test_ffmpeg_detector_unittest.py new file mode 100644 index 0000000..a8c9f61 --- /dev/null +++ b/app/utils/test_ffmpeg_detector_unittest.py @@ -0,0 +1,76 @@ +import os +import tempfile +import unittest +from pathlib import Path + +from app.utils import ffmpeg_detector + + +class FFmpegDetectorTests(unittest.TestCase): + def _write_fake_binary(self, path: Path, first_line: str) -> None: + path.write_text( + "#!/bin/sh\n" + "if [ \"$1\" = \"-version\" ]; then\n" + f" echo \"{first_line}\"\n" + " exit 0\n" + "fi\n" + "if [ \"$2\" = \"-hwaccels\" ]; then\n" + " echo \"Hardware acceleration methods:\"\n" + " echo \"videotoolbox\"\n" + " exit 0\n" + "fi\n" + "if [ \"$2\" = \"-encoders\" ]; then\n" + " echo \" V....D h264_videotoolbox Apple VideoToolbox H.264\"\n" + " echo \" V....D h264_nvenc NVIDIA NVENC H.264\"\n" + " echo \" V....D h264_qsv Intel QSV H.264\"\n" + " echo \" V....D libx264 libx264 H.264\"\n" + " exit 0\n" + "fi\n" + "if [ \"$2\" = \"-filters\" ]; then\n" + " echo \" ... subtitles V->V Render text subtitles\"\n" + " echo \" ... drawtext V->V Draw text\"\n" + " echo \" ... overlay VV->V Overlay video\"\n" + " exit 0\n" + "fi\n" + "exit 0\n", + encoding="utf-8", + ) + path.chmod(0o755) + + @unittest.skipIf(os.name == "nt", "shell fake binaries are POSIX-only") + def test_discover_includes_configured_path(self): + with tempfile.TemporaryDirectory() as tmp_dir: + ffmpeg_path = Path(tmp_dir) / "ffmpeg" + ffprobe_path = Path(tmp_dir) / "ffprobe" + self._write_fake_binary(ffmpeg_path, "ffmpeg version fake-1.0") + self._write_fake_binary(ffprobe_path, "ffprobe version fake-1.0") + + engines = ffmpeg_detector.discover_ffmpeg_engines( + configured_path=str(ffmpeg_path), + root_dir=tmp_dir, + include_system=False, + ) + + self.assertEqual(engines[0]["path"], str(ffmpeg_path.resolve())) + self.assertEqual(engines[0]["ffprobe_path"], str(ffprobe_path.resolve())) + self.assertTrue(engines[0]["available"]) + + @unittest.skipIf(os.name == "nt", "shell fake binaries are POSIX-only") + def test_validate_reports_hardware_and_subtitle_support(self): + with tempfile.TemporaryDirectory() as tmp_dir: + ffmpeg_path = Path(tmp_dir) / "ffmpeg" + ffprobe_path = Path(tmp_dir) / "ffprobe" + self._write_fake_binary(ffmpeg_path, "ffmpeg version fake-1.0") + self._write_fake_binary(ffprobe_path, "ffprobe version fake-1.0") + + report = ffmpeg_detector.validate_ffmpeg_engine(str(ffmpeg_path)) + + self.assertTrue(report["ffmpeg_available"]) + self.assertTrue(report["ffprobe_available"]) + self.assertTrue(report["hardware_acceleration"]["available"]) + self.assertTrue(report["subtitle_burn"]["available"]) + self.assertEqual(report["subtitle_burn"]["method"], "subtitles") + + +if __name__ == "__main__": + unittest.main() diff --git a/config.example.toml b/config.example.toml index 3c815c3..b9b03bc 100644 --- a/config.example.toml +++ b/config.example.toml @@ -75,6 +75,10 @@ # WebUI 界面是否显示配置项 hide_config = true + # FFmpeg 引擎路径(可选) + # 为空时使用系统 PATH;也可以在系统设置中通过下拉框选择整合包或本机 ffmpeg。 + ffmpeg_path = "" + # 官方 OpenAI 默认端点(可选): # text_openai_base_url = "https://api.openai.com/v1" diff --git a/webui/components/system_settings.py b/webui/components/system_settings.py index 82e9592..733e230 100644 --- a/webui/components/system_settings.py +++ b/webui/components/system_settings.py @@ -3,6 +3,8 @@ import os import shutil from loguru import logger +from app.config import config +from app.utils import ffmpeg_detector, ffmpeg_utils from app.utils.utils import storage_dir @@ -27,6 +29,162 @@ def clear_directory(dir_path, tr): else: st.warning(tr("Directory does not exist")) + +def _format_engine_label(engines_by_path, tr): + def formatter(path): + engine = engines_by_path.get(path, {}) + source = engine.get("source", "") + source_key = f"FFmpeg source {source}" + translated_source = tr(source_key) + if translated_source == source_key: + translated_source = source + + version = str(engine.get("version_line", "")).replace("ffmpeg version", "").strip() + version = version or "unknown version" + status = _status_text(engine.get("available"), tr) + return f"{translated_source} - {version} - {path} ({status})" + + return formatter + + +def _status_text(value, tr): + return tr("Available") if value else tr("Unavailable") + + +def _render_ffmpeg_report(report, tr): + st.write(f"**{tr('FFmpeg detection details')}**") + st.caption(f"{tr('Path')}: {report.get('path', '')}") + if report.get("version_line"): + st.caption(f"{tr('Version')}: {report['version_line']}") + + col1, col2, col3, col4 = st.columns(4) + with col1: + st.metric("FFmpeg", _status_text(report.get("ffmpeg_available"), tr)) + with col2: + st.metric("FFprobe", _status_text(report.get("ffprobe_available"), tr)) + with col3: + hwaccel = report.get("hardware_acceleration", {}) + st.metric(tr("Hardware Acceleration"), _status_text(hwaccel.get("available"), tr)) + with col4: + subtitle_burn = report.get("subtitle_burn", {}) + st.metric(tr("Subtitle Burn-in"), _status_text(subtitle_burn.get("available"), tr)) + + if report.get("ffmpeg_available") and report.get("subtitle_burn", {}).get("available"): + if report.get("hardware_acceleration", {}).get("available"): + st.success(tr("FFmpeg engine passed all checks")) + else: + st.warning(tr("FFmpeg engine works but hardware acceleration is unavailable")) + else: + st.error(tr("FFmpeg engine check failed")) + + hwaccel = report.get("hardware_acceleration", {}) + subtitle_burn = report.get("subtitle_burn", {}) + col1, col2 = st.columns(2) + with col1: + st.write(f"**{tr('Hardware acceleration detail')}**") + st.write(f"- {tr('Type')}: {hwaccel.get('type') or '-'}") + st.write(f"- {tr('Encoder')}: {hwaccel.get('encoder') or '-'}") + st.write(f"- {tr('Message')}: {hwaccel.get('message') or '-'}") + hwaccels = report.get("hwaccels") or [] + st.write(f"- {tr('Supported Hardware Methods')}: {', '.join(hwaccels) if hwaccels else '-'}") + with col2: + filters = subtitle_burn.get("filters") or {} + st.write(f"**{tr('Subtitle burn-in detail')}**") + st.write(f"- {tr('Method')}: {subtitle_burn.get('method') or '-'}") + st.write(f"- {tr('Message')}: {subtitle_burn.get('message') or '-'}") + st.write( + "- " + + tr("Subtitle Filters") + + ": " + + ", ".join( + f"{name}={_status_text(enabled, tr)}" + for name, enabled in filters.items() + ) + ) + + errors = report.get("errors") or [] + if errors: + with st.expander(tr("FFmpeg errors")): + for error in errors: + st.write(f"- {error}") + + with st.expander(tr("Raw FFmpeg report")): + st.json(report) + + +def render_ffmpeg_engine_settings(tr): + """Render FFmpeg engine discovery, selection and diagnostics.""" + st.divider() + st.subheader(tr("FFmpeg Engine Detection")) + + engines = ffmpeg_detector.discover_ffmpeg_engines( + configured_path=config.app.get("ffmpeg_path", ""), + root_dir=config.root_dir, + ) + engines_by_path = {engine["path"]: engine for engine in engines} + engine_paths = list(engines_by_path.keys()) + + if not engine_paths: + st.warning(tr("No FFmpeg engines found")) + + current_path = config.app.get("ffmpeg_path", "") + selected_index = 0 + if current_path in engines_by_path: + selected_index = engine_paths.index(current_path) + + selected_path = "" + if engine_paths: + selected_path = st.selectbox( + tr("FFmpeg Engine"), + options=engine_paths, + index=selected_index, + format_func=_format_engine_label(engines_by_path, tr), + help=tr("FFmpeg Engine Help"), + ) + + custom_path = st.text_input( + tr("Custom FFmpeg Path"), + value="", + help=tr("Custom FFmpeg Path Help"), + placeholder="/path/to/ffmpeg", + ).strip() + effective_path = custom_path or selected_path + + active_path = config.app.get("ffmpeg_path", "") + if active_path: + st.caption(f"{tr('Current FFmpeg Engine')}: {active_path}") + + col1, col2 = st.columns(2) + with col1: + if st.button(tr("Save FFmpeg Engine"), use_container_width=True, disabled=not effective_path): + try: + if not os.path.isfile(effective_path): + st.error(tr("Selected FFmpeg path is invalid")) + else: + config.app["ffmpeg_path"] = effective_path + config.ffmpeg_path = effective_path + config.apply_ffmpeg_path(effective_path) + config.save_config() + ffmpeg_utils.reset_hwaccel_detection() + st.success(tr("FFmpeg engine saved")) + except Exception as e: + st.error(f"{tr('Failed to save config')}: {str(e)}") + logger.error(f"保存 FFmpeg 引擎失败: {e}") + + with col2: + if st.button(tr("Test Selected FFmpeg"), use_container_width=True, disabled=not effective_path): + with st.spinner(tr("Testing FFmpeg engine")): + try: + st.session_state["ffmpeg_engine_report"] = ffmpeg_detector.validate_ffmpeg_engine(effective_path) + except Exception as e: + st.error(f"{tr('FFmpeg engine check failed')}: {str(e)}") + logger.error(f"FFmpeg 引擎检测失败: {e}") + + report = st.session_state.get("ffmpeg_engine_report") + if report: + _render_ffmpeg_report(report, tr) + + def render_system_panel(tr): """渲染系统设置面板""" with st.expander(tr("System settings"), expanded=False): @@ -43,3 +201,5 @@ def render_system_panel(tr): with col3: if st.button(tr("Clear tasks"), use_container_width=True): clear_directory(os.path.join(storage_dir(), "tasks"), tr) + + render_ffmpeg_engine_settings(tr) diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 24b2f0a..1ce0df5 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -235,6 +235,48 @@ "Directory cleared": "Directory cleared", "Directory does not exist": "Directory does not exist", "Failed to clear directory": "Failed to clear directory", + "FFmpeg Engine Detection": "FFmpeg Engine Detection", + "FFmpeg Engine": "FFmpeg Engine", + "FFmpeg Engine Help": "Choose the ffmpeg executable this app should prefer; the packaged runtime and local PATH are discovered automatically", + "No FFmpeg engines found": "No FFmpeg engines found", + "Custom FFmpeg Path": "Custom FFmpeg Path", + "Custom FFmpeg Path Help": "Paste an absolute path to an ffmpeg executable if the target engine is not listed", + "Current FFmpeg Engine": "Current FFmpeg Engine", + "Save FFmpeg Engine": "Save Engine", + "Test Selected FFmpeg": "Test Selected FFmpeg", + "Testing FFmpeg engine": "Testing FFmpeg engine...", + "FFmpeg engine saved": "FFmpeg engine saved", + "Selected FFmpeg path is invalid": "Selected FFmpeg path is invalid", + "FFmpeg detection details": "FFmpeg detection details", + "FFmpeg source Configured": "Configured", + "FFmpeg source NarratoAI packaged runtime": "NarratoAI packaged runtime", + "FFmpeg source Integrated runtime": "Integrated runtime", + "FFmpeg source System PATH": "System PATH", + "FFmpeg source Homebrew": "Homebrew", + "FFmpeg source Python environment": "Python environment", + "FFmpeg source Python executable folder": "Python executable folder", + "FFmpeg source IMAGEIO_FFMPEG_EXE": "IMAGEIO_FFMPEG_EXE", + "FFmpeg source imageio-ffmpeg": "imageio-ffmpeg", + "FFmpeg source System": "System", + "Version": "Version", + "Path": "Path", + "Available": "Available", + "Unavailable": "Unavailable", + "Hardware Acceleration": "Hardware Acceleration", + "Subtitle Burn-in": "Subtitle Burn-in", + "FFmpeg engine passed all checks": "FFmpeg engine passed all checks: basic execution, hardware acceleration and subtitle burn-in are available", + "FFmpeg engine works but hardware acceleration is unavailable": "FFmpeg and subtitle burn-in work, but hardware acceleration is unavailable; software encoding will be used", + "FFmpeg engine check failed": "FFmpeg engine check failed", + "Hardware acceleration detail": "Hardware acceleration detail", + "Subtitle burn-in detail": "Subtitle burn-in detail", + "Type": "Type", + "Encoder": "Encoder", + "Message": "Message", + "Method": "Method", + "Supported Hardware Methods": "Supported hardware methods", + "Subtitle Filters": "Subtitle filters", + "FFmpeg errors": "FFmpeg errors", + "Raw FFmpeg report": "Raw FFmpeg report", "Subtitle Preview": "Subtitle Preview", "One-Click Transcribe": "One-Click Transcribe", "Transcribing...": "Transcribing...", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 539a6d1..321af09 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -223,6 +223,48 @@ "Directory cleared": "目录清理完成", "Directory does not exist": "目录不存在", "Failed to clear directory": "清理目录失败", + "FFmpeg Engine Detection": "FFmpeg 引擎检测", + "FFmpeg Engine": "FFmpeg 引擎", + "FFmpeg Engine Help": "选择当前应用优先使用的 ffmpeg 可执行文件;会自动发现整合包运行时和本机 PATH 中的 ffmpeg", + "No FFmpeg engines found": "未发现可用 FFmpeg 引擎", + "Custom FFmpeg Path": "自定义 FFmpeg 路径", + "Custom FFmpeg Path Help": "如果下拉框没有列出目标引擎,可以粘贴 ffmpeg 可执行文件的绝对路径", + "Current FFmpeg Engine": "当前生效引擎", + "Save FFmpeg Engine": "保存引擎", + "Test Selected FFmpeg": "检测所选 FFmpeg", + "Testing FFmpeg engine": "正在检测 FFmpeg 引擎...", + "FFmpeg engine saved": "FFmpeg 引擎已保存", + "Selected FFmpeg path is invalid": "所选 FFmpeg 路径无效", + "FFmpeg detection details": "FFmpeg 检测详情", + "FFmpeg source Configured": "已配置", + "FFmpeg source NarratoAI packaged runtime": "NarratoAI 整合包运行时", + "FFmpeg source Integrated runtime": "内置运行时", + "FFmpeg source System PATH": "系统 PATH", + "FFmpeg source Homebrew": "Homebrew", + "FFmpeg source Python environment": "Python 环境", + "FFmpeg source Python executable folder": "Python 可执行目录", + "FFmpeg source IMAGEIO_FFMPEG_EXE": "IMAGEIO_FFMPEG_EXE", + "FFmpeg source imageio-ffmpeg": "imageio-ffmpeg", + "FFmpeg source System": "系统路径", + "Version": "版本", + "Path": "路径", + "Available": "可用", + "Unavailable": "不可用", + "Hardware Acceleration": "硬件加速", + "Subtitle Burn-in": "字幕烧录", + "FFmpeg engine passed all checks": "FFmpeg 引擎检测通过:基础功能、硬件加速和字幕烧录均可用", + "FFmpeg engine works but hardware acceleration is unavailable": "FFmpeg 基础功能和字幕烧录可用,但硬件加速不可用,将使用软件编码", + "FFmpeg engine check failed": "FFmpeg 引擎检测失败", + "Hardware acceleration detail": "硬件加速详情", + "Subtitle burn-in detail": "字幕烧录详情", + "Type": "类型", + "Encoder": "编码器", + "Message": "信息", + "Method": "方式", + "Supported Hardware Methods": "支持的硬件加速方法", + "Subtitle Filters": "字幕滤镜", + "FFmpeg errors": "FFmpeg 错误", + "Raw FFmpeg report": "原始 FFmpeg 报告", "Subtitle Preview": "字幕预览", "One-Click Transcribe": "一键转录", "Transcribing...": "正在转录中...", From 7d4bd45f692df311cb29f828d330e7ab1828d5be Mon Sep 17 00:00:00 2001 From: viccy Date: Mon, 8 Jun 2026 16:02:20 +0800 Subject: [PATCH 22/24] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E7=94=9F=E6=88=90=E8=BF=9B=E5=BA=A6=E8=BF=BD=E8=B8=AA?= =?UTF-8?q?=E4=B8=8EWebUI=E5=B1=95=E7=A4=BA=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=89=AA=E6=98=A0=E5=AF=BC=E5=87=BA=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加FFmpeg视频合并进度回调支持,实现实时进度上报 - 为进度回调添加参数校验与异常捕获,避免回调失败影响主流程 - 重构任务状态更新逻辑,统一封装任务更新函数减少重复代码 - 重新划分视频生成的6个标准步骤,更新各阶段的状态上报逻辑 - 更新WebUI界面,支持展示当前生成步骤、总步骤与FFmpeg实时进度 - 优化剪映草稿导出功能,不再预先裁剪原视频,直接引用源素材时间戳 - 新增剪映草稿字幕生成功能,并补充对应的单元测试用例 --- app/services/generate_video.py | 36 ++- app/services/jianying_draft_builder.py | 255 ++++++++++++++++++-- app/services/jianying_task.py | 167 +++++++++++-- app/services/task.py | 109 ++++++--- app/services/test_jianying_task_unittest.py | 171 +++++++++++++ webui.py | 94 +++++++- 6 files changed, 762 insertions(+), 70 deletions(-) diff --git a/app/services/generate_video.py b/app/services/generate_video.py index 0d2c11d..1fe41fd 100644 --- a/app/services/generate_video.py +++ b/app/services/generate_video.py @@ -15,7 +15,7 @@ import subprocess import time import traceback import tempfile -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Callable from loguru import logger import numpy as np from moviepy import ( @@ -471,7 +471,23 @@ def _parse_ffmpeg_progress_time(progress: Dict[str, str]) -> float: return 0.0 -def _run_ffmpeg_with_progress(cmd: list[str], duration: float) -> tuple[int, str]: +def _emit_ffmpeg_progress( + progress_callback: Optional[Callable[[float], None]], + percent: float, +) -> None: + if not progress_callback: + return + try: + progress_callback(max(0.0, min(100.0, float(percent)))) + except Exception as e: + logger.debug(f"ffmpeg 进度回调失败: {e}") + + +def _run_ffmpeg_with_progress( + cmd: list[str], + duration: float, + progress_callback: Optional[Callable[[float], None]] = None, +) -> tuple[int, str]: progress_keys = { "frame", "fps", @@ -497,6 +513,7 @@ def _run_ffmpeg_with_progress(cmd: list[str], duration: float) -> tuple[int, str output_tail: list[str] = [] last_log_time = 0.0 last_logged_percent = -1.0 + _emit_ffmpeg_progress(progress_callback, 0) assert process.stdout is not None for raw_line in process.stdout: @@ -537,11 +554,14 @@ def _run_ffmpeg_with_progress(cmd: list[str], duration: float) -> tuple[int, str f"({_format_duration(current)}/{_format_duration(duration)}), " f"speed={speed}" ) + _emit_ffmpeg_progress(progress_callback, percent) last_log_time = now last_logged_percent = percent progress = {} return_code = process.wait() + if return_code == 0: + _emit_ffmpeg_progress(progress_callback, 100) return return_code, "\n".join(output_tail[-80:]) @@ -1264,6 +1284,7 @@ def _merge_materials_with_ffmpeg( subtitle_path: Optional[str] = None, bgm_path: Optional[str] = None, options: Optional[Dict[str, Any]] = None, + progress_callback: Optional[Callable[[float], None]] = None, ) -> bool: ffmpeg_binary = _get_ffmpeg_binary() if not _check_ffmpeg_binary(ffmpeg_binary): @@ -1285,7 +1306,11 @@ def _merge_materials_with_ffmpeg( f"video={video_path}, audio={audio_path}, output={output_path}, " f"duration={_format_duration(duration)}" ) - return_code, ffmpeg_output = _run_ffmpeg_with_progress(cmd, duration) + return_code, ffmpeg_output = _run_ffmpeg_with_progress( + cmd, + duration, + progress_callback=progress_callback, + ) if return_code != 0: logger.warning(f"ffmpeg 快速合并失败,将回退 MoviePy: {ffmpeg_output[-3000:]}") if os.path.exists(output_path): @@ -1315,7 +1340,8 @@ def merge_materials( output_path: str, subtitle_path: Optional[str] = None, bgm_path: Optional[str] = None, - options: Optional[Dict[str, Any]] = None + options: Optional[Dict[str, Any]] = None, + progress_callback: Optional[Callable[[float], None]] = None, ) -> str: """ 合并视频、音频、BGM和字幕素材生成最终视频 @@ -1342,6 +1368,7 @@ def merge_materials( - threads: 处理线程数,默认2 - fps: 输出帧率,默认30 - subtitle_enabled: 是否启用字幕,默认True + progress_callback: ffmpeg 快速合并进度回调,参数为 0-100 的百分比 返回: 输出视频的路径 @@ -1439,6 +1466,7 @@ def merge_materials( subtitle_path=subtitle_path, bgm_path=bgm_path, options=ffmpeg_options, + progress_callback=progress_callback, ): return output_path logger.warning("ffmpeg 快速合并失败,继续使用 MoviePy 兼容路径") diff --git a/app/services/jianying_draft_builder.py b/app/services/jianying_draft_builder.py index 3f00422..c998099 100644 --- a/app/services/jianying_draft_builder.py +++ b/app/services/jianying_draft_builder.py @@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple from loguru import logger from app.models.schema import VideoClipParams +from app.services import script_subtitle MICROSECONDS = 1_000_000 @@ -567,6 +568,213 @@ def _create_audio_segment( } +def _normalize_hex_color(color: Optional[str], default: str = "#FFFFFF") -> str: + color = str(color or default).strip() + if not color.startswith("#"): + color = f"#{color}" + if re.fullmatch(r"#[0-9a-fA-F]{3}", color): + color = "#" + "".join(char * 2 for char in color[1:]) + if not re.fullmatch(r"#[0-9a-fA-F]{6}", color): + color = default + return color.upper() + + +def _hex_color_to_rgb_float(color: Optional[str], default: str = "#FFFFFF") -> Tuple[float, float, float]: + normalized = _normalize_hex_color(color, default) + return ( + int(normalized[1:3], 16) / 255, + int(normalized[3:5], 16) / 255, + int(normalized[5:7], 16) / 255, + ) + + +def _resolve_subtitle_text_size(params: VideoClipParams) -> float: + raw_size = getattr(params, "font_size", 60) or 60 + try: + font_size = float(raw_size) + except (TypeError, ValueError): + font_size = 60.0 + return max(4.0, min(10.0, font_size / 12.0)) + + +def _resolve_subtitle_transform_y(params: VideoClipParams) -> float: + subtitle_position = str(getattr(params, "subtitle_position", "bottom") or "bottom").lower() + if subtitle_position == "top": + return 0.82 + if subtitle_position == "center": + return 0.0 + if subtitle_position == "custom": + try: + y_percent = float(getattr(params, "custom_position", 85.0)) + except (TypeError, ValueError): + y_percent = 85.0 + y_percent = max(0.0, min(100.0, y_percent)) + return max(-0.92, min(0.92, 1.0 - 2.0 * (y_percent / 100.0))) + return -0.8 + + +def _create_text_material(text: str, params: VideoClipParams) -> Dict[str, Any]: + material_id = uuid.uuid4().hex + text = str(text or "") + text_color = _hex_color_to_rgb_float(getattr(params, "text_fore_color", "#FFFFFF"), "#FFFFFF") + stroke_color = _hex_color_to_rgb_float(getattr(params, "stroke_color", "#000000"), "#000000") + try: + stroke_width = float(getattr(params, "stroke_width", 1.5) or 0) + except (TypeError, ValueError): + stroke_width = 1.5 + + text_style = { + "fill": { + "alpha": 1.0, + "content": { + "render_type": "solid", + "solid": { + "alpha": 1.0, + "color": list(text_color), + }, + }, + }, + "range": [0, len(text)], + "size": _resolve_subtitle_text_size(params), + "bold": False, + "italic": False, + "underline": False, + "strokes": [], + } + check_flag = 7 + if stroke_width > 0: + text_style["strokes"] = [ + { + "content": { + "solid": { + "alpha": 1.0, + "color": list(stroke_color), + } + }, + "width": max(0.0, min(0.2, stroke_width / 100.0 * 0.2)), + } + ] + check_flag |= 8 + + return { + "id": material_id, + "content": json.dumps( + { + "styles": [text_style], + "text": text, + }, + ensure_ascii=False, + ), + "typesetting": 0, + "alignment": 1, + "letter_spacing": 0.0, + "line_spacing": 0.02, + "line_feed": 1, + "line_max_width": 0.82, + "force_apply_line_max_width": False, + "check_flag": check_flag, + "type": "subtitle", + "global_alpha": 1.0, + } + + +def _create_text_segment( + material_id: str, + start_us: int, + duration_us: int, + params: VideoClipParams, +) -> Dict[str, Any]: + return { + "id": uuid.uuid4().hex, + "material_id": material_id, + "target_timerange": {"start": start_us, "duration": duration_us}, + "source_timerange": None, + "speed": 1.0, + "volume": 1.0, + "extra_material_refs": [], + "is_tone_modify": False, + "clip": { + "alpha": 1.0, + "flip": {"horizontal": False, "vertical": False}, + "rotation": 0.0, + "scale": {"x": 1.0, "y": 1.0}, + "transform": {"x": 0.0, "y": _resolve_subtitle_transform_y(params)}, + }, + "uniform_scale": {"on": True, "value": 1.0}, + "render_index": 15000, + "common_keyframes": [], + } + + +def _parse_srt_entries(subtitle_path: str) -> List[Tuple[float, float, str]]: + if not subtitle_path or not os.path.exists(subtitle_path): + return [] + + with open(subtitle_path, "r", encoding="utf-8-sig") as f: + content = f.read().strip() + if not content: + return [] + + entries: List[Tuple[float, float, str]] = [] + for block in re.split(r"\n\s*\n", content): + lines = [line.strip() for line in block.splitlines() if line.strip()] + time_line_index = next( + (index for index, line in enumerate(lines) if "-->" in line), + None, + ) + if time_line_index is None or time_line_index + 1 >= len(lines): + continue + + try: + start_text, end_text = lines[time_line_index].split("-->", 1) + start = script_subtitle.parse_srt_like_time(start_text) + end = script_subtitle.parse_srt_like_time(end_text) + except Exception as e: + logger.warning(f"解析剪映字幕时间失败,跳过字幕块: {e}") + continue + + text = "\n".join(lines[time_line_index + 1:]).strip() + if end <= start or not text: + continue + entries.append((start, end, text)) + + return entries + + +def _add_subtitle_track_from_srt( + draft: Dict[str, Any], + subtitle_path: str, + params: VideoClipParams, +) -> int: + entries = _parse_srt_entries(subtitle_path) + if not entries: + return 0 + + text_track = _create_track("text", "字幕轨道") + text_track["is_default_name"] = False + max_end_us = 0 + for start, end, text in entries: + start_us = _seconds_to_microseconds(start) + duration_us = _seconds_to_microseconds(end - start) + if duration_us <= 0: + continue + + text_material = _create_text_material(text, params) + draft["materials"]["texts"].append(text_material) + text_track["segments"].append(_create_text_segment( + text_material["id"], + start_us, + duration_us, + params, + )) + max_end_us = max(max_end_us, start_us + duration_us) + + if text_track["segments"]: + draft["tracks"].append(text_track) + logger.info(f"已写入剪映字幕轨: {len(text_track['segments'])} 条, {subtitle_path}") + return max_end_us + + def _normalize_video_material(material: Dict[str, Any]) -> Dict[str, Any]: fallback_path = f"assets/video/{material.get('material_name') or 'source.mp4'}" result = { @@ -1313,6 +1521,7 @@ def write_plaintext_jianying_draft( new_script_list: List[Dict[str, Any]], params: VideoClipParams, output_dir: str, + subtitle_path: str = "", ) -> Tuple[str, str]: os.makedirs(jianying_draft_path, exist_ok=True) @@ -1332,13 +1541,16 @@ def write_plaintext_jianying_draft( metadata_cache: Dict[str, Tuple[int, int, int]] = {} used_asset_paths: Set[str] = set() asset_path_cache: Dict[str, str] = {} + video_material_cache: Dict[str, Dict[str, Any]] = {} current_time_us = 0 for item in new_script_list: start_time = float(item.get("start_time", 0.0) or 0.0) + source_start_time = float(item.get("source_start_time", start_time) or 0.0) requested_duration = float(item.get("duration", 0.0) or 0.0) timestamp = item.get("timestamp", "") ost = int(item.get("OST", 0) or 0) + use_source_timerange = bool(item.get("use_source_timerange", False)) logger.info( f"处理片段: OST={ost}, start_time={start_time}, " @@ -1346,15 +1558,15 @@ def write_plaintext_jianying_draft( ) video_file = item.get("video", "") - use_clipped_video = bool(video_file and os.path.exists(video_file)) - if not use_clipped_video: + use_clipped_video = bool(video_file and os.path.exists(video_file) and not use_source_timerange) + if not use_clipped_video and not video_file: video_file = params.video_origin_path if not video_file or not os.path.exists(video_file): logger.warning(f"视频素材不存在,跳过片段: {video_file or timestamp}") continue - source_start_time = 0.0 if use_clipped_video else start_time + source_start_time = 0.0 if use_clipped_video else source_start_time video_duration = _clamp_duration_to_media( requested_duration, video_file, @@ -1381,23 +1593,32 @@ def write_plaintext_jianying_draft( continue segment_duration_us = _seconds_to_microseconds(segment_duration) - video_material_duration_us, width, height = _get_video_metadata_ffprobe(video_file, metadata_cache) - video_relative_path = _register_asset( - video_file, - draft_path, - "assets/video", - f"video_{len(video_track['segments']) + 1}.mp4", - used_asset_paths, - asset_path_cache, + video_material_key = os.path.abspath(video_file) + video_material = video_material_cache.get(video_material_key) + if video_material is None: + video_material_duration_us, width, height = _get_video_metadata_ffprobe(video_file, metadata_cache) + video_relative_path = _register_asset( + video_file, + draft_path, + "assets/video", + f"video_{len(video_material_cache) + 1}.mp4", + used_asset_paths, + asset_path_cache, + ) + video_material = _create_video_material(video_relative_path, video_material_duration_us, width, height) + draft["materials"]["videos"].append(video_material) + video_material_cache[video_material_key] = video_material + video_volume = ( + 0.0 + if ost == 0 + else float(getattr(params, "original_volume", 1.0) or 1.0) ) - video_material = _create_video_material(video_relative_path, video_material_duration_us, width, height) - draft["materials"]["videos"].append(video_material) video_track["segments"].append(_create_video_segment( video_material["id"], _seconds_to_microseconds(_floor_duration_to_milliseconds(source_start_time)), segment_duration_us, current_time_us, - float(getattr(params, "original_volume", 1.0) or 1.0), + video_volume, )) if ost in [0, 2] and audio_file and os.path.exists(audio_file): @@ -1428,10 +1649,14 @@ def write_plaintext_jianying_draft( if not video_track["segments"]: raise ValueError("没有可写入剪映草稿的视频片段") + subtitle_end_us = 0 + if getattr(params, "subtitle_enabled", True) and subtitle_path: + subtitle_end_us = _add_subtitle_track_from_srt(draft, subtitle_path, params) + first_video = draft["materials"]["videos"][0] draft["canvas_config"]["width"] = int(first_video.get("width", 1920) or 1920) draft["canvas_config"]["height"] = int(first_video.get("height", 1080) or 1080) - draft["duration"] = current_time_us + draft["duration"] = max(current_time_us, subtitle_end_us) draft["update_time"] = int(time.time() * MICROSECONDS) asset_size = sum( diff --git a/app/services/jianying_task.py b/app/services/jianying_task.py index a24304c..21e2c01 100644 --- a/app/services/jianying_task.py +++ b/app/services/jianying_task.py @@ -9,7 +9,7 @@ from loguru import logger from app.config import config from app.models import const from app.models.schema import VideoClipParams -from app.services import voice, clip_video, update_script +from app.services import voice, clip_video, script_subtitle from app.services.jianying_draft_builder import write_plaintext_jianying_draft from app.services import state as sm from app.utils import utils @@ -141,6 +141,141 @@ def _normalize_indextts_reference_audio(params: VideoClipParams) -> None: raise ValueError(f"{display_name} 参考音频不存在,请在音频设置中上传或选择有效的参考音频") +def _index_tts_results(tts_results: list[Dict]) -> Dict: + indexed = {} + for tts_result in tts_results or []: + item_id = tts_result.get("_id") + timestamp = tts_result.get("timestamp") + if item_id is not None: + indexed[item_id] = tts_result + if timestamp: + indexed[timestamp] = tts_result + return indexed + + +def _get_video_source_paths(params: VideoClipParams) -> list[str]: + return clip_video._normalize_video_origin_paths( + getattr(params, "video_origin_path", ""), + getattr(params, "video_origin_paths", []), + ) + + +def _resolve_script_video_path(item: Dict, video_source_paths: list[str]) -> str: + if not video_source_paths: + return "" + return clip_video._resolve_script_video_path(item, video_source_paths) + + +def _resolve_tts_result(item: Dict, tts_map: Dict) -> Dict: + item_id = item.get("_id") + timestamp = item.get("timestamp") + if item_id is not None and item_id in tts_map: + return tts_map[item_id] + if timestamp in tts_map: + return tts_map[timestamp] + return {} + + +def _build_jianying_draft_script( + list_script: list[Dict], + params: VideoClipParams, + tts_results: list[Dict], +) -> list[Dict]: + video_source_paths = _get_video_source_paths(params) + if not video_source_paths: + raise ValueError("视频文件不能为空") + + tts_map = _index_tts_results(tts_results) + draft_script = [] + accumulated_duration = 0.0 + + for item in list_script: + item_copy = dict(item) + timestamp = item_copy.get("timestamp", "") + try: + source_start, source_end = script_subtitle.parse_time_range(timestamp) + except ValueError as e: + logger.warning(f"解析剪映片段时间戳失败,跳过片段 {item_copy.get('_id')}: {e}") + continue + + timestamp_duration = _floor_duration_to_milliseconds(source_end - source_start) + if timestamp_duration <= 0: + logger.warning(f"剪映片段时长无效,跳过片段 {item_copy.get('_id')}: {timestamp}") + continue + + ost = int(item_copy.get("OST", 0) or 0) + tts_result = _resolve_tts_result(item_copy, tts_map) if ost in [0, 2] else {} + item_duration = timestamp_duration + if tts_result.get("duration"): + item_duration = _floor_duration_to_milliseconds(float(tts_result.get("duration") or 0.0)) + if item_duration <= 0: + item_duration = timestamp_duration + + item_copy.update({ + "video": _resolve_script_video_path(item_copy, video_source_paths), + "audio": tts_result.get("audio_file", ""), + "subtitle": tts_result.get("subtitle_file", ""), + "sourceTimeRange": timestamp, + "start_time": source_start, + "source_start_time": source_start, + "duration": item_duration, + "use_source_timerange": True, + "editedTimeRange": ( + f"{script_subtitle.format_srt_time(accumulated_duration)}-" + f"{script_subtitle.format_srt_time(accumulated_duration + item_duration)}" + ), + }) + accumulated_duration += item_duration + draft_script.append(item_copy) + + if not draft_script: + raise ValueError("没有可写入剪映草稿的视频片段") + + return draft_script + + +def _get_original_subtitle_paths(params: VideoClipParams) -> list[str]: + subtitle_paths = getattr(params, "original_subtitle_paths", []) or [] + if isinstance(subtitle_paths, str): + subtitle_paths = [subtitle_paths] + + normalized_paths = [] + seen = set() + for subtitle_path in subtitle_paths: + if not isinstance(subtitle_path, str): + continue + subtitle_path = subtitle_path.strip() + if subtitle_path and subtitle_path not in seen: + normalized_paths.append(subtitle_path) + seen.add(subtitle_path) + + single_subtitle_path = str(getattr(params, "original_subtitle_path", "") or "").strip() + if single_subtitle_path and single_subtitle_path not in seen: + normalized_paths.insert(0, single_subtitle_path) + + return normalized_paths + + +def _create_jianying_subtitle_file( + task_id: str, + draft_script: list[Dict], + params: VideoClipParams, +) -> str: + if not getattr(params, "subtitle_enabled", True): + return "" + + try: + return script_subtitle.create_script_subtitle_file( + task_id=task_id, + list_script=draft_script, + original_subtitle_paths=_get_original_subtitle_paths(params), + video_origin_paths=_get_video_source_paths(params), + ) + except Exception as e: + logger.warning(f"剪映草稿字幕生成失败,将导出无字幕草稿: {e}") + return "" + + def start_export_jianying_draft(task_id: str, params: VideoClipParams): """ 导出到剪映草稿的后台任务 @@ -200,23 +335,15 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams): sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20) """ - 3. 统一视频裁剪 - 基于OST类型的差异化裁剪策略 + 3. 准备剪映草稿时间线 - 直接引用原视频素材和源时间戳 """ - logger.info("\n\n## 3. 统一视频裁剪(基于OST类型)") - video_clip_result = clip_video.clip_video_unified( - video_origin_path=params.video_origin_path, - video_origin_paths=getattr(params, "video_origin_paths", []), - script_list=list_script, - tts_results=tts_results - ) + logger.info("\n\n## 3. 准备剪映草稿时间线(不裁剪视频)") + new_script_list = _build_jianying_draft_script(list_script, params, tts_results) + subtitle_path = _create_jianying_subtitle_file(task_id, new_script_list, params) - tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results} - subclip_clip_result = { - tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results - } - new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result) - - logger.info(f"统一裁剪完成,处理了 {len(video_clip_result)} 个视频片段") + logger.info(f"剪映草稿时间线准备完成,处理了 {len(new_script_list)} 个视频片段") + if subtitle_path: + logger.info(f"剪映草稿字幕文件: {subtitle_path}") sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60) @@ -245,15 +372,19 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams): new_script_list=new_script_list, params=params, output_dir=output_dir, + subtitle_path=subtitle_path, ) logger.success(f"成功导出到剪映草稿: {draft_name}") logger.info(f"草稿已保存到: {draft_path}") # 更新任务状态 - sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, draft_path=draft_path, draft_name=draft_name) + task_kwargs = {"draft_path": draft_path, "draft_name": draft_name} + if subtitle_path: + task_kwargs["subtitles"] = [subtitle_path] + sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **task_kwargs) - return {"draft_path": draft_path, "draft_name": draft_name} + return task_kwargs except Exception as e: logger.error(f"导出到剪映草稿失败: {e}") import traceback diff --git a/app/services/task.py b/app/services/task.py index d7aa1c9..b23a5b9 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -24,6 +24,38 @@ from app.services import state as sm from app.utils import utils +VIDEO_GENERATION_TOTAL_STEPS = 6 + + +def _update_video_generation_task( + task_id: str, + progress: int, + message: str, + step_current: int = 0, + ffmpeg_progress: float | None = None, + state: int = const.TASK_STATE_PROCESSING, + **kwargs, +) -> None: + task_fields = { + "message": message, + "step_current": step_current, + "step_total": VIDEO_GENERATION_TOTAL_STEPS, + **kwargs, + } + if ffmpeg_progress is not None: + task_fields["ffmpeg_progress"] = round( + max(0.0, min(100.0, float(ffmpeg_progress))), + 1, + ) + + sm.state.update_task( + task_id, + state=state, + progress=progress, + **task_fields, + ) + + def _is_auto_transcription_enabled(params: VideoClipParams) -> bool: return bool( getattr(params, "subtitle_enabled", True) @@ -583,22 +615,22 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): global merged_audio_path, merged_subtitle_path logger.info(f"\n\n## 开始统一视频处理任务: {task_id}") - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=0, message="正在初始化视频生成任务", + step_current=0, ) """ 1. 加载剪辑脚本 """ logger.info("\n\n## 1. 加载视频脚本") - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=5, message="正在加载剪辑脚本", + step_current=1, ) video_script_path = path.join(params.video_clip_json_path) @@ -625,11 +657,11 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): 2. 使用 TTS 生成音频素材 """ logger.info("\n\n## 2. 根据OST设置生成音频列表") - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=10, message="正在生成 TTS 配音", + step_current=2, ) # 只为OST=0 or 2的判断生成音频, OST=0 仅保留解说 OST=2 保留解说和原声 tts_segments = [ @@ -647,22 +679,22 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): voice_pitch=params.voice_pitch, ) - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=20, message="TTS 配音生成完成", + step_current=2, ) """ 3. 统一视频裁剪 - 基于OST类型的差异化裁剪策略 """ logger.info("\n\n## 3. 统一视频裁剪(基于OST类型)") - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=30, message="正在按脚本裁剪视频片段", + step_current=3, ) # 使用新的统一裁剪策略 @@ -682,22 +714,22 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): logger.info(f"统一裁剪完成,处理了 {len(video_clip_result)} 个视频片段") - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=60, message="视频片段裁剪完成", + step_current=3, ) """ 4. 合并音频和字幕 """ logger.info("\n\n## 4. 合并音频和字幕") - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=65, message="正在合并配音和字幕", + step_current=4, ) total_duration = sum([script["duration"] for script in new_script_list]) if tts_segments: @@ -750,11 +782,11 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): ) except Exception as e: logger.warning(f"程序化字幕生成失败: {e}") - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=70, message="配音和字幕合并完成", + step_current=4, ) """ @@ -765,11 +797,11 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4") logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}") - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=75, message="正在合并视频片段", + step_current=5, ) # 使用统一裁剪后的视频片段 @@ -790,11 +822,11 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): video_aspect=params.video_aspect, threads=params.n_threads ) - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=80, message="视频片段合并完成", + step_current=5, ) """ @@ -810,11 +842,12 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): else output_video_path ) logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {merge_output_video_path}") - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=85, message="正在合成最终视频", + step_current=6, + ffmpeg_progress=0, ) bgm_path = utils.get_bgm_file( @@ -858,30 +891,47 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): 'threads': params.n_threads, **_build_subtitle_mask_options(params, enabled=not auto_transcription_enabled), } + final_merge_progress_start = 85 + final_merge_progress_end = 89 if auto_transcription_enabled else 99 + + def update_final_merge_progress(ffmpeg_progress: float): + progress_span = final_merge_progress_end - final_merge_progress_start + overall_progress = final_merge_progress_start + int( + round((max(0.0, min(100.0, float(ffmpeg_progress))) / 100) * progress_span) + ) + _update_video_generation_task( + task_id, + progress=overall_progress, + message="正在合成最终视频", + step_current=6, + ffmpeg_progress=ffmpeg_progress, + ) + generate_video.merge_materials( video_path=combined_video_path, audio_path=merged_audio_path, subtitle_path=merged_subtitle_path, bgm_path=bgm_path, output_path=merge_output_video_path, - options=options + options=options, + progress_callback=update_final_merge_progress, ) auto_subtitle_path = "" if auto_transcription_enabled: - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=90, message="正在自动转录最终视频", + step_current=6, ) logger.info("\n\n## 7. 自动转录最终视频字幕") auto_subtitle_path = _transcribe_final_video(task_id, merge_output_video_path, params) - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_PROCESSING, progress=95, message="正在压入自动转录字幕", + step_current=6, ) logger.info(f"\n\n## 8. 压入自动转录字幕 -> {output_video_path}") _merge_auto_transcribed_subtitles( @@ -902,11 +952,12 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): } if auto_subtitle_path: kwargs["subtitles"] = [auto_subtitle_path] - sm.state.update_task( + _update_video_generation_task( task_id, - state=const.TASK_STATE_COMPLETE, progress=100, message="视频生成完成", + step_current=VIDEO_GENERATION_TOTAL_STEPS, + state=const.TASK_STATE_COMPLETE, **kwargs ) return kwargs diff --git a/app/services/test_jianying_task_unittest.py b/app/services/test_jianying_task_unittest.py index 0a1660f..e977242 100644 --- a/app/services/test_jianying_task_unittest.py +++ b/app/services/test_jianying_task_unittest.py @@ -193,6 +193,177 @@ class JianyingTaskTests(unittest.TestCase): self.assertEqual("NarratoAI_test", root_meta["all_draft_store"][0]["draft_name"]) self.assertEqual(str(draft_dir / "draft_info.json"), root_meta["all_draft_store"][0]["draft_json_file"]) + def test_write_plaintext_jianying_draft_uses_source_timerange_and_writes_subtitles(self): + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) / "drafts" + output_dir = Path(temp_dir) / "task" + root_path.mkdir() + output_dir.mkdir() + video_path = output_dir / "source.mp4" + audio_path = output_dir / "audio_00_00_02,000-00_00_04,000.mp3" + subtitle_path = output_dir / "script_subtitles.srt" + video_path.write_bytes(b"fake source video") + audio_path.write_bytes(b"fake audio") + subtitle_path.write_text( + "1\n00:00:00,000 --> 00:00:01,500\n测试字幕\n", + encoding="utf-8", + ) + + params = VideoClipParams( + video_origin_path=str(video_path), + original_volume=0.4, + tts_volume=0.9, + subtitle_enabled=True, + font_size=60, + text_fore_color="#FFFFFF", + ) + script = [ + { + "OST": 0, + "start_time": 2.0, + "source_start_time": 2.0, + "duration": 3.0, + "timestamp": "00:00:02,000-00:00:05,000", + "video": str(video_path), + "audio": str(audio_path), + "use_source_timerange": True, + } + ] + + def fake_duration(file_path): + return 10.0 if file_path == str(video_path) else 3.0 + + with ( + patch.object(jianying_draft_builder, "_get_media_duration_ffprobe", side_effect=fake_duration), + patch.object( + jianying_draft_builder, + "_get_video_metadata_ffprobe", + return_value=(10_000_000, 1920, 1080), + ), + ): + draft_path, _ = jianying_draft_builder.write_plaintext_jianying_draft( + str(root_path), + "NarratoAI_source", + script, + params, + str(output_dir), + subtitle_path=str(subtitle_path), + ) + + draft_info = json.loads((Path(draft_path) / "draft_info.json").read_text(encoding="utf-8")) + self.assertEqual(1, len(draft_info["materials"]["videos"])) + self.assertEqual(1, len(draft_info["materials"]["texts"])) + self.assertIn("测试字幕", draft_info["materials"]["texts"][0]["content"]) + + video_segment = draft_info["tracks"][0]["segments"][0] + self.assertEqual(2_000_000, video_segment["source_timerange"]["start"]) + self.assertEqual(3_000_000, video_segment["source_timerange"]["duration"]) + self.assertEqual(0.0, video_segment["volume"]) + + text_tracks = [track for track in draft_info["tracks"] if track["type"] == "text"] + self.assertEqual(1, len(text_tracks)) + self.assertEqual(1, len(text_tracks[0]["segments"])) + self.assertEqual(1_500_000, text_tracks[0]["segments"][0]["target_timerange"]["duration"]) + + def test_build_jianying_draft_script_references_original_video(self): + with tempfile.TemporaryDirectory() as temp_dir: + video_one = Path(temp_dir) / "one.mp4" + video_two = Path(temp_dir) / "two.mp4" + audio_path = Path(temp_dir) / "audio.mp3" + video_one.write_bytes(b"one") + video_two.write_bytes(b"two") + audio_path.write_bytes(b"audio") + + params = VideoClipParams( + video_origin_path=str(video_one), + video_origin_paths=[str(video_one), str(video_two)], + ) + script = [ + { + "_id": 9, + "video_id": 2, + "timestamp": "00:00:05,000-00:00:07,000", + "narration": "解说", + "OST": 0, + } + ] + tts_results = [ + { + "_id": 9, + "timestamp": "00:00:05,000-00:00:07,000", + "audio_file": str(audio_path), + "subtitle_file": "", + "duration": 1.25, + } + ] + + draft_script = jianying_task._build_jianying_draft_script(script, params, tts_results) + + self.assertEqual(str(video_two), draft_script[0]["video"]) + self.assertEqual(str(audio_path), draft_script[0]["audio"]) + self.assertEqual(5.0, draft_script[0]["source_start_time"]) + self.assertEqual(1.25, draft_script[0]["duration"]) + self.assertTrue(draft_script[0]["use_source_timerange"]) + + def test_start_export_jianying_draft_does_not_clip_video(self): + with tempfile.TemporaryDirectory() as temp_dir: + root_path = Path(temp_dir) / "drafts" + task_dir = Path(temp_dir) / "task" + root_path.mkdir() + task_dir.mkdir() + video_path = Path(temp_dir) / "source.mp4" + audio_path = task_dir / "audio.mp3" + script_path = Path(temp_dir) / "script.json" + subtitle_path = task_dir / "script_subtitles.srt" + video_path.write_bytes(b"video") + audio_path.write_bytes(b"audio") + script_path.write_text( + json.dumps([ + { + "_id": 1, + "timestamp": "00:00:01,000-00:00:03,000", + "narration": "测试解说", + "OST": 0, + } + ], ensure_ascii=False), + encoding="utf-8", + ) + + params = VideoClipParams( + video_clip_json_path=str(script_path), + video_origin_path=str(video_path), + tts_engine="edge_tts", + voice_name="zh-CN-YunjianNeural", + subtitle_enabled=True, + draft_name="NarratoAI_no_clip", + ) + tts_results = [ + { + "_id": 1, + "timestamp": "00:00:01,000-00:00:03,000", + "audio_file": str(audio_path), + "subtitle_file": "", + "duration": 1.5, + } + ] + + with ( + patch.dict(jianying_task.config.ui, {"jianying_draft_path": str(root_path)}, clear=False), + patch.object(jianying_task.utils, "task_dir", return_value=str(task_dir)), + patch.object(jianying_task.voice, "tts_multiple", return_value=tts_results), + patch.object(jianying_task, "_create_jianying_subtitle_file", return_value=str(subtitle_path)), + patch.object(jianying_task, "write_plaintext_jianying_draft", return_value=(str(root_path / "draft"), "NarratoAI_no_clip")) as write_draft, + patch.object(jianying_task.clip_video, "clip_video_unified") as clip_video_unified, + ): + result = jianying_task.start_export_jianying_draft("task-id", params) + + clip_video_unified.assert_not_called() + write_kwargs = write_draft.call_args.kwargs + self.assertTrue(write_kwargs["new_script_list"][0]["use_source_timerange"]) + self.assertEqual(str(audio_path), write_kwargs["new_script_list"][0]["audio"]) + self.assertEqual(str(subtitle_path), write_kwargs["subtitle_path"]) + self.assertEqual(str(subtitle_path), result["subtitles"][0]) + if __name__ == "__main__": unittest.main() diff --git a/webui.py b/webui.py index bf9dd71..7897fbb 100644 --- a/webui.py +++ b/webui.py @@ -10,6 +10,7 @@ from webui.components import basic_settings, video_settings, audio_settings, sub # from webui.utils import cache, file_utils from app.utils import utils from app.utils import ffmpeg_utils +from app.models import const from app.models.schema import VideoClipParams, VideoAspect @@ -129,6 +130,77 @@ def tr(key): return loc.get("Translation", {}).get(key, key) +VIDEO_GENERATION_STEP_LABELS = [ + "正在加载剪辑脚本", + "正在生成 TTS 配音", + "正在按脚本裁剪视频片段", + "正在合并配音和字幕", + "正在合并视频片段", + "正在合成最终视频", +] + + +def _safe_int(value, default=0): + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _format_optional_percent(value): + try: + percent = max(0.0, min(100.0, float(value))) + except (TypeError, ValueError): + return None + if percent.is_integer(): + return str(int(percent)) + return f"{percent:.1f}" + + +def _render_generation_status(task: dict | None) -> str: + task = task or {} + state = task.get("state") + current_step = _safe_int(task.get("step_current"), 0) + step_total = _safe_int(task.get("step_total"), len(VIDEO_GENERATION_STEP_LABELS)) + message = str(task.get("message") or "") + ffmpeg_percent = _format_optional_percent(task.get("ffmpeg_progress")) + + if current_step <= 0: + return f"
{escape(message or '正在生成视频,请稍候...')}
" + + lines = [] + for index, default_label in enumerate(VIDEO_GENERATION_STEP_LABELS, start=1): + is_current = index == current_step + is_complete = state == const.TASK_STATE_COMPLETE + is_done = is_complete or index < current_step + label = message if is_current and message else default_label + + suffix = f"{index}/{step_total}" + if ( + is_current + and index == step_total + and ffmpeg_percent is not None + and not is_complete + ): + suffix = f"{suffix},ffmpeg {ffmpeg_percent}%" + + color = "#262730" if is_current else "#8b9099" if is_done else "#b9bec7" + weight = "650" if is_current else "500" + lines.append( + "
" + f"{escape(label)} ({escape(suffix)})" + "
" + ) + + return "".join(lines) + + def get_help_text(): """返回带当前项目版本号的帮助文案""" return tr("Get Help").replace("🎉🎉🎉", f" v{config.project_version}") @@ -198,7 +270,12 @@ def render_generate_button(): progress_bar = st.progress(0) status_panel = st.status(tr("Generating Video"), expanded=True) - status_panel.write(tr("Generating Video")) + with status_panel: + status_placeholder = st.empty() + status_placeholder.markdown( + _render_generation_status(None), + unsafe_allow_html=True, + ) def run_task(): try: @@ -238,10 +315,19 @@ def render_generate_button(): # 更新进度条和阶段状态 progress_bar.progress(progress / 100) current_message = task.get("message") or f"Processing... {progress}%" - status_label = f"{current_message} ({progress}%)" - status_key = (state, progress, current_message) + status_key = ( + state, + progress, + current_message, + task.get("step_current"), + task.get("step_total"), + task.get("ffmpeg_progress"), + ) if status_key != last_status_key: - status_panel.write(status_label) + status_placeholder.markdown( + _render_generation_status(task), + unsafe_allow_html=True, + ) last_status_key = status_key if state == const.TASK_STATE_COMPLETE: From f6bda521b21d455b05d475af6f36b497b281cb46 Mon Sep 17 00:00:00 2001 From: viccy Date: Mon, 8 Jun 2026 16:23:10 +0800 Subject: [PATCH 23/24] =?UTF-8?q?feat(webui,=20jianying):=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E8=87=AA=E5=8A=A8=E5=AD=97=E5=B9=95=E5=8C=B9=E9=85=8D?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E5=B9=B6=E4=BF=AE=E5=A4=8Dwebui=E7=8A=B6?= =?UTF-8?q?=E6=80=81=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 为剪映任务模块新增自动根据视频文件名匹配对应字幕文件的逻辑,当未传入原始字幕路径时自动查找并选择最新的匹配字幕 - 修复webui脚本设置页的selectbox状态同步问题,改用session_state作为唯一状态源,避免同时传递index和key导致的冲突 - 更新webui脚本路径的特殊路径判断列表,新增MODE_FILE的特殊情况处理 - 新增两个单元测试用例验证自动字幕匹配和原片字幕导入功能 --- app/services/jianying_task.py | 58 +++++++++++++++++++++ app/services/test_jianying_task_unittest.py | 57 ++++++++++++++++++++ webui/components/script_settings.py | 13 +++-- 3 files changed, 124 insertions(+), 4 deletions(-) diff --git a/app/services/jianying_task.py b/app/services/jianying_task.py index 21e2c01..941a2c8 100644 --- a/app/services/jianying_task.py +++ b/app/services/jianying_task.py @@ -1,5 +1,6 @@ import json import os +import re import subprocess import time from os import path @@ -253,9 +254,66 @@ def _get_original_subtitle_paths(params: VideoClipParams) -> list[str]: if single_subtitle_path and single_subtitle_path not in seen: normalized_paths.insert(0, single_subtitle_path) + if not normalized_paths: + normalized_paths = _find_original_subtitle_paths_for_videos(_get_video_source_paths(params)) + return normalized_paths +def _video_stem_candidates(video_path: str) -> list[str]: + stem = path.splitext(path.basename(str(video_path or "").strip()))[0] + if not stem: + return [] + + candidates = [stem] + timestamp_stripped = re.sub(r"_[0-9]{14}$", "", stem) + if timestamp_stripped and timestamp_stripped not in candidates: + candidates.append(timestamp_stripped) + return candidates + + +def _find_original_subtitle_paths_for_videos(video_paths: list[str]) -> list[str]: + subtitle_dir = utils.subtitle_dir() + if not path.isdir(subtitle_dir): + return [] + + subtitle_files = [ + path.join(subtitle_dir, filename) + for filename in os.listdir(subtitle_dir) + if filename.lower().endswith(".srt") + ] + if not subtitle_files: + return [] + + resolved_paths = [] + seen = set() + for video_path in video_paths: + candidates = _video_stem_candidates(video_path) + if not candidates: + continue + + matches = [] + for subtitle_path in subtitle_files: + subtitle_stem = path.splitext(path.basename(subtitle_path))[0] + for candidate in candidates: + if subtitle_stem == candidate or subtitle_stem.startswith(f"{candidate}_"): + matches.append(subtitle_path) + break + + if not matches: + continue + + matches.sort(key=lambda item: path.getmtime(item), reverse=True) + selected_path = matches[0] + if selected_path not in seen: + resolved_paths.append(selected_path) + seen.add(selected_path) + + if resolved_paths: + logger.info(f"剪映导出未从参数获取原片字幕,已按视频文件名自动匹配: {resolved_paths}") + return resolved_paths + + def _create_jianying_subtitle_file( task_id: str, draft_script: list[Dict], diff --git a/app/services/test_jianying_task_unittest.py b/app/services/test_jianying_task_unittest.py index e977242..d66afb9 100644 --- a/app/services/test_jianying_task_unittest.py +++ b/app/services/test_jianying_task_unittest.py @@ -305,6 +305,63 @@ class JianyingTaskTests(unittest.TestCase): self.assertEqual(1.25, draft_script[0]["duration"]) self.assertTrue(draft_script[0]["use_source_timerange"]) + def test_get_original_subtitle_paths_falls_back_to_matching_video_name(self): + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + video_path = temp_path / "episode_20260608010240.mp4" + older_subtitle = temp_path / "episode_fun_asr_20260608000100.srt" + newer_subtitle = temp_path / "episode_fun_asr_20260608010100.srt" + video_path.write_bytes(b"video") + older_subtitle.write_text("old", encoding="utf-8") + newer_subtitle.write_text("new", encoding="utf-8") + + params = VideoClipParams(video_origin_path=str(video_path)) + + with patch.object(jianying_task.utils, "subtitle_dir", return_value=str(temp_path)): + subtitle_paths = jianying_task._get_original_subtitle_paths(params) + + self.assertEqual([str(newer_subtitle)], subtitle_paths) + + def test_create_jianying_subtitle_file_includes_original_audio_subtitles(self): + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + task_dir = temp_path / "task" + task_dir.mkdir() + video_path = temp_path / "episode.mp4" + subtitle_path = temp_path / "episode.srt" + video_path.write_bytes(b"video") + subtitle_path.write_text( + "1\n00:00:05,000 --> 00:00:06,500\n原片对白\n", + encoding="utf-8", + ) + + params = VideoClipParams(video_origin_path=str(video_path), subtitle_enabled=True) + draft_script = jianying_task._build_jianying_draft_script( + [ + { + "_id": 1, + "timestamp": "00:00:05,000-00:00:07,000", + "narration": "播放原片1", + "OST": 1, + } + ], + params, + [], + ) + + with ( + patch.object(jianying_task.utils, "subtitle_dir", return_value=str(temp_path)), + patch.object(jianying_task.utils, "task_dir", return_value=str(task_dir)), + ): + output_path = jianying_task._create_jianying_subtitle_file( + "task-id", + draft_script, + params, + ) + + self.assertTrue(output_path) + self.assertIn("原片对白", Path(output_path).read_text(encoding="utf-8")) + def test_start_export_jianying_draft_does_not_clip_video(self): with tempfile.TemporaryDirectory() as temp_dir: root_path = Path(temp_dir) / "drafts" diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index 5d68dd6..42b68d9 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -448,7 +448,7 @@ def render_script_file(tr, params): # 如果当前path是特殊值(auto/short/summary/film_summary),则重置为空 saved_script_path = ( current_path - if current_path not in [MODE_AUTO, MODE_SHORT, MODE_SHORT_SUMMARY, MODE_FILM_SUMMARY] + if current_path not in [MODE_FILE, MODE_AUTO, MODE_SHORT, MODE_SHORT_SUMMARY, MODE_FILM_SUMMARY] else "" ) @@ -458,13 +458,18 @@ def render_script_file(tr, params): selected_index = i break - # 如果找到了保存的脚本,同步更新 selectbox 的 key 状态 - if saved_script_path and selected_index > 0: + # 用 session_state 作为 selectbox 的唯一来源,避免同时传默认 index 和设置 key 状态。 + if ( + "script_file_selection" not in st.session_state + or st.session_state["script_file_selection"] >= len(script_list) + ): + st.session_state["script_file_selection"] = selected_index + elif saved_script_path and selected_index > 0: st.session_state['script_file_selection'] = selected_index selected_script_index = st.selectbox( tr("Script Files"), - index=selected_index, + index=None, options=range(len(script_list)), format_func=lambda x: script_list[x][0], key="script_file_selection" From 25ae35484f0ffa5f678794793ca8a854345c318b Mon Sep 17 00:00:00 2001 From: viccy Date: Wed, 10 Jun 2026 00:04:20 +0800 Subject: [PATCH 24/24] =?UTF-8?q?feat:=20=E6=9B=B4=E6=96=B0=E6=96=87?= =?UTF-8?q?=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 +++ README.md | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a1c25bd..fd920b4 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,6 @@ tests/* !tests/test_script_service_documentary_unittest.py !tests/test_generate_narration_script_documentary_unittest.py !tests/test_generate_script_docu_unittest.py + +docs/reddit-community +docs/wechat-0.8 \ No newline at end of file diff --git a/README.md b/README.md index 7edab9b..03e4dac 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ NarratoAI 是一款自动化影视解说工具,基于 LLM 实现文案撰写 本项目仅供学习和研究使用,不得商用。如需商业授权,请联系作者。 ## 最新资讯 +- 2026.06.10 发布新版本 0.8.1,**大版本更新**,优化多个核心流程 - 2026.04.27 发布新版本 0.7.9,新增 **Fun-ASR一键转录字幕** - 2026.04.03 发布新版本 0.7.8,重构纪录片逐帧分析链路,统一共享服务并优化抽帧、缓存、视觉并发与文案生成流程 - 2026.03.27 发布新版本 0.7.7,出于安全考虑,已移除 LiteLLM 依赖,统一使用 OpenAI 兼容请求链路 @@ -100,7 +101,7 @@ _**1. NarratoAI 是一款完全免费的软件,近期在社交媒体(抖音,B - [X] 支持短剧解说 - [ ] 主角人脸匹配 - [ ] 支持根据口播,文案,视频素材自动匹配 -- [ ] 支持更多 TTS 引擎 +- [X] 支持更多 TTS 引擎 - [ ] ... ## 快速启动 🚀