From 342fc15f3bf3beb515655de540d9a259bc9cf337 Mon Sep 17 00:00:00 2001 From: viccy Date: Sun, 7 Jun 2026 01:24:32 +0800 Subject: [PATCH] =?UTF-8?q?feat(tts,search,video):=20=E6=96=B0=E5=A2=9EOmn?= =?UTF-8?q?iVoice=20TTS=E3=80=81=E8=81=94=E7=BD=91=E6=90=9C=E7=B4=A2?= =?UTF-8?q?=E4=B8=8E=E5=A4=9A=E8=A7=86=E9=A2=91=E5=89=AA=E8=BE=91=E6=94=AF?= =?UTF-8?q?=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增OmniVoice语音合成引擎全流程支持,包含配置项、WebUI界面与服务实现 集成Tavily联网搜索能力,支持短剧剧情分析前自动检索剧情背景信息 新增多视频源剪辑支持,完善脚本校验规则并重构剪辑逻辑适配多视频路径 重构LLM剧情分析Prompt,优化输出格式适配多场景与联网检索结果 调整streamlit版本至1.56.0修复兼容性问题 新增相关单元测试与多语言翻译,更新配置示例文件 --- app/config/config.py | 5 + app/config/defaults.py | 3 + app/services/clip_video.py | 114 +++++++-- app/services/jianying_task.py | 9 +- app/services/llm/unified_service.py | 15 +- app/services/llm/validators.py | 12 + .../short_drama_narration/plot_analysis.py | 105 +++++---- .../script_generation.py | 25 +- app/services/task.py | 2 + app/services/tavily_search.py | 116 +++++++++ app/services/test_jianying_task_unittest.py | 17 ++ ...est_multi_video_script_sources_unittest.py | 84 +++++++ app/services/voice.py | 165 ++++++++++++- app/utils/check_script.py | 17 ++ config.example.toml | 32 ++- requirements.txt | 2 +- webui.py | 7 + webui/components/audio_settings.py | 192 ++++++++++++++- webui/components/basic_settings.py | 27 +++ webui/components/script_settings.py | 170 +++++++++++-- webui/components/subtitle_settings.py | 2 +- webui/i18n/en.json | 42 ++++ webui/i18n/zh.json | 42 ++++ webui/tools/generate_short_summary.py | 223 +++++++++++++++++- 24 files changed, 1320 insertions(+), 108 deletions(-) create mode 100644 app/services/tavily_search.py create mode 100644 app/services/test_multi_video_script_sources_unittest.py diff --git a/app/config/config.py b/app/config/config.py index ae19945..de17645 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -13,8 +13,11 @@ INDEXTTS_ENGINE = "indextts" INDEXTTS_DISPLAY_NAME = "IndexTTS-1.5" INDEXTTS2_ENGINE = "indextts2" INDEXTTS2_DISPLAY_NAME = "IndexTTS-2" +OMNIVOICE_ENGINE = "omnivoice" +OMNIVOICE_DISPLAY_NAME = "OmniVoice" INDEXTTS_VOICE_PREFIX = f"{INDEXTTS_ENGINE}:" INDEXTTS2_VOICE_PREFIX = f"{INDEXTTS2_ENGINE}:" +OMNIVOICE_VOICE_PREFIX = f"{OMNIVOICE_ENGINE}:" def normalize_tts_engine_name(tts_engine: str) -> str: @@ -131,6 +134,7 @@ def save_config(): _cfg["fun_asr"] = fun_asr _cfg["indextts"] = indextts _cfg["indextts2"] = indextts2 + _cfg["omnivoice"] = omnivoice _cfg["doubaotts"] = doubaotts f.write(toml.dumps(_cfg)) @@ -148,6 +152,7 @@ tts_qwen = _cfg.get("tts_qwen", {}) fun_asr = _cfg.get("fun_asr", {}) indextts = _cfg.get("indextts", {}) indextts2 = _cfg.get("indextts2", {}) +omnivoice = _cfg.get("omnivoice", {}) doubaotts = _cfg.get("doubaotts", {}) hostname = socket.gethostname() diff --git a/app/config/defaults.py b/app/config/defaults.py index a001978..9f648fa 100644 --- a/app/config/defaults.py +++ b/app/config/defaults.py @@ -35,6 +35,9 @@ DEFAULT_LLM_APP_CONFIG = { "text_openai_model_name": DEFAULT_TEXT_OPENAI_MODEL_NAME, "text_openai_api_key": "", "text_openai_base_url": DEFAULT_OPENAI_COMPATIBLE_BASE_URL, + "tavily_api_key": "", + "tavily_search_depth": "basic", + "tavily_max_results": 5, } DEFAULT_LLM_APP_CONFIG.update(DEFAULT_LLM_GENERATION_APP_CONFIG) diff --git a/app/services/clip_video.py b/app/services/clip_video.py index 8455703..93f9ddd 100644 --- a/app/services/clip_video.py +++ b/app/services/clip_video.py @@ -32,6 +32,82 @@ def parse_timestamp(timestamp: str) -> tuple: return start_time, end_time +def _normalize_video_origin_paths( + video_origin_path: str, + video_origin_paths: Optional[List[str]] = None, +) -> List[str]: + paths = [] + if video_origin_paths: + paths.extend(video_origin_paths) + if video_origin_path: + paths.insert(0, video_origin_path) + + normalized_paths = [] + seen = set() + for item in paths: + if not isinstance(item, str): + continue + item = item.strip() + if not item or item in seen: + continue + normalized_paths.append(item) + seen.add(item) + return normalized_paths + + +def _coerce_video_id(value) -> Optional[int]: + try: + video_id = int(value) + except (TypeError, ValueError): + return None + return video_id if video_id > 0 else None + + +def _match_video_id_by_name(video_name: str, video_origin_paths: List[str]) -> Optional[int]: + video_name = str(video_name or "").strip() + if not video_name: + return None + + expected_name = os.path.basename(video_name) + for index, video_path in enumerate(video_origin_paths, start=1): + if os.path.basename(video_path) == expected_name: + return index + return None + + +def _resolve_script_video_path(script_item: Dict, video_origin_paths: List[str]) -> str: + explicit_path = ( + script_item.get("source_video_path") + or script_item.get("video_origin_path") + or script_item.get("origin_video_path") + ) + if explicit_path and os.path.exists(explicit_path): + return explicit_path + + video_id = _coerce_video_id(script_item.get("video_id") or script_item.get("video_index")) + matched_video_id = _match_video_id_by_name( + script_item.get("video_name") or script_item.get("source_video"), + video_origin_paths, + ) + if matched_video_id: + video_id = matched_video_id + + if video_id is not None: + if video_id <= len(video_origin_paths): + return video_origin_paths[video_id - 1] + logger.warning( + f"片段 {script_item.get('_id')} 的 video_id={video_id} 超出视频数量 " + f"{len(video_origin_paths)},默认使用第一个视频" + ) + + return video_origin_paths[0] + + +def _safe_output_id(value) -> str: + safe_value = str(value if value is not None else "unknown") + return "".join(char if char.isalnum() or char in ("-", "_") else "_" for char in safe_value) + + def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str: """ 根据开始时间和持续时间计算结束时间 @@ -579,7 +655,7 @@ def _process_narration_only_segment( # 生成输出文件名 safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-') - output_filename = f"ost0_vid_{safe_start_time}@{safe_end_time}.mp4" + output_filename = f"ost0_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 - 移除音频 @@ -622,7 +698,7 @@ def _process_original_audio_segment( # 生成输出文件名 safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = end_time.replace(':', '-').replace(',', '-') - output_filename = f"ost1_vid_{safe_start_time}@{safe_end_time}.mp4" + output_filename = f"ost1_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 - 保持原声 @@ -674,7 +750,7 @@ def _process_mixed_segment( # 生成输出文件名 safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-') - output_filename = f"ost2_vid_{safe_start_time}@{safe_end_time}.mp4" + output_filename = f"ost2_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 - 保持原声 @@ -782,28 +858,34 @@ def clip_video_unified( script_list: List[Dict], tts_results: List[Dict], output_dir: Optional[str] = None, - task_id: Optional[str] = None + task_id: Optional[str] = None, + video_origin_paths: Optional[List[str]] = None ) -> Dict[str, str]: """ 基于OST类型的统一视频裁剪策略 - 消除双重裁剪问题 Args: - video_origin_path: 原始视频的路径 + video_origin_path: 原始视频的路径;旧脚本或无 video_id 片段默认使用该视频 script_list: 完整的脚本列表,包含所有片段信息 tts_results: TTS结果列表,仅包含OST=0和OST=2的片段 output_dir: 输出目录路径,默认为None时会自动生成 task_id: 任务ID,用于生成唯一的输出目录,默认为None时会自动生成 + video_origin_paths: 多个原始视频路径,脚本片段可用 video_id/video_name 指定来源 Returns: Dict[str, str]: 片段ID到裁剪后视频路径的映射 """ - # 检查视频文件是否存在 - if not os.path.exists(video_origin_path): - raise FileNotFoundError(f"视频文件不存在: {video_origin_path}") + video_source_paths = _normalize_video_origin_paths(video_origin_path, video_origin_paths) + if not video_source_paths: + raise FileNotFoundError("视频文件不存在: 未提供原始视频路径") + + missing_video_paths = [item for item in video_source_paths if not os.path.exists(item)] + if missing_video_paths: + raise FileNotFoundError(f"视频文件不存在: {', '.join(missing_video_paths)}") # 如果未提供task_id,则根据输入生成一个唯一ID if task_id is None: - content_for_hash = f"{video_origin_path}_{json.dumps(script_list)}" + content_for_hash = f"{json.dumps(video_source_paths, ensure_ascii=False)}_{json.dumps(script_list, ensure_ascii=False)}" task_id = hashlib.md5(content_for_hash.encode()).hexdigest() # 设置输出目录 @@ -840,29 +922,33 @@ def clip_video_unified( failed_clips = [] success_count = 0 - logger.info(f"📹 开始统一视频裁剪,总共{total_clips}个片段") + logger.info(f"📹 开始统一视频裁剪,总共{total_clips}个片段,源视频{len(video_source_paths)}个") for i, script_item in enumerate(script_list, 1): _id = script_item.get("_id") ost = script_item.get("OST", 0) timestamp = script_item["timestamp"] + source_video_path = _resolve_script_video_path(script_item, video_source_paths) - logger.info(f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, 时间戳:{timestamp}") + logger.info( + f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, " + f"视频:{os.path.basename(source_video_path)}, 时间戳:{timestamp}" + ) try: if ost == 0: # 纯解说片段 output_path = _process_narration_only_segment( - video_origin_path, script_item, tts_map, output_dir, + source_video_path, script_item, tts_map, output_dir, encoder_config, hwaccel_args ) elif ost == 1: # 纯原声片段 output_path = _process_original_audio_segment( - video_origin_path, script_item, output_dir, + source_video_path, script_item, output_dir, encoder_config, hwaccel_args ) elif ost == 2: # 解说+原声混合片段 output_path = _process_mixed_segment( - video_origin_path, script_item, tts_map, output_dir, + source_video_path, script_item, tts_map, output_dir, encoder_config, hwaccel_args ) else: diff --git a/app/services/jianying_task.py b/app/services/jianying_task.py index 345f6b7..a24304c 100644 --- a/app/services/jianying_task.py +++ b/app/services/jianying_task.py @@ -107,7 +107,7 @@ def _clamp_duration_to_media( def _normalize_indextts_reference_audio(params: VideoClipParams) -> None: - """Ensure IndexTTS engines use the configured reference audio instead of a stale UI voice.""" + """Ensure local clone TTS engines use configured reference audio instead of a stale UI voice.""" params.tts_engine = config.normalize_tts_engine_name(params.tts_engine) if params.tts_engine == config.INDEXTTS_ENGINE: tts_config = config.indextts @@ -117,6 +117,12 @@ def _normalize_indextts_reference_audio(params: VideoClipParams) -> None: tts_config = config.indextts2 voice_prefix = config.INDEXTTS2_VOICE_PREFIX display_name = "IndexTTS-2" + elif params.tts_engine == config.OMNIVOICE_ENGINE: + tts_config = config.omnivoice + if tts_config.get("mode", "auto") != "voice_clone": + return + voice_prefix = config.OMNIVOICE_VOICE_PREFIX + display_name = "OmniVoice" else: return @@ -199,6 +205,7 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams): logger.info("\n\n## 3. 统一视频裁剪(基于OST类型)") video_clip_result = clip_video.clip_video_unified( video_origin_path=params.video_origin_path, + video_origin_paths=getattr(params, "video_origin_paths", []), script_list=list_script, tts_results=tts_results ) diff --git a/app/services/llm/unified_service.py b/app/services/llm/unified_service.py index 0c31b5a..63cc48f 100644 --- a/app/services/llm/unified_service.py +++ b/app/services/llm/unified_service.py @@ -12,6 +12,7 @@ from loguru import logger from .manager import LLMServiceManager from .validators import OutputValidator from .exceptions import LLMServiceError +from app.services.prompts import PromptManager # 提供商注册由 webui.py:main() 显式调用(见 LLM 提供商注册机制重构) # 这样更可靠,错误也更容易调试 @@ -181,12 +182,20 @@ class UnifiedLLMService: LLMServiceError: 服务调用失败时抛出 """ try: - # 构建分析提示词 - system_prompt = "你是一位专业的剧本分析师和剧情概括助手。请仔细分析字幕内容,提取关键剧情信息。" + prompt = PromptManager.get_prompt( + category="short_drama_narration", + name="plot_analysis", + parameters={"subtitle_content": subtitle_content}, + ) + prompt_object = PromptManager.get_prompt_object( + category="short_drama_narration", + name="plot_analysis", + ) + system_prompt = prompt_object.get_system_prompt() # 生成分析结果 result = await UnifiedLLMService.generate_text( - prompt=subtitle_content, + prompt=prompt, system_prompt=system_prompt, provider=provider, temperature=temperature, diff --git a/app/services/llm/validators.py b/app/services/llm/validators.py index 1614e14..1ef30e2 100644 --- a/app/services/llm/validators.py +++ b/app/services/llm/validators.py @@ -113,6 +113,8 @@ class OutputValidator: "required": ["_id", "timestamp", "picture", "narration"], "properties": { "_id": {"type": "number"}, + "video_id": {"type": "number"}, + "video_name": {"type": "string"}, "timestamp": {"type": "string"}, "picture": {"type": "string"}, "narration": {"type": "string"}, @@ -161,6 +163,16 @@ class OutputValidator: item_id = item.get("_id") if not isinstance(item_id, (int, float)) or item_id <= 0: raise ValidationError(f"第{index+1}项ID必须为正整数: {item_id}", "invalid_id") + + video_id = item.get("video_id") + if video_id not in (None, "") and ( + not isinstance(video_id, (int, float)) or video_id <= 0 + ): + raise ValidationError(f"第{index+1}项video_id必须为正整数: {video_id}", "invalid_video_id") + + video_name = item.get("video_name") + if video_name not in (None, "") and not isinstance(video_name, str): + raise ValidationError(f"第{index+1}项video_name必须为字符串: {video_name}", "invalid_video_name") @staticmethod def validate_subtitle_analysis(output: str) -> str: diff --git a/app/services/prompts/short_drama_narration/plot_analysis.py b/app/services/prompts/short_drama_narration/plot_analysis.py index 0f8ffb1..a50dbe7 100644 --- a/app/services/prompts/short_drama_narration/plot_analysis.py +++ b/app/services/prompts/short_drama_narration/plot_analysis.py @@ -19,72 +19,79 @@ class PlotAnalysisPrompt(TextPrompt): metadata = PromptMetadata( name="plot_analysis", category="short_drama_narration", - version="v1.0", - description="分析短剧字幕内容,提供详细的剧情分析和分段解析", + version="v1.1", + description="结合字幕和可选联网检索上下文,输出适合短剧解说脚本生成的结构化剧情理解", model_type=ModelType.TEXT, output_format=OutputFormat.TEXT, - tags=["短剧", "剧情分析", "字幕解析", "分段分析"], + tags=["短剧", "剧情分析", "字幕解析", "分段分析", "联网检索", "解说脚本素材"], parameters=["subtitle_content"] ) super().__init__(metadata) - self._system_prompt = "你是一位专业的剧本分析师和剧情概括助手。" + self._system_prompt = "你是一位专业的短剧解说策划和剧本分析师。请输出克制、结构化、可直接供下游解说脚本生成使用的剧情理解材料。" def get_template(self) -> str: return """# 角色 -你是一位专业的剧本分析师和剧情概括助手。 +你是一位专业的短剧解说策划和剧本分析师。你的输出不是给观众看的成片文案,而是给下游“短剧解说脚本生成器”使用的结构化剧情理解材料。 -# 任务 -我将为你提供一部短剧的完整字幕文本。请你基于这些字幕,完成以下任务: -1. **整体剧情分析**:简要概括整个短剧的核心剧情脉络、主要冲突和结局(如果有的话)。 -2. **分段剧情解析与时间戳定位**: - * 将整个短剧划分为若干个关键的剧情段落(例如:开端、发展、转折、高潮、结局,或根据具体情节自然划分)。 - * 段落数应该与字幕长度成正比。 - * 对于每一个剧情段落: - * **概括该段落的主要内容**:用简洁的语言描述这段剧情发生了什么。 - * **标注对应的时间戳范围**:明确指出该剧情段落对应的开始字幕时间戳和结束字幕时间戳。请直接从字幕中提取时间信息。 +# 输入说明 +下面的输入可能只包含一个视频的原始字幕,也可能包含多个视频文件的字幕;也可能同时包含 Tavily 联网检索结果和原始字幕。 +- 联网检索结果只能用于辅助识别短剧名称、人物关系、时代背景、公开剧情梗概。 +- 原始字幕是唯一可信的当前片段事实来源。 +- 如果联网检索结果与字幕冲突,必须以字幕为准。 +- 如果联网检索结果包含当前字幕尚未出现的后续剧情,只能放在“字幕未覆盖/需谨慎信息”中,不能写进当前剧情事实。 +- 多个视频字幕会以“视频 1: 文件名”“视频 2: 文件名”等标题分隔。时间戳均为对应视频内部时间,不是拼接后的累计时间。 -# 输入格式 -字幕内容通常包含时间戳和对话,例如: -``` -00:00:05,000 --> 00:00:10,000 -[角色A]: 你好吗? -00:00:10,500 --> 00:00:15,000 -[角色B]: 我很好,谢谢。发生了一些有趣的事情。 -... (更多字幕内容) ... -``` -我将把实际字幕粘贴在下方。 +# 核心任务 +请基于输入完成剧情理解,目标是帮助后续生成高质量短剧解说脚本: +1. 识别短剧名称、当前字幕范围、视频来源、联网检索辅助信息和字幕事实边界。 +2. 统一人物称呼,避免同一人物出现多个名字写法。 +3. 用 100-180 字概括当前字幕覆盖的剧情,不提前剧透字幕未出现的内容。 +4. 按视频来源和字幕时间顺序拆分关键剧情段落,并为每段标注准确 video_id / video_name / 时间戳。 +5. 提炼解说创作可用的钩子、冲突、爽点/泪点/悬念点和建议保留原声片段。 -# 输出格式要求 -请按照以下格式清晰地呈现分析结果: +# 强制输出规则 +1. 禁止输出寒暄、解释身份或“好的,我将……”等聊天式开场。 +2. 禁止编造字幕中没有的具体事件、对白、关系进展或结局。 +3. 时间戳必须直接来自对应视频字幕;无法确定时写“字幕未明确”,不要猜测。 +4. 多视频场景下必须明确每段来自哪个视频文件,禁止把不同视频的同名时间戳混在一起。 +5. 人名必须统一:优先采用联网检索中的正式名称;如果字幕写法不同,在人物表中保留“字幕称呼”。 +6. 内容要简洁、客观、可复用,避免散文化长段落。 +7. 必须严格按照下面的 Markdown 格式输出,不要添加额外章节。 -**一、整体剧情概括:** -[此处填写对整个短剧剧情的概括] +# 输出格式 +## 一、基础识别 +- 短剧名称:[如输入可判断则填写,否则写“未知”] +- 当前字幕范围:[开始时间戳] --> [结束时间戳];无法确定则写“字幕未明确” +- 视频来源:[列出视频编号、文件名和各自字幕时间范围;单视频也要写] +- 联网检索确认:[仅写可辅助理解的公开信息;没有联网结果则写“未启用/未提供”] +- 字幕内实际出现:[列出当前字幕真实出现的关键事实,2-4 条] +- 字幕未覆盖/需谨慎信息:[列出联网结果提到但当前字幕未发生的内容;没有则写“无”] -**二、分段剧情解析:** +## 二、人物与关系 +| 统一称呼 | 字幕称呼 | 身份/关系 | 当前剧情作用 | 确定性 | +|---|---|---|---|---| +| [人物名] | [字幕原文称呼] | [身份或关系] | [在当前片段中的作用] | 字幕明确/联网辅助/合理推断 | -**剧情段落 1:[段落主题/概括,例如:主角登场与背景介绍]** -* **时间戳:** [开始时间戳] --> [结束时间戳] -* **内容概要:** [对这段剧情的详细描述] +## 三、整体剧情概括 +[100-180 字,只概括当前字幕覆盖的剧情。必须包含核心冲突、人物动机和当前悬念。] -**剧情段落 2:[段落主题/概括,例如:第一个冲突出现]** -* **时间戳:** [开始时间戳] --> [结束时间戳] -* **内容概要:** [对这段剧情的详细描述] +## 四、分段剧情解析 +| 视频 | 时间戳 | 段落主题 | 剧情事件 | 情绪/冲突功能 | +|---|---|---|---|---| +| [video_id + video_name] | [开始] --> [结束] | [简短主题] | [当前段落发生了什么] | [铺垫/冲突升级/人物塑造/反转/悬念/情绪爆发等] | -... (根据实际剧情段落数量继续) ... +## 五、解说创作重点 +- 开场钩子:[用一句话指出最适合开场抓人的冲突或疑问] +- 核心冲突:[当前片段最主要的矛盾] +- 爽点/泪点/情绪点:[列 1-3 条,没有则写“无明显”] +- 悬念点:[当前片段留下的疑问或后续期待] +- 建议保留原声片段: + 1. [video_id + video_name + 时间戳]:[保留理由;如果没有合适原声,写“无明显”] -**剧情段落 N:[段落主题/概括,例如:结局与反思]** -* **时间戳:** [开始时间戳] --> [结束时间戳] -* **内容概要:** [对这段剧情的详细描述] +## 六、联网信息校验 +- 可用于辅助理解的信息:[联网结果中可帮助理解当前字幕的信息;没有则写“无”] +- 与字幕不一致或字幕未覆盖的信息:[必须列出,不要混入当前剧情事实;没有则写“无”] -# 注意事项 -* 请确保时间戳的准确性,直接引用字幕中的时间。 -* 剧情段落的划分应合乎逻辑,能够反映剧情的起承转合。 -* 语言表达应简洁、准确、客观。 - -# 限制 -1. 严禁输出与分析结果无关的内容 -2. 时间戳必须严格按照字幕中的实际时间 - -# 请处理以下字幕: +# 输入内容 ${subtitle_content}""" diff --git a/app/services/prompts/short_drama_narration/script_generation.py b/app/services/prompts/short_drama_narration/script_generation.py index 0184cb1..234fc98 100644 --- a/app/services/prompts/short_drama_narration/script_generation.py +++ b/app/services/prompts/short_drama_narration/script_generation.py @@ -43,11 +43,14 @@ class ScriptGenerationPrompt(ParameterizedPrompt): ${plot_analysis} -### 原始字幕(含精确时间戳) +### 原始字幕(含视频编号和精确时间戳) ${subtitle_content} +字幕可能来自多个视频文件。每个字幕分段标题会以“视频 1: 文件名”“视频 2: 文件名”等形式标识来源。 +生成脚本时必须把每个片段绑定到对应视频来源,时间戳表示该视频文件内部的局部时间,不是把多个视频拼接后的全局时间。 + ## 短剧解说创作核心要素 ### 1. 黄金开场(3秒法则) @@ -137,11 +140,18 @@ ${subtitle_content} ### 时间戳管理(绝对不能违反) - **时间戳绝对不能重叠**,确保剪辑后无重复画面 -- **时间段必须连续且不交叉**,严格按时间顺序排列 -- **每个时间戳都必须在原始字幕中找到对应范围** +- **同一个 video_id 内的时间段必须连续且不交叉**,严格按该视频内时间顺序排列 +- **跨视频可以切换 video_id**,但每个时间戳都必须来自对应视频字幕分段 +- **每个时间戳都必须在对应视频的原始字幕中找到对应范围** - 可以拆分原时间片段,但必须保持时间连续性 - 时间戳的格式必须与原始字幕中的格式完全一致 +### 多视频来源规范(多集/多文件必须遵守) +- **video_id**:必须填写,取字幕分段标题里的视频编号,例如“视频 3”就填 3 +- **video_name**:必须填写对应的视频文件名,例如“3_20260607002212.mp4” +- **timestamp**:只填写对应 video_id 内部的时间范围,不要换算成多个视频拼接后的累计时间 +- 如果剧情跨多个视频推进,脚本可以按故事顺序在不同 video_id 之间切换,但不得把视频 2 的时间戳写到 video_id=1 + ### 时长控制(1/3原则) - **解说视频总长度 = 原视频长度的 1/3** - 精确控制节奏和密度,既不能过短也不能过长 @@ -159,6 +169,8 @@ ${subtitle_content} ```json { "_id": 序号, + "video_id": 视频编号, + "video_name": "视频文件名", "timestamp": "开始时间-结束时间", "picture": "画面内容描述", "narration": "播放原片+序号", @@ -242,6 +254,8 @@ ${subtitle_content} "items": [ { "_id": 1, + "video_id": 1, + "video_name": "1.mp4", "timestamp": "00:00:01,000-00:00:05,500", "picture": "女主角林小雨慌张地道歉,男主角沈墨轩冷漠地看着她", "narration": "一个普通女孩的命运即将因为一杯咖啡彻底改变!她撞到的这个男人,竟然是...", @@ -249,6 +263,8 @@ ${subtitle_content} }, { "_id": 2, + "video_id": 1, + "video_name": "1.mp4", "timestamp": "00:00:05,500-00:00:08,000", "picture": "沈墨轩质问林小雨,语气冷厉威严", "narration": "播放原片2", @@ -256,6 +272,8 @@ ${subtitle_content} }, { "_id": 3, + "video_id": 2, + "video_name": "2.mp4", "timestamp": "00:00:08,000-00:00:12,000", "picture": "林小雨惊慌失措,沈墨轩眼中闪过一丝兴趣", "narration": "霸道总裁的经典开场!一杯咖啡引发的爱情故事就这样开始了...", @@ -281,6 +299,7 @@ ${subtitle_content} - **原声片段标识**:OST=1表示原声,OST=0表示解说 - **原声格式规范**:narration字段必须使用"播放原片+序号"格式 - **关键情绪点**:必须保留原片原声,增强观众代入感 +- **视频来源**:每个片段必须包含 video_id 和 video_name,用于定位多个上传视频中的源文件 - **时间戳精度**:精确到毫秒级别,确保与字幕完美匹配 - **逻辑连贯性**:严格遵循剧情发展顺序 diff --git a/app/services/task.py b/app/services/task.py index bf8c45b..28b05ea 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -225,6 +225,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di # 使用新的统一裁剪策略 video_clip_result = clip_video.clip_video_unified( video_origin_path=params.video_origin_path, + video_origin_paths=getattr(params, "video_origin_paths", []), script_list=list_script, tts_results=tts_results ) @@ -477,6 +478,7 @@ def start_subclip_unified(task_id: str, params: VideoClipParams): # 使用新的统一裁剪策略 video_clip_result = clip_video.clip_video_unified( video_origin_path=params.video_origin_path, + video_origin_paths=getattr(params, "video_origin_paths", []), script_list=list_script, tts_results=tts_results ) diff --git a/app/services/tavily_search.py b/app/services/tavily_search.py new file mode 100644 index 0000000..586a7ee --- /dev/null +++ b/app/services/tavily_search.py @@ -0,0 +1,116 @@ +"""Tavily-powered web search helpers for plot analysis.""" + +from __future__ import annotations + +import os +from typing import Any + +import requests +from loguru import logger + + +TAVILY_API_BASE_URL = "https://api.tavily.com" +DEFAULT_SEARCH_DEPTH = "basic" +DEFAULT_MAX_RESULTS = 5 +DEFAULT_TIMEOUT = 20 + + +class TavilySearchError(RuntimeError): + """Raised when Tavily search cannot be completed.""" + + +def _trim_text(value: Any, max_chars: int) -> str: + text = str(value or "").strip() + if len(text) <= max_chars: + return text + return f"{text[:max_chars].rstrip()}..." + + +def search_short_drama( + short_name: str, + api_key: str | None = None, + *, + search_depth: str = DEFAULT_SEARCH_DEPTH, + max_results: int = DEFAULT_MAX_RESULTS, + timeout: int = DEFAULT_TIMEOUT, +) -> dict[str, Any]: + """Search web context for a short drama name with Tavily.""" + short_name = str(short_name or "").strip() + if not short_name: + raise TavilySearchError("短剧名称不能为空") + + api_key = (api_key or os.getenv("TAVILY_API_KEY") or "").strip() + if not api_key: + raise TavilySearchError("Tavily API Key 未配置") + + query = f"{short_name} 短剧 剧情 介绍 人物 结局" + payload = { + "query": query, + "search_depth": search_depth or DEFAULT_SEARCH_DEPTH, + "topic": "general", + "max_results": max(1, min(int(max_results or DEFAULT_MAX_RESULTS), 10)), + "include_answer": True, + "include_raw_content": False, + "include_images": False, + } + + try: + response = requests.post( + f"{TAVILY_API_BASE_URL}/search", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json=payload, + timeout=timeout, + ) + except requests.RequestException as exc: + raise TavilySearchError(f"Tavily 请求失败: {exc}") from exc + + if response.status_code >= 400: + message = _trim_text(response.text, 500) + raise TavilySearchError(f"Tavily 请求失败: HTTP {response.status_code} {message}") + + try: + data = response.json() + except ValueError as exc: + raise TavilySearchError("Tavily 返回内容不是有效 JSON") from exc + + logger.info( + "Tavily 短剧检索完成: query={}, results={}", + query, + len(data.get("results") or []), + ) + return data + + +def format_search_context(search_data: dict[str, Any], *, max_chars: int = 6000) -> str: + """Format Tavily response into compact LLM context.""" + if not search_data: + return "" + + lines = [ + "# Tavily 联网检索结果", + f"检索 query: {search_data.get('query', '')}", + ] + + answer = _trim_text(search_data.get("answer"), 1200) + if answer: + lines.extend(["", "## 综合回答", answer]) + + results = search_data.get("results") or [] + if results: + lines.extend(["", "## 搜索来源"]) + for index, result in enumerate(results, start=1): + title = _trim_text(result.get("title"), 120) + url = _trim_text(result.get("url"), 240) + content = _trim_text(result.get("content") or result.get("raw_content"), 700) + lines.extend( + [ + f"{index}. 标题: {title}", + f" 来源: {url}", + f" 摘要: {content}", + ] + ) + + return _trim_text("\n".join(lines).strip(), max_chars) diff --git a/app/services/test_jianying_task_unittest.py b/app/services/test_jianying_task_unittest.py index 18897a4..0a1660f 100644 --- a/app/services/test_jianying_task_unittest.py +++ b/app/services/test_jianying_task_unittest.py @@ -51,6 +51,23 @@ class JianyingTaskTests(unittest.TestCase): self.assertEqual(f"indextts2:{ref_path}", params.voice_name) + def test_normalize_omnivoice_clone_uses_valid_param_reference(self): + with tempfile.NamedTemporaryFile(suffix=".wav") as ref: + params = VideoClipParams(tts_engine="omnivoice", voice_name=f"omnivoice:{ref.name}") + + with patch.dict(jianying_task.config.omnivoice, {"mode": "voice_clone"}, clear=False): + jianying_task._normalize_indextts_reference_audio(params) + + self.assertEqual(f"omnivoice:{ref.name}", params.voice_name) + + def test_normalize_omnivoice_auto_does_not_require_reference(self): + params = VideoClipParams(tts_engine="omnivoice", voice_name="omnivoice:auto") + + with patch.dict(jianying_task.config.omnivoice, {"mode": "auto", "reference_audio": ""}, clear=False): + jianying_task._normalize_indextts_reference_audio(params) + + self.assertEqual("omnivoice:auto", params.voice_name) + def test_normalize_indextts_requires_existing_reference_audio(self): params = VideoClipParams(tts_engine="indextts", voice_name="zh-CN-YunjianNeural") diff --git a/app/services/test_multi_video_script_sources_unittest.py b/app/services/test_multi_video_script_sources_unittest.py new file mode 100644 index 0000000..dd6fce8 --- /dev/null +++ b/app/services/test_multi_video_script_sources_unittest.py @@ -0,0 +1,84 @@ +import json +import os +import tempfile +import unittest +from unittest import mock + +from app.services import clip_video +from app.utils import check_script + + +class TestMultiVideoScriptSources(unittest.TestCase): + def test_check_format_accepts_optional_video_source_fields(self): + script = [ + { + "_id": 1, + "video_id": 2, + "video_name": "2.mp4", + "timestamp": "00:00:00,000-00:00:03,000", + "picture": "画面", + "narration": "解说", + "OST": 0, + } + ] + + result = check_script.check_format(json.dumps(script, ensure_ascii=False)) + + self.assertTrue(result["success"]) + + def test_clip_video_unified_resolves_source_by_video_id_and_name(self): + with tempfile.TemporaryDirectory() as temp_dir: + video_1 = os.path.join(temp_dir, "1.mp4") + video_2 = os.path.join(temp_dir, "2.mp4") + for video_path in [video_1, video_2]: + with open(video_path, "wb") as file: + file.write(b"video") + + output_dir = os.path.join(temp_dir, "clips") + used_sources = [] + + def fake_process(source_video_path, script_item, output_dir_arg, *_args): + used_sources.append(source_video_path) + output_path = os.path.join(output_dir_arg, f"{script_item['_id']}.mp4") + with open(output_path, "wb") as file: + file.write(b"clip") + return output_path + + script_list = [ + { + "_id": 1, + "video_id": 2, + "timestamp": "00:00:00,000-00:00:03,000", + "picture": "视频2画面", + "narration": "播放原片1", + "OST": 1, + }, + { + "_id": 2, + "video_name": "1.mp4", + "timestamp": "00:00:03,000-00:00:06,000", + "picture": "视频1画面", + "narration": "播放原片2", + "OST": 1, + }, + ] + + with ( + mock.patch.object(clip_video, "check_hardware_acceleration", return_value=None), + mock.patch.object(clip_video, "_process_original_audio_segment", side_effect=fake_process), + ): + result = clip_video.clip_video_unified( + video_origin_path=video_1, + video_origin_paths=[video_1, video_2], + script_list=script_list, + tts_results=[], + output_dir=output_dir, + task_id="multi-video-test", + ) + + self.assertEqual([video_2, video_1], used_sources) + self.assertEqual({1, 2}, set(result.keys())) + + +if __name__ == "__main__": + unittest.main() diff --git a/app/services/voice.py b/app/services/voice.py index 2be5c87..476c2fe 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os import re import json @@ -1298,6 +1300,10 @@ def tts( if tts_engine == config.INDEXTTS2_ENGINE: logger.info("分发到 IndexTTS-2") return indextts2_tts(text, voice_name, voice_file) + + if tts_engine == config.OMNIVOICE_ENGINE: + logger.info("分发到 OmniVoice") + return omnivoice_tts(text, voice_name, voice_file, speed=voice_rate) if tts_engine == "doubaotts": logger.info("分发到豆包语音 TTS") @@ -1783,7 +1789,11 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f voice_name = config.normalize_indextts_voice_prefix(parse_voice_name(voice_name)) output_dir = utils.task_dir(task_id) tts_results = [] - audio_extension = ".wav" if tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) else ".mp3" + audio_extension = ".wav" if tts_engine in ( + config.INDEXTTS_ENGINE, + config.INDEXTTS2_ENGINE, + config.OMNIVOICE_ENGINE, + ) else ".mp3" for item in list_script: if item['OST'] != 1: @@ -1809,11 +1819,11 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f f"或者使用其他 tts 引擎") continue else: - # SoulVoice、Qwen3、IndexTTS、豆包语音 引擎不生成精确字幕文件 + # SoulVoice、Qwen3、IndexTTS、OmniVoice、豆包语音 引擎不生成精确字幕文件 if ( is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) - or tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) + or tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE, config.OMNIVOICE_ENGINE) or tts_engine == "doubaotts" ): # 获取实际音频文件的时长 @@ -2256,6 +2266,17 @@ def parse_indextts2_voice(voice_name: str) -> str: return voice_name +def parse_omnivoice_voice(voice_name: str) -> str: + """ + 解析 OmniVoice 语音名称 + 支持格式:omnivoice:reference_audio_path + 返回参考音频文件路径或模式名 + """ + if isinstance(voice_name, str) and voice_name.startswith(config.OMNIVOICE_VOICE_PREFIX): + return voice_name[len(config.OMNIVOICE_VOICE_PREFIX):] + return voice_name + + def indextts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: """ 使用 IndexTTS-1.5 API 进行零样本语音克隆 @@ -2493,3 +2514,141 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str) -> Union[SubMaker logger.error("IndexTTS-2 TTS 生成失败,已达到最大重试次数") return None + + +def _normalize_omnivoice_api_url(api_url: str) -> str: + api_url = (api_url or "http://127.0.0.1:7866/tts").strip() + if api_url.endswith("/tts"): + return api_url + if api_url.endswith("/tts/json"): + return f"{api_url[:-len('/tts/json')]}/tts" + return f"{api_url.rstrip('/')}/tts" + + +def _download_omnivoice_audio(response: requests.Response, api_url: str, voice_file: str, proxies: dict) -> bool: + content_type = response.headers.get("content-type", "").lower() + if "application/json" not in content_type: + with open(voice_file, "wb") as f: + f.write(response.content) + return os.path.getsize(voice_file) > 0 + + result = response.json() + audio_url = result.get("audio_url") if isinstance(result, dict) else "" + if not audio_url: + logger.error(f"OmniVoice API 响应中没有音频下载地址: {result}") + return False + + audio_response = requests.get(urljoin(api_url, audio_url), proxies=proxies, timeout=180) + if audio_response.status_code != 200: + logger.error(f"OmniVoice 音频下载失败: {audio_response.status_code} - {audio_response.text}") + return False + + with open(voice_file, "wb") as f: + f.write(audio_response.content) + return os.path.getsize(voice_file) > 0 + + +def _optional_omnivoice_generation_data(voice_speed: float) -> dict: + omnivoice_config = getattr(config, "omnivoice", {}) or {} + data = { + "speed": voice_speed or omnivoice_config.get("speed", 1.0), + } + + optional_fields = { + "num_step": omnivoice_config.get("num_step"), + "guidance_scale": omnivoice_config.get("guidance_scale"), + "duration": omnivoice_config.get("duration"), + } + for key, value in optional_fields.items(): + if value not in (None, ""): + data[key] = value + + for key in ("denoise", "postprocess_output", "preprocess_prompt"): + if key in omnivoice_config: + data[key] = str(bool(omnivoice_config.get(key))).lower() + + return data + + +def omnivoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: + """ + 使用 OmniVoice-Pack FastAPI 服务进行语音合成。 + 支持自动音色、指令音色和参考音频克隆三种模式。 + """ + omnivoice_config = getattr(config, "omnivoice", {}) or {} + api_url = _normalize_omnivoice_api_url(omnivoice_config.get("api_url", "http://127.0.0.1:7866/tts")) + mode = omnivoice_config.get("mode", "auto") + language = (omnivoice_config.get("language", "zh") or "").strip() + instruct = (omnivoice_config.get("instruct", "") or "").strip() + ref_text = (omnivoice_config.get("ref_text", "") or "").strip() + parsed_voice = parse_omnivoice_voice(voice_name) + if mode != "voice_clone" and parsed_voice and os.path.isfile(parsed_voice): + mode = "voice_clone" + + reference_audio_path = "" + if mode == "voice_clone": + candidate = parsed_voice + if candidate and os.path.isfile(candidate): + reference_audio_path = candidate + else: + reference_audio_path = parse_omnivoice_voice(omnivoice_config.get("reference_audio", "") or "") + + if not reference_audio_path or not os.path.exists(reference_audio_path): + logger.error(f"OmniVoice 参考音频文件不存在: {reference_audio_path}") + return None + elif mode != "voice_design": + instruct = "" + + data = { + "text": text.strip(), + "language": language, + **_optional_omnivoice_generation_data(speed), + } + if mode == "voice_design" and instruct: + data["instruct"] = instruct + if mode == "voice_clone" and ref_text: + data["ref_text"] = ref_text + + proxies = _get_configured_proxies() + for attempt in range(3): + files = {} + try: + if reference_audio_path: + files["ref_audio"] = open(reference_audio_path, "rb") + + logger.info(f"第 {attempt + 1} 次调用 OmniVoice API: {api_url}, mode={mode}") + response = requests.post( + api_url, + files=files or None, + data=data, + proxies=proxies, + timeout=240, + ) + + if response.status_code == 200 and _download_omnivoice_audio(response, api_url, voice_file, proxies): + logger.info(f"OmniVoice 成功生成音频: {voice_file}, 大小: {os.path.getsize(voice_file)} 字节") + sub_maker = new_sub_maker() + duration = get_audio_duration_from_file(voice_file) + duration_ms = int(duration * 1000) if duration > 0 else max(1000, int(len(text) * 200)) + add_subtitle_event(sub_maker, 0, duration_ms * 10000, text) + return sub_maker + + logger.error(f"OmniVoice API 调用失败: {response.status_code} - {response.text}") + except requests.exceptions.Timeout: + logger.error(f"OmniVoice API 调用超时 (尝试 {attempt + 1}/3)") + except requests.exceptions.RequestException as e: + logger.error(f"OmniVoice API 网络错误: {str(e)} (尝试 {attempt + 1}/3)") + except Exception as e: + logger.error(f"OmniVoice TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)") + finally: + for file_obj in files.values(): + try: + file_obj.close() + except Exception: + pass + + if attempt < 2: + time.sleep(2) + + logger.error("OmniVoice TTS 生成失败,已达到最大重试次数") + return None diff --git a/app/utils/check_script.py b/app/utils/check_script.py index 9c745e6..0e6f692 100644 --- a/app/utils/check_script.py +++ b/app/utils/check_script.py @@ -57,6 +57,23 @@ def check_format(script_content: str) -> Dict[str, Any]: 'details': f'当前值: {clip["_id"]} (类型: {type(clip["_id"]).__name__})' } + # 验证可选视频来源字段。旧脚本可以不包含,新脚本用于多视频定位。 + if 'video_id' in clip and clip['video_id'] not in ("", None): + if not isinstance(clip['video_id'], int) or clip['video_id'] <= 0: + return { + 'success': False, + 'message': f'第{i+1}个片段的video_id必须是正整数', + 'details': f'当前值: {clip["video_id"]} (类型: {type(clip["video_id"]).__name__})' + } + + if 'video_name' in clip and clip['video_name'] not in ("", None): + if not isinstance(clip['video_name'], str): + return { + 'success': False, + 'message': f'第{i+1}个片段的video_name必须是字符串', + 'details': f'当前值: {clip["video_name"]} (类型: {type(clip["video_name"]).__name__})' + } + # 验证 timestamp 字段格式 timestamp_pattern = r'^\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}$' if not isinstance(clip['timestamp'], str) or not re.match(timestamp_pattern, clip['timestamp']): diff --git a/config.example.toml b/config.example.toml index 0b807e3..547724e 100644 --- a/config.example.toml +++ b/config.example.toml @@ -49,6 +49,12 @@ text_openai_max_tokens = 65536 text_openai_thinking_level = "auto" # auto/off/low/medium/high + # ===== Tavily 联网搜索配置 ===== + # 用于短剧剧情理解前,按短剧名称检索公开剧情/人物/分集信息 + tavily_api_key = "" # 获取地址:https://app.tavily.com + tavily_search_depth = "basic" # basic / advanced / fast / ultra-fast + tavily_max_results = 5 + # ===== API Keys 参考 ===== # 主流 LLM Providers API Key 获取地址: # @@ -171,6 +177,30 @@ repetition_penalty = 10.0 max_mel_tokens = 1500 +[omnivoice] + # OmniVoice-Pack 语音合成配置 + # 支持 OmniVoice-Pack FastAPI 接口:POST /tts + api_url = "http://127.0.0.1:7866/tts" + language = "zh" + + # 生成模式:auto / voice_design / voice_clone + mode = "auto" + instruct = "" + + # voice_clone 模式下使用,音色列表复用 IndexTTS-1.5 的资源目录 + reference_audio_source = "resource" + reference_audio = "" + ref_text = "" + + # 高级生成参数 + num_step = 32 + guidance_scale = 2.0 + speed = 1.0 + duration = "" + denoise = true + postprocess_output = true + preprocess_prompt = true + [doubaotts] # 豆包语音 TTS 配置 # 申请流程: @@ -189,7 +219,7 @@ silence_duration = 0.125 [ui] - # TTS引擎选择 (indextts, indextts2, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech) + # TTS引擎选择 (indextts, indextts2, omnivoice, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech) tts_engine = "indextts" # Edge TTS 配置 diff --git a/requirements.txt b/requirements.txt index c6011de..be125ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ requests>=2.32.0 moviepy==2.1.1 edge-tts==7.2.7 -streamlit>=1.57.0 +streamlit==1.56.0 watchdog==6.0.0 loguru>=0.7.3 tomli>=2.2.1 diff --git a/webui.py b/webui.py index 68c24a7..5ba26a3 100644 --- a/webui.py +++ b/webui.py @@ -243,6 +243,12 @@ def get_voice_name_for_tts_engine(tts_engine: str) -> str: if reference_audio: return f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}" return config.ui.get('voice_name', '') + if tts_engine == config.OMNIVOICE_ENGINE: + mode = config.omnivoice.get('mode', 'auto') + reference_audio = config.omnivoice.get('reference_audio', '') + if mode == 'voice_clone' and reference_audio: + return f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}" + return f"{config.OMNIVOICE_VOICE_PREFIX}{mode}" if tts_engine == 'doubaotts': return config.ui.get('doubaotts_voice_type', 'BV700_streaming') if tts_engine == 'soulvoice': @@ -263,6 +269,7 @@ def get_jianying_export_params(draft_name=None) -> VideoClipParams: return VideoClipParams( video_clip_json_path=st.session_state['video_clip_json_path'], video_origin_path=st.session_state['video_origin_path'], + video_origin_paths=st.session_state.get('video_origin_paths', []), tts_engine=tts_engine, voice_name=voice_name, voice_rate=voice_rate, diff --git a/webui/components/audio_settings.py b/webui/components/audio_settings.py index c5ec08c..cab5413 100644 --- a/webui/components/audio_settings.py +++ b/webui/components/audio_settings.py @@ -40,6 +40,11 @@ BGM_RESOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/bgms-safe" BGM_TRACKS_JSON = os.path.join(BGM_RESOURCE_DIR, "tracks.json") BGM_UPLOAD_SUBDIR = "uploaded_bgms" BGM_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg") +LOCAL_TTS_ENGINES = { + config.INDEXTTS_ENGINE, + config.INDEXTTS2_ENGINE, + config.OMNIVOICE_ENGINE, +} def get_soulvoice_voices(): @@ -55,9 +60,10 @@ def get_soulvoice_voices(): def get_tts_engine_options(tr=lambda key: key): """获取TTS引擎选项""" - return { + engine_options = { config.INDEXTTS_ENGINE: config.INDEXTTS_DISPLAY_NAME, config.INDEXTTS2_ENGINE: config.INDEXTTS2_DISPLAY_NAME, + config.OMNIVOICE_ENGINE: config.OMNIVOICE_DISPLAY_NAME, "edge_tts": "Edge TTS", "qwen3_tts": tr("Tongyi Qwen3 TTS"), "tencent_tts": tr("Tencent Cloud TTS"), @@ -65,6 +71,25 @@ def get_tts_engine_options(tr=lambda key: key): "azure_speech": "Azure Speech Services" } + return { + engine: format_tts_engine_option(engine, display_name, tr) + for engine, display_name in engine_options.items() + } + + +def get_tts_engine_deployment_label(tts_engine, tr=lambda key: key): + """获取TTS引擎部署类型标签""" + if tts_engine in LOCAL_TTS_ENGINES: + return tr("Local Deployment") + + return tr("Cloud Service") + + +def format_tts_engine_option(tts_engine, display_name, tr=lambda key: key): + """格式化TTS引擎下拉显示名""" + deployment_label = get_tts_engine_deployment_label(tts_engine, tr) + return f"{display_name} [{deployment_label}]" + def get_tts_engine_descriptions(tr=lambda key: key): """获取TTS引擎详细描述""" @@ -105,6 +130,12 @@ def get_tts_engine_descriptions(tr=lambda key: key): "use_case": tr("IndexTTS2 use case"), "registration": None }, + config.OMNIVOICE_ENGINE: { + "title": config.OMNIVOICE_DISPLAY_NAME, + "features": tr("OmniVoice features"), + "use_case": tr("OmniVoice use case"), + "registration": None + }, "doubaotts": { "title": tr("Doubao TTS"), "features": tr("Doubao TTS features"), @@ -546,6 +577,8 @@ def render_tts_settings(tr): render_indextts_tts_settings(tr) elif selected_engine == config.INDEXTTS2_ENGINE: render_indextts2_tts_settings(tr) + elif selected_engine == config.OMNIVOICE_ENGINE: + render_omnivoice_tts_settings(tr) elif selected_engine == "doubaotts": render_doubaotts_settings(tr) @@ -1274,6 +1307,148 @@ def render_indextts2_tts_settings(tr): st.session_state['voice_pitch'] = 1.0 +def render_omnivoice_tts_settings(tr): + """渲染 OmniVoice TTS 设置""" + omnivoice_config = config.omnivoice + + api_url = st.text_input( + tr("API URL"), + value=omnivoice_config.get("api_url", "http://127.0.0.1:7866/tts"), + help=tr("OmniVoice API URL Help"), + ) + + language = st.text_input( + tr("OmniVoice Language Code"), + value=omnivoice_config.get("language", "zh"), + help=tr("OmniVoice Language Code Help"), + placeholder="zh", + ) + + mode_options = [ + ("auto", tr("OmniVoice Mode Auto")), + ("voice_design", tr("OmniVoice Mode Voice Design")), + ("voice_clone", tr("OmniVoice Mode Voice Clone")), + ] + mode_values = [item[0] for item in mode_options] + saved_mode = omnivoice_config.get("mode", "auto") + if saved_mode not in mode_values: + saved_mode = "auto" + + mode = mode_options[st.selectbox( + tr("OmniVoice Generation Mode"), + options=range(len(mode_options)), + index=mode_values.index(saved_mode), + format_func=lambda x: mode_options[x][1], + help=tr("OmniVoice Generation Mode Help"), + )][0] + + instruct = omnivoice_config.get("instruct", "") + reference_audio_source = omnivoice_config.get("reference_audio_source", "resource") + reference_audio = omnivoice_config.get("reference_audio", "") + ref_text = omnivoice_config.get("ref_text", "") + + if mode == "voice_design": + instruct = st.text_area( + tr("OmniVoice Instruct"), + value=instruct, + help=tr("OmniVoice Instruct Help"), + placeholder=tr("OmniVoice Instruct Placeholder"), + height=80, + ) + elif mode == "voice_clone": + reference_audio_source, reference_audio = render_indextts_reference_audio_selector( + tr, + omnivoice_config, + "omnivoice", + ) + ref_text = st.text_area( + tr("OmniVoice Reference Text"), + value=ref_text, + help=tr("OmniVoice Reference Text Help"), + placeholder=tr("OmniVoice Reference Text Placeholder"), + height=90, + ) + + with st.expander(tr("Advanced Parameters"), expanded=False): + col1, col2 = st.columns(2) + with col1: + num_step = st.slider( + "Num Step", + min_value=4, + max_value=64, + value=int(omnivoice_config.get("num_step", 32)), + step=1, + help=tr("OmniVoice Num Step Help"), + ) + guidance_scale = st.slider( + "Guidance Scale", + min_value=0.1, + max_value=10.0, + value=float(omnivoice_config.get("guidance_scale", 2.0)), + step=0.1, + help=tr("OmniVoice Guidance Scale Help"), + ) + voice_rate = st.slider( + tr("Voice Rate"), + min_value=0.5, + max_value=2.0, + value=float(omnivoice_config.get("speed", 1.0)), + step=0.1, + help=tr("Voice Rate Help 0.5-2.0"), + ) + with col2: + saved_duration = omnivoice_config.get("duration", "") + duration_value = float(saved_duration) if saved_duration not in (None, "") else 0.0 + duration = st.number_input( + tr("OmniVoice Duration"), + min_value=0.0, + max_value=120.0, + value=duration_value, + step=0.5, + help=tr("OmniVoice Duration Help"), + ) + denoise = st.checkbox( + tr("OmniVoice Denoise"), + value=bool(omnivoice_config.get("denoise", True)), + help=tr("OmniVoice Denoise Help"), + ) + postprocess_output = st.checkbox( + tr("OmniVoice Postprocess Output"), + value=bool(omnivoice_config.get("postprocess_output", True)), + help=tr("OmniVoice Postprocess Output Help"), + ) + preprocess_prompt = st.checkbox( + tr("OmniVoice Preprocess Prompt"), + value=bool(omnivoice_config.get("preprocess_prompt", True)), + help=tr("OmniVoice Preprocess Prompt Help"), + ) + + with st.expander(tr("OmniVoice Usage Instructions Title"), expanded=False): + st.markdown(tr("OmniVoice Usage Instructions")) + + config.omnivoice["api_url"] = api_url + config.omnivoice["language"] = language + config.omnivoice["mode"] = mode + config.omnivoice["instruct"] = instruct + config.omnivoice["reference_audio_source"] = reference_audio_source + config.omnivoice["reference_audio"] = reference_audio + config.omnivoice["ref_text"] = ref_text + config.omnivoice["num_step"] = num_step + config.omnivoice["guidance_scale"] = guidance_scale + config.omnivoice["speed"] = voice_rate + config.omnivoice["duration"] = duration if duration > 0 else "" + config.omnivoice["denoise"] = denoise + config.omnivoice["postprocess_output"] = postprocess_output + config.omnivoice["preprocess_prompt"] = preprocess_prompt + + if mode == "voice_clone" and reference_audio: + config.ui["voice_name"] = f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}" + else: + config.ui["voice_name"] = f"{config.OMNIVOICE_VOICE_PREFIX}{mode}" + st.session_state["voice_rate"] = voice_rate + st.session_state["voice_pitch"] = 1.0 + + def render_doubaotts_settings(tr): """渲染豆包语音 TTS 设置""" # AK 输入 @@ -1567,6 +1742,15 @@ def render_voice_preview_new(tr, selected_engine): voice_name = f"{config.INDEXTTS2_VOICE_PREFIX}{reference_audio}" voice_rate = 1.0 # IndexTTS-2 使用自身生成参数 voice_pitch = 1.0 + elif selected_engine == config.OMNIVOICE_ENGINE: + mode = config.omnivoice.get("mode", "auto") + reference_audio = config.omnivoice.get("reference_audio", "") + if mode == "voice_clone" and reference_audio: + voice_name = f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}" + else: + voice_name = f"{config.OMNIVOICE_VOICE_PREFIX}{mode}" + voice_rate = config.omnivoice.get("speed", 1.0) + voice_pitch = 1.0 elif selected_engine == "doubaotts": voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming") voice_name = voice_type @@ -1579,7 +1763,11 @@ def render_voice_preview_new(tr, selected_engine): with st.spinner(tr("Synthesizing Voice")): temp_dir = utils.storage_dir("temp", create=True) - audio_format = "audio/wav" if selected_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) else "audio/mp3" + audio_format = "audio/wav" if selected_engine in ( + config.INDEXTTS_ENGINE, + config.INDEXTTS2_ENGINE, + config.OMNIVOICE_ENGINE, + ) else "audio/mp3" audio_extension = ".wav" if audio_format == "audio/wav" else ".mp3" audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}{audio_extension}") diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py index a8185bc..1ea746c 100644 --- a/webui/components/basic_settings.py +++ b/webui/components/basic_settings.py @@ -260,6 +260,7 @@ def render_basic_settings(tr): with left_config_panel: render_language_settings(tr) render_proxy_settings(tr) + render_tavily_search_settings(tr) with middle_config_panel: render_vision_llm_settings(tr) # 视觉分析模型设置 @@ -345,6 +346,32 @@ def render_proxy_settings(tr): config.ui["jianying_draft_path"] = jianying_draft_path +def render_tavily_search_settings(tr): + """Render Tavily API key settings used by short drama web search.""" + st.subheader(tr("Tavily Search Settings")) + st.markdown( + f"{tr('API Key URL')}: " + "[https://app.tavily.com](https://app.tavily.com)" + ) + + tavily_api_key = st.text_input( + tr("Tavily API Key"), + value=config.app.get("tavily_api_key", ""), + type="password", + help=tr("Tavily API Key Help"), + key="tavily_api_key_input", + ) + + if update_app_config_if_changed("tavily_api_key", str(tavily_api_key or "").strip()): + try: + config.save_config() + st.session_state["tavily_api_key"] = str(tavily_api_key or "").strip() + st.success(tr("Tavily config saved")) + except Exception as e: + st.error(f"{tr('Failed to save config')}: {str(e)}") + logger.error(f"保存 Tavily 配置失败: {str(e)}") + + def test_vision_model_connection(api_key, base_url, model_name, provider, tr): """测试视觉模型连接 diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index 9b03457..d8b296e 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -17,7 +17,7 @@ from webui.tools.generate_script_short import generate_script_short from webui.tools.generate_short_summary import analyze_short_drama_plot, generate_script_short_sunmmary -SCRIPT_TABLE_BASE_COLUMNS = ["_id", "timestamp", "picture", "narration", "OST"] +SCRIPT_TABLE_BASE_COLUMNS = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"] VIDEO_UPLOAD_TYPES = ["mp4", "mov", "avi", "flv", "mkv", "mpeg4"] VIDEO_GLOB_PATTERNS = [f"*.{suffix}" for suffix in VIDEO_UPLOAD_TYPES] @@ -99,15 +99,24 @@ def _read_subtitle_file(path): return f.read() -def _build_combined_subtitle_content(subtitle_paths): +def _build_combined_subtitle_content(subtitle_paths, video_paths=None): sections = [] subtitle_contents = {} - for subtitle_path in subtitle_paths: + video_paths = _normalize_video_paths(video_paths) + for index, subtitle_path in enumerate(subtitle_paths, start=1): if not subtitle_path or not os.path.exists(subtitle_path): continue content = _read_subtitle_file(subtitle_path) subtitle_contents[subtitle_path] = content - sections.append(f"# {os.path.basename(subtitle_path)}\n{content}".strip()) + video_path = video_paths[index - 1] if index <= len(video_paths) else "" + if video_path: + header = ( + f"# 视频 {index}: {os.path.basename(video_path)}\n" + f"字幕文件: {os.path.basename(subtitle_path)}" + ) + else: + header = f"# 视频 {index}\n字幕文件: {os.path.basename(subtitle_path)}" + sections.append(f"{header}\n{content}".strip()) return "\n\n".join(sections), subtitle_contents @@ -120,7 +129,10 @@ def _selected_subtitle_paths(): def _set_subtitle_state(subtitle_paths): subtitle_paths = _normalize_video_paths(subtitle_paths) - subtitle_content, subtitle_contents = _build_combined_subtitle_content(subtitle_paths) + subtitle_content, subtitle_contents = _build_combined_subtitle_content( + subtitle_paths, + _selected_video_paths(), + ) st.session_state['subtitle_path'] = subtitle_paths[0] if subtitle_paths else None st.session_state['subtitle_paths'] = subtitle_paths st.session_state['subtitle_content'] = subtitle_content if subtitle_content else None @@ -128,6 +140,20 @@ def _set_subtitle_state(subtitle_paths): st.session_state['subtitle_file_processed'] = bool(subtitle_paths) +def _short_drama_plot_analysis_signature(subtitle_paths, video_theme, web_search_enabled, video_paths=None): + theme = str(video_theme or "").strip() if web_search_enabled else "" + return json.dumps( + { + "subtitle_paths": _normalize_video_paths(subtitle_paths), + "video_paths": _normalize_video_paths(video_paths), + "video_theme": theme, + "web_search_enabled": bool(web_search_enabled), + }, + ensure_ascii=False, + sort_keys=True, + ) + + def render_script_panel(tr): """渲染脚本配置面板""" with st.container(border=True): @@ -525,16 +551,71 @@ def short_drama_summary(tr): render_fun_asr_transcription(tr) render_subtitle_preview(tr) - current_subtitle_path = st.session_state.get('subtitle_path', '') - plot_analysis_source = st.session_state.get('short_drama_plot_analysis_subtitle_path') - if plot_analysis_source and plot_analysis_source != current_subtitle_path: - st.session_state['short_drama_plot_analysis'] = "" - st.session_state['short_drama_plot_analysis_subtitle_path'] = "" + current_subtitle_paths = _selected_subtitle_paths() + current_subtitle_path = current_subtitle_paths[0] if current_subtitle_paths else '' - name_cols = st.columns([4, 1.2], vertical_alignment="bottom") + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) + + name_cols = st.columns([3.4, 1.1, 2], vertical_alignment="bottom") with name_cols[0]: video_theme = st.text_input(tr("短剧名称")) with name_cols[1]: + web_search_enabled = st.toggle( + tr("联网搜索"), + key="short_drama_web_search_enabled", + help=tr("Enable Web Search Help"), + disabled=not current_subtitle_path, + ) + with name_cols[2]: analyze_plot_clicked = st.button( tr("剧情理解"), key="short_drama_plot_analysis_button", @@ -543,17 +624,37 @@ def short_drama_summary(tr): ) st.session_state['video_theme'] = video_theme + current_signature = _short_drama_plot_analysis_signature( + current_subtitle_paths, + video_theme, + web_search_enabled, + _selected_video_paths(), + ) + saved_signature = st.session_state.get('short_drama_plot_analysis_signature') + legacy_source = st.session_state.get('short_drama_plot_analysis_subtitle_path') + if ( + (saved_signature and saved_signature != current_signature) + or (legacy_source and legacy_source != current_subtitle_path) + ): + st.session_state['short_drama_plot_analysis'] = "" + st.session_state['short_drama_plot_analysis_subtitle_path'] = "" + st.session_state['short_drama_plot_analysis_signature'] = "" + if analyze_plot_clicked: with st.spinner(tr("Analyzing plot...")): plot_analysis = analyze_short_drama_plot( - current_subtitle_path, + current_subtitle_paths, st.session_state.get('temperature', 0.7), tr, subtitle_content=st.session_state.get('subtitle_content', ''), + short_name=video_theme, + enable_web_search=web_search_enabled, + video_paths=_selected_video_paths(), ) if plot_analysis: st.session_state['short_drama_plot_analysis'] = plot_analysis st.session_state['short_drama_plot_analysis_subtitle_path'] = current_subtitle_path + st.session_state['short_drama_plot_analysis_signature'] = current_signature st.success(tr("Plot analysis completed")) if st.session_state.get('short_drama_plot_analysis'): @@ -575,7 +676,10 @@ def render_subtitle_preview(tr): subtitle_contents = {} if subtitle_paths and (not subtitle_content or not subtitle_contents): - subtitle_content, subtitle_contents = _build_combined_subtitle_content(subtitle_paths) + subtitle_content, subtitle_contents = _build_combined_subtitle_content( + subtitle_paths, + _selected_video_paths(), + ) st.session_state['subtitle_content'] = subtitle_content st.session_state['subtitle_contents'] = subtitle_contents @@ -724,7 +828,7 @@ def _normalize_script_table_value(column, value): if _is_blank_table_value(value): return "" - if column in {"_id", "OST"}: + if column in {"_id", "video_id", "OST"}: try: return int(value) except (TypeError, ValueError): @@ -783,6 +887,14 @@ def render_video_script_editor(tr): column_order=column_order, column_config={ "_id": st.column_config.NumberColumn(tr("Script Column ID"), step=1, format="%d", width=52), + "video_id": st.column_config.NumberColumn( + tr("Script Column Video ID"), + min_value=1, + step=1, + format="%d", + width=80, + ), + "video_name": st.column_config.TextColumn(tr("Script Column Video Name"), width=180), "timestamp": st.column_config.TextColumn(tr("Script Column Timestamp"), width=200), "picture": st.column_config.TextColumn(tr("Script Column Picture"), width=320), "narration": st.column_config.TextColumn(tr("Script Column Narration"), width=480), @@ -1057,7 +1169,10 @@ def render_fun_asr_transcription(tr): st.error(tr("Fun-ASR failed without subtitle file")) return - subtitle_content, subtitle_contents = _build_combined_subtitle_content(generated_paths) + subtitle_content, subtitle_contents = _build_combined_subtitle_content( + generated_paths, + media_paths, + ) if not subtitle_content.strip(): clear_fun_asr_subtitle_state() st.error(tr("Fun-ASR failed without subtitle file")) @@ -1112,20 +1227,35 @@ def render_script_buttons(tr, params): generate_script_short(tr, params, custom_clips) elif script_path == "summary": # 执行 短剧解说 脚本生成 - subtitle_path = st.session_state.get('subtitle_path') + subtitle_paths = _selected_subtitle_paths() + subtitle_path = subtitle_paths[0] if subtitle_paths else None video_theme = st.session_state.get('video_theme') temperature = st.session_state.get('temperature') + web_search_enabled = bool(st.session_state.get('short_drama_web_search_enabled', False)) + current_signature = _short_drama_plot_analysis_signature( + subtitle_paths, + video_theme, + web_search_enabled, + _selected_video_paths(), + ) plot_analysis = "" - if st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path: + if st.session_state.get('short_drama_plot_analysis_signature') == current_signature: + plot_analysis = st.session_state.get('short_drama_plot_analysis', '') + elif ( + not web_search_enabled + and st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path + ): plot_analysis = st.session_state.get('short_drama_plot_analysis', '') generate_script_short_sunmmary( params, - subtitle_path, + subtitle_paths, video_theme, temperature, tr, plot_analysis=plot_analysis, subtitle_content=st.session_state.get('subtitle_content', ''), + enable_web_search=web_search_enabled, + video_paths=_selected_video_paths(), ) else: load_script(tr, script_path) @@ -1172,6 +1302,8 @@ def save_script_with_validation(tr, video_clip_json_details): example_script = [ { "_id": 1, + "video_id": 1, + "video_name": "1.mp4", "timestamp": "00:00:00,600-00:00:07,559", "picture": "工地上,蔡晓艳奋力救人,场面混乱", "narration": "灾后重建,工地上险象环生!泼辣女工蔡晓艳挺身而出,救人第一!", @@ -1179,6 +1311,8 @@ def save_script_with_validation(tr, video_clip_json_details): }, { "_id": 2, + "video_id": 2, + "video_name": "2.mp4", "timestamp": "00:00:08,240-00:00:12,359", "picture": "领导视察,蔡晓艳不屑一顾", "narration": "播放原片4", diff --git a/webui/components/subtitle_settings.py b/webui/components/subtitle_settings.py index f719d5e..96a7a7a 100644 --- a/webui/components/subtitle_settings.py +++ b/webui/components/subtitle_settings.py @@ -604,7 +604,7 @@ def render_font_settings(tr): def is_disabled_subtitle_settings(tts_engine:str)->bool: """是否禁用字幕设置""" - return tts_engine=="soulvoice" or tts_engine=="qwen3_tts" + return tts_engine=="soulvoice" or tts_engine=="qwen3_tts" or tts_engine==config.OMNIVOICE_ENGINE def render_position_settings(tr): """渲染位置设置""" diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 8e3356c..284d9a6 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -15,6 +15,8 @@ "Video script table help": "Edit the full script JSON as a table. You can add or delete rows; saving will validate and write the script file again.", "Raw JSON Preview": "Raw JSON Preview", "Script Column ID": "ID", + "Script Column Video ID": "Video", + "Script Column Video Name": "Video Name", "Script Column Timestamp": "Timestamp", "Script Column Picture": "Picture", "Script Column Narration": "Narration", @@ -286,7 +288,11 @@ "IndexTTS download link": "Download link: https://pan.quark.cn/s/0767c9bcefd5", "IndexTTS2 features": "A locally or privately deployed IndexTTS-2 voice-cloning engine with emotion control and fuller generation parameters.", "IndexTTS2 use case": "Best for fixed voices, emotional narration, and local speech synthesis workflows that need finer sampling controls. Start the IndexTTS-2 API service before use.", + "OmniVoice features": "A locally or privately deployed OmniVoice-Pack multilingual TTS engine with automatic voice generation, voice design, and reference-audio cloning.", + "OmniVoice use case": "Best for local controllable multilingual narration, voice design, or reference-audio cloning. Start the OmniVoice-Pack API service before use.", "Doubao TTS features": "Volcengine Doubao speech synthesis with multiple voices and emotions, plus fast access in mainland China.", + "Local Deployment": "Local Deployment", + "Cloud Service": "Cloud Service", "Select TTS Engine": "Select TTS Engine", "Select TTS Engine Help": "Choose the text-to-speech engine you want to use.", "TTS Engine Details": "📋 {engine} Details", @@ -413,6 +419,16 @@ "Subtitle calibration succeeded for multiple files": "Subtitle calibration succeeded for {count} files: {files}", "Subtitle calibration failed": "Subtitle calibration failed", "Transcribed subtitles storage hint": "Previously transcribed subtitles are saved in {path}; drag a file from that folder to upload", + "Tavily Search Settings": "Tavily Web Search", + "Tavily API Key": "Tavily API Key", + "Tavily API Key Help": "Used for web search before short drama plot analysis. When Web Search is enabled, the app searches plot, character, and episode context by drama name, then combines it with subtitles.", + "Tavily config saved": "Tavily configuration saved", + "联网搜索": "Web Search", + "Enable Web Search Help": "When enabled, plot analysis searches the web with Tavily by short drama name before combining those results with subtitles.", + "Please configure Tavily API Key in Basic Settings": "Please configure the Tavily API Key in Basic Settings first", + "Please enter short drama name before web search": "Please enter the short drama name before enabling web search", + "Searching short drama with Tavily...": "Searching short drama context with Tavily...", + "Tavily search failed": "Tavily search failed", "剧情理解": "Plot Analysis", "剧情理解结果": "Plot Analysis Result", "Analyzing plot...": "Analyzing plot...", @@ -443,6 +459,30 @@ "API URL": "API URL", "IndexTTS API URL Help": "IndexTTS-1.5 API service URL", "IndexTTS2 API URL Help": "IndexTTS-2 API service URL. You can enter the service root or the full /tts endpoint.", + "OmniVoice API URL Help": "OmniVoice-Pack API service URL. You can enter the service root or the full /tts endpoint.", + "OmniVoice Language Code": "Synthesis Language", + "OmniVoice Language Code Help": "The language parameter sent to OmniVoice-Pack, such as zh or en.", + "OmniVoice Generation Mode": "Generation Mode", + "OmniVoice Generation Mode Help": "Automatic voice needs no extra fields; voice design uses an instruction; reference-audio cloning needs reference audio and matching text.", + "OmniVoice Mode Auto": "Automatic Voice", + "OmniVoice Mode Voice Design": "Voice Design", + "OmniVoice Mode Voice Clone": "Reference Audio Clone", + "OmniVoice Instruct": "Voice Instruction", + "OmniVoice Instruct Help": "Describe the desired voice, such as gender, pitch, accent, or style.", + "OmniVoice Instruct Placeholder": "e.g. female, low pitch, british accent", + "OmniVoice Reference Text": "Reference Audio Text", + "OmniVoice Reference Text Help": "The exact transcript of the reference audio. Required when the deployed service has ASR disabled.", + "OmniVoice Reference Text Placeholder": "Enter the text spoken in the reference audio", + "OmniVoice Num Step Help": "Diffusion generation steps. Higher values usually improve quality but slow generation.", + "OmniVoice Guidance Scale Help": "Controls how strongly text conditions guide generation.", + "OmniVoice Duration": "Target Duration (seconds)", + "OmniVoice Duration Help": "0 lets the model decide the duration automatically.", + "OmniVoice Denoise": "Enable Denoise", + "OmniVoice Denoise Help": "Ask OmniVoice-Pack to denoise the generated output.", + "OmniVoice Postprocess Output": "Postprocess Output", + "OmniVoice Postprocess Output Help": "Enable OmniVoice-Pack output post-processing.", + "OmniVoice Preprocess Prompt": "Preprocess Text", + "OmniVoice Preprocess Prompt Help": "Enable OmniVoice-Pack text preprocessing.", "Reference Audio Source": "Reference Audio Source", "Reference Audio Source Help": "Choose a reference audio from the resource directory or upload a new one.", "Select from Resource Directory": "Select from Resource Directory", @@ -502,6 +542,8 @@ "Max Mel Tokens Help": "Controls the maximum mel tokens generated in one request. Higher values can produce longer audio.", "IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 Usage Instructions", "IndexTTS2 Usage Instructions": "**IndexTTS-2 voice cloning**\n\n1. **Choose a voice**: reuse IndexTTS-1.5 resource audio or upload a reference audio file\n2. **Set API URL**: for example http://192.168.3.6:7863/tts, or enter the service root\n3. **Tune emotion**: speaker is the default; switch to audio, vector, or text when needed\n4. **Tune generation**: temperature, top_p, top_k, num_beams, repetition_penalty, and max_mel_tokens are sent directly to the IndexTTS-2 API\n\n**Notes**:\n- Reference audio quality directly affects cloning quality\n- The first request may load the model and take longer\n- CPU deployments are much slower than GPU deployments", + "OmniVoice Usage Instructions Title": "OmniVoice Usage Instructions", + "OmniVoice Usage Instructions": "**OmniVoice-Pack speech synthesis**\n\n1. **Automatic voice**: set the API URL and language, then synthesize directly.\n2. **Voice design**: fill instruct with the desired gender, pitch, accent, or style.\n3. **Reference-audio clone**: upload or choose reference audio and fill its matching transcript.\n\n**Notes**:\n- The default service URL is http://127.0.0.1:7866/tts\n- Reference-audio cloning requires reference text when the service has no ASR model loaded\n- OmniVoice returns WAV audio, and NarratoAI estimates subtitle segment timing from the audio duration", "Volcengine Access Key Help": "Volcengine Access Key", "Volcengine Secret Key Help": "Volcengine Secret Key", "Doubao AppID Help": "Doubao TTS application AppID", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 4b16d7e..76872eb 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -159,6 +159,8 @@ "Video script table help": "在表格中编辑完整脚本 JSON。可新增、删除行;保存时会重新校验并写入脚本文件。", "Raw JSON Preview": "原始 JSON 预览", "Script Column ID": "序号", + "Script Column Video ID": "视频", + "Script Column Video Name": "视频文件", "Script Column Timestamp": "时间戳", "Script Column Picture": "画面描述", "Script Column Narration": "解说台词", @@ -267,7 +269,11 @@ "IndexTTS download link": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", "IndexTTS2 features": "本地/私有部署的 IndexTTS-2 语音克隆引擎,支持情感控制和更完整的生成参数。", "IndexTTS2 use case": "适合需要固定音色、情绪化旁白或更细致采样控制的本地语音合成场景。使用前请先启动 IndexTTS-2 API 服务。", + "OmniVoice features": "本地/私有部署的 OmniVoice-Pack 多语种语音合成引擎,支持自动音色、指令音色和参考音频克隆。", + "OmniVoice use case": "适合需要本地可控、多语言旁白、音色设计或参考音频克隆的场景。使用前请先启动 OmniVoice-Pack API 服务。", "Doubao TTS features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快", + "Local Deployment": "本地部署", + "Cloud Service": "云端服务", "Select TTS Engine": "选择 TTS 引擎", "Select TTS Engine Help": "选择您要使用的文本转语音引擎", "TTS Engine Details": "📋 {engine} 详细说明", @@ -395,6 +401,16 @@ "Subtitle calibration succeeded for multiple files": "字幕校准成功,共 {count} 个文件: {files}", "Subtitle calibration failed": "字幕校准失败", "Transcribed subtitles storage hint": "之前转录生成的字幕保存在 {path},可从该目录拖入上传", + "Tavily Search Settings": "Tavily 联网搜索", + "Tavily API Key": "Tavily API Key", + "Tavily API Key Help": "用于短剧剧情理解前的联网检索。开启“联网搜索”后,会先按短剧名称检索剧情、人物和分集信息,再结合字幕分析。", + "Tavily config saved": "Tavily 配置已保存", + "联网搜索": "联网搜索", + "Enable Web Search Help": "开启后,剧情理解会先使用 Tavily 按短剧名称联网检索,再结合检索结果和字幕分析剧情。", + "Please configure Tavily API Key in Basic Settings": "请先在基础设置中配置 Tavily API Key", + "Please enter short drama name before web search": "开启联网搜索前,请先填写短剧名称", + "Searching short drama with Tavily...": "正在使用 Tavily 检索短剧信息...", + "Tavily search failed": "Tavily 检索失败", "剧情理解": "剧情理解", "剧情理解结果": "剧情理解结果", "Analyzing plot...": "正在理解剧情...", @@ -425,6 +441,30 @@ "API URL": "API 地址", "IndexTTS API URL Help": "IndexTTS-1.5 API 服务地址", "IndexTTS2 API URL Help": "IndexTTS-2 API 服务地址,可填写服务根地址或完整 /tts 地址", + "OmniVoice API URL Help": "OmniVoice-Pack API 服务地址,可填写服务根地址或完整 /tts 地址", + "OmniVoice Language Code": "合成语言", + "OmniVoice Language Code Help": "传给 OmniVoice-Pack 的 language 参数,例如 zh、en。", + "OmniVoice Generation Mode": "生成模式", + "OmniVoice Generation Mode Help": "自动音色无需额外参数;指令音色使用描述词;参考音频克隆需要参考音频和对应文本。", + "OmniVoice Mode Auto": "自动音色", + "OmniVoice Mode Voice Design": "指令音色", + "OmniVoice Mode Voice Clone": "参考音频克隆", + "OmniVoice Instruct": "音色指令", + "OmniVoice Instruct Help": "描述希望生成的音色,例如性别、音高、口音或风格。", + "OmniVoice Instruct Placeholder": "例如:female, low pitch, british accent", + "OmniVoice Reference Text": "参考音频文本", + "OmniVoice Reference Text Help": "参考音频对应的逐字文本;当前部署未启用 ASR 时必须填写。", + "OmniVoice Reference Text Placeholder": "请输入参考音频中实际朗读的内容", + "OmniVoice Num Step Help": "扩散生成步数,值越大通常质量更高但速度更慢。", + "OmniVoice Guidance Scale Help": "控制文本条件的引导强度。", + "OmniVoice Duration": "目标时长(秒)", + "OmniVoice Duration Help": "0 表示由模型自动决定时长。", + "OmniVoice Denoise": "启用降噪", + "OmniVoice Denoise Help": "让 OmniVoice-Pack 对生成结果执行降噪处理。", + "OmniVoice Postprocess Output": "后处理输出", + "OmniVoice Postprocess Output Help": "启用 OmniVoice-Pack 的输出后处理。", + "OmniVoice Preprocess Prompt": "预处理文本", + "OmniVoice Preprocess Prompt Help": "启用 OmniVoice-Pack 的文本预处理。", "Reference Audio Source": "参考音频来源", "Reference Audio Source Help": "选择从资源目录选择参考音频,或上传新的参考音频", "Select from Resource Directory": "从资源目录选择", @@ -484,6 +524,8 @@ "Max Mel Tokens Help": "控制单次生成的最大 mel token 数,值越大可生成更长音频", "IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 使用说明", "IndexTTS2 Usage Instructions": "**IndexTTS-2 语音克隆**\n\n1. **选择音色**:复用 IndexTTS-1.5 的资源音频或上传参考音频\n2. **设置 API 地址**:例如 http://192.168.3.6:7863/tts,也可以填写服务根地址\n3. **调整情感参数**:默认使用 speaker,可按需切换到 audio、vector 或 text\n4. **调整生成参数**:temperature、top_p、top_k、num_beams、repetition_penalty 和 max_mel_tokens 会直接传给 IndexTTS-2 接口\n\n**注意事项**:\n- 参考音频质量会直接影响克隆效果\n- 首次请求可能需要加载模型,耗时更长\n- CPU 部署生成速度会明显慢于 GPU", + "OmniVoice Usage Instructions Title": "OmniVoice 使用说明", + "OmniVoice Usage Instructions": "**OmniVoice-Pack 语音合成**\n\n1. **自动音色**:只需要设置 API 地址和语言,可直接合成。\n2. **指令音色**:填写 instruct 描述想要的性别、音高、口音或风格。\n3. **参考音频克隆**:上传或选择参考音频,并填写该音频对应文本。\n\n**注意事项**:\n- 当前默认服务地址为 http://127.0.0.1:7866/tts\n- 参考音频克隆在服务未加载 ASR 模型时必须填写参考文本\n- OmniVoice 返回 WAV 音频,系统会按音频时长估算字幕段落", "Volcengine Access Key Help": "火山引擎 Access Key", "Volcengine Secret Key Help": "火山引擎 Secret Key", "Doubao AppID Help": "豆包语音应用 AppID", diff --git a/webui/tools/generate_short_summary.py b/webui/tools/generate_short_summary.py index eb42361..d06431c 100644 --- a/webui/tools/generate_short_summary.py +++ b/webui/tools/generate_short_summary.py @@ -17,12 +17,101 @@ from loguru import logger from app.config import config from app.services.SDE.short_drama_explanation import analyze_subtitle, generate_narration_script from app.services.subtitle_text import read_subtitle_text +from app.services.tavily_search import TavilySearchError, format_search_context, search_short_drama # 导入新的LLM服务模块 - 确保提供商被注册 import app.services.llm # 这会触发提供商注册 from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter import re +def _normalize_paths(paths): + if isinstance(paths, str): + paths = [paths] + if not paths: + return [] + + normalized_paths = [] + seen = set() + for path in paths: + if not isinstance(path, str): + continue + path = path.strip() + if not path or path in seen: + continue + normalized_paths.append(path) + seen.add(path) + return normalized_paths + + +def _build_combined_subtitle_content(subtitle_paths, video_paths=None): + sections = [] + video_paths = _normalize_paths(video_paths) + for index, subtitle_path in enumerate(_normalize_paths(subtitle_paths), start=1): + if not os.path.exists(subtitle_path): + continue + + video_path = video_paths[index - 1] if index <= len(video_paths) else "" + if video_path: + header = ( + f"# 视频 {index}: {os.path.basename(video_path)}\n" + f"字幕文件: {os.path.basename(subtitle_path)}" + ) + else: + header = f"# 视频 {index}\n字幕文件: {os.path.basename(subtitle_path)}" + sections.append(f"{header}\n{read_subtitle_text(subtitle_path).text}".strip()) + + return "\n\n".join(sections) + + +def _coerce_video_id(value): + try: + video_id = int(value) + except (TypeError, ValueError): + return None + return video_id if video_id > 0 else None + + +def _match_video_id_by_name(video_name, video_paths): + video_name = str(video_name or "").strip() + if not video_name: + return None + + for index, video_path in enumerate(video_paths, start=1): + if os.path.basename(video_path) == os.path.basename(video_name): + return index + return None + + +def _normalize_narration_items_video_sources(items, video_paths): + video_paths = _normalize_paths(video_paths) + if not video_paths: + return items + + normalized_items = [] + for item in items: + if not isinstance(item, dict): + normalized_items.append(item) + continue + + item_copy = item.copy() + video_id = _coerce_video_id(item_copy.get("video_id") or item_copy.get("video_index")) + matched_video_id = _match_video_id_by_name( + item_copy.get("video_name") or item_copy.get("source_video"), + video_paths, + ) + if matched_video_id: + video_id = matched_video_id + if video_id is None or video_id > len(video_paths): + logger.warning(f"片段 {item_copy.get('_id')} 未提供有效 video_id,默认使用视频 1") + video_id = 1 + + item_copy["video_id"] = video_id + item_copy["video_name"] = os.path.basename(video_paths[video_id - 1]) + normalized_items.append(item_copy) + + return normalized_items + + def parse_and_fix_json(json_string): """ 解析并修复JSON字符串 @@ -135,12 +224,83 @@ def parse_and_fix_json(json_string): return None -def analyze_short_drama_plot(subtitle_path, temperature, tr=lambda key: key, subtitle_content=None): +def _get_tavily_api_key() -> str: + return ( + st.session_state.get("tavily_api_key") + or config.app.get("tavily_api_key") + or "" + ).strip() + + +def _build_tavily_context(short_name: str, tr=lambda key: key) -> str | None: + short_name = str(short_name or "").strip() + if not short_name: + st.error(tr("Please enter short drama name before web search")) + return None + + api_key = _get_tavily_api_key() + if not api_key: + st.error(tr("Please configure Tavily API Key in Basic Settings")) + return None + + try: + search_data = search_short_drama( + short_name, + api_key, + search_depth=config.app.get("tavily_search_depth", "basic"), + max_results=config.app.get("tavily_max_results", 5), + ) + return format_search_context(search_data) + except TavilySearchError as e: + logger.error(f"Tavily 短剧检索失败: {str(e)}") + st.error(f"{tr('Tavily search failed')}: {str(e)}") + return None + except Exception as e: + logger.error(f"Tavily 短剧检索异常: {traceback.format_exc()}") + st.error(f"{tr('Tavily search failed')}: {str(e)}") + return None + + +def _build_plot_analysis_input( + subtitle_content: str, + short_name: str = "", + enable_web_search: bool = False, + tr=lambda key: key, +) -> str | None: + subtitle_content = str(subtitle_content or "").strip() + if not enable_web_search: + return subtitle_content + + tavily_context = _build_tavily_context(short_name, tr) + if tavily_context is None: + return None + + return f"""# 分析补充说明 +请先参考 Tavily 联网检索结果理解短剧名称、人物关系、剧情背景和公开剧情梗概,再结合原始字幕完成剧情理解。 +如果联网检索结果与字幕内容冲突,请以字幕内容为准;时间戳必须只从字幕内容中提取。 + +{tavily_context} + +# 原始字幕 +{subtitle_content}""" + + +def analyze_short_drama_plot( + subtitle_path, + temperature, + tr=lambda key: key, + subtitle_content=None, + short_name: str = "", + enable_web_search: bool = False, + video_paths=None, +): """仅执行短剧字幕剧情理解,返回可编辑的剧情分析文本。""" - if not subtitle_path: + subtitle_paths = _normalize_paths(subtitle_path) + if not subtitle_paths: st.error(tr("Please generate or upload subtitles first")) return None - if not os.path.exists(subtitle_path): + missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)] + if missing_subtitle_paths: st.error(tr("Subtitle file does not exist")) return None @@ -149,19 +309,31 @@ def analyze_short_drama_plot(subtitle_path, temperature, tr=lambda key: key, sub text_model = config.app.get(f'text_{text_provider}_model_name') text_base_url = config.app.get(f'text_{text_provider}_base_url') - subtitle_content = str(subtitle_content or "").strip() or read_subtitle_text(subtitle_path).text + subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content( + subtitle_paths, + video_paths, + ) if not subtitle_content: st.error(tr("Subtitle file is empty or unreadable")) return None + plot_analysis_input = _build_plot_analysis_input( + subtitle_content, + short_name=short_name, + enable_web_search=enable_web_search, + tr=tr, + ) + if plot_analysis_input is None: + return None + try: logger.info("使用新的LLM服务架构进行字幕分析") analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) - analysis_result = analyzer.analyze_subtitle(subtitle_content) + analysis_result = analyzer.analyze_subtitle(plot_analysis_input) except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") analysis_result = analyze_subtitle( - subtitle_content=subtitle_content, + subtitle_content=plot_analysis_input, api_key=text_api_key, model=text_model, base_url=text_base_url, @@ -186,6 +358,8 @@ def generate_script_short_sunmmary( tr=lambda key: key, plot_analysis=None, subtitle_content=None, + enable_web_search: bool = False, + video_paths=None, ): """ 生成 短剧解说 视频脚本 @@ -204,7 +378,12 @@ def generate_script_short_sunmmary( try: with st.spinner(tr("Generating script...")): - if not params.video_origin_path: + selected_video_paths = _normalize_paths( + video_paths + or getattr(params, "video_origin_paths", []) + or getattr(params, "video_origin_path", "") + ) + if not selected_video_paths: st.error(tr("Please select video file first")) return """ @@ -212,7 +391,9 @@ def generate_script_short_sunmmary( """ update_progress(30, tr("Parsing subtitles...")) # 判断字幕文件是否存在 - if not os.path.exists(subtitle_path): + subtitle_paths = _normalize_paths(subtitle_path) + missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)] + if not subtitle_paths or missing_subtitle_paths: st.error(tr("Subtitle file does not exist")) return @@ -225,7 +406,10 @@ def generate_script_short_sunmmary( text_base_url = config.app.get(f'text_{text_provider}_base_url') # 读取字幕文件内容(无论使用哪种实现都需要) - subtitle_content = str(subtitle_content or "").strip() or read_subtitle_text(subtitle_path).text + subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content( + subtitle_paths, + selected_video_paths, + ) if not subtitle_content: st.error(tr("Subtitle file is empty or unreadable")) return @@ -238,16 +422,27 @@ def generate_script_short_sunmmary( "analysis": str(plot_analysis).strip(), } else: + plot_analysis_input = subtitle_content + if enable_web_search: + update_progress(40, tr("Searching short drama with Tavily...")) + plot_analysis_input = _build_plot_analysis_input( + subtitle_content, + short_name=video_theme, + enable_web_search=True, + tr=tr, + ) + if plot_analysis_input is None: + return try: # 优先使用新的LLM服务架构 logger.info("使用新的LLM服务架构进行字幕分析") - analysis_result = analyzer.analyze_subtitle(subtitle_content) + analysis_result = analyzer.analyze_subtitle(plot_analysis_input) except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") # 回退到旧的实现 analysis_result = analyze_subtitle( - subtitle_content=subtitle_content, + subtitle_content=plot_analysis_input, api_key=text_api_key, model=text_model, base_url=text_base_url, @@ -320,7 +515,11 @@ def generate_script_short_sunmmary( logger.error(f"JSON结构错误,缺少items字段: {narration_dict}") st.stop() - script = json.dumps(narration_dict['items'], ensure_ascii=False, indent=2) + narration_items = _normalize_narration_items_video_sources( + narration_dict['items'], + selected_video_paths, + ) + script = json.dumps(narration_items, ensure_ascii=False, indent=2) if script is None: st.error(tr("Script generation failed check logs"))