feat(tts,search,video): 新增OmniVoice TTS、联网搜索与多视频剪辑支持

新增OmniVoice语音合成引擎全流程支持,包含配置项、WebUI界面与服务实现
集成Tavily联网搜索能力,支持短剧剧情分析前自动检索剧情背景信息
新增多视频源剪辑支持,完善脚本校验规则并重构剪辑逻辑适配多视频路径
重构LLM剧情分析Prompt,优化输出格式适配多场景与联网检索结果
调整streamlit版本至1.56.0修复兼容性问题
新增相关单元测试与多语言翻译,更新配置示例文件
This commit is contained in:
viccy 2026-06-07 01:24:32 +08:00
parent d147fe66e4
commit 342fc15f3b
24 changed files with 1320 additions and 108 deletions

View File

@ -13,8 +13,11 @@ INDEXTTS_ENGINE = "indextts"
INDEXTTS_DISPLAY_NAME = "IndexTTS-1.5"
INDEXTTS2_ENGINE = "indextts2"
INDEXTTS2_DISPLAY_NAME = "IndexTTS-2"
OMNIVOICE_ENGINE = "omnivoice"
OMNIVOICE_DISPLAY_NAME = "OmniVoice"
INDEXTTS_VOICE_PREFIX = f"{INDEXTTS_ENGINE}:"
INDEXTTS2_VOICE_PREFIX = f"{INDEXTTS2_ENGINE}:"
OMNIVOICE_VOICE_PREFIX = f"{OMNIVOICE_ENGINE}:"
def normalize_tts_engine_name(tts_engine: str) -> str:
@ -131,6 +134,7 @@ def save_config():
_cfg["fun_asr"] = fun_asr
_cfg["indextts"] = indextts
_cfg["indextts2"] = indextts2
_cfg["omnivoice"] = omnivoice
_cfg["doubaotts"] = doubaotts
f.write(toml.dumps(_cfg))
@ -148,6 +152,7 @@ tts_qwen = _cfg.get("tts_qwen", {})
fun_asr = _cfg.get("fun_asr", {})
indextts = _cfg.get("indextts", {})
indextts2 = _cfg.get("indextts2", {})
omnivoice = _cfg.get("omnivoice", {})
doubaotts = _cfg.get("doubaotts", {})
hostname = socket.gethostname()

View File

@ -35,6 +35,9 @@ DEFAULT_LLM_APP_CONFIG = {
"text_openai_model_name": DEFAULT_TEXT_OPENAI_MODEL_NAME,
"text_openai_api_key": "",
"text_openai_base_url": DEFAULT_OPENAI_COMPATIBLE_BASE_URL,
"tavily_api_key": "",
"tavily_search_depth": "basic",
"tavily_max_results": 5,
}
DEFAULT_LLM_APP_CONFIG.update(DEFAULT_LLM_GENERATION_APP_CONFIG)

View File

@ -32,6 +32,82 @@ def parse_timestamp(timestamp: str) -> tuple:
return start_time, end_time
def _normalize_video_origin_paths(
video_origin_path: str,
video_origin_paths: Optional[List[str]] = None,
) -> List[str]:
paths = []
if video_origin_paths:
paths.extend(video_origin_paths)
if video_origin_path:
paths.insert(0, video_origin_path)
normalized_paths = []
seen = set()
for item in paths:
if not isinstance(item, str):
continue
item = item.strip()
if not item or item in seen:
continue
normalized_paths.append(item)
seen.add(item)
return normalized_paths
def _coerce_video_id(value) -> Optional[int]:
try:
video_id = int(value)
except (TypeError, ValueError):
return None
return video_id if video_id > 0 else None
def _match_video_id_by_name(video_name: str, video_origin_paths: List[str]) -> Optional[int]:
video_name = str(video_name or "").strip()
if not video_name:
return None
expected_name = os.path.basename(video_name)
for index, video_path in enumerate(video_origin_paths, start=1):
if os.path.basename(video_path) == expected_name:
return index
return None
def _resolve_script_video_path(script_item: Dict, video_origin_paths: List[str]) -> str:
explicit_path = (
script_item.get("source_video_path")
or script_item.get("video_origin_path")
or script_item.get("origin_video_path")
)
if explicit_path and os.path.exists(explicit_path):
return explicit_path
video_id = _coerce_video_id(script_item.get("video_id") or script_item.get("video_index"))
matched_video_id = _match_video_id_by_name(
script_item.get("video_name") or script_item.get("source_video"),
video_origin_paths,
)
if matched_video_id:
video_id = matched_video_id
if video_id is not None:
if video_id <= len(video_origin_paths):
return video_origin_paths[video_id - 1]
logger.warning(
f"片段 {script_item.get('_id')} 的 video_id={video_id} 超出视频数量 "
f"{len(video_origin_paths)},默认使用第一个视频"
)
return video_origin_paths[0]
def _safe_output_id(value) -> str:
safe_value = str(value if value is not None else "unknown")
return "".join(char if char.isalnum() or char in ("-", "_") else "_" for char in safe_value)
def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str:
"""
根据开始时间和持续时间计算结束时间
@ -579,7 +655,7 @@ def _process_narration_only_segment(
# 生成输出文件名
safe_start_time = start_time.replace(':', '-').replace(',', '-')
safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
output_filename = f"ost0_vid_{safe_start_time}@{safe_end_time}.mp4"
output_filename = f"ost0_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4"
output_path = os.path.join(output_dir, output_filename)
# 构建FFmpeg命令 - 移除音频
@ -622,7 +698,7 @@ def _process_original_audio_segment(
# 生成输出文件名
safe_start_time = start_time.replace(':', '-').replace(',', '-')
safe_end_time = end_time.replace(':', '-').replace(',', '-')
output_filename = f"ost1_vid_{safe_start_time}@{safe_end_time}.mp4"
output_filename = f"ost1_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4"
output_path = os.path.join(output_dir, output_filename)
# 构建FFmpeg命令 - 保持原声
@ -674,7 +750,7 @@ def _process_mixed_segment(
# 生成输出文件名
safe_start_time = start_time.replace(':', '-').replace(',', '-')
safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
output_filename = f"ost2_vid_{safe_start_time}@{safe_end_time}.mp4"
output_filename = f"ost2_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4"
output_path = os.path.join(output_dir, output_filename)
# 构建FFmpeg命令 - 保持原声
@ -782,28 +858,34 @@ def clip_video_unified(
script_list: List[Dict],
tts_results: List[Dict],
output_dir: Optional[str] = None,
task_id: Optional[str] = None
task_id: Optional[str] = None,
video_origin_paths: Optional[List[str]] = None
) -> Dict[str, str]:
"""
基于OST类型的统一视频裁剪策略 - 消除双重裁剪问题
Args:
video_origin_path: 原始视频的路径
video_origin_path: 原始视频的路径旧脚本或无 video_id 片段默认使用该视频
script_list: 完整的脚本列表包含所有片段信息
tts_results: TTS结果列表仅包含OST=0和OST=2的片段
output_dir: 输出目录路径默认为None时会自动生成
task_id: 任务ID用于生成唯一的输出目录默认为None时会自动生成
video_origin_paths: 多个原始视频路径脚本片段可用 video_id/video_name 指定来源
Returns:
Dict[str, str]: 片段ID到裁剪后视频路径的映射
"""
# 检查视频文件是否存在
if not os.path.exists(video_origin_path):
raise FileNotFoundError(f"视频文件不存在: {video_origin_path}")
video_source_paths = _normalize_video_origin_paths(video_origin_path, video_origin_paths)
if not video_source_paths:
raise FileNotFoundError("视频文件不存在: 未提供原始视频路径")
missing_video_paths = [item for item in video_source_paths if not os.path.exists(item)]
if missing_video_paths:
raise FileNotFoundError(f"视频文件不存在: {', '.join(missing_video_paths)}")
# 如果未提供task_id则根据输入生成一个唯一ID
if task_id is None:
content_for_hash = f"{video_origin_path}_{json.dumps(script_list)}"
content_for_hash = f"{json.dumps(video_source_paths, ensure_ascii=False)}_{json.dumps(script_list, ensure_ascii=False)}"
task_id = hashlib.md5(content_for_hash.encode()).hexdigest()
# 设置输出目录
@ -840,29 +922,33 @@ def clip_video_unified(
failed_clips = []
success_count = 0
logger.info(f"📹 开始统一视频裁剪,总共{total_clips}个片段")
logger.info(f"📹 开始统一视频裁剪,总共{total_clips}个片段,源视频{len(video_source_paths)}")
for i, script_item in enumerate(script_list, 1):
_id = script_item.get("_id")
ost = script_item.get("OST", 0)
timestamp = script_item["timestamp"]
source_video_path = _resolve_script_video_path(script_item, video_source_paths)
logger.info(f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, 时间戳:{timestamp}")
logger.info(
f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, "
f"视频:{os.path.basename(source_video_path)}, 时间戳:{timestamp}"
)
try:
if ost == 0: # 纯解说片段
output_path = _process_narration_only_segment(
video_origin_path, script_item, tts_map, output_dir,
source_video_path, script_item, tts_map, output_dir,
encoder_config, hwaccel_args
)
elif ost == 1: # 纯原声片段
output_path = _process_original_audio_segment(
video_origin_path, script_item, output_dir,
source_video_path, script_item, output_dir,
encoder_config, hwaccel_args
)
elif ost == 2: # 解说+原声混合片段
output_path = _process_mixed_segment(
video_origin_path, script_item, tts_map, output_dir,
source_video_path, script_item, tts_map, output_dir,
encoder_config, hwaccel_args
)
else:

View File

@ -107,7 +107,7 @@ def _clamp_duration_to_media(
def _normalize_indextts_reference_audio(params: VideoClipParams) -> None:
"""Ensure IndexTTS engines use the configured reference audio instead of a stale UI voice."""
"""Ensure local clone TTS engines use configured reference audio instead of a stale UI voice."""
params.tts_engine = config.normalize_tts_engine_name(params.tts_engine)
if params.tts_engine == config.INDEXTTS_ENGINE:
tts_config = config.indextts
@ -117,6 +117,12 @@ def _normalize_indextts_reference_audio(params: VideoClipParams) -> None:
tts_config = config.indextts2
voice_prefix = config.INDEXTTS2_VOICE_PREFIX
display_name = "IndexTTS-2"
elif params.tts_engine == config.OMNIVOICE_ENGINE:
tts_config = config.omnivoice
if tts_config.get("mode", "auto") != "voice_clone":
return
voice_prefix = config.OMNIVOICE_VOICE_PREFIX
display_name = "OmniVoice"
else:
return
@ -199,6 +205,7 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams):
logger.info("\n\n## 3. 统一视频裁剪基于OST类型")
video_clip_result = clip_video.clip_video_unified(
video_origin_path=params.video_origin_path,
video_origin_paths=getattr(params, "video_origin_paths", []),
script_list=list_script,
tts_results=tts_results
)

View File

@ -12,6 +12,7 @@ from loguru import logger
from .manager import LLMServiceManager
from .validators import OutputValidator
from .exceptions import LLMServiceError
from app.services.prompts import PromptManager
# 提供商注册由 webui.py:main() 显式调用(见 LLM 提供商注册机制重构)
# 这样更可靠,错误也更容易调试
@ -181,12 +182,20 @@ class UnifiedLLMService:
LLMServiceError: 服务调用失败时抛出
"""
try:
# 构建分析提示词
system_prompt = "你是一位专业的剧本分析师和剧情概括助手。请仔细分析字幕内容,提取关键剧情信息。"
prompt = PromptManager.get_prompt(
category="short_drama_narration",
name="plot_analysis",
parameters={"subtitle_content": subtitle_content},
)
prompt_object = PromptManager.get_prompt_object(
category="short_drama_narration",
name="plot_analysis",
)
system_prompt = prompt_object.get_system_prompt()
# 生成分析结果
result = await UnifiedLLMService.generate_text(
prompt=subtitle_content,
prompt=prompt,
system_prompt=system_prompt,
provider=provider,
temperature=temperature,

View File

@ -113,6 +113,8 @@ class OutputValidator:
"required": ["_id", "timestamp", "picture", "narration"],
"properties": {
"_id": {"type": "number"},
"video_id": {"type": "number"},
"video_name": {"type": "string"},
"timestamp": {"type": "string"},
"picture": {"type": "string"},
"narration": {"type": "string"},
@ -161,6 +163,16 @@ class OutputValidator:
item_id = item.get("_id")
if not isinstance(item_id, (int, float)) or item_id <= 0:
raise ValidationError(f"{index+1}项ID必须为正整数: {item_id}", "invalid_id")
video_id = item.get("video_id")
if video_id not in (None, "") and (
not isinstance(video_id, (int, float)) or video_id <= 0
):
raise ValidationError(f"{index+1}项video_id必须为正整数: {video_id}", "invalid_video_id")
video_name = item.get("video_name")
if video_name not in (None, "") and not isinstance(video_name, str):
raise ValidationError(f"{index+1}项video_name必须为字符串: {video_name}", "invalid_video_name")
@staticmethod
def validate_subtitle_analysis(output: str) -> str:

View File

@ -19,72 +19,79 @@ class PlotAnalysisPrompt(TextPrompt):
metadata = PromptMetadata(
name="plot_analysis",
category="short_drama_narration",
version="v1.0",
description="分析短剧字幕内容,提供详细的剧情分析和分段解析",
version="v1.1",
description="结合字幕和可选联网检索上下文,输出适合短剧解说脚本生成的结构化剧情理解",
model_type=ModelType.TEXT,
output_format=OutputFormat.TEXT,
tags=["短剧", "剧情分析", "字幕解析", "分段分析"],
tags=["短剧", "剧情分析", "字幕解析", "分段分析", "联网检索", "解说脚本素材"],
parameters=["subtitle_content"]
)
super().__init__(metadata)
self._system_prompt = "你是一位专业的剧本分析师和剧情概括助手"
self._system_prompt = "你是一位专业的短剧解说策划和剧本分析师。请输出克制、结构化、可直接供下游解说脚本生成使用的剧情理解材料"
def get_template(self) -> str:
return """# 角色
你是一位专业的剧本分析师和剧情概括助手
你是一位专业的短剧解说策划和剧本分析师你的输出不是给观众看的成片文案而是给下游短剧解说脚本生成器使用的结构化剧情理解材料
# 任务
我将为你提供一部短剧的完整字幕文本请你基于这些字幕完成以下任务
1. **整体剧情分析**简要概括整个短剧的核心剧情脉络主要冲突和结局如果有的话
2. **分段剧情解析与时间戳定位**
* 将整个短剧划分为若干个关键的剧情段落例如开端发展转折高潮结局或根据具体情节自然划分
* 段落数应该与字幕长度成正比
* 对于每一个剧情段落
* **概括该段落的主要内容**用简洁的语言描述这段剧情发生了什么
* **标注对应的时间戳范围**明确指出该剧情段落对应的开始字幕时间戳和结束字幕时间戳请直接从字幕中提取时间信息
# 输入说明
下面的输入可能只包含一个视频的原始字幕也可能包含多个视频文件的字幕也可能同时包含 Tavily 联网检索结果和原始字幕
- 联网检索结果只能用于辅助识别短剧名称人物关系时代背景公开剧情梗概
- 原始字幕是唯一可信的当前片段事实来源
- 如果联网检索结果与字幕冲突必须以字幕为准
- 如果联网检索结果包含当前字幕尚未出现的后续剧情只能放在字幕未覆盖/需谨慎信息不能写进当前剧情事实
- 多个视频字幕会以视频 1: 文件名视频 2: 文件名等标题分隔时间戳均为对应视频内部时间不是拼接后的累计时间
# 输入格式
字幕内容通常包含时间戳和对话例如
```
00:00:05,000 --> 00:00:10,000
[角色A]: 你好吗
00:00:10,500 --> 00:00:15,000
[角色B]: 我很好谢谢发生了一些有趣的事情
... (更多字幕内容) ...
```
我将把实际字幕粘贴在下方
# 核心任务
请基于输入完成剧情理解目标是帮助后续生成高质量短剧解说脚本
1. 识别短剧名称当前字幕范围视频来源联网检索辅助信息和字幕事实边界
2. 统一人物称呼避免同一人物出现多个名字写法
3. 100-180 字概括当前字幕覆盖的剧情不提前剧透字幕未出现的内容
4. 按视频来源和字幕时间顺序拆分关键剧情段落并为每段标注准确 video_id / video_name / 时间戳
5. 提炼解说创作可用的钩子冲突爽点/泪点/悬念点和建议保留原声片段
# 输出格式要求
请按照以下格式清晰地呈现分析结果
# 强制输出规则
1. 禁止输出寒暄解释身份或好的我将等聊天式开场
2. 禁止编造字幕中没有的具体事件对白关系进展或结局
3. 时间戳必须直接来自对应视频字幕无法确定时写字幕未明确不要猜测
4. 多视频场景下必须明确每段来自哪个视频文件禁止把不同视频的同名时间戳混在一起
5. 人名必须统一优先采用联网检索中的正式名称如果字幕写法不同在人物表中保留字幕称呼
6. 内容要简洁客观可复用避免散文化长段落
7. 必须严格按照下面的 Markdown 格式输出不要添加额外章节
**整体剧情概括**
[此处填写对整个短剧剧情的概括]
# 输出格式
## 一、基础识别
- 短剧名称[如输入可判断则填写否则写未知]
- 当前字幕范围[开始时间戳] --> [结束时间戳]无法确定则写字幕未明确
- 视频来源[列出视频编号文件名和各自字幕时间范围单视频也要写]
- 联网检索确认[仅写可辅助理解的公开信息没有联网结果则写未启用/未提供]
- 字幕内实际出现[列出当前字幕真实出现的关键事实2-4 ]
- 字幕未覆盖/需谨慎信息[列出联网结果提到但当前字幕未发生的内容没有则写]
**分段剧情解析**
## 二、人物与关系
| 统一称呼 | 字幕称呼 | 身份/关系 | 当前剧情作用 | 确定性 |
|---|---|---|---|---|
| [人物名] | [字幕原文称呼] | [身份或关系] | [在当前片段中的作用] | 字幕明确/联网辅助/合理推断 |
**剧情段落 1[段落主题/概括例如主角登场与背景介绍]**
* **时间戳** [开始时间戳] --> [结束时间戳]
* **内容概要** [对这段剧情的详细描述]
## 三、整体剧情概括
[100-180 只概括当前字幕覆盖的剧情必须包含核心冲突人物动机和当前悬念]
**剧情段落 2[段落主题/概括例如第一个冲突出现]**
* **时间戳** [开始时间戳] --> [结束时间戳]
* **内容概要** [对这段剧情的详细描述]
## 四、分段剧情解析
| 视频 | 时间戳 | 段落主题 | 剧情事件 | 情绪/冲突功能 |
|---|---|---|---|---|
| [video_id + video_name] | [开始] --> [结束] | [简短主题] | [当前段落发生了什么] | [铺垫/冲突升级/人物塑造/反转/悬念/情绪爆发等] |
... (根据实际剧情段落数量继续) ...
## 五、解说创作重点
- 开场钩子[用一句话指出最适合开场抓人的冲突或疑问]
- 核心冲突[当前片段最主要的矛盾]
- 爽点/泪点/情绪点[ 1-3 没有则写无明显]
- 悬念点[当前片段留下的疑问或后续期待]
- 建议保留原声片段
1. [video_id + video_name + 时间戳][保留理由如果没有合适原声无明显]
**剧情段落 N[段落主题/概括例如结局与反思]**
* **时间戳** [开始时间戳] --> [结束时间戳]
* **内容概要** [对这段剧情的详细描述]
## 六、联网信息校验
- 可用于辅助理解的信息[联网结果中可帮助理解当前字幕的信息没有则写]
- 与字幕不一致或字幕未覆盖的信息[必须列出不要混入当前剧情事实没有则写]
# 注意事项
* 请确保时间戳的准确性直接引用字幕中的时间
* 剧情段落的划分应合乎逻辑能够反映剧情的起承转合
* 语言表达应简洁准确客观
# 限制
1. 严禁输出与分析结果无关的内容
2. 时间戳必须严格按照字幕中的实际时间
# 请处理以下字幕:
# 输入内容
${subtitle_content}"""

View File

@ -43,11 +43,14 @@ class ScriptGenerationPrompt(ParameterizedPrompt):
${plot_analysis}
</plot>
### 原始字幕(含精确时间戳)
### 原始字幕(含视频编号和精确时间戳)
<subtitles>
${subtitle_content}
</subtitles>
字幕可能来自多个视频文件每个字幕分段标题会以视频 1: 文件名视频 2: 文件名等形式标识来源
生成脚本时必须把每个片段绑定到对应视频来源时间戳表示该视频文件内部的局部时间不是把多个视频拼接后的全局时间
## 短剧解说创作核心要素
### 1. 黄金开场3秒法则
@ -137,11 +140,18 @@ ${subtitle_content}
### 时间戳管理(绝对不能违反)
- **时间戳绝对不能重叠**确保剪辑后无重复画面
- **时间段必须连续且不交叉**严格按时间顺序排列
- **每个时间戳都必须在原始字幕中找到对应范围**
- **同一个 video_id 内的时间段必须连续且不交叉**严格按该视频内时间顺序排列
- **跨视频可以切换 video_id**但每个时间戳都必须来自对应视频字幕分段
- **每个时间戳都必须在对应视频的原始字幕中找到对应范围**
- 可以拆分原时间片段但必须保持时间连续性
- 时间戳的格式必须与原始字幕中的格式完全一致
### 多视频来源规范(多集/多文件必须遵守)
- **video_id**必须填写取字幕分段标题里的视频编号例如视频 3就填 3
- **video_name**必须填写对应的视频文件名例如3_20260607002212.mp4
- **timestamp**只填写对应 video_id 内部的时间范围不要换算成多个视频拼接后的累计时间
- 如果剧情跨多个视频推进脚本可以按故事顺序在不同 video_id 之间切换但不得把视频 2 的时间戳写到 video_id=1
### 时长控制1/3原则
- **解说视频总长度 = 原视频长度的 1/3**
- 精确控制节奏和密度既不能过短也不能过长
@ -159,6 +169,8 @@ ${subtitle_content}
```json
{
"_id": 序号,
"video_id": 视频编号,
"video_name": "视频文件名",
"timestamp": "开始时间-结束时间",
"picture": "画面内容描述",
"narration": "播放原片+序号",
@ -242,6 +254,8 @@ ${subtitle_content}
"items": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:05,500",
"picture": "女主角林小雨慌张地道歉,男主角沈墨轩冷漠地看着她",
"narration": "一个普通女孩的命运即将因为一杯咖啡彻底改变!她撞到的这个男人,竟然是...",
@ -249,6 +263,8 @@ ${subtitle_content}
},
{
"_id": 2,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:05,500-00:00:08,000",
"picture": "沈墨轩质问林小雨,语气冷厉威严",
"narration": "播放原片2",
@ -256,6 +272,8 @@ ${subtitle_content}
},
{
"_id": 3,
"video_id": 2,
"video_name": "2.mp4",
"timestamp": "00:00:08,000-00:00:12,000",
"picture": "林小雨惊慌失措,沈墨轩眼中闪过一丝兴趣",
"narration": "霸道总裁的经典开场!一杯咖啡引发的爱情故事就这样开始了...",
@ -281,6 +299,7 @@ ${subtitle_content}
- **原声片段标识**OST=1表示原声OST=0表示解说
- **原声格式规范**narration字段必须使用"播放原片+序号"格式
- **关键情绪点**必须保留原片原声增强观众代入感
- **视频来源**每个片段必须包含 video_id video_name用于定位多个上传视频中的源文件
- **时间戳精度**精确到毫秒级别确保与字幕完美匹配
- **逻辑连贯性**严格遵循剧情发展顺序

View File

@ -225,6 +225,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
# 使用新的统一裁剪策略
video_clip_result = clip_video.clip_video_unified(
video_origin_path=params.video_origin_path,
video_origin_paths=getattr(params, "video_origin_paths", []),
script_list=list_script,
tts_results=tts_results
)
@ -477,6 +478,7 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
# 使用新的统一裁剪策略
video_clip_result = clip_video.clip_video_unified(
video_origin_path=params.video_origin_path,
video_origin_paths=getattr(params, "video_origin_paths", []),
script_list=list_script,
tts_results=tts_results
)

View File

@ -0,0 +1,116 @@
"""Tavily-powered web search helpers for plot analysis."""
from __future__ import annotations
import os
from typing import Any
import requests
from loguru import logger
TAVILY_API_BASE_URL = "https://api.tavily.com"
DEFAULT_SEARCH_DEPTH = "basic"
DEFAULT_MAX_RESULTS = 5
DEFAULT_TIMEOUT = 20
class TavilySearchError(RuntimeError):
"""Raised when Tavily search cannot be completed."""
def _trim_text(value: Any, max_chars: int) -> str:
text = str(value or "").strip()
if len(text) <= max_chars:
return text
return f"{text[:max_chars].rstrip()}..."
def search_short_drama(
short_name: str,
api_key: str | None = None,
*,
search_depth: str = DEFAULT_SEARCH_DEPTH,
max_results: int = DEFAULT_MAX_RESULTS,
timeout: int = DEFAULT_TIMEOUT,
) -> dict[str, Any]:
"""Search web context for a short drama name with Tavily."""
short_name = str(short_name or "").strip()
if not short_name:
raise TavilySearchError("短剧名称不能为空")
api_key = (api_key or os.getenv("TAVILY_API_KEY") or "").strip()
if not api_key:
raise TavilySearchError("Tavily API Key 未配置")
query = f"{short_name} 短剧 剧情 介绍 人物 结局"
payload = {
"query": query,
"search_depth": search_depth or DEFAULT_SEARCH_DEPTH,
"topic": "general",
"max_results": max(1, min(int(max_results or DEFAULT_MAX_RESULTS), 10)),
"include_answer": True,
"include_raw_content": False,
"include_images": False,
}
try:
response = requests.post(
f"{TAVILY_API_BASE_URL}/search",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json=payload,
timeout=timeout,
)
except requests.RequestException as exc:
raise TavilySearchError(f"Tavily 请求失败: {exc}") from exc
if response.status_code >= 400:
message = _trim_text(response.text, 500)
raise TavilySearchError(f"Tavily 请求失败: HTTP {response.status_code} {message}")
try:
data = response.json()
except ValueError as exc:
raise TavilySearchError("Tavily 返回内容不是有效 JSON") from exc
logger.info(
"Tavily 短剧检索完成: query={}, results={}",
query,
len(data.get("results") or []),
)
return data
def format_search_context(search_data: dict[str, Any], *, max_chars: int = 6000) -> str:
"""Format Tavily response into compact LLM context."""
if not search_data:
return ""
lines = [
"# Tavily 联网检索结果",
f"检索 query: {search_data.get('query', '')}",
]
answer = _trim_text(search_data.get("answer"), 1200)
if answer:
lines.extend(["", "## 综合回答", answer])
results = search_data.get("results") or []
if results:
lines.extend(["", "## 搜索来源"])
for index, result in enumerate(results, start=1):
title = _trim_text(result.get("title"), 120)
url = _trim_text(result.get("url"), 240)
content = _trim_text(result.get("content") or result.get("raw_content"), 700)
lines.extend(
[
f"{index}. 标题: {title}",
f" 来源: {url}",
f" 摘要: {content}",
]
)
return _trim_text("\n".join(lines).strip(), max_chars)

View File

@ -51,6 +51,23 @@ class JianyingTaskTests(unittest.TestCase):
self.assertEqual(f"indextts2:{ref_path}", params.voice_name)
def test_normalize_omnivoice_clone_uses_valid_param_reference(self):
with tempfile.NamedTemporaryFile(suffix=".wav") as ref:
params = VideoClipParams(tts_engine="omnivoice", voice_name=f"omnivoice:{ref.name}")
with patch.dict(jianying_task.config.omnivoice, {"mode": "voice_clone"}, clear=False):
jianying_task._normalize_indextts_reference_audio(params)
self.assertEqual(f"omnivoice:{ref.name}", params.voice_name)
def test_normalize_omnivoice_auto_does_not_require_reference(self):
params = VideoClipParams(tts_engine="omnivoice", voice_name="omnivoice:auto")
with patch.dict(jianying_task.config.omnivoice, {"mode": "auto", "reference_audio": ""}, clear=False):
jianying_task._normalize_indextts_reference_audio(params)
self.assertEqual("omnivoice:auto", params.voice_name)
def test_normalize_indextts_requires_existing_reference_audio(self):
params = VideoClipParams(tts_engine="indextts", voice_name="zh-CN-YunjianNeural")

View File

@ -0,0 +1,84 @@
import json
import os
import tempfile
import unittest
from unittest import mock
from app.services import clip_video
from app.utils import check_script
class TestMultiVideoScriptSources(unittest.TestCase):
def test_check_format_accepts_optional_video_source_fields(self):
script = [
{
"_id": 1,
"video_id": 2,
"video_name": "2.mp4",
"timestamp": "00:00:00,000-00:00:03,000",
"picture": "画面",
"narration": "解说",
"OST": 0,
}
]
result = check_script.check_format(json.dumps(script, ensure_ascii=False))
self.assertTrue(result["success"])
def test_clip_video_unified_resolves_source_by_video_id_and_name(self):
with tempfile.TemporaryDirectory() as temp_dir:
video_1 = os.path.join(temp_dir, "1.mp4")
video_2 = os.path.join(temp_dir, "2.mp4")
for video_path in [video_1, video_2]:
with open(video_path, "wb") as file:
file.write(b"video")
output_dir = os.path.join(temp_dir, "clips")
used_sources = []
def fake_process(source_video_path, script_item, output_dir_arg, *_args):
used_sources.append(source_video_path)
output_path = os.path.join(output_dir_arg, f"{script_item['_id']}.mp4")
with open(output_path, "wb") as file:
file.write(b"clip")
return output_path
script_list = [
{
"_id": 1,
"video_id": 2,
"timestamp": "00:00:00,000-00:00:03,000",
"picture": "视频2画面",
"narration": "播放原片1",
"OST": 1,
},
{
"_id": 2,
"video_name": "1.mp4",
"timestamp": "00:00:03,000-00:00:06,000",
"picture": "视频1画面",
"narration": "播放原片2",
"OST": 1,
},
]
with (
mock.patch.object(clip_video, "check_hardware_acceleration", return_value=None),
mock.patch.object(clip_video, "_process_original_audio_segment", side_effect=fake_process),
):
result = clip_video.clip_video_unified(
video_origin_path=video_1,
video_origin_paths=[video_1, video_2],
script_list=script_list,
tts_results=[],
output_dir=output_dir,
task_id="multi-video-test",
)
self.assertEqual([video_2, video_1], used_sources)
self.assertEqual({1, 2}, set(result.keys()))
if __name__ == "__main__":
unittest.main()

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import os
import re
import json
@ -1298,6 +1300,10 @@ def tts(
if tts_engine == config.INDEXTTS2_ENGINE:
logger.info("分发到 IndexTTS-2")
return indextts2_tts(text, voice_name, voice_file)
if tts_engine == config.OMNIVOICE_ENGINE:
logger.info("分发到 OmniVoice")
return omnivoice_tts(text, voice_name, voice_file, speed=voice_rate)
if tts_engine == "doubaotts":
logger.info("分发到豆包语音 TTS")
@ -1783,7 +1789,11 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
voice_name = config.normalize_indextts_voice_prefix(parse_voice_name(voice_name))
output_dir = utils.task_dir(task_id)
tts_results = []
audio_extension = ".wav" if tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) else ".mp3"
audio_extension = ".wav" if tts_engine in (
config.INDEXTTS_ENGINE,
config.INDEXTTS2_ENGINE,
config.OMNIVOICE_ENGINE,
) else ".mp3"
for item in list_script:
if item['OST'] != 1:
@ -1809,11 +1819,11 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
f"或者使用其他 tts 引擎")
continue
else:
# SoulVoice、Qwen3、IndexTTS、豆包语音 引擎不生成精确字幕文件
# SoulVoice、Qwen3、IndexTTS、OmniVoice、豆包语音 引擎不生成精确字幕文件
if (
is_soulvoice_voice(voice_name)
or is_qwen_engine(tts_engine)
or tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE)
or tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE, config.OMNIVOICE_ENGINE)
or tts_engine == "doubaotts"
):
# 获取实际音频文件的时长
@ -2256,6 +2266,17 @@ def parse_indextts2_voice(voice_name: str) -> str:
return voice_name
def parse_omnivoice_voice(voice_name: str) -> str:
"""
解析 OmniVoice 语音名称
支持格式omnivoice:reference_audio_path
返回参考音频文件路径或模式名
"""
if isinstance(voice_name, str) and voice_name.startswith(config.OMNIVOICE_VOICE_PREFIX):
return voice_name[len(config.OMNIVOICE_VOICE_PREFIX):]
return voice_name
def indextts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""
使用 IndexTTS-1.5 API 进行零样本语音克隆
@ -2493,3 +2514,141 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str) -> Union[SubMaker
logger.error("IndexTTS-2 TTS 生成失败,已达到最大重试次数")
return None
def _normalize_omnivoice_api_url(api_url: str) -> str:
api_url = (api_url or "http://127.0.0.1:7866/tts").strip()
if api_url.endswith("/tts"):
return api_url
if api_url.endswith("/tts/json"):
return f"{api_url[:-len('/tts/json')]}/tts"
return f"{api_url.rstrip('/')}/tts"
def _download_omnivoice_audio(response: requests.Response, api_url: str, voice_file: str, proxies: dict) -> bool:
content_type = response.headers.get("content-type", "").lower()
if "application/json" not in content_type:
with open(voice_file, "wb") as f:
f.write(response.content)
return os.path.getsize(voice_file) > 0
result = response.json()
audio_url = result.get("audio_url") if isinstance(result, dict) else ""
if not audio_url:
logger.error(f"OmniVoice API 响应中没有音频下载地址: {result}")
return False
audio_response = requests.get(urljoin(api_url, audio_url), proxies=proxies, timeout=180)
if audio_response.status_code != 200:
logger.error(f"OmniVoice 音频下载失败: {audio_response.status_code} - {audio_response.text}")
return False
with open(voice_file, "wb") as f:
f.write(audio_response.content)
return os.path.getsize(voice_file) > 0
def _optional_omnivoice_generation_data(voice_speed: float) -> dict:
omnivoice_config = getattr(config, "omnivoice", {}) or {}
data = {
"speed": voice_speed or omnivoice_config.get("speed", 1.0),
}
optional_fields = {
"num_step": omnivoice_config.get("num_step"),
"guidance_scale": omnivoice_config.get("guidance_scale"),
"duration": omnivoice_config.get("duration"),
}
for key, value in optional_fields.items():
if value not in (None, ""):
data[key] = value
for key in ("denoise", "postprocess_output", "preprocess_prompt"):
if key in omnivoice_config:
data[key] = str(bool(omnivoice_config.get(key))).lower()
return data
def omnivoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""
使用 OmniVoice-Pack FastAPI 服务进行语音合成
支持自动音色指令音色和参考音频克隆三种模式
"""
omnivoice_config = getattr(config, "omnivoice", {}) or {}
api_url = _normalize_omnivoice_api_url(omnivoice_config.get("api_url", "http://127.0.0.1:7866/tts"))
mode = omnivoice_config.get("mode", "auto")
language = (omnivoice_config.get("language", "zh") or "").strip()
instruct = (omnivoice_config.get("instruct", "") or "").strip()
ref_text = (omnivoice_config.get("ref_text", "") or "").strip()
parsed_voice = parse_omnivoice_voice(voice_name)
if mode != "voice_clone" and parsed_voice and os.path.isfile(parsed_voice):
mode = "voice_clone"
reference_audio_path = ""
if mode == "voice_clone":
candidate = parsed_voice
if candidate and os.path.isfile(candidate):
reference_audio_path = candidate
else:
reference_audio_path = parse_omnivoice_voice(omnivoice_config.get("reference_audio", "") or "")
if not reference_audio_path or not os.path.exists(reference_audio_path):
logger.error(f"OmniVoice 参考音频文件不存在: {reference_audio_path}")
return None
elif mode != "voice_design":
instruct = ""
data = {
"text": text.strip(),
"language": language,
**_optional_omnivoice_generation_data(speed),
}
if mode == "voice_design" and instruct:
data["instruct"] = instruct
if mode == "voice_clone" and ref_text:
data["ref_text"] = ref_text
proxies = _get_configured_proxies()
for attempt in range(3):
files = {}
try:
if reference_audio_path:
files["ref_audio"] = open(reference_audio_path, "rb")
logger.info(f"{attempt + 1} 次调用 OmniVoice API: {api_url}, mode={mode}")
response = requests.post(
api_url,
files=files or None,
data=data,
proxies=proxies,
timeout=240,
)
if response.status_code == 200 and _download_omnivoice_audio(response, api_url, voice_file, proxies):
logger.info(f"OmniVoice 成功生成音频: {voice_file}, 大小: {os.path.getsize(voice_file)} 字节")
sub_maker = new_sub_maker()
duration = get_audio_duration_from_file(voice_file)
duration_ms = int(duration * 1000) if duration > 0 else max(1000, int(len(text) * 200))
add_subtitle_event(sub_maker, 0, duration_ms * 10000, text)
return sub_maker
logger.error(f"OmniVoice API 调用失败: {response.status_code} - {response.text}")
except requests.exceptions.Timeout:
logger.error(f"OmniVoice API 调用超时 (尝试 {attempt + 1}/3)")
except requests.exceptions.RequestException as e:
logger.error(f"OmniVoice API 网络错误: {str(e)} (尝试 {attempt + 1}/3)")
except Exception as e:
logger.error(f"OmniVoice TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)")
finally:
for file_obj in files.values():
try:
file_obj.close()
except Exception:
pass
if attempt < 2:
time.sleep(2)
logger.error("OmniVoice TTS 生成失败,已达到最大重试次数")
return None

View File

@ -57,6 +57,23 @@ def check_format(script_content: str) -> Dict[str, Any]:
'details': f'当前值: {clip["_id"]} (类型: {type(clip["_id"]).__name__})'
}
# 验证可选视频来源字段。旧脚本可以不包含,新脚本用于多视频定位。
if 'video_id' in clip and clip['video_id'] not in ("", None):
if not isinstance(clip['video_id'], int) or clip['video_id'] <= 0:
return {
'success': False,
'message': f'{i+1}个片段的video_id必须是正整数',
'details': f'当前值: {clip["video_id"]} (类型: {type(clip["video_id"]).__name__})'
}
if 'video_name' in clip and clip['video_name'] not in ("", None):
if not isinstance(clip['video_name'], str):
return {
'success': False,
'message': f'{i+1}个片段的video_name必须是字符串',
'details': f'当前值: {clip["video_name"]} (类型: {type(clip["video_name"]).__name__})'
}
# 验证 timestamp 字段格式
timestamp_pattern = r'^\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}$'
if not isinstance(clip['timestamp'], str) or not re.match(timestamp_pattern, clip['timestamp']):

View File

@ -49,6 +49,12 @@
text_openai_max_tokens = 65536
text_openai_thinking_level = "auto" # auto/off/low/medium/high
# ===== Tavily 联网搜索配置 =====
# 用于短剧剧情理解前,按短剧名称检索公开剧情/人物/分集信息
tavily_api_key = "" # 获取地址https://app.tavily.com
tavily_search_depth = "basic" # basic / advanced / fast / ultra-fast
tavily_max_results = 5
# ===== API Keys 参考 =====
# 主流 LLM Providers API Key 获取地址:
#
@ -171,6 +177,30 @@
repetition_penalty = 10.0
max_mel_tokens = 1500
[omnivoice]
# OmniVoice-Pack 语音合成配置
# 支持 OmniVoice-Pack FastAPI 接口POST /tts
api_url = "http://127.0.0.1:7866/tts"
language = "zh"
# 生成模式auto / voice_design / voice_clone
mode = "auto"
instruct = ""
# voice_clone 模式下使用,音色列表复用 IndexTTS-1.5 的资源目录
reference_audio_source = "resource"
reference_audio = ""
ref_text = ""
# 高级生成参数
num_step = 32
guidance_scale = 2.0
speed = 1.0
duration = ""
denoise = true
postprocess_output = true
preprocess_prompt = true
[doubaotts]
# 豆包语音 TTS 配置
# 申请流程:
@ -189,7 +219,7 @@
silence_duration = 0.125
[ui]
# TTS引擎选择 (indextts, indextts2, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech)
# TTS引擎选择 (indextts, indextts2, omnivoice, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech)
tts_engine = "indextts"
# Edge TTS 配置

View File

@ -2,7 +2,7 @@
requests>=2.32.0
moviepy==2.1.1
edge-tts==7.2.7
streamlit>=1.57.0
streamlit==1.56.0
watchdog==6.0.0
loguru>=0.7.3
tomli>=2.2.1

View File

@ -243,6 +243,12 @@ def get_voice_name_for_tts_engine(tts_engine: str) -> str:
if reference_audio:
return f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}"
return config.ui.get('voice_name', '')
if tts_engine == config.OMNIVOICE_ENGINE:
mode = config.omnivoice.get('mode', 'auto')
reference_audio = config.omnivoice.get('reference_audio', '')
if mode == 'voice_clone' and reference_audio:
return f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}"
return f"{config.OMNIVOICE_VOICE_PREFIX}{mode}"
if tts_engine == 'doubaotts':
return config.ui.get('doubaotts_voice_type', 'BV700_streaming')
if tts_engine == 'soulvoice':
@ -263,6 +269,7 @@ def get_jianying_export_params(draft_name=None) -> VideoClipParams:
return VideoClipParams(
video_clip_json_path=st.session_state['video_clip_json_path'],
video_origin_path=st.session_state['video_origin_path'],
video_origin_paths=st.session_state.get('video_origin_paths', []),
tts_engine=tts_engine,
voice_name=voice_name,
voice_rate=voice_rate,

View File

@ -40,6 +40,11 @@ BGM_RESOURCE_DIR = "/Users/viccy/Downloads/tts-mp3-clone/bgms-safe"
BGM_TRACKS_JSON = os.path.join(BGM_RESOURCE_DIR, "tracks.json")
BGM_UPLOAD_SUBDIR = "uploaded_bgms"
BGM_AUDIO_EXTENSIONS = (".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg")
LOCAL_TTS_ENGINES = {
config.INDEXTTS_ENGINE,
config.INDEXTTS2_ENGINE,
config.OMNIVOICE_ENGINE,
}
def get_soulvoice_voices():
@ -55,9 +60,10 @@ def get_soulvoice_voices():
def get_tts_engine_options(tr=lambda key: key):
"""获取TTS引擎选项"""
return {
engine_options = {
config.INDEXTTS_ENGINE: config.INDEXTTS_DISPLAY_NAME,
config.INDEXTTS2_ENGINE: config.INDEXTTS2_DISPLAY_NAME,
config.OMNIVOICE_ENGINE: config.OMNIVOICE_DISPLAY_NAME,
"edge_tts": "Edge TTS",
"qwen3_tts": tr("Tongyi Qwen3 TTS"),
"tencent_tts": tr("Tencent Cloud TTS"),
@ -65,6 +71,25 @@ def get_tts_engine_options(tr=lambda key: key):
"azure_speech": "Azure Speech Services"
}
return {
engine: format_tts_engine_option(engine, display_name, tr)
for engine, display_name in engine_options.items()
}
def get_tts_engine_deployment_label(tts_engine, tr=lambda key: key):
"""获取TTS引擎部署类型标签"""
if tts_engine in LOCAL_TTS_ENGINES:
return tr("Local Deployment")
return tr("Cloud Service")
def format_tts_engine_option(tts_engine, display_name, tr=lambda key: key):
"""格式化TTS引擎下拉显示名"""
deployment_label = get_tts_engine_deployment_label(tts_engine, tr)
return f"{display_name} [{deployment_label}]"
def get_tts_engine_descriptions(tr=lambda key: key):
"""获取TTS引擎详细描述"""
@ -105,6 +130,12 @@ def get_tts_engine_descriptions(tr=lambda key: key):
"use_case": tr("IndexTTS2 use case"),
"registration": None
},
config.OMNIVOICE_ENGINE: {
"title": config.OMNIVOICE_DISPLAY_NAME,
"features": tr("OmniVoice features"),
"use_case": tr("OmniVoice use case"),
"registration": None
},
"doubaotts": {
"title": tr("Doubao TTS"),
"features": tr("Doubao TTS features"),
@ -546,6 +577,8 @@ def render_tts_settings(tr):
render_indextts_tts_settings(tr)
elif selected_engine == config.INDEXTTS2_ENGINE:
render_indextts2_tts_settings(tr)
elif selected_engine == config.OMNIVOICE_ENGINE:
render_omnivoice_tts_settings(tr)
elif selected_engine == "doubaotts":
render_doubaotts_settings(tr)
@ -1274,6 +1307,148 @@ def render_indextts2_tts_settings(tr):
st.session_state['voice_pitch'] = 1.0
def render_omnivoice_tts_settings(tr):
"""渲染 OmniVoice TTS 设置"""
omnivoice_config = config.omnivoice
api_url = st.text_input(
tr("API URL"),
value=omnivoice_config.get("api_url", "http://127.0.0.1:7866/tts"),
help=tr("OmniVoice API URL Help"),
)
language = st.text_input(
tr("OmniVoice Language Code"),
value=omnivoice_config.get("language", "zh"),
help=tr("OmniVoice Language Code Help"),
placeholder="zh",
)
mode_options = [
("auto", tr("OmniVoice Mode Auto")),
("voice_design", tr("OmniVoice Mode Voice Design")),
("voice_clone", tr("OmniVoice Mode Voice Clone")),
]
mode_values = [item[0] for item in mode_options]
saved_mode = omnivoice_config.get("mode", "auto")
if saved_mode not in mode_values:
saved_mode = "auto"
mode = mode_options[st.selectbox(
tr("OmniVoice Generation Mode"),
options=range(len(mode_options)),
index=mode_values.index(saved_mode),
format_func=lambda x: mode_options[x][1],
help=tr("OmniVoice Generation Mode Help"),
)][0]
instruct = omnivoice_config.get("instruct", "")
reference_audio_source = omnivoice_config.get("reference_audio_source", "resource")
reference_audio = omnivoice_config.get("reference_audio", "")
ref_text = omnivoice_config.get("ref_text", "")
if mode == "voice_design":
instruct = st.text_area(
tr("OmniVoice Instruct"),
value=instruct,
help=tr("OmniVoice Instruct Help"),
placeholder=tr("OmniVoice Instruct Placeholder"),
height=80,
)
elif mode == "voice_clone":
reference_audio_source, reference_audio = render_indextts_reference_audio_selector(
tr,
omnivoice_config,
"omnivoice",
)
ref_text = st.text_area(
tr("OmniVoice Reference Text"),
value=ref_text,
help=tr("OmniVoice Reference Text Help"),
placeholder=tr("OmniVoice Reference Text Placeholder"),
height=90,
)
with st.expander(tr("Advanced Parameters"), expanded=False):
col1, col2 = st.columns(2)
with col1:
num_step = st.slider(
"Num Step",
min_value=4,
max_value=64,
value=int(omnivoice_config.get("num_step", 32)),
step=1,
help=tr("OmniVoice Num Step Help"),
)
guidance_scale = st.slider(
"Guidance Scale",
min_value=0.1,
max_value=10.0,
value=float(omnivoice_config.get("guidance_scale", 2.0)),
step=0.1,
help=tr("OmniVoice Guidance Scale Help"),
)
voice_rate = st.slider(
tr("Voice Rate"),
min_value=0.5,
max_value=2.0,
value=float(omnivoice_config.get("speed", 1.0)),
step=0.1,
help=tr("Voice Rate Help 0.5-2.0"),
)
with col2:
saved_duration = omnivoice_config.get("duration", "")
duration_value = float(saved_duration) if saved_duration not in (None, "") else 0.0
duration = st.number_input(
tr("OmniVoice Duration"),
min_value=0.0,
max_value=120.0,
value=duration_value,
step=0.5,
help=tr("OmniVoice Duration Help"),
)
denoise = st.checkbox(
tr("OmniVoice Denoise"),
value=bool(omnivoice_config.get("denoise", True)),
help=tr("OmniVoice Denoise Help"),
)
postprocess_output = st.checkbox(
tr("OmniVoice Postprocess Output"),
value=bool(omnivoice_config.get("postprocess_output", True)),
help=tr("OmniVoice Postprocess Output Help"),
)
preprocess_prompt = st.checkbox(
tr("OmniVoice Preprocess Prompt"),
value=bool(omnivoice_config.get("preprocess_prompt", True)),
help=tr("OmniVoice Preprocess Prompt Help"),
)
with st.expander(tr("OmniVoice Usage Instructions Title"), expanded=False):
st.markdown(tr("OmniVoice Usage Instructions"))
config.omnivoice["api_url"] = api_url
config.omnivoice["language"] = language
config.omnivoice["mode"] = mode
config.omnivoice["instruct"] = instruct
config.omnivoice["reference_audio_source"] = reference_audio_source
config.omnivoice["reference_audio"] = reference_audio
config.omnivoice["ref_text"] = ref_text
config.omnivoice["num_step"] = num_step
config.omnivoice["guidance_scale"] = guidance_scale
config.omnivoice["speed"] = voice_rate
config.omnivoice["duration"] = duration if duration > 0 else ""
config.omnivoice["denoise"] = denoise
config.omnivoice["postprocess_output"] = postprocess_output
config.omnivoice["preprocess_prompt"] = preprocess_prompt
if mode == "voice_clone" and reference_audio:
config.ui["voice_name"] = f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}"
else:
config.ui["voice_name"] = f"{config.OMNIVOICE_VOICE_PREFIX}{mode}"
st.session_state["voice_rate"] = voice_rate
st.session_state["voice_pitch"] = 1.0
def render_doubaotts_settings(tr):
"""渲染豆包语音 TTS 设置"""
# AK 输入
@ -1567,6 +1742,15 @@ def render_voice_preview_new(tr, selected_engine):
voice_name = f"{config.INDEXTTS2_VOICE_PREFIX}{reference_audio}"
voice_rate = 1.0 # IndexTTS-2 使用自身生成参数
voice_pitch = 1.0
elif selected_engine == config.OMNIVOICE_ENGINE:
mode = config.omnivoice.get("mode", "auto")
reference_audio = config.omnivoice.get("reference_audio", "")
if mode == "voice_clone" and reference_audio:
voice_name = f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}"
else:
voice_name = f"{config.OMNIVOICE_VOICE_PREFIX}{mode}"
voice_rate = config.omnivoice.get("speed", 1.0)
voice_pitch = 1.0
elif selected_engine == "doubaotts":
voice_type = config.ui.get("doubaotts_voice_type", "BV700_streaming")
voice_name = voice_type
@ -1579,7 +1763,11 @@ def render_voice_preview_new(tr, selected_engine):
with st.spinner(tr("Synthesizing Voice")):
temp_dir = utils.storage_dir("temp", create=True)
audio_format = "audio/wav" if selected_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE) else "audio/mp3"
audio_format = "audio/wav" if selected_engine in (
config.INDEXTTS_ENGINE,
config.INDEXTTS2_ENGINE,
config.OMNIVOICE_ENGINE,
) else "audio/mp3"
audio_extension = ".wav" if audio_format == "audio/wav" else ".mp3"
audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}{audio_extension}")

View File

@ -260,6 +260,7 @@ def render_basic_settings(tr):
with left_config_panel:
render_language_settings(tr)
render_proxy_settings(tr)
render_tavily_search_settings(tr)
with middle_config_panel:
render_vision_llm_settings(tr) # 视觉分析模型设置
@ -345,6 +346,32 @@ def render_proxy_settings(tr):
config.ui["jianying_draft_path"] = jianying_draft_path
def render_tavily_search_settings(tr):
"""Render Tavily API key settings used by short drama web search."""
st.subheader(tr("Tavily Search Settings"))
st.markdown(
f"{tr('API Key URL')}: "
"[https://app.tavily.com](https://app.tavily.com)"
)
tavily_api_key = st.text_input(
tr("Tavily API Key"),
value=config.app.get("tavily_api_key", ""),
type="password",
help=tr("Tavily API Key Help"),
key="tavily_api_key_input",
)
if update_app_config_if_changed("tavily_api_key", str(tavily_api_key or "").strip()):
try:
config.save_config()
st.session_state["tavily_api_key"] = str(tavily_api_key or "").strip()
st.success(tr("Tavily config saved"))
except Exception as e:
st.error(f"{tr('Failed to save config')}: {str(e)}")
logger.error(f"保存 Tavily 配置失败: {str(e)}")
def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
"""测试视觉模型连接

View File

@ -17,7 +17,7 @@ from webui.tools.generate_script_short import generate_script_short
from webui.tools.generate_short_summary import analyze_short_drama_plot, generate_script_short_sunmmary
SCRIPT_TABLE_BASE_COLUMNS = ["_id", "timestamp", "picture", "narration", "OST"]
SCRIPT_TABLE_BASE_COLUMNS = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"]
VIDEO_UPLOAD_TYPES = ["mp4", "mov", "avi", "flv", "mkv", "mpeg4"]
VIDEO_GLOB_PATTERNS = [f"*.{suffix}" for suffix in VIDEO_UPLOAD_TYPES]
@ -99,15 +99,24 @@ def _read_subtitle_file(path):
return f.read()
def _build_combined_subtitle_content(subtitle_paths):
def _build_combined_subtitle_content(subtitle_paths, video_paths=None):
sections = []
subtitle_contents = {}
for subtitle_path in subtitle_paths:
video_paths = _normalize_video_paths(video_paths)
for index, subtitle_path in enumerate(subtitle_paths, start=1):
if not subtitle_path or not os.path.exists(subtitle_path):
continue
content = _read_subtitle_file(subtitle_path)
subtitle_contents[subtitle_path] = content
sections.append(f"# {os.path.basename(subtitle_path)}\n{content}".strip())
video_path = video_paths[index - 1] if index <= len(video_paths) else ""
if video_path:
header = (
f"# 视频 {index}: {os.path.basename(video_path)}\n"
f"字幕文件: {os.path.basename(subtitle_path)}"
)
else:
header = f"# 视频 {index}\n字幕文件: {os.path.basename(subtitle_path)}"
sections.append(f"{header}\n{content}".strip())
return "\n\n".join(sections), subtitle_contents
@ -120,7 +129,10 @@ def _selected_subtitle_paths():
def _set_subtitle_state(subtitle_paths):
subtitle_paths = _normalize_video_paths(subtitle_paths)
subtitle_content, subtitle_contents = _build_combined_subtitle_content(subtitle_paths)
subtitle_content, subtitle_contents = _build_combined_subtitle_content(
subtitle_paths,
_selected_video_paths(),
)
st.session_state['subtitle_path'] = subtitle_paths[0] if subtitle_paths else None
st.session_state['subtitle_paths'] = subtitle_paths
st.session_state['subtitle_content'] = subtitle_content if subtitle_content else None
@ -128,6 +140,20 @@ def _set_subtitle_state(subtitle_paths):
st.session_state['subtitle_file_processed'] = bool(subtitle_paths)
def _short_drama_plot_analysis_signature(subtitle_paths, video_theme, web_search_enabled, video_paths=None):
theme = str(video_theme or "").strip() if web_search_enabled else ""
return json.dumps(
{
"subtitle_paths": _normalize_video_paths(subtitle_paths),
"video_paths": _normalize_video_paths(video_paths),
"video_theme": theme,
"web_search_enabled": bool(web_search_enabled),
},
ensure_ascii=False,
sort_keys=True,
)
def render_script_panel(tr):
"""渲染脚本配置面板"""
with st.container(border=True):
@ -525,16 +551,71 @@ def short_drama_summary(tr):
render_fun_asr_transcription(tr)
render_subtitle_preview(tr)
current_subtitle_path = st.session_state.get('subtitle_path', '')
plot_analysis_source = st.session_state.get('short_drama_plot_analysis_subtitle_path')
if plot_analysis_source and plot_analysis_source != current_subtitle_path:
st.session_state['short_drama_plot_analysis'] = ""
st.session_state['short_drama_plot_analysis_subtitle_path'] = ""
current_subtitle_paths = _selected_subtitle_paths()
current_subtitle_path = current_subtitle_paths[0] if current_subtitle_paths else ''
name_cols = st.columns([4, 1.2], vertical_alignment="bottom")
st.markdown(
"""
<style>
.st-key-short_drama_web_search_enabled [data-testid="stMarkdownContainer"] {
display: none;
}
.st-key-short_drama_web_search_enabled [data-testid="stWidgetLabel"] {
min-width: 0;
transform: translateX(-1.2rem);
}
.st-key-short_drama_web_search_enabled label {
align-items: center;
gap: 0.45rem;
}
.st-key-short_drama_web_search_enabled label > div:first-child {
width: 3rem !important;
min-width: 3rem !important;
height: 1.55rem !important;
border-radius: 999px !important;
border: 1px solid #d1d5db !important;
background: #e5e7eb !important;
box-shadow: inset 0 1px 2px rgba(15, 23, 42, 0.08) !important;
transition: background 160ms ease, border-color 160ms ease, box-shadow 160ms ease !important;
}
.st-key-short_drama_web_search_enabled label:hover > div:first-child {
background: #dbe3ef !important;
border-color: #b8c2d3 !important;
}
.st-key-short_drama_web_search_enabled label:has(input[aria-checked="true"]) > div:first-child {
border-color: transparent !important;
background: linear-gradient(135deg, #2563eb, #14b8a6) !important;
box-shadow: 0 6px 14px rgba(37, 99, 235, 0.22) !important;
}
.st-key-short_drama_web_search_enabled label > div:first-child > div {
width: 1.05rem !important;
height: 1.05rem !important;
border-radius: 999px !important;
background: #ffffff !important;
box-shadow: 0 2px 6px rgba(15, 23, 42, 0.24) !important;
}
.st-key-short_drama_web_search_enabled button[aria-label^="Help for"] {
color: #6b7280 !important;
}
.st-key-short_drama_web_search_enabled button[aria-label^="Help for"]:hover {
color: #2563eb !important;
}
</style>
""",
unsafe_allow_html=True,
)
name_cols = st.columns([3.4, 1.1, 2], vertical_alignment="bottom")
with name_cols[0]:
video_theme = st.text_input(tr("短剧名称"))
with name_cols[1]:
web_search_enabled = st.toggle(
tr("联网搜索"),
key="short_drama_web_search_enabled",
help=tr("Enable Web Search Help"),
disabled=not current_subtitle_path,
)
with name_cols[2]:
analyze_plot_clicked = st.button(
tr("剧情理解"),
key="short_drama_plot_analysis_button",
@ -543,17 +624,37 @@ def short_drama_summary(tr):
)
st.session_state['video_theme'] = video_theme
current_signature = _short_drama_plot_analysis_signature(
current_subtitle_paths,
video_theme,
web_search_enabled,
_selected_video_paths(),
)
saved_signature = st.session_state.get('short_drama_plot_analysis_signature')
legacy_source = st.session_state.get('short_drama_plot_analysis_subtitle_path')
if (
(saved_signature and saved_signature != current_signature)
or (legacy_source and legacy_source != current_subtitle_path)
):
st.session_state['short_drama_plot_analysis'] = ""
st.session_state['short_drama_plot_analysis_subtitle_path'] = ""
st.session_state['short_drama_plot_analysis_signature'] = ""
if analyze_plot_clicked:
with st.spinner(tr("Analyzing plot...")):
plot_analysis = analyze_short_drama_plot(
current_subtitle_path,
current_subtitle_paths,
st.session_state.get('temperature', 0.7),
tr,
subtitle_content=st.session_state.get('subtitle_content', ''),
short_name=video_theme,
enable_web_search=web_search_enabled,
video_paths=_selected_video_paths(),
)
if plot_analysis:
st.session_state['short_drama_plot_analysis'] = plot_analysis
st.session_state['short_drama_plot_analysis_subtitle_path'] = current_subtitle_path
st.session_state['short_drama_plot_analysis_signature'] = current_signature
st.success(tr("Plot analysis completed"))
if st.session_state.get('short_drama_plot_analysis'):
@ -575,7 +676,10 @@ def render_subtitle_preview(tr):
subtitle_contents = {}
if subtitle_paths and (not subtitle_content or not subtitle_contents):
subtitle_content, subtitle_contents = _build_combined_subtitle_content(subtitle_paths)
subtitle_content, subtitle_contents = _build_combined_subtitle_content(
subtitle_paths,
_selected_video_paths(),
)
st.session_state['subtitle_content'] = subtitle_content
st.session_state['subtitle_contents'] = subtitle_contents
@ -724,7 +828,7 @@ def _normalize_script_table_value(column, value):
if _is_blank_table_value(value):
return ""
if column in {"_id", "OST"}:
if column in {"_id", "video_id", "OST"}:
try:
return int(value)
except (TypeError, ValueError):
@ -783,6 +887,14 @@ def render_video_script_editor(tr):
column_order=column_order,
column_config={
"_id": st.column_config.NumberColumn(tr("Script Column ID"), step=1, format="%d", width=52),
"video_id": st.column_config.NumberColumn(
tr("Script Column Video ID"),
min_value=1,
step=1,
format="%d",
width=80,
),
"video_name": st.column_config.TextColumn(tr("Script Column Video Name"), width=180),
"timestamp": st.column_config.TextColumn(tr("Script Column Timestamp"), width=200),
"picture": st.column_config.TextColumn(tr("Script Column Picture"), width=320),
"narration": st.column_config.TextColumn(tr("Script Column Narration"), width=480),
@ -1057,7 +1169,10 @@ def render_fun_asr_transcription(tr):
st.error(tr("Fun-ASR failed without subtitle file"))
return
subtitle_content, subtitle_contents = _build_combined_subtitle_content(generated_paths)
subtitle_content, subtitle_contents = _build_combined_subtitle_content(
generated_paths,
media_paths,
)
if not subtitle_content.strip():
clear_fun_asr_subtitle_state()
st.error(tr("Fun-ASR failed without subtitle file"))
@ -1112,20 +1227,35 @@ def render_script_buttons(tr, params):
generate_script_short(tr, params, custom_clips)
elif script_path == "summary":
# 执行 短剧解说 脚本生成
subtitle_path = st.session_state.get('subtitle_path')
subtitle_paths = _selected_subtitle_paths()
subtitle_path = subtitle_paths[0] if subtitle_paths else None
video_theme = st.session_state.get('video_theme')
temperature = st.session_state.get('temperature')
web_search_enabled = bool(st.session_state.get('short_drama_web_search_enabled', False))
current_signature = _short_drama_plot_analysis_signature(
subtitle_paths,
video_theme,
web_search_enabled,
_selected_video_paths(),
)
plot_analysis = ""
if st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path:
if st.session_state.get('short_drama_plot_analysis_signature') == current_signature:
plot_analysis = st.session_state.get('short_drama_plot_analysis', '')
elif (
not web_search_enabled
and st.session_state.get('short_drama_plot_analysis_subtitle_path') == subtitle_path
):
plot_analysis = st.session_state.get('short_drama_plot_analysis', '')
generate_script_short_sunmmary(
params,
subtitle_path,
subtitle_paths,
video_theme,
temperature,
tr,
plot_analysis=plot_analysis,
subtitle_content=st.session_state.get('subtitle_content', ''),
enable_web_search=web_search_enabled,
video_paths=_selected_video_paths(),
)
else:
load_script(tr, script_path)
@ -1172,6 +1302,8 @@ def save_script_with_validation(tr, video_clip_json_details):
example_script = [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:00,600-00:00:07,559",
"picture": "工地上,蔡晓艳奋力救人,场面混乱",
"narration": "灾后重建,工地上险象环生!泼辣女工蔡晓艳挺身而出,救人第一!",
@ -1179,6 +1311,8 @@ def save_script_with_validation(tr, video_clip_json_details):
},
{
"_id": 2,
"video_id": 2,
"video_name": "2.mp4",
"timestamp": "00:00:08,240-00:00:12,359",
"picture": "领导视察,蔡晓艳不屑一顾",
"narration": "播放原片4",

View File

@ -604,7 +604,7 @@ def render_font_settings(tr):
def is_disabled_subtitle_settings(tts_engine:str)->bool:
"""是否禁用字幕设置"""
return tts_engine=="soulvoice" or tts_engine=="qwen3_tts"
return tts_engine=="soulvoice" or tts_engine=="qwen3_tts" or tts_engine==config.OMNIVOICE_ENGINE
def render_position_settings(tr):
"""渲染位置设置"""

View File

@ -15,6 +15,8 @@
"Video script table help": "Edit the full script JSON as a table. You can add or delete rows; saving will validate and write the script file again.",
"Raw JSON Preview": "Raw JSON Preview",
"Script Column ID": "ID",
"Script Column Video ID": "Video",
"Script Column Video Name": "Video Name",
"Script Column Timestamp": "Timestamp",
"Script Column Picture": "Picture",
"Script Column Narration": "Narration",
@ -286,7 +288,11 @@
"IndexTTS download link": "Download link: https://pan.quark.cn/s/0767c9bcefd5",
"IndexTTS2 features": "A locally or privately deployed IndexTTS-2 voice-cloning engine with emotion control and fuller generation parameters.",
"IndexTTS2 use case": "Best for fixed voices, emotional narration, and local speech synthesis workflows that need finer sampling controls. Start the IndexTTS-2 API service before use.",
"OmniVoice features": "A locally or privately deployed OmniVoice-Pack multilingual TTS engine with automatic voice generation, voice design, and reference-audio cloning.",
"OmniVoice use case": "Best for local controllable multilingual narration, voice design, or reference-audio cloning. Start the OmniVoice-Pack API service before use.",
"Doubao TTS features": "Volcengine Doubao speech synthesis with multiple voices and emotions, plus fast access in mainland China.",
"Local Deployment": "Local Deployment",
"Cloud Service": "Cloud Service",
"Select TTS Engine": "Select TTS Engine",
"Select TTS Engine Help": "Choose the text-to-speech engine you want to use.",
"TTS Engine Details": "📋 {engine} Details",
@ -413,6 +419,16 @@
"Subtitle calibration succeeded for multiple files": "Subtitle calibration succeeded for {count} files: {files}",
"Subtitle calibration failed": "Subtitle calibration failed",
"Transcribed subtitles storage hint": "Previously transcribed subtitles are saved in {path}; drag a file from that folder to upload",
"Tavily Search Settings": "Tavily Web Search",
"Tavily API Key": "Tavily API Key",
"Tavily API Key Help": "Used for web search before short drama plot analysis. When Web Search is enabled, the app searches plot, character, and episode context by drama name, then combines it with subtitles.",
"Tavily config saved": "Tavily configuration saved",
"联网搜索": "Web Search",
"Enable Web Search Help": "When enabled, plot analysis searches the web with Tavily by short drama name before combining those results with subtitles.",
"Please configure Tavily API Key in Basic Settings": "Please configure the Tavily API Key in Basic Settings first",
"Please enter short drama name before web search": "Please enter the short drama name before enabling web search",
"Searching short drama with Tavily...": "Searching short drama context with Tavily...",
"Tavily search failed": "Tavily search failed",
"剧情理解": "Plot Analysis",
"剧情理解结果": "Plot Analysis Result",
"Analyzing plot...": "Analyzing plot...",
@ -443,6 +459,30 @@
"API URL": "API URL",
"IndexTTS API URL Help": "IndexTTS-1.5 API service URL",
"IndexTTS2 API URL Help": "IndexTTS-2 API service URL. You can enter the service root or the full /tts endpoint.",
"OmniVoice API URL Help": "OmniVoice-Pack API service URL. You can enter the service root or the full /tts endpoint.",
"OmniVoice Language Code": "Synthesis Language",
"OmniVoice Language Code Help": "The language parameter sent to OmniVoice-Pack, such as zh or en.",
"OmniVoice Generation Mode": "Generation Mode",
"OmniVoice Generation Mode Help": "Automatic voice needs no extra fields; voice design uses an instruction; reference-audio cloning needs reference audio and matching text.",
"OmniVoice Mode Auto": "Automatic Voice",
"OmniVoice Mode Voice Design": "Voice Design",
"OmniVoice Mode Voice Clone": "Reference Audio Clone",
"OmniVoice Instruct": "Voice Instruction",
"OmniVoice Instruct Help": "Describe the desired voice, such as gender, pitch, accent, or style.",
"OmniVoice Instruct Placeholder": "e.g. female, low pitch, british accent",
"OmniVoice Reference Text": "Reference Audio Text",
"OmniVoice Reference Text Help": "The exact transcript of the reference audio. Required when the deployed service has ASR disabled.",
"OmniVoice Reference Text Placeholder": "Enter the text spoken in the reference audio",
"OmniVoice Num Step Help": "Diffusion generation steps. Higher values usually improve quality but slow generation.",
"OmniVoice Guidance Scale Help": "Controls how strongly text conditions guide generation.",
"OmniVoice Duration": "Target Duration (seconds)",
"OmniVoice Duration Help": "0 lets the model decide the duration automatically.",
"OmniVoice Denoise": "Enable Denoise",
"OmniVoice Denoise Help": "Ask OmniVoice-Pack to denoise the generated output.",
"OmniVoice Postprocess Output": "Postprocess Output",
"OmniVoice Postprocess Output Help": "Enable OmniVoice-Pack output post-processing.",
"OmniVoice Preprocess Prompt": "Preprocess Text",
"OmniVoice Preprocess Prompt Help": "Enable OmniVoice-Pack text preprocessing.",
"Reference Audio Source": "Reference Audio Source",
"Reference Audio Source Help": "Choose a reference audio from the resource directory or upload a new one.",
"Select from Resource Directory": "Select from Resource Directory",
@ -502,6 +542,8 @@
"Max Mel Tokens Help": "Controls the maximum mel tokens generated in one request. Higher values can produce longer audio.",
"IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 Usage Instructions",
"IndexTTS2 Usage Instructions": "**IndexTTS-2 voice cloning**\n\n1. **Choose a voice**: reuse IndexTTS-1.5 resource audio or upload a reference audio file\n2. **Set API URL**: for example http://192.168.3.6:7863/tts, or enter the service root\n3. **Tune emotion**: speaker is the default; switch to audio, vector, or text when needed\n4. **Tune generation**: temperature, top_p, top_k, num_beams, repetition_penalty, and max_mel_tokens are sent directly to the IndexTTS-2 API\n\n**Notes**:\n- Reference audio quality directly affects cloning quality\n- The first request may load the model and take longer\n- CPU deployments are much slower than GPU deployments",
"OmniVoice Usage Instructions Title": "OmniVoice Usage Instructions",
"OmniVoice Usage Instructions": "**OmniVoice-Pack speech synthesis**\n\n1. **Automatic voice**: set the API URL and language, then synthesize directly.\n2. **Voice design**: fill instruct with the desired gender, pitch, accent, or style.\n3. **Reference-audio clone**: upload or choose reference audio and fill its matching transcript.\n\n**Notes**:\n- The default service URL is http://127.0.0.1:7866/tts\n- Reference-audio cloning requires reference text when the service has no ASR model loaded\n- OmniVoice returns WAV audio, and NarratoAI estimates subtitle segment timing from the audio duration",
"Volcengine Access Key Help": "Volcengine Access Key",
"Volcengine Secret Key Help": "Volcengine Secret Key",
"Doubao AppID Help": "Doubao TTS application AppID",

View File

@ -159,6 +159,8 @@
"Video script table help": "在表格中编辑完整脚本 JSON。可新增、删除行保存时会重新校验并写入脚本文件。",
"Raw JSON Preview": "原始 JSON 预览",
"Script Column ID": "序号",
"Script Column Video ID": "视频",
"Script Column Video Name": "视频文件",
"Script Column Timestamp": "时间戳",
"Script Column Picture": "画面描述",
"Script Column Narration": "解说台词",
@ -267,7 +269,11 @@
"IndexTTS download link": "下载地址https://pan.quark.cn/s/0767c9bcefd5",
"IndexTTS2 features": "本地/私有部署的 IndexTTS-2 语音克隆引擎,支持情感控制和更完整的生成参数。",
"IndexTTS2 use case": "适合需要固定音色、情绪化旁白或更细致采样控制的本地语音合成场景。使用前请先启动 IndexTTS-2 API 服务。",
"OmniVoice features": "本地/私有部署的 OmniVoice-Pack 多语种语音合成引擎,支持自动音色、指令音色和参考音频克隆。",
"OmniVoice use case": "适合需要本地可控、多语言旁白、音色设计或参考音频克隆的场景。使用前请先启动 OmniVoice-Pack API 服务。",
"Doubao TTS features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快",
"Local Deployment": "本地部署",
"Cloud Service": "云端服务",
"Select TTS Engine": "选择 TTS 引擎",
"Select TTS Engine Help": "选择您要使用的文本转语音引擎",
"TTS Engine Details": "📋 {engine} 详细说明",
@ -395,6 +401,16 @@
"Subtitle calibration succeeded for multiple files": "字幕校准成功,共 {count} 个文件: {files}",
"Subtitle calibration failed": "字幕校准失败",
"Transcribed subtitles storage hint": "之前转录生成的字幕保存在 {path},可从该目录拖入上传",
"Tavily Search Settings": "Tavily 联网搜索",
"Tavily API Key": "Tavily API Key",
"Tavily API Key Help": "用于短剧剧情理解前的联网检索。开启“联网搜索”后,会先按短剧名称检索剧情、人物和分集信息,再结合字幕分析。",
"Tavily config saved": "Tavily 配置已保存",
"联网搜索": "联网搜索",
"Enable Web Search Help": "开启后,剧情理解会先使用 Tavily 按短剧名称联网检索,再结合检索结果和字幕分析剧情。",
"Please configure Tavily API Key in Basic Settings": "请先在基础设置中配置 Tavily API Key",
"Please enter short drama name before web search": "开启联网搜索前,请先填写短剧名称",
"Searching short drama with Tavily...": "正在使用 Tavily 检索短剧信息...",
"Tavily search failed": "Tavily 检索失败",
"剧情理解": "剧情理解",
"剧情理解结果": "剧情理解结果",
"Analyzing plot...": "正在理解剧情...",
@ -425,6 +441,30 @@
"API URL": "API 地址",
"IndexTTS API URL Help": "IndexTTS-1.5 API 服务地址",
"IndexTTS2 API URL Help": "IndexTTS-2 API 服务地址,可填写服务根地址或完整 /tts 地址",
"OmniVoice API URL Help": "OmniVoice-Pack API 服务地址,可填写服务根地址或完整 /tts 地址",
"OmniVoice Language Code": "合成语言",
"OmniVoice Language Code Help": "传给 OmniVoice-Pack 的 language 参数,例如 zh、en。",
"OmniVoice Generation Mode": "生成模式",
"OmniVoice Generation Mode Help": "自动音色无需额外参数;指令音色使用描述词;参考音频克隆需要参考音频和对应文本。",
"OmniVoice Mode Auto": "自动音色",
"OmniVoice Mode Voice Design": "指令音色",
"OmniVoice Mode Voice Clone": "参考音频克隆",
"OmniVoice Instruct": "音色指令",
"OmniVoice Instruct Help": "描述希望生成的音色,例如性别、音高、口音或风格。",
"OmniVoice Instruct Placeholder": "例如female, low pitch, british accent",
"OmniVoice Reference Text": "参考音频文本",
"OmniVoice Reference Text Help": "参考音频对应的逐字文本;当前部署未启用 ASR 时必须填写。",
"OmniVoice Reference Text Placeholder": "请输入参考音频中实际朗读的内容",
"OmniVoice Num Step Help": "扩散生成步数,值越大通常质量更高但速度更慢。",
"OmniVoice Guidance Scale Help": "控制文本条件的引导强度。",
"OmniVoice Duration": "目标时长(秒)",
"OmniVoice Duration Help": "0 表示由模型自动决定时长。",
"OmniVoice Denoise": "启用降噪",
"OmniVoice Denoise Help": "让 OmniVoice-Pack 对生成结果执行降噪处理。",
"OmniVoice Postprocess Output": "后处理输出",
"OmniVoice Postprocess Output Help": "启用 OmniVoice-Pack 的输出后处理。",
"OmniVoice Preprocess Prompt": "预处理文本",
"OmniVoice Preprocess Prompt Help": "启用 OmniVoice-Pack 的文本预处理。",
"Reference Audio Source": "参考音频来源",
"Reference Audio Source Help": "选择从资源目录选择参考音频,或上传新的参考音频",
"Select from Resource Directory": "从资源目录选择",
@ -484,6 +524,8 @@
"Max Mel Tokens Help": "控制单次生成的最大 mel token 数,值越大可生成更长音频",
"IndexTTS2 Usage Instructions Title": "💡 IndexTTS-2 使用说明",
"IndexTTS2 Usage Instructions": "**IndexTTS-2 语音克隆**\n\n1. **选择音色**:复用 IndexTTS-1.5 的资源音频或上传参考音频\n2. **设置 API 地址**:例如 http://192.168.3.6:7863/tts也可以填写服务根地址\n3. **调整情感参数**:默认使用 speaker可按需切换到 audio、vector 或 text\n4. **调整生成参数**temperature、top_p、top_k、num_beams、repetition_penalty 和 max_mel_tokens 会直接传给 IndexTTS-2 接口\n\n**注意事项**\n- 参考音频质量会直接影响克隆效果\n- 首次请求可能需要加载模型,耗时更长\n- CPU 部署生成速度会明显慢于 GPU",
"OmniVoice Usage Instructions Title": "OmniVoice 使用说明",
"OmniVoice Usage Instructions": "**OmniVoice-Pack 语音合成**\n\n1. **自动音色**:只需要设置 API 地址和语言,可直接合成。\n2. **指令音色**:填写 instruct 描述想要的性别、音高、口音或风格。\n3. **参考音频克隆**:上传或选择参考音频,并填写该音频对应文本。\n\n**注意事项**\n- 当前默认服务地址为 http://127.0.0.1:7866/tts\n- 参考音频克隆在服务未加载 ASR 模型时必须填写参考文本\n- OmniVoice 返回 WAV 音频,系统会按音频时长估算字幕段落",
"Volcengine Access Key Help": "火山引擎 Access Key",
"Volcengine Secret Key Help": "火山引擎 Secret Key",
"Doubao AppID Help": "豆包语音应用 AppID",

View File

@ -17,12 +17,101 @@ from loguru import logger
from app.config import config
from app.services.SDE.short_drama_explanation import analyze_subtitle, generate_narration_script
from app.services.subtitle_text import read_subtitle_text
from app.services.tavily_search import TavilySearchError, format_search_context, search_short_drama
# 导入新的LLM服务模块 - 确保提供商被注册
import app.services.llm # 这会触发提供商注册
from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter
import re
def _normalize_paths(paths):
if isinstance(paths, str):
paths = [paths]
if not paths:
return []
normalized_paths = []
seen = set()
for path in paths:
if not isinstance(path, str):
continue
path = path.strip()
if not path or path in seen:
continue
normalized_paths.append(path)
seen.add(path)
return normalized_paths
def _build_combined_subtitle_content(subtitle_paths, video_paths=None):
sections = []
video_paths = _normalize_paths(video_paths)
for index, subtitle_path in enumerate(_normalize_paths(subtitle_paths), start=1):
if not os.path.exists(subtitle_path):
continue
video_path = video_paths[index - 1] if index <= len(video_paths) else ""
if video_path:
header = (
f"# 视频 {index}: {os.path.basename(video_path)}\n"
f"字幕文件: {os.path.basename(subtitle_path)}"
)
else:
header = f"# 视频 {index}\n字幕文件: {os.path.basename(subtitle_path)}"
sections.append(f"{header}\n{read_subtitle_text(subtitle_path).text}".strip())
return "\n\n".join(sections)
def _coerce_video_id(value):
try:
video_id = int(value)
except (TypeError, ValueError):
return None
return video_id if video_id > 0 else None
def _match_video_id_by_name(video_name, video_paths):
video_name = str(video_name or "").strip()
if not video_name:
return None
for index, video_path in enumerate(video_paths, start=1):
if os.path.basename(video_path) == os.path.basename(video_name):
return index
return None
def _normalize_narration_items_video_sources(items, video_paths):
video_paths = _normalize_paths(video_paths)
if not video_paths:
return items
normalized_items = []
for item in items:
if not isinstance(item, dict):
normalized_items.append(item)
continue
item_copy = item.copy()
video_id = _coerce_video_id(item_copy.get("video_id") or item_copy.get("video_index"))
matched_video_id = _match_video_id_by_name(
item_copy.get("video_name") or item_copy.get("source_video"),
video_paths,
)
if matched_video_id:
video_id = matched_video_id
if video_id is None or video_id > len(video_paths):
logger.warning(f"片段 {item_copy.get('_id')} 未提供有效 video_id默认使用视频 1")
video_id = 1
item_copy["video_id"] = video_id
item_copy["video_name"] = os.path.basename(video_paths[video_id - 1])
normalized_items.append(item_copy)
return normalized_items
def parse_and_fix_json(json_string):
"""
解析并修复JSON字符串
@ -135,12 +224,83 @@ def parse_and_fix_json(json_string):
return None
def analyze_short_drama_plot(subtitle_path, temperature, tr=lambda key: key, subtitle_content=None):
def _get_tavily_api_key() -> str:
return (
st.session_state.get("tavily_api_key")
or config.app.get("tavily_api_key")
or ""
).strip()
def _build_tavily_context(short_name: str, tr=lambda key: key) -> str | None:
short_name = str(short_name or "").strip()
if not short_name:
st.error(tr("Please enter short drama name before web search"))
return None
api_key = _get_tavily_api_key()
if not api_key:
st.error(tr("Please configure Tavily API Key in Basic Settings"))
return None
try:
search_data = search_short_drama(
short_name,
api_key,
search_depth=config.app.get("tavily_search_depth", "basic"),
max_results=config.app.get("tavily_max_results", 5),
)
return format_search_context(search_data)
except TavilySearchError as e:
logger.error(f"Tavily 短剧检索失败: {str(e)}")
st.error(f"{tr('Tavily search failed')}: {str(e)}")
return None
except Exception as e:
logger.error(f"Tavily 短剧检索异常: {traceback.format_exc()}")
st.error(f"{tr('Tavily search failed')}: {str(e)}")
return None
def _build_plot_analysis_input(
subtitle_content: str,
short_name: str = "",
enable_web_search: bool = False,
tr=lambda key: key,
) -> str | None:
subtitle_content = str(subtitle_content or "").strip()
if not enable_web_search:
return subtitle_content
tavily_context = _build_tavily_context(short_name, tr)
if tavily_context is None:
return None
return f"""# 分析补充说明
请先参考 Tavily 联网检索结果理解短剧名称人物关系剧情背景和公开剧情梗概再结合原始字幕完成剧情理解
如果联网检索结果与字幕内容冲突请以字幕内容为准时间戳必须只从字幕内容中提取
{tavily_context}
# 原始字幕
{subtitle_content}"""
def analyze_short_drama_plot(
subtitle_path,
temperature,
tr=lambda key: key,
subtitle_content=None,
short_name: str = "",
enable_web_search: bool = False,
video_paths=None,
):
"""仅执行短剧字幕剧情理解,返回可编辑的剧情分析文本。"""
if not subtitle_path:
subtitle_paths = _normalize_paths(subtitle_path)
if not subtitle_paths:
st.error(tr("Please generate or upload subtitles first"))
return None
if not os.path.exists(subtitle_path):
missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)]
if missing_subtitle_paths:
st.error(tr("Subtitle file does not exist"))
return None
@ -149,19 +309,31 @@ def analyze_short_drama_plot(subtitle_path, temperature, tr=lambda key: key, sub
text_model = config.app.get(f'text_{text_provider}_model_name')
text_base_url = config.app.get(f'text_{text_provider}_base_url')
subtitle_content = str(subtitle_content or "").strip() or read_subtitle_text(subtitle_path).text
subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content(
subtitle_paths,
video_paths,
)
if not subtitle_content:
st.error(tr("Subtitle file is empty or unreadable"))
return None
plot_analysis_input = _build_plot_analysis_input(
subtitle_content,
short_name=short_name,
enable_web_search=enable_web_search,
tr=tr,
)
if plot_analysis_input is None:
return None
try:
logger.info("使用新的LLM服务架构进行字幕分析")
analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider)
analysis_result = analyzer.analyze_subtitle(subtitle_content)
analysis_result = analyzer.analyze_subtitle(plot_analysis_input)
except Exception as e:
logger.warning(f"使用新LLM服务失败回退到旧实现: {str(e)}")
analysis_result = analyze_subtitle(
subtitle_content=subtitle_content,
subtitle_content=plot_analysis_input,
api_key=text_api_key,
model=text_model,
base_url=text_base_url,
@ -186,6 +358,8 @@ def generate_script_short_sunmmary(
tr=lambda key: key,
plot_analysis=None,
subtitle_content=None,
enable_web_search: bool = False,
video_paths=None,
):
"""
生成 短剧解说 视频脚本
@ -204,7 +378,12 @@ def generate_script_short_sunmmary(
try:
with st.spinner(tr("Generating script...")):
if not params.video_origin_path:
selected_video_paths = _normalize_paths(
video_paths
or getattr(params, "video_origin_paths", [])
or getattr(params, "video_origin_path", "")
)
if not selected_video_paths:
st.error(tr("Please select video file first"))
return
"""
@ -212,7 +391,9 @@ def generate_script_short_sunmmary(
"""
update_progress(30, tr("Parsing subtitles..."))
# 判断字幕文件是否存在
if not os.path.exists(subtitle_path):
subtitle_paths = _normalize_paths(subtitle_path)
missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)]
if not subtitle_paths or missing_subtitle_paths:
st.error(tr("Subtitle file does not exist"))
return
@ -225,7 +406,10 @@ def generate_script_short_sunmmary(
text_base_url = config.app.get(f'text_{text_provider}_base_url')
# 读取字幕文件内容(无论使用哪种实现都需要)
subtitle_content = str(subtitle_content or "").strip() or read_subtitle_text(subtitle_path).text
subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content(
subtitle_paths,
selected_video_paths,
)
if not subtitle_content:
st.error(tr("Subtitle file is empty or unreadable"))
return
@ -238,16 +422,27 @@ def generate_script_short_sunmmary(
"analysis": str(plot_analysis).strip(),
}
else:
plot_analysis_input = subtitle_content
if enable_web_search:
update_progress(40, tr("Searching short drama with Tavily..."))
plot_analysis_input = _build_plot_analysis_input(
subtitle_content,
short_name=video_theme,
enable_web_search=True,
tr=tr,
)
if plot_analysis_input is None:
return
try:
# 优先使用新的LLM服务架构
logger.info("使用新的LLM服务架构进行字幕分析")
analysis_result = analyzer.analyze_subtitle(subtitle_content)
analysis_result = analyzer.analyze_subtitle(plot_analysis_input)
except Exception as e:
logger.warning(f"使用新LLM服务失败回退到旧实现: {str(e)}")
# 回退到旧的实现
analysis_result = analyze_subtitle(
subtitle_content=subtitle_content,
subtitle_content=plot_analysis_input,
api_key=text_api_key,
model=text_model,
base_url=text_base_url,
@ -320,7 +515,11 @@ def generate_script_short_sunmmary(
logger.error(f"JSON结构错误缺少items字段: {narration_dict}")
st.stop()
script = json.dumps(narration_dict['items'], ensure_ascii=False, indent=2)
narration_items = _normalize_narration_items_video_sources(
narration_dict['items'],
selected_video_paths,
)
script = json.dumps(narration_items, ensure_ascii=False, indent=2)
if script is None:
st.error(tr("Script generation failed check logs"))