Merge pull request #250 from linyqh/develop

Release: NarratoAI v0.8.x
This commit is contained in:
viccy 2026-06-10 00:08:22 +08:00 committed by GitHub
commit 2fd397e05a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
68 changed files with 16708 additions and 1438 deletions

3
.gitignore vendored
View File

@ -51,3 +51,6 @@ tests/*
!tests/test_script_service_documentary_unittest.py
!tests/test_generate_narration_script_documentary_unittest.py
!tests/test_generate_script_docu_unittest.py
docs/reddit-community
docs/wechat-0.8

View File

@ -41,10 +41,11 @@ NarratoAI 是一款自动化影视解说工具,基于 LLM 实现文案撰写
本项目仅供学习和研究使用,不得商用。如需商业授权,请联系作者。
## 最新资讯
- 2026.06.10 发布新版本 0.8.1**大版本更新**,优化多个核心流程
- 2026.04.27 发布新版本 0.7.9,新增 **Fun-ASR一键转录字幕**
- 2026.04.03 发布新版本 0.7.8,重构纪录片逐帧分析链路,统一共享服务并优化抽帧、缓存、视觉并发与文案生成流程
- 2026.03.27 发布新版本 0.7.7,出于安全考虑,已移除 LiteLLM 依赖,统一使用 OpenAI 兼容请求链路
- 2025.11.20 发布新版本 0.7.5,新增 [IndexTTS2](https://github.com/index-tts/index-tts) 语音克隆支持
- 2025.11.20 发布新版本 0.7.5,新增 [IndexTTS-1.5](https://github.com/index-tts/index-tts) 语音克隆支持
- 2025.10.15 发布新版本 0.7.3,升级大模型供应商管理能力
- 2025.09.10 发布新版本 0.7.2新增腾讯云tts
- 2025.08.18 发布新版本 0.7.1,支持 **语音克隆** 和 最新大模型
@ -100,7 +101,7 @@ _**1. NarratoAI 是一款完全免费的软件,近期在社交媒体(抖音,B
- [X] 支持短剧解说
- [ ] 主角人脸匹配
- [ ] 支持根据口播,文案,视频素材自动匹配
- [ ] 支持更多 TTS 引擎
- [X] 支持更多 TTS 引擎
- [ ] ...
## 快速启动 🚀

View File

@ -9,6 +9,56 @@ from app.config.defaults import build_default_app_config, merge_missing_app_defa
root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
config_file = f"{root_dir}/config.toml"
version_file = f"{root_dir}/project_version"
INDEXTTS_ENGINE = "indextts"
INDEXTTS_DISPLAY_NAME = "IndexTTS-1.5"
INDEXTTS2_ENGINE = "indextts2"
INDEXTTS2_DISPLAY_NAME = "IndexTTS-2"
OMNIVOICE_ENGINE = "omnivoice"
OMNIVOICE_DISPLAY_NAME = "OmniVoice"
INDEXTTS_VOICE_PREFIX = f"{INDEXTTS_ENGINE}:"
INDEXTTS2_VOICE_PREFIX = f"{INDEXTTS2_ENGINE}:"
OMNIVOICE_VOICE_PREFIX = f"{OMNIVOICE_ENGINE}:"
def normalize_tts_engine_name(tts_engine: str) -> str:
return tts_engine
def normalize_indextts_voice_prefix(voice_name: str) -> str:
return voice_name
def _is_legacy_indextts2_config(indextts2_config) -> bool:
if not isinstance(indextts2_config, dict):
return False
api_url = str(indextts2_config.get("api_url", ""))
has_indextts2_fields = any(
key in indextts2_config
for key in (
"emotion_mode",
"emotion_alpha",
"max_text_tokens_per_segment",
"max_mel_tokens",
"vec_calm",
)
)
return "8081" in api_url and not has_indextts2_fields
def migrate_indextts_config(config_data):
migrated_legacy_indextts2 = _is_legacy_indextts2_config(config_data.get(INDEXTTS2_ENGINE))
if migrated_legacy_indextts2:
if "indextts" not in config_data:
config_data["indextts"] = config_data[INDEXTTS2_ENGINE]
config_data.pop(INDEXTTS2_ENGINE, None)
ui_config = config_data.get("ui")
if isinstance(ui_config, dict):
if migrated_legacy_indextts2 and ui_config.get("tts_engine") == INDEXTTS2_ENGINE:
ui_config["tts_engine"] = INDEXTTS_ENGINE
if ui_config.get("voice_name", "").startswith(INDEXTTS2_VOICE_PREFIX) and ui_config.get("tts_engine") == INDEXTTS_ENGINE:
ui_config["voice_name"] = f"{INDEXTTS_VOICE_PREFIX}{ui_config['voice_name'][len(INDEXTTS2_VOICE_PREFIX):]}"
return config_data
def get_version_from_file():
@ -32,13 +82,13 @@ def load_config():
_config_ = build_default_config()
write_config_file(_config_)
logger.info("create config.toml with shared defaults")
return _config_
return migrate_indextts_config(_config_)
logger.info(f"load config from file: {config_file}")
_config_ = load_toml_file(config_file)
_config_["app"] = merge_missing_app_defaults(_config_.get("app", {}))
return _config_
return migrate_indextts_config(_config_)
def load_toml_file(file_path):
@ -60,7 +110,7 @@ def build_default_config():
config_data = load_toml_file(example_file)
config_data["app"] = build_default_app_config(config_data.get("app", {}))
return config_data
return migrate_indextts_config(config_data)
def write_config_file(config_data):
@ -82,7 +132,9 @@ def save_config():
_cfg["ui"] = ui
_cfg["tts_qwen"] = tts_qwen
_cfg["fun_asr"] = fun_asr
_cfg["indextts"] = indextts
_cfg["indextts2"] = indextts2
_cfg["omnivoice"] = omnivoice
_cfg["doubaotts"] = doubaotts
f.write(toml.dumps(_cfg))
@ -98,7 +150,9 @@ ui = _cfg.get("ui", {})
frames = _cfg.get("frames", {})
tts_qwen = _cfg.get("tts_qwen", {})
fun_asr = _cfg.get("fun_asr", {})
indextts = _cfg.get("indextts", {})
indextts2 = _cfg.get("indextts2", {})
omnivoice = _cfg.get("omnivoice", {})
doubaotts = _cfg.get("doubaotts", {})
hostname = socket.gethostname()
@ -119,8 +173,43 @@ imagemagick_path = app.get("imagemagick_path", "")
if imagemagick_path and os.path.isfile(imagemagick_path):
os.environ["IMAGEMAGICK_BINARY"] = imagemagick_path
_applied_ffmpeg_dir = None
def apply_ffmpeg_path(ffmpeg_binary: str = "") -> None:
"""Apply the configured FFmpeg binary to this Python process."""
global _applied_ffmpeg_dir
if not ffmpeg_binary or not os.path.isfile(ffmpeg_binary):
return
ffmpeg_binary = os.path.abspath(os.path.expanduser(ffmpeg_binary))
ffmpeg_dir = os.path.dirname(ffmpeg_binary)
os.environ["IMAGEIO_FFMPEG_EXE"] = ffmpeg_binary
current_paths = os.environ.get("PATH", "").split(os.pathsep)
normalized_ffmpeg_dir = os.path.normcase(os.path.abspath(ffmpeg_dir))
normalized_previous_dir = (
os.path.normcase(os.path.abspath(_applied_ffmpeg_dir))
if _applied_ffmpeg_dir
else None
)
filtered_paths = []
for path_item in current_paths:
if not path_item:
continue
normalized_item = os.path.normcase(os.path.abspath(path_item))
if normalized_item == normalized_ffmpeg_dir:
continue
if normalized_previous_dir and normalized_item == normalized_previous_dir:
continue
filtered_paths.append(path_item)
os.environ["PATH"] = os.pathsep.join([ffmpeg_dir, *filtered_paths])
_applied_ffmpeg_dir = ffmpeg_dir
ffmpeg_path = app.get("ffmpeg_path", "")
if ffmpeg_path and os.path.isfile(ffmpeg_path):
os.environ["IMAGEIO_FFMPEG_EXE"] = ffmpeg_path
apply_ffmpeg_path(ffmpeg_path)
logger.info(f"{project_name} v{project_version}")

View File

@ -11,6 +11,21 @@ DEFAULT_VISION_OPENAI_MODEL_NAME = "Qwen/Qwen3.5-122B-A10B"
DEFAULT_TEXT_LLM_PROVIDER = DEFAULT_OPENAI_COMPATIBLE_PROVIDER
DEFAULT_TEXT_OPENAI_MODEL_NAME = "Pro/zai-org/GLM-5"
DEFAULT_LLM_GENERATION_CONFIG = {
"temperature": 1.0,
"top_p": 0.95,
"max_tokens": 65536,
"thinking_level": "auto",
}
DEFAULT_LLM_THINKING_LEVELS = ["auto", "off", "low", "medium", "high"]
DEFAULT_LLM_GENERATION_APP_CONFIG = {
f"{model_type}_openai_{param_name}": value
for model_type in ("vision", "text")
for param_name, value in DEFAULT_LLM_GENERATION_CONFIG.items()
}
DEFAULT_LLM_APP_CONFIG = {
"vision_llm_provider": DEFAULT_VISION_LLM_PROVIDER,
"vision_openai_model_name": DEFAULT_VISION_OPENAI_MODEL_NAME,
@ -20,7 +35,11 @@ DEFAULT_LLM_APP_CONFIG = {
"text_openai_model_name": DEFAULT_TEXT_OPENAI_MODEL_NAME,
"text_openai_api_key": "",
"text_openai_base_url": DEFAULT_OPENAI_COMPATIBLE_BASE_URL,
"tavily_api_key": "",
"tavily_search_depth": "basic",
"tavily_max_results": 5,
}
DEFAULT_LLM_APP_CONFIG.update(DEFAULT_LLM_GENERATION_APP_CONFIG)
def build_default_app_config(app_config: dict | None = None) -> dict:

View File

@ -53,13 +53,53 @@ hide_config = true
self.assertEqual("openai", config_data["app"]["vision_llm_provider"])
self.assertEqual("Qwen/Qwen3.5-122B-A10B", config_data["app"]["vision_openai_model_name"])
self.assertEqual("https://api.siliconflow.cn/v1", config_data["app"]["vision_openai_base_url"])
self.assertEqual(1.0, config_data["app"]["vision_openai_temperature"])
self.assertEqual(0.95, config_data["app"]["vision_openai_top_p"])
self.assertEqual("openai", config_data["app"]["text_llm_provider"])
self.assertEqual("Pro/zai-org/GLM-5", config_data["app"]["text_openai_model_name"])
self.assertEqual("https://api.siliconflow.cn/v1", config_data["app"]["text_openai_base_url"])
self.assertEqual(1.0, config_data["app"]["text_openai_temperature"])
self.assertEqual(0.95, config_data["app"]["text_openai_top_p"])
self.assertEqual("Qwen/Qwen3.5-122B-A10B", saved_config["app"]["vision_openai_model_name"])
self.assertEqual("Pro/zai-org/GLM-5", saved_config["app"]["text_openai_model_name"])
self.assertTrue(saved_config["app"]["hide_config"])
def test_legacy_indextts2_config_is_migrated_to_indextts_15(self):
migrated = cfg.migrate_indextts_config(
{
"indextts2": {"api_url": "http://127.0.0.1:8081/tts"},
"ui": {
"tts_engine": "indextts2",
"voice_name": "indextts2:/tmp/reference.wav",
},
}
)
self.assertEqual("http://127.0.0.1:8081/tts", migrated["indextts"]["api_url"])
self.assertNotIn("indextts2", migrated)
self.assertEqual("indextts", migrated["ui"]["tts_engine"])
self.assertEqual("indextts:/tmp/reference.wav", migrated["ui"]["voice_name"])
def test_indextts2_config_is_kept_as_separate_engine(self):
migrated = cfg.migrate_indextts_config(
{
"indextts": {"api_url": "http://127.0.0.1:8081/tts"},
"indextts2": {
"api_url": "http://192.168.3.6:7863/tts",
"emotion_mode": "speaker",
},
"ui": {
"tts_engine": "indextts2",
"voice_name": "indextts2:/tmp/reference.wav",
},
}
)
self.assertEqual("http://127.0.0.1:8081/tts", migrated["indextts"]["api_url"])
self.assertEqual("http://192.168.3.6:7863/tts", migrated["indextts2"]["api_url"])
self.assertEqual("indextts2", migrated["ui"]["tts_engine"])
self.assertEqual("indextts2:/tmp/reference.wav", migrated["ui"]["voice_name"])
class OpenAICompatibleModelDefaultsTests(unittest.TestCase):
def test_ui_keeps_full_model_name_and_openai_provider(self):

View File

@ -164,6 +164,9 @@ class VideoClipParams(BaseModel):
video_clip_json: Optional[list] = Field(default=[], description="LLM 生成的视频剪辑脚本内容")
video_clip_json_path: Optional[str] = Field(default="", description="LLM 生成的视频剪辑脚本路径")
video_origin_path: Optional[str] = Field(default="", description="原视频路径")
video_origin_paths: Optional[List[str]] = Field(default=[], description="原视频路径列表")
original_subtitle_path: Optional[str] = Field(default="", description="原视频字幕路径")
original_subtitle_paths: Optional[List[str]] = Field(default=[], description="原视频字幕路径列表")
video_aspect: Optional[VideoAspect] = Field(default=VideoAspect.portrait.value, description="视频比例")
video_language: Optional[str] = Field(default="zh-CN", description="视频语言")
@ -182,6 +185,28 @@ class VideoClipParams(BaseModel):
bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
subtitle_enabled: bool = True
subtitle_mask_enabled: bool = False
subtitle_mask_landscape_x_percent: float = 10.0
subtitle_mask_landscape_y_percent: float = 78.0
subtitle_mask_landscape_width_percent: float = 80.0
subtitle_mask_landscape_height_percent: float = 14.0
subtitle_mask_landscape_blur_radius: int = 18
subtitle_mask_landscape_opacity_percent: int = 82
subtitle_mask_portrait_x_percent: float = 8.0
subtitle_mask_portrait_y_percent: float = 79.0
subtitle_mask_portrait_width_percent: float = 84.0
subtitle_mask_portrait_height_percent: float = 16.0
subtitle_mask_portrait_blur_radius: int = 26
subtitle_mask_portrait_opacity_percent: int = 84
subtitle_position_landscape_y_percent: float = 85.0
subtitle_position_portrait_y_percent: float = 82.0
subtitle_auto_transcribe_enabled: bool = False
subtitle_auto_transcribe_backend: str = "local"
subtitle_auto_transcribe_api_url: str = ""
subtitle_auto_transcribe_firered_api_url: str = ""
subtitle_auto_transcribe_api_key: str = ""
subtitle_auto_transcribe_hotword: str = ""
subtitle_auto_transcribe_enable_spk: bool = False
font_name: str = "SimHei" # 默认使用黑体
font_size: int = 36
text_fore_color: str = "white" # 文本前景色
@ -206,4 +231,3 @@ class SubtitlePosition(str, Enum):
TOP = "top"
CENTER = "center"
BOTTOM = "bottom"

View File

@ -11,7 +11,7 @@
import os
import json
import requests
from typing import Dict, Any, Optional
from typing import Dict, Any, Optional, Tuple
from loguru import logger
from app.config import config
from app.utils.utils import get_uuid, storage_dir
@ -31,6 +31,7 @@ class SubtitleAnalyzer:
custom_prompt: Optional[str] = None,
temperature: Optional[float] = 1.0,
provider: Optional[str] = None,
prompt_category: str = "short_drama_narration",
):
"""
初始化字幕分析器
@ -49,6 +50,7 @@ class SubtitleAnalyzer:
self.base_url = base_url
self.temperature = temperature
self.provider = provider or self._detect_provider()
self.prompt_category = prompt_category or "short_drama_narration"
# 设置自定义提示词(如果提供)
self.custom_prompt = custom_prompt
@ -94,7 +96,7 @@ class SubtitleAnalyzer:
else:
# 使用新的提示词管理系统,正确传入参数
prompt = PromptManager.get_prompt(
category="short_drama_narration",
category=self.prompt_category,
name="plot_analysis",
parameters={"subtitle_content": subtitle_content}
)
@ -363,7 +365,179 @@ class SubtitleAnalyzer:
logger.error(f"保存分析结果时发生错误: {str(e)}")
return ""
def generate_narration_script(self, short_name: str, plot_analysis: str, subtitle_content: str = "", temperature: float = 0.7) -> Dict[str, Any]:
def _render_prompt(self, name: str, parameters: Dict[str, Any]) -> Tuple[str, Optional[str]]:
prompt = PromptManager.get_prompt(
category=self.prompt_category,
name=name,
parameters=parameters,
)
prompt_object = PromptManager.get_prompt_object(
category=self.prompt_category,
name=name,
)
return prompt, prompt_object.get_system_prompt()
def _generate_json_text(
self,
prompt: str,
system_prompt: Optional[str],
temperature: float,
) -> Dict[str, Any]:
if self.is_native_gemini:
return self._generate_narration_with_native_gemini(prompt, temperature, system_prompt, json_output=True)
return self._generate_narration_with_openai_compatible(prompt, temperature, system_prompt, json_output=True)
def _generate_plain_text(
self,
prompt: str,
system_prompt: Optional[str],
temperature: float,
) -> Dict[str, Any]:
if self.is_native_gemini:
result = self._generate_narration_with_native_gemini(prompt, temperature, system_prompt, json_output=False)
else:
result = self._generate_narration_with_openai_compatible(prompt, temperature, system_prompt, json_output=False)
if result.get("status") == "success":
result["narration_copy"] = str(result.get("narration_script", "")).strip()
return result
def generate_narration_copy(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str = "",
temperature: float = 0.7,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
) -> Dict[str, Any]:
"""生成供用户审核修改的解说正文。"""
try:
prompt, system_prompt = self._render_prompt(
"narration_copy",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content,
"narration_language": narration_language,
},
)
return self._generate_plain_text(prompt, system_prompt, temperature)
except Exception as e:
logger.error(f"解说文案正文生成过程中发生错误: {str(e)}")
return {
"status": "error",
"message": str(e),
"temperature": temperature,
}
def match_narration_copy_to_script(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str,
narration_copy: str,
temperature: float = 0.3,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
original_sound_ratio: int = 30,
) -> Dict[str, Any]:
"""将用户审核后的解说正文匹配到字幕时间戳。"""
try:
prompt, system_prompt = self._render_prompt(
"script_matching",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content,
"narration_copy": narration_copy,
"narration_language": narration_language,
"original_sound_ratio": int(original_sound_ratio),
},
)
return self._generate_json_text(prompt, system_prompt, min(float(temperature), 0.3))
except Exception as e:
logger.error(f"解说文案画面匹配过程中发生错误: {str(e)}")
return {
"status": "error",
"message": str(e),
"temperature": temperature,
}
def plan_narration_segments(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str = "",
temperature: float = 0.3,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
) -> Dict[str, Any]:
"""规划短剧解说片段,只输出片段来源和意图。"""
try:
prompt, system_prompt = self._render_prompt(
"segment_planning",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content,
"narration_language": narration_language,
},
)
return self._generate_json_text(prompt, system_prompt, min(float(temperature), 0.3))
except Exception as e:
logger.error(f"片段规划过程中发生错误: {str(e)}")
return {
"status": "error",
"message": str(e),
"temperature": temperature,
}
def repair_narration_script(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str,
invalid_script: str,
validation_errors: str,
temperature: float = 0.3,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
) -> Dict[str, Any]:
"""根据确定性校验错误修复解说脚本。"""
try:
prompt, system_prompt = self._render_prompt(
"script_repair",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content,
"invalid_script": invalid_script,
"validation_errors": validation_errors,
"narration_language": narration_language,
},
)
return self._generate_json_text(prompt, system_prompt, min(float(temperature), 0.3))
except Exception as e:
logger.error(f"解说文案修复过程中发生错误: {str(e)}")
return {
"status": "error",
"message": str(e),
"temperature": temperature,
}
def generate_narration_script(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str = "",
temperature: float = 0.7,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
) -> Dict[str, Any]:
"""
根据剧情分析生成解说文案
@ -372,28 +546,36 @@ class SubtitleAnalyzer:
plot_analysis: 剧情分析内容
subtitle_content: 原始字幕内容用于提供准确的时间戳信息
temperature: 生成温度控制创造性默认0.7
narration_language: 解说台词目标语言
Returns:
Dict[str, Any]: 包含生成结果的字典
"""
try:
# 使用新的提示词管理系统构建提示词
prompt = PromptManager.get_prompt(
category="short_drama_narration",
name="script_generation",
parameters={
segment_plan_result = self.plan_narration_segments(
short_name=short_name,
plot_analysis=plot_analysis,
subtitle_content=subtitle_content,
temperature=temperature,
narration_language=narration_language,
drama_genre=drama_genre,
)
if segment_plan_result["status"] != "success":
return segment_plan_result
prompt, system_prompt = self._render_prompt(
"script_generation",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content
}
"subtitle_content": subtitle_content,
"segment_plan": segment_plan_result["narration_script"],
"narration_language": narration_language,
},
)
if self.is_native_gemini:
# 使用原生Gemini API格式
return self._generate_narration_with_native_gemini(prompt, temperature)
else:
# 使用OpenAI兼容格式
return self._generate_narration_with_openai_compatible(prompt, temperature)
return self._generate_json_text(prompt, system_prompt, temperature)
except Exception as e:
logger.error(f"解说文案生成过程中发生错误: {str(e)}")
@ -403,16 +585,35 @@ class SubtitleAnalyzer:
"temperature": self.temperature
}
def _generate_narration_with_native_gemini(self, prompt: str, temperature: float) -> Dict[str, Any]:
def _generate_narration_with_native_gemini(
self,
prompt: str,
temperature: float,
system_prompt: Optional[str] = None,
json_output: bool = True,
) -> Dict[str, Any]:
"""使用原生Gemini API生成解说文案"""
try:
# 构建原生Gemini API请求数据
# 为了确保JSON输出在提示词中添加更强的约束
enhanced_prompt = f"{prompt}\n\n请确保输出严格的JSON格式不要包含任何其他文字或标记。"
enhanced_prompt = (
f"{prompt}\n\n请确保输出严格的JSON格式不要包含任何其他文字或标记。"
if json_output
else prompt
)
payload = {
"systemInstruction": {
"parts": [{"text": "你是一位专业的短视频解说脚本撰写专家。你必须严格按照JSON格式输出不能包含任何其他文字、说明或代码块标记。"}]
"parts": [
{
"text": system_prompt
or (
"你必须严格按照JSON格式输出不能包含任何其他文字、说明或代码块标记。"
if json_output
else "你是一位专业的短剧解说文案助手。"
)
}
]
},
"contents": [{
"parts": [{"text": enhanced_prompt}]
@ -423,7 +624,6 @@ class SubtitleAnalyzer:
"topP": 0.95,
"maxOutputTokens": 64000,
"candidateCount": 1,
"stopSequences": ["```", "注意", "说明"]
},
"safetySettings": [
{
@ -444,6 +644,8 @@ class SubtitleAnalyzer:
}
]
}
if json_output:
payload["generationConfig"]["stopSequences"] = ["```", "注意", "说明"]
# 构建请求URL
url = f"{self.base_url}/models/{self.model}:generateContent"
@ -523,21 +725,27 @@ class SubtitleAnalyzer:
"temperature": temperature
}
def _generate_narration_with_openai_compatible(self, prompt: str, temperature: float) -> Dict[str, Any]:
def _generate_narration_with_openai_compatible(
self,
prompt: str,
temperature: float,
system_prompt: Optional[str] = None,
json_output: bool = True,
) -> Dict[str, Any]:
"""使用OpenAI兼容API生成解说文案"""
try:
# 构建OpenAI格式的请求数据
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": "你是一位专业的短视频解说脚本撰写专家。"},
{"role": "system", "content": system_prompt or ("你必须严格按照JSON格式输出。" if json_output else "你是一位专业的短剧解说文案助手。")},
{"role": "user", "content": prompt}
],
"temperature": temperature
}
# 对特定模型添加响应格式设置
if self.model not in ["deepseek-reasoner"]:
if json_output and self.model not in ["deepseek-reasoner"]:
payload["response_format"] = {"type": "json_object"}
# 构建请求地址
@ -632,7 +840,8 @@ def analyze_subtitle(
temperature: float = 1.0,
save_result: bool = False,
output_path: Optional[str] = None,
provider: Optional[str] = None
provider: Optional[str] = None,
prompt_category: str = "short_drama_narration",
) -> Dict[str, Any]:
"""
分析字幕内容的便捷函数
@ -659,7 +868,8 @@ def analyze_subtitle(
model=model,
base_url=base_url,
custom_prompt=custom_prompt,
provider=provider
provider=provider,
prompt_category=prompt_category,
)
logger.debug(f"使用模型: {analyzer.model} 开始分析, 温度: {analyzer.temperature}")
# 分析字幕
@ -691,7 +901,10 @@ def generate_narration_script(
temperature: float = 1.0,
save_result: bool = False,
output_path: Optional[str] = None,
provider: Optional[str] = None
provider: Optional[str] = None,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
prompt_category: str = "short_drama_narration",
) -> Dict[str, Any]:
"""
根据剧情分析生成解说文案的便捷函数
@ -707,6 +920,7 @@ def generate_narration_script(
save_result: 是否保存结果到文件
output_path: 输出文件路径
provider: 提供商类型
narration_language: 解说台词目标语言
Returns:
Dict[str, Any]: 包含生成结果的字典
@ -717,11 +931,19 @@ def generate_narration_script(
api_key=api_key,
model=model,
base_url=base_url,
provider=provider
provider=provider,
prompt_category=prompt_category,
)
# 生成解说文案
result = analyzer.generate_narration_script(short_name, plot_analysis, subtitle_content or "", temperature)
result = analyzer.generate_narration_script(
short_name,
plot_analysis,
subtitle_content or "",
temperature,
narration_language,
drama_genre,
)
# 保存结果
if save_result and result["status"] == "success":
@ -730,6 +952,113 @@ def generate_narration_script(
return result
def generate_narration_copy(
short_name: str = None,
plot_analysis: str = None,
subtitle_content: str = None,
api_key: Optional[str] = None,
model: Optional[str] = None,
base_url: Optional[str] = None,
temperature: float = 0.7,
provider: Optional[str] = None,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
prompt_category: str = "short_drama_narration",
) -> Dict[str, Any]:
"""生成可供用户审核修改的解说正文。"""
analyzer = SubtitleAnalyzer(
temperature=temperature,
api_key=api_key,
model=model,
base_url=base_url,
provider=provider,
prompt_category=prompt_category,
)
return analyzer.generate_narration_copy(
short_name=short_name,
plot_analysis=plot_analysis or "",
subtitle_content=subtitle_content or "",
temperature=temperature,
narration_language=narration_language,
drama_genre=drama_genre,
)
def match_narration_copy_to_script(
short_name: str = None,
plot_analysis: str = None,
subtitle_content: str = None,
narration_copy: str = None,
api_key: Optional[str] = None,
model: Optional[str] = None,
base_url: Optional[str] = None,
temperature: float = 0.3,
provider: Optional[str] = None,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
original_sound_ratio: int = 30,
prompt_category: str = "short_drama_narration",
) -> Dict[str, Any]:
"""将用户审核后的解说正文匹配到字幕时间戳。"""
analyzer = SubtitleAnalyzer(
temperature=temperature,
api_key=api_key,
model=model,
base_url=base_url,
provider=provider,
prompt_category=prompt_category,
)
return analyzer.match_narration_copy_to_script(
short_name=short_name,
plot_analysis=plot_analysis or "",
subtitle_content=subtitle_content or "",
narration_copy=narration_copy or "",
temperature=temperature,
narration_language=narration_language,
drama_genre=drama_genre,
original_sound_ratio=original_sound_ratio,
)
def repair_narration_script(
short_name: str = None,
plot_analysis: str = None,
subtitle_content: str = None,
invalid_script: str = None,
validation_errors: str = None,
api_key: Optional[str] = None,
model: Optional[str] = None,
base_url: Optional[str] = None,
temperature: float = 0.3,
provider: Optional[str] = None,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
prompt_category: str = "short_drama_narration",
) -> Dict[str, Any]:
"""根据校验错误修复解说文案的便捷函数。"""
analyzer = SubtitleAnalyzer(
temperature=temperature,
api_key=api_key,
model=model,
base_url=base_url,
provider=provider,
prompt_category=prompt_category,
)
return analyzer.repair_narration_script(
short_name=short_name,
plot_analysis=plot_analysis or "",
subtitle_content=subtitle_content or "",
invalid_script=invalid_script or "",
validation_errors=validation_errors or "",
temperature=temperature,
narration_language=narration_language,
drama_genre=drama_genre,
)
if __name__ == '__main__':
text_api_key = "skxxxx"
text_model = "gemini-2.0-flash"

View File

@ -32,6 +32,108 @@ def parse_timestamp(timestamp: str) -> tuple:
return start_time, end_time
def _ffmpeg_time_to_seconds(time_value: str) -> float:
normalized_time = str(time_value).strip().replace(",", ".")
parts = normalized_time.split(":")
if len(parts) == 3:
hours, minutes, seconds = parts
return int(hours) * 3600 + int(minutes) * 60 + float(seconds)
if len(parts) == 2:
minutes, seconds = parts
return int(minutes) * 60 + float(seconds)
return float(normalized_time)
def _calculate_ffmpeg_duration(start_time: str, end_time: str) -> str:
duration = _ffmpeg_time_to_seconds(end_time) - _ffmpeg_time_to_seconds(start_time)
if duration <= 0:
raise ValueError(f"无效的视频裁剪时间范围: {start_time} -> {end_time}")
return f"{duration:.3f}".rstrip("0").rstrip(".")
def _append_fast_seek_input(cmd: List[str], input_path: str, start_time: str, end_time: str) -> None:
duration = _calculate_ffmpeg_duration(start_time, end_time)
cmd.extend(["-ss", start_time, "-i", input_path, "-t", duration])
def _normalize_video_origin_paths(
video_origin_path: str,
video_origin_paths: Optional[List[str]] = None,
) -> List[str]:
paths = []
if video_origin_paths:
paths.extend(video_origin_paths)
if video_origin_path:
paths.insert(0, video_origin_path)
normalized_paths = []
seen = set()
for item in paths:
if not isinstance(item, str):
continue
item = item.strip()
if not item or item in seen:
continue
normalized_paths.append(item)
seen.add(item)
return normalized_paths
def _coerce_video_id(value) -> Optional[int]:
try:
video_id = int(value)
except (TypeError, ValueError):
return None
return video_id if video_id > 0 else None
def _match_video_id_by_name(video_name: str, video_origin_paths: List[str]) -> Optional[int]:
video_name = str(video_name or "").strip()
if not video_name:
return None
expected_name = os.path.basename(video_name)
for index, video_path in enumerate(video_origin_paths, start=1):
if os.path.basename(video_path) == expected_name:
return index
return None
def _resolve_script_video_path(script_item: Dict, video_origin_paths: List[str]) -> str:
explicit_path = (
script_item.get("source_video_path")
or script_item.get("video_origin_path")
or script_item.get("origin_video_path")
)
if explicit_path and os.path.exists(explicit_path):
return explicit_path
video_id = _coerce_video_id(script_item.get("video_id") or script_item.get("video_index"))
matched_video_id = _match_video_id_by_name(
script_item.get("video_name") or script_item.get("source_video"),
video_origin_paths,
)
if matched_video_id:
video_id = matched_video_id
if video_id is not None:
if video_id <= len(video_origin_paths):
return video_origin_paths[video_id - 1]
logger.warning(
f"片段 {script_item.get('_id')} 的 video_id={video_id} 超出视频数量 "
f"{len(video_origin_paths)},默认使用第一个视频"
)
return video_origin_paths[0]
def _safe_output_id(value) -> str:
safe_value = str(value if value is not None else "unknown")
return "".join(char if char.isalnum() or char in ("-", "_") else "_" for char in safe_value)
def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str:
"""
根据开始时间和持续时间计算结束时间
@ -177,11 +279,8 @@ def build_ffmpeg_command(
# 对于其他编码器,可以使用硬件解码参数
cmd.extend(hwaccel_args)
# 输入文件
cmd.extend(["-i", input_path])
# 时间范围
cmd.extend(["-ss", start_time, "-to", end_time])
# 快速定位输入文件,避免长视频从头解码到目标片段
_append_fast_seek_input(cmd, input_path, start_time, end_time)
# 编码器设置
cmd.extend(["-c:v", encoder_config["video_codec"]])
@ -363,11 +462,12 @@ def try_compatibility_fallback(
bool: 是否成功
"""
# 兼容性模式:避免所有可能的滤镜链问题
duration = _calculate_ffmpeg_duration(start_time, end_time)
fallback_cmd = [
"ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
"-i", input_path,
"-ss", start_time,
"-to", end_time,
"-i", input_path,
"-t", duration,
"-c:v", "libx264",
"-c:a", "aac",
"-pix_fmt", "yuv420p", # 明确指定像素格式
@ -404,11 +504,12 @@ def try_software_fallback(
bool: 是否成功
"""
# 纯软件编码
duration = _calculate_ffmpeg_duration(start_time, end_time)
fallback_cmd = [
"ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
"-i", input_path,
"-ss", start_time,
"-to", end_time,
"-i", input_path,
"-t", duration,
"-c:v", "libx264",
"-c:a", "aac",
"-pix_fmt", "yuv420p",
@ -444,11 +545,12 @@ def try_basic_fallback(
bool: 是否成功
"""
# 最基本的编码参数
duration = _calculate_ffmpeg_duration(start_time, end_time)
fallback_cmd = [
"ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
"-i", input_path,
"-ss", start_time,
"-to", end_time,
"-i", input_path,
"-t", duration,
"-c:v", "libx264",
"-c:a", "aac",
"-pix_fmt", "yuv420p",
@ -527,11 +629,12 @@ def try_fallback_encoding(
bool: 是否成功
"""
# 最简单的软件编码命令
duration = _calculate_ffmpeg_duration(start_time, end_time)
fallback_cmd = [
"ffmpeg", "-y",
"-i", input_path,
"-ss", start_time,
"-to", end_time,
"-i", input_path,
"-t", duration,
"-c:v", "libx264",
"-c:a", "aac",
"-pix_fmt", "yuv420p",
@ -579,7 +682,7 @@ def _process_narration_only_segment(
# 生成输出文件名
safe_start_time = start_time.replace(':', '-').replace(',', '-')
safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
output_filename = f"ost0_vid_{safe_start_time}@{safe_end_time}.mp4"
output_filename = f"ost0_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4"
output_path = os.path.join(output_dir, output_filename)
# 构建FFmpeg命令 - 移除音频
@ -622,7 +725,7 @@ def _process_original_audio_segment(
# 生成输出文件名
safe_start_time = start_time.replace(':', '-').replace(',', '-')
safe_end_time = end_time.replace(':', '-').replace(',', '-')
output_filename = f"ost1_vid_{safe_start_time}@{safe_end_time}.mp4"
output_filename = f"ost1_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4"
output_path = os.path.join(output_dir, output_filename)
# 构建FFmpeg命令 - 保持原声
@ -674,7 +777,7 @@ def _process_mixed_segment(
# 生成输出文件名
safe_start_time = start_time.replace(':', '-').replace(',', '-')
safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
output_filename = f"ost2_vid_{safe_start_time}@{safe_end_time}.mp4"
output_filename = f"ost2_{_safe_output_id(_id)}_vid_{safe_start_time}@{safe_end_time}.mp4"
output_path = os.path.join(output_dir, output_filename)
# 构建FFmpeg命令 - 保持原声
@ -725,11 +828,8 @@ def _build_ffmpeg_command_with_audio_control(
elif hwaccel_args:
cmd.extend(hwaccel_args)
# 输入文件
cmd.extend(["-i", input_path])
# 时间范围
cmd.extend(["-ss", start_time, "-to", end_time])
# 快速定位输入文件,避免长视频从头解码到目标片段
_append_fast_seek_input(cmd, input_path, start_time, end_time)
# 视频编码器设置
cmd.extend(["-c:v", encoder_config["video_codec"]])
@ -782,28 +882,34 @@ def clip_video_unified(
script_list: List[Dict],
tts_results: List[Dict],
output_dir: Optional[str] = None,
task_id: Optional[str] = None
task_id: Optional[str] = None,
video_origin_paths: Optional[List[str]] = None
) -> Dict[str, str]:
"""
基于OST类型的统一视频裁剪策略 - 消除双重裁剪问题
Args:
video_origin_path: 原始视频的路径
video_origin_path: 原始视频的路径旧脚本或无 video_id 片段默认使用该视频
script_list: 完整的脚本列表包含所有片段信息
tts_results: TTS结果列表仅包含OST=0和OST=2的片段
output_dir: 输出目录路径默认为None时会自动生成
task_id: 任务ID用于生成唯一的输出目录默认为None时会自动生成
video_origin_paths: 多个原始视频路径脚本片段可用 video_id/video_name 指定来源
Returns:
Dict[str, str]: 片段ID到裁剪后视频路径的映射
"""
# 检查视频文件是否存在
if not os.path.exists(video_origin_path):
raise FileNotFoundError(f"视频文件不存在: {video_origin_path}")
video_source_paths = _normalize_video_origin_paths(video_origin_path, video_origin_paths)
if not video_source_paths:
raise FileNotFoundError("视频文件不存在: 未提供原始视频路径")
missing_video_paths = [item for item in video_source_paths if not os.path.exists(item)]
if missing_video_paths:
raise FileNotFoundError(f"视频文件不存在: {', '.join(missing_video_paths)}")
# 如果未提供task_id则根据输入生成一个唯一ID
if task_id is None:
content_for_hash = f"{video_origin_path}_{json.dumps(script_list)}"
content_for_hash = f"{json.dumps(video_source_paths, ensure_ascii=False)}_{json.dumps(script_list, ensure_ascii=False)}"
task_id = hashlib.md5(content_for_hash.encode()).hexdigest()
# 设置输出目录
@ -840,29 +946,33 @@ def clip_video_unified(
failed_clips = []
success_count = 0
logger.info(f"📹 开始统一视频裁剪,总共{total_clips}个片段")
logger.info(f"📹 开始统一视频裁剪,总共{total_clips}个片段,源视频{len(video_source_paths)}")
for i, script_item in enumerate(script_list, 1):
_id = script_item.get("_id")
ost = script_item.get("OST", 0)
timestamp = script_item["timestamp"]
source_video_path = _resolve_script_video_path(script_item, video_source_paths)
logger.info(f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, 时间戳:{timestamp}")
logger.info(
f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, "
f"视频:{os.path.basename(source_video_path)}, 时间戳:{timestamp}"
)
try:
if ost == 0: # 纯解说片段
output_path = _process_narration_only_segment(
video_origin_path, script_item, tts_map, output_dir,
source_video_path, script_item, tts_map, output_dir,
encoder_config, hwaccel_args
)
elif ost == 1: # 纯原声片段
output_path = _process_original_audio_segment(
video_origin_path, script_item, output_dir,
source_video_path, script_item, output_dir,
encoder_config, hwaccel_args
)
elif ost == 2: # 解说+原声混合片段
output_path = _process_mixed_segment(
video_origin_path, script_item, tts_map, output_dir,
source_video_path, script_item, tts_map, output_dir,
encoder_config, hwaccel_args
)
else:

View File

@ -1,15 +1,17 @@
"""Aliyun Bailian Fun-ASR subtitle transcription helpers.
"""Fun-ASR subtitle transcription helpers.
This module intentionally uses the REST API because the official Fun-ASR
The Bailian path intentionally uses the REST API because the official Fun-ASR
recorded-file API supports temporary `oss://` resources only through REST.
"""
from __future__ import annotations
import os
import shutil
import time
from dataclasses import dataclass
from typing import Any, Optional
from urllib.parse import urljoin, urlparse, urlunparse
import requests
from loguru import logger
@ -21,6 +23,8 @@ UPLOAD_POLICY_URL = f"{DASHSCOPE_BASE_URL}/api/v1/uploads"
TRANSCRIPTION_URL = f"{DASHSCOPE_BASE_URL}/api/v1/services/audio/asr/transcription"
TASK_URL_TEMPLATE = f"{DASHSCOPE_BASE_URL}/api/v1/tasks/{{task_id}}"
MODEL_NAME = "fun-asr"
LOCAL_FUN_ASR_API_URL = "http://127.0.0.1:7860"
LOCAL_FIRERED_ASR_API_URL = "http://127.0.0.1:7867"
TERMINAL_FAILED_STATUSES = {"FAILED", "CANCELED", "UNKNOWN"}
PUNCTUATION_BREAKS = set(",。!?;,.!?;")
@ -89,6 +93,93 @@ def _session_post(session, url: str, **kwargs):
return session.post(url, **kwargs)
def _require_local_file(local_file: str) -> None:
if not os.path.isfile(local_file):
raise FunAsrError(f"待转写文件不存在: {local_file}")
def _normalize_local_api_url(api_url: str = "") -> str:
api_url = (api_url or LOCAL_FUN_ASR_API_URL).strip().rstrip("/")
if not api_url:
raise FunAsrError("请先填写本地 FunASR-Pack API 地址")
if "://" not in api_url:
api_url = f"http://{api_url}"
return api_url
def _local_base_url(api_url: str = "") -> str:
api_url = _normalize_local_api_url(api_url)
parsed = urlparse(api_url)
path = parsed.path.rstrip("/")
if path.endswith("/asr"):
path = path[:-4].rstrip("/")
return urlunparse(parsed._replace(path=path, params="", query="", fragment="")).rstrip("/")
def _local_asr_url(api_url: str = "") -> str:
api_url = _normalize_local_api_url(api_url)
if urlparse(api_url).path.rstrip("/").endswith("/asr"):
return api_url
return f"{api_url}/asr"
def _absolute_local_download_url(api_url: str, download_url: str) -> str:
download_url = (download_url or "").strip()
if not download_url:
return ""
if urlparse(download_url).scheme:
return download_url
return urljoin(f"{_local_base_url(api_url)}/", download_url)
def _raise_for_local_http(
response: requests.Response,
action: str,
service_name: str = "本地 FunASR-Pack 服务",
) -> None:
status_code = getattr(response, "status_code", 200)
if status_code and status_code >= 400:
detail = ""
try:
data = response.json()
if isinstance(data, dict):
detail = str(data.get("detail") or "")
except Exception:
detail = ""
suffix = f": {detail}" if detail else ""
raise FunAsrError(f"{action}失败{suffix},请确认{service_name}可用")
try:
response.raise_for_status()
except Exception as exc:
raise FunAsrError(f"{action}失败,请确认{service_name}可用") from exc
def _local_json(
response: requests.Response,
action: str,
service_name: str = "本地 FunASR-Pack 服务",
) -> dict[str, Any]:
_raise_for_local_http(response, action, service_name=service_name)
try:
data = response.json()
except Exception as exc:
raise FunAsrError(f"{action}返回了无效 JSON") from exc
if not isinstance(data, dict):
raise FunAsrError(f"{action}返回格式无效")
return data
def _response_text(response: requests.Response) -> str:
text = getattr(response, "text", None)
if isinstance(text, str):
return text
content = getattr(response, "content", b"")
if isinstance(content, bytes):
return content.decode("utf-8")
return str(content)
def request_upload_policy(api_key: str, model: str = MODEL_NAME, session=requests) -> UploadPolicy:
"""Request Bailian temporary-storage upload policy for the target model."""
api_key = _require_api_key(api_key)
@ -418,6 +509,357 @@ def write_srt_file(srt_content: str, subtitle_file: str = "") -> str:
return subtitle_file
def copy_srt_file(source_file: str, subtitle_file: str = "") -> str:
"""Copy an existing SRT file into NarratoAI's subtitle directory."""
if not os.path.isfile(source_file):
raise FunAsrError(f"本地 FunASR-Pack 返回的字幕文件不存在: {source_file}")
if not subtitle_file:
subtitle_file = os.path.join(utils.subtitle_dir(), f"fun_asr_local_{int(time.time())}.srt")
parent = os.path.dirname(subtitle_file)
if parent:
os.makedirs(parent, exist_ok=True)
if os.path.abspath(source_file) != os.path.abspath(subtitle_file):
shutil.copyfile(source_file, subtitle_file)
return subtitle_file
def request_local_fun_asr_health(api_url: str = LOCAL_FUN_ASR_API_URL, session=requests) -> dict[str, Any]:
"""Fetch FunASR-Pack health metadata from the local service."""
response = _session_get(session, f"{_local_base_url(api_url)}/health", timeout=10)
return _local_json(response, "检查本地 FunASR-Pack 服务")
def request_local_firered_asr_health(
api_url: str = LOCAL_FIRERED_ASR_API_URL,
session=requests,
) -> dict[str, Any]:
"""Fetch FireRedASR2-AED-Pack health metadata from the local service."""
response = _session_get(session, f"{_local_base_url(api_url)}/health", timeout=10)
return _local_json(
response,
"检查本地 FireRedASR2-AED-Pack 服务",
service_name="本地 FireRedASR2-AED-Pack 服务",
)
def request_local_fun_asr(
local_file: str,
api_url: str = LOCAL_FUN_ASR_API_URL,
hotword: str = "",
enable_spk: Optional[bool] = None,
timeout: float = 600.0,
session=requests,
) -> dict[str, Any]:
"""Call the local FunASR-Pack `/asr` API and return its JSON result."""
_require_local_file(local_file)
data: dict[str, str] = {}
if hotword.strip():
data["hotword"] = hotword.strip()
if enable_spk is not None:
data["enable_spk"] = "true" if enable_spk else "false"
with open(local_file, "rb") as file_obj:
files = {"file": (_safe_upload_name(local_file), file_obj)}
response = _session_post(
session,
_local_asr_url(api_url),
data=data,
files=files,
timeout=timeout,
)
return _local_json(response, "调用本地 FunASR-Pack ASR API")
def request_local_firered_asr(
local_file: str,
api_url: str = LOCAL_FIRERED_ASR_API_URL,
enable_vad: Optional[bool] = True,
enable_lid: Optional[bool] = True,
enable_punc: Optional[bool] = True,
return_timestamp: Optional[bool] = True,
timeout: float = 600.0,
session=requests,
) -> dict[str, Any]:
"""Call the local FireRedASR2-AED-Pack `/asr` API and return its JSON result."""
_require_local_file(local_file)
data: dict[str, str] = {}
options = {
"enable_vad": enable_vad,
"enable_lid": enable_lid,
"enable_punc": enable_punc,
"return_timestamp": return_timestamp,
}
for key, value in options.items():
if value is not None:
data[key] = "true" if value else "false"
with open(local_file, "rb") as file_obj:
files = {"file": (_safe_upload_name(local_file), file_obj)}
response = _session_post(
session,
_local_asr_url(api_url),
data=data,
files=files,
timeout=timeout,
)
return _local_json(
response,
"调用本地 FireRedASR2-AED-Pack ASR API",
service_name="本地 FireRedASR2-AED-Pack 服务",
)
def download_local_srt(
download_url: str,
api_url: str = LOCAL_FUN_ASR_API_URL,
subtitle_file: str = "",
session=requests,
service_name: str = "本地 FunASR-Pack 服务",
) -> str:
"""Download an SRT exposed by FunASR-Pack and save it as a NarratoAI subtitle."""
absolute_url = _absolute_local_download_url(api_url, download_url)
if not absolute_url:
raise FunAsrError("本地 FunASR-Pack 结果缺少 SRT 下载地址")
response = _session_get(session, absolute_url, timeout=60)
_raise_for_local_http(response, "下载本地 SRT", service_name=service_name)
srt_content = _response_text(response)
if not srt_content.strip():
raise FunAsrError(f"{service_name}返回了空 SRT")
return write_srt_file(srt_content, subtitle_file)
def _local_result_items(result_json: dict[str, Any]):
raw = result_json.get("raw")
if isinstance(raw, dict):
yield raw
elif isinstance(raw, list):
for item in raw:
if isinstance(item, dict):
yield item
elif result_json.get("text"):
yield result_json
def _blocks_from_local_timestamp(item: dict[str, Any], max_chars: int, max_duration: float) -> list[dict[str, Any]]:
text = str(item.get("text") or "").strip()
timestamps = item.get("timestamp") or []
if not text or not isinstance(timestamps, list):
return []
non_space_chars = [char for char in text if char.strip()]
consume_punctuation = len(timestamps) >= len(non_space_chars)
blocks: list[dict[str, Any]] = []
current: Optional[dict[str, Any]] = None
timestamp_index = 0
last_end = 0.0
max_duration_ms = max_duration * 1000
for char in text:
if not char.strip():
continue
is_punctuation = char in PUNCTUATION_BREAKS
consume_timestamp = consume_punctuation or not is_punctuation
if consume_timestamp and timestamp_index < len(timestamps):
pair = timestamps[timestamp_index]
timestamp_index += 1
if not isinstance(pair, (list, tuple)) or len(pair) < 2:
continue
start_ms = _timestamp_ms(pair[0], "local.timestamp.start")
end_ms = _timestamp_ms(pair[1], "local.timestamp.end")
last_end = end_ms
else:
start_ms = last_end
end_ms = last_end if is_punctuation else last_end + 200
last_end = end_ms
if current is None:
current = {"start": start_ms, "end": end_ms, "text": char}
else:
should_split_before = (
len(current["text"] + char) > max_chars
or (end_ms - current["start"]) > max_duration_ms
)
if should_split_before:
_flush_block(blocks, current)
current = {"start": start_ms, "end": end_ms, "text": char}
else:
current["text"] += char
current["end"] = end_ms
if current and is_punctuation:
_flush_block(blocks, current)
current = None
if current:
_flush_block(blocks, current)
return blocks
def local_fun_asr_result_to_srt(
result_json: dict[str, Any],
max_chars: int = 20,
max_duration: float = 3.5,
) -> str:
"""Convert a FunASR-Pack JSON response into SRT when the API SRT is unavailable."""
blocks: list[dict[str, Any]] = []
for item in _local_result_items(result_json):
item_blocks = _blocks_from_local_timestamp(item, max_chars, max_duration)
if not item_blocks:
text = str(item.get("text") or "").strip()
if text:
item_blocks = _blocks_from_sentence(
{
"begin_time": 0,
"end_time": max(1500, len(text) * 180),
"text": text,
},
max_chars=max_chars,
)
blocks.extend(item_blocks)
if not blocks:
raise FunAsrError("本地 FunASR-Pack 转写结果为空:未找到可用字幕内容")
lines = []
for index, block in enumerate(blocks, start=1):
lines.append(_srt_block(index, block["start"], block["end"], block["text"]))
return "\n".join(lines).rstrip() + "\n"
def firered_asr_result_to_srt(result_json: dict[str, Any]) -> str:
"""Convert a FireRedASR2-AED-Pack JSON response into SRT when no SRT URL is returned."""
blocks: list[dict[str, Any]] = []
sentences = result_json.get("sentences")
if isinstance(sentences, list):
for sentence in sentences:
if not isinstance(sentence, dict):
continue
text = str(sentence.get("text") or "").strip()
if not text:
continue
start = sentence.get("start_ms", sentence.get("begin_time", sentence.get("start_time", 0)))
end = sentence.get("end_ms", sentence.get("end_time"))
start_ms = _timestamp_ms(start, "firered.sentence.start_ms")
end_ms = _timestamp_ms(end, "firered.sentence.end_ms") if end is not None else start_ms + 500
blocks.append({"start": start_ms, "end": end_ms, "text": text})
if not blocks:
return local_fun_asr_result_to_srt(result_json)
lines = []
for index, block in enumerate(blocks, start=1):
lines.append(_srt_block(index, block["start"], block["end"], block["text"]))
return "\n".join(lines).rstrip() + "\n"
def _get_local_srt_download_url(result_json: dict[str, Any]) -> str:
downloads = result_json.get("downloads") or {}
if isinstance(downloads, dict):
download_url = downloads.get("srt")
if download_url:
return str(download_url)
for key in ("srt_url", "srt_download_url", "download_url"):
download_url = result_json.get(key)
if download_url:
return str(download_url)
return ""
def create_with_local_fun_asr(
local_file: str,
subtitle_file: str = "",
api_url: str = LOCAL_FUN_ASR_API_URL,
hotword: str = "",
enable_spk: Optional[bool] = None,
timeout: float = 600.0,
session=requests,
) -> Optional[str]:
"""Create an SRT file through a locally running FunASR-Pack API."""
try:
result_json = request_local_fun_asr(
local_file=local_file,
api_url=api_url,
hotword=hotword,
enable_spk=enable_spk,
timeout=timeout,
session=session,
)
srt_file = result_json.get("srt_file")
if isinstance(srt_file, str) and srt_file and os.path.isfile(srt_file):
output_file = copy_srt_file(srt_file, subtitle_file)
else:
download_url = _get_local_srt_download_url(result_json)
if download_url:
output_file = download_local_srt(
download_url,
api_url=api_url,
subtitle_file=subtitle_file,
session=session,
)
else:
srt_content = local_fun_asr_result_to_srt(result_json)
output_file = write_srt_file(srt_content, subtitle_file)
logger.info(f"本地 FunASR-Pack 字幕文件已生成: {output_file}")
return output_file
except FunAsrError:
raise
except Exception as exc:
raise FunAsrError("本地 FunASR-Pack 字幕转写失败,请检查服务地址、文件或模型状态") from exc
def create_with_local_firered_asr(
local_file: str,
subtitle_file: str = "",
api_url: str = LOCAL_FIRERED_ASR_API_URL,
enable_vad: Optional[bool] = True,
enable_lid: Optional[bool] = True,
enable_punc: Optional[bool] = True,
return_timestamp: Optional[bool] = True,
timeout: float = 600.0,
session=requests,
) -> Optional[str]:
"""Create an SRT file through a locally running FireRedASR2-AED-Pack API."""
service_name = "本地 FireRedASR2-AED-Pack 服务"
try:
result_json = request_local_firered_asr(
local_file=local_file,
api_url=api_url,
enable_vad=enable_vad,
enable_lid=enable_lid,
enable_punc=enable_punc,
return_timestamp=return_timestamp,
timeout=timeout,
session=session,
)
srt_file = result_json.get("srt_file")
if isinstance(srt_file, str) and srt_file and os.path.isfile(srt_file):
output_file = copy_srt_file(srt_file, subtitle_file)
else:
download_url = _get_local_srt_download_url(result_json)
if download_url:
output_file = download_local_srt(
download_url,
api_url=api_url,
subtitle_file=subtitle_file,
session=session,
service_name=service_name,
)
else:
srt_content = firered_asr_result_to_srt(result_json)
output_file = write_srt_file(srt_content, subtitle_file)
logger.info(f"本地 FireRedASR2-AED-Pack 字幕文件已生成: {output_file}")
return output_file
except FunAsrError:
raise
except Exception as exc:
raise FunAsrError("本地ASR字幕转写失败请检查 FireRedASR2-AED-Pack 服务地址、文件或模型状态") from exc
def create_with_fun_asr(
local_file: str,
subtitle_file: str = "",

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,27 +1,30 @@
import json
import os
import re
import subprocess
import time
from os import path
from typing import Dict
from loguru import logger
from app.config import config
from app.models import const
from app.models.schema import VideoClipParams
from app.services import voice, clip_video, update_script
from app.services import voice, clip_video, script_subtitle
from app.services.jianying_draft_builder import write_plaintext_jianying_draft
from app.services import state as sm
from app.utils import utils
def get_audio_duration_ffprobe(audio_file: str) -> float:
def get_media_duration_ffprobe(media_file: str) -> float:
"""
使用ffprobe获取音频文件的精确时长
使用ffprobe获取媒体文件的精确时长
Args:
audio_file: 音频文件路径
media_file: 媒体文件路径
Returns:
float: 音频时长精确到微秒
float: 媒体时长精确到微秒
"""
try:
cmd = [
@ -29,20 +32,308 @@ def get_audio_duration_ffprobe(audio_file: str) -> float:
'-v', 'error',
'-show_entries', 'format=duration',
'-of', 'csv=p=0',
audio_file
media_file
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
duration = float(result.stdout.strip())
logger.debug(f"使用ffprobe获取音频时长: {duration:.6f}")
logger.debug(f"使用ffprobe获取媒体时长: {duration:.6f}, 文件: {media_file}")
return duration
except subprocess.CalledProcessError as e:
logger.error(f"ffprobe执行失败: {e.stderr}")
raise
except Exception as e:
logger.error(f"获取音频时长失败: {str(e)}")
logger.error(f"获取媒体时长失败: {str(e)}")
raise
def get_audio_duration_ffprobe(audio_file: str) -> float:
return get_media_duration_ffprobe(audio_file)
def _strip_tts_voice_prefix(voice_name: str, prefix: str) -> str:
voice_name = voice_name or ""
if voice_name.startswith(prefix):
return voice_name[len(prefix):]
return voice_name
def _strip_indextts_prefix(voice_name: str) -> str:
return _strip_tts_voice_prefix(
config.normalize_indextts_voice_prefix(voice_name or ""),
config.INDEXTTS_VOICE_PREFIX,
)
def _floor_duration_to_milliseconds(duration: float) -> float:
return int(duration * 1000) / 1000.0
def _format_seconds_for_trange(seconds: float) -> str:
return f"{seconds:.3f}s"
def _get_cached_media_duration(media_file: str, duration_cache: Dict[str, float]) -> float:
if media_file not in duration_cache:
duration_cache[media_file] = _floor_duration_to_milliseconds(
get_media_duration_ffprobe(media_file)
)
return duration_cache[media_file]
def _clamp_duration_to_media(
requested_duration: float,
media_file: str,
duration_cache: Dict[str, float],
media_label: str,
source_start_time: float = 0.0,
) -> float:
requested_duration = _floor_duration_to_milliseconds(max(requested_duration, 0.0))
actual_duration = _get_cached_media_duration(media_file, duration_cache)
available_duration = _floor_duration_to_milliseconds(
max(actual_duration - max(source_start_time, 0.0), 0.0)
)
safe_duration = min(requested_duration, available_duration)
logger.info(
f"{media_label}实际时长: {actual_duration:.6f}秒, "
f"可用时长: {available_duration:.6f}秒, 请求时长: {requested_duration:.3f}"
)
if safe_duration < requested_duration:
logger.warning(
f"{media_label}短于脚本时长,已将剪映片段时长从 "
f"{requested_duration:.3f}秒 调整为 {safe_duration:.3f}"
)
return safe_duration
def _normalize_indextts_reference_audio(params: VideoClipParams) -> None:
"""Ensure local clone TTS engines use configured reference audio instead of a stale UI voice."""
params.tts_engine = config.normalize_tts_engine_name(params.tts_engine)
if params.tts_engine == config.INDEXTTS_ENGINE:
tts_config = config.indextts
voice_prefix = config.INDEXTTS_VOICE_PREFIX
display_name = "IndexTTS-1.5"
elif params.tts_engine == config.INDEXTTS2_ENGINE:
tts_config = config.indextts2
voice_prefix = config.INDEXTTS2_VOICE_PREFIX
display_name = "IndexTTS-2"
elif params.tts_engine == config.OMNIVOICE_ENGINE:
tts_config = config.omnivoice
if tts_config.get("mode", "auto") != "voice_clone":
return
voice_prefix = config.OMNIVOICE_VOICE_PREFIX
display_name = "OmniVoice"
else:
return
candidate = _strip_tts_voice_prefix(getattr(params, "voice_name", "") or "", voice_prefix)
if candidate and os.path.isfile(candidate):
params.voice_name = f"{voice_prefix}{candidate}"
logger.info(f"{display_name} 使用参考音频: {candidate}")
return
configured_ref = _strip_tts_voice_prefix(tts_config.get("reference_audio", "") or "", voice_prefix)
if configured_ref and os.path.isfile(configured_ref):
params.voice_name = f"{voice_prefix}{configured_ref}"
logger.info(f"{display_name} 使用配置中的参考音频: {configured_ref}")
return
raise ValueError(f"{display_name} 参考音频不存在,请在音频设置中上传或选择有效的参考音频")
def _index_tts_results(tts_results: list[Dict]) -> Dict:
indexed = {}
for tts_result in tts_results or []:
item_id = tts_result.get("_id")
timestamp = tts_result.get("timestamp")
if item_id is not None:
indexed[item_id] = tts_result
if timestamp:
indexed[timestamp] = tts_result
return indexed
def _get_video_source_paths(params: VideoClipParams) -> list[str]:
return clip_video._normalize_video_origin_paths(
getattr(params, "video_origin_path", ""),
getattr(params, "video_origin_paths", []),
)
def _resolve_script_video_path(item: Dict, video_source_paths: list[str]) -> str:
if not video_source_paths:
return ""
return clip_video._resolve_script_video_path(item, video_source_paths)
def _resolve_tts_result(item: Dict, tts_map: Dict) -> Dict:
item_id = item.get("_id")
timestamp = item.get("timestamp")
if item_id is not None and item_id in tts_map:
return tts_map[item_id]
if timestamp in tts_map:
return tts_map[timestamp]
return {}
def _build_jianying_draft_script(
list_script: list[Dict],
params: VideoClipParams,
tts_results: list[Dict],
) -> list[Dict]:
video_source_paths = _get_video_source_paths(params)
if not video_source_paths:
raise ValueError("视频文件不能为空")
tts_map = _index_tts_results(tts_results)
draft_script = []
accumulated_duration = 0.0
for item in list_script:
item_copy = dict(item)
timestamp = item_copy.get("timestamp", "")
try:
source_start, source_end = script_subtitle.parse_time_range(timestamp)
except ValueError as e:
logger.warning(f"解析剪映片段时间戳失败,跳过片段 {item_copy.get('_id')}: {e}")
continue
timestamp_duration = _floor_duration_to_milliseconds(source_end - source_start)
if timestamp_duration <= 0:
logger.warning(f"剪映片段时长无效,跳过片段 {item_copy.get('_id')}: {timestamp}")
continue
ost = int(item_copy.get("OST", 0) or 0)
tts_result = _resolve_tts_result(item_copy, tts_map) if ost in [0, 2] else {}
item_duration = timestamp_duration
if tts_result.get("duration"):
item_duration = _floor_duration_to_milliseconds(float(tts_result.get("duration") or 0.0))
if item_duration <= 0:
item_duration = timestamp_duration
item_copy.update({
"video": _resolve_script_video_path(item_copy, video_source_paths),
"audio": tts_result.get("audio_file", ""),
"subtitle": tts_result.get("subtitle_file", ""),
"sourceTimeRange": timestamp,
"start_time": source_start,
"source_start_time": source_start,
"duration": item_duration,
"use_source_timerange": True,
"editedTimeRange": (
f"{script_subtitle.format_srt_time(accumulated_duration)}-"
f"{script_subtitle.format_srt_time(accumulated_duration + item_duration)}"
),
})
accumulated_duration += item_duration
draft_script.append(item_copy)
if not draft_script:
raise ValueError("没有可写入剪映草稿的视频片段")
return draft_script
def _get_original_subtitle_paths(params: VideoClipParams) -> list[str]:
subtitle_paths = getattr(params, "original_subtitle_paths", []) or []
if isinstance(subtitle_paths, str):
subtitle_paths = [subtitle_paths]
normalized_paths = []
seen = set()
for subtitle_path in subtitle_paths:
if not isinstance(subtitle_path, str):
continue
subtitle_path = subtitle_path.strip()
if subtitle_path and subtitle_path not in seen:
normalized_paths.append(subtitle_path)
seen.add(subtitle_path)
single_subtitle_path = str(getattr(params, "original_subtitle_path", "") or "").strip()
if single_subtitle_path and single_subtitle_path not in seen:
normalized_paths.insert(0, single_subtitle_path)
if not normalized_paths:
normalized_paths = _find_original_subtitle_paths_for_videos(_get_video_source_paths(params))
return normalized_paths
def _video_stem_candidates(video_path: str) -> list[str]:
stem = path.splitext(path.basename(str(video_path or "").strip()))[0]
if not stem:
return []
candidates = [stem]
timestamp_stripped = re.sub(r"_[0-9]{14}$", "", stem)
if timestamp_stripped and timestamp_stripped not in candidates:
candidates.append(timestamp_stripped)
return candidates
def _find_original_subtitle_paths_for_videos(video_paths: list[str]) -> list[str]:
subtitle_dir = utils.subtitle_dir()
if not path.isdir(subtitle_dir):
return []
subtitle_files = [
path.join(subtitle_dir, filename)
for filename in os.listdir(subtitle_dir)
if filename.lower().endswith(".srt")
]
if not subtitle_files:
return []
resolved_paths = []
seen = set()
for video_path in video_paths:
candidates = _video_stem_candidates(video_path)
if not candidates:
continue
matches = []
for subtitle_path in subtitle_files:
subtitle_stem = path.splitext(path.basename(subtitle_path))[0]
for candidate in candidates:
if subtitle_stem == candidate or subtitle_stem.startswith(f"{candidate}_"):
matches.append(subtitle_path)
break
if not matches:
continue
matches.sort(key=lambda item: path.getmtime(item), reverse=True)
selected_path = matches[0]
if selected_path not in seen:
resolved_paths.append(selected_path)
seen.add(selected_path)
if resolved_paths:
logger.info(f"剪映导出未从参数获取原片字幕,已按视频文件名自动匹配: {resolved_paths}")
return resolved_paths
def _create_jianying_subtitle_file(
task_id: str,
draft_script: list[Dict],
params: VideoClipParams,
) -> str:
if not getattr(params, "subtitle_enabled", True):
return ""
try:
return script_subtitle.create_script_subtitle_file(
task_id=task_id,
list_script=draft_script,
original_subtitle_paths=_get_original_subtitle_paths(params),
video_origin_paths=_get_video_source_paths(params),
)
except Exception as e:
logger.warning(f"剪映草稿字幕生成失败,将导出无字幕草稿: {e}")
return ""
def start_export_jianying_draft(task_id: str, params: VideoClipParams):
"""
导出到剪映草稿的后台任务
@ -83,6 +374,7 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams):
2. 使用 TTS 生成音频素材
"""
logger.info("\n\n## 2. 根据OST设置生成音频列表")
_normalize_indextts_reference_audio(params)
tts_segments = [
segment for segment in list_script
if segment['OST'] in [0, 2]
@ -101,22 +393,15 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams):
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
"""
3. 统一视频裁剪 - 基于OST类型的差异化裁剪策略
3. 准备剪映草稿时间线 - 直接引用原视频素材和源时间戳
"""
logger.info("\n\n## 3. 统一视频裁剪基于OST类型")
video_clip_result = clip_video.clip_video_unified(
video_origin_path=params.video_origin_path,
script_list=list_script,
tts_results=tts_results
)
logger.info("\n\n## 3. 准备剪映草稿时间线(不裁剪视频)")
new_script_list = _build_jianying_draft_script(list_script, params, tts_results)
subtitle_path = _create_jianying_subtitle_file(task_id, new_script_list, params)
tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results}
subclip_clip_result = {
tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results
}
new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result)
logger.info(f"统一裁剪完成,处理了 {len(video_clip_result)} 个视频片段")
logger.info(f"剪映草稿时间线准备完成,处理了 {len(new_script_list)} 个视频片段")
if subtitle_path:
logger.info(f"剪映草稿字幕文件: {subtitle_path}")
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60)
@ -126,114 +411,38 @@ def start_export_jianying_draft(task_id: str, params: VideoClipParams):
logger.info("\n\n## 4. 导出到剪映草稿")
try:
import pyJianYingDraft
from pyJianYingDraft import DraftFolder, VideoSegment, AudioSegment, trange, TrackType
jianying_draft_path = config.ui.get("jianying_draft_path", "")
if not jianying_draft_path:
raise ValueError("剪映草稿路径未配置")
# 创建DraftFolder实例
draft_folder = DraftFolder(jianying_draft_path)
# 使用从参数中获取的草稿名称,如果为空则使用默认名称
draft_name = getattr(params, 'draft_name', "")
logger.debug(f"从params获取的草稿名称: '{draft_name}' (类型: {type(draft_name)})")
if not draft_name:
draft_name = f"NarratoAI_{int(time.time())}"
logger.debug(f"使用默认草稿名称: '{draft_name}'")
# 创建新草稿
script = draft_folder.create_draft(draft_name, 1920, 1080)
# 添加视频轨道和音频轨道
script.add_track(TrackType.video, '视频轨道')
script.add_track(TrackType.audio, '音频轨道')
# 处理脚本数据
current_time = 0
output_dir = utils.task_dir(task_id)
for item in new_script_list:
# 获取时间信息
start_time = float(item.get('start_time', 0.0))
duration = float(item.get('duration', 0.0))
timestamp = item.get('timestamp', '')
logger.info(f"处理片段: OST={item['OST']}, start_time={start_time}, duration={duration}, timestamp={timestamp}")
# 生成音频文件路径
audio_file = ""
if timestamp:
timestamp_formatted = timestamp.replace(':', '_')
audio_file = os.path.join(
output_dir,
f"audio_{timestamp_formatted}.mp3"
)
# 检查是否有裁剪后的视频文件
video_file = item.get('video', '')
if video_file and not os.path.exists(video_file):
video_file = ""
# 添加视频片段
if video_file:
# 使用裁剪后的视频文件
# 对于裁剪后的视频target_timerange的第二个参数是持续时间
video_segment = VideoSegment(
video_file,
trange(f"{current_time}s", f"{duration}s")
)
else:
# 使用原始视频文件
# source_timerange是从原始视频中截取的部分
# target_timerange是片段在时间轴上的位置
video_segment = VideoSegment(
params.video_origin_path,
trange(f"{current_time}s", f"{duration}s"),
source_timerange=trange(f"{start_time}s", f"{duration}s")
)
script.add_segment(video_segment, '视频轨道')
# 处理音频
if item['OST'] in [0, 2]: # 需要TTS的片段
if os.path.exists(audio_file):
# 使用ffprobe获取精确的音频时长避免因TTS引擎差异导致时长不匹配
actual_audio_duration = get_audio_duration_ffprobe(audio_file)
logger.info(f"音频文件实际时长: {actual_audio_duration:.6f}秒, 脚本时长(视频): {duration:.3f}")
# 使用音频实际时长和视频时长中的较小值,确保不超过素材时长
# 当TTS语速调整时音频可能比视频长或短取较小值可以避免超出素材
safe_duration = min(actual_audio_duration, duration)
logger.info(f"使用时长: {safe_duration:.6f}秒 (取音频和视频时长的较小值)")
audio_segment = AudioSegment(
audio_file,
trange(f"{current_time}s", f"{safe_duration}s")
)
script.add_segment(audio_segment, '音频轨道')
else:
logger.warning(f"音频文件不存在: {audio_file}")
# OST=1的片段保留原声不需要添加额外音频
# 更新当前时间
current_time += duration
# 保存草稿
script.save()
draft_path = os.path.join(jianying_draft_path, draft_name)
draft_path, draft_name = write_plaintext_jianying_draft(
jianying_draft_path=jianying_draft_path,
draft_name=draft_name,
new_script_list=new_script_list,
params=params,
output_dir=output_dir,
subtitle_path=subtitle_path,
)
logger.success(f"成功导出到剪映草稿: {draft_name}")
logger.info(f"草稿已保存到: {draft_path}")
# 更新任务状态
sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, draft_path=draft_path, draft_name=draft_name)
task_kwargs = {"draft_path": draft_path, "draft_name": draft_name}
if subtitle_path:
task_kwargs["subtitles"] = [subtitle_path]
sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **task_kwargs)
return {"draft_path": draft_path, "draft_name": draft_name}
except ImportError as e:
logger.error(f"导入pyJianYingDraft失败: {e}")
raise ImportError(f"pyJianYingDraft库导入失败: {e}\n请确保已正确安装该库")
return task_kwargs
except Exception as e:
logger.error(f"导出到剪映草稿失败: {e}")
import traceback

View File

@ -178,6 +178,27 @@ class TextModelProvider(BaseLLMProvider):
生成的文本内容
"""
pass
async def generate_text_stream(self,
prompt: str,
system_prompt: Optional[str] = None,
temperature: float = 1.0,
max_tokens: Optional[int] = None,
response_format: Optional[str] = None,
on_chunk=None,
**kwargs) -> str:
"""生成文本内容并尽可能回调流式片段;默认退化为一次性输出。"""
result = await self.generate_text(
prompt=prompt,
system_prompt=system_prompt,
temperature=temperature,
max_tokens=max_tokens,
response_format=response_format,
**kwargs,
)
if on_chunk:
on_chunk({"type": "content", "text": result})
return result
def _build_messages(self, prompt: str, system_prompt: Optional[str] = None) -> List[Dict[str, str]]:
"""构建消息列表"""

View File

@ -198,11 +198,19 @@ class VisionAnalyzerAdapter:
class SubtitleAnalyzerAdapter:
"""字幕分析器适配器"""
def __init__(self, api_key: str, model: str, base_url: str, provider: str = None):
def __init__(
self,
api_key: str,
model: str,
base_url: str,
provider: str = None,
prompt_category: str = "short_drama_narration",
):
self.api_key = api_key
self.model = model
self.base_url = base_url
self.provider = provider or "openai"
self.prompt_category = prompt_category or "short_drama_narration"
def _run_async_safely(self, coro_func, *args, **kwargs):
"""安全地运行异步协程"""
@ -225,6 +233,229 @@ class SubtitleAnalyzerAdapter:
output = output.strip()
return output
def _render_prompt(self, name: str, parameters: Dict[str, Any]) -> tuple[str, Optional[str]]:
prompt = PromptManager.get_prompt(
category=self.prompt_category,
name=name,
parameters=parameters,
)
prompt_object = PromptManager.get_prompt_object(
category=self.prompt_category,
name=name,
)
return prompt, prompt_object.get_system_prompt()
def _generate_json_text(
self,
prompt: str,
system_prompt: Optional[str],
temperature: float,
stream_callback=None,
) -> str:
generate_func = (
UnifiedLLMService.generate_text_stream
if stream_callback
else UnifiedLLMService.generate_text
)
kwargs = {
"prompt": prompt,
"system_prompt": system_prompt,
"provider": self.provider,
"temperature": temperature,
"response_format": "json",
"api_key": self.api_key,
"api_base": self.base_url,
}
if stream_callback:
kwargs["on_chunk"] = stream_callback
result = self._run_async_safely(generate_func, **kwargs)
return self._clean_json_output(result)
def _generate_plain_text(self, prompt: str, system_prompt: Optional[str], temperature: float) -> str:
result = self._run_async_safely(
UnifiedLLMService.generate_text,
prompt=prompt,
system_prompt=system_prompt,
provider=self.provider,
temperature=temperature,
api_key=self.api_key,
api_base=self.base_url,
)
return str(result or "").strip()
def generate_narration_copy(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str = "",
temperature: float = 0.7,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
) -> Dict[str, Any]:
"""Generate editable narration copy before timeline matching."""
try:
prompt, system_prompt = self._render_prompt(
"narration_copy",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content,
"narration_language": narration_language,
},
)
narration_copy = self._generate_plain_text(prompt, system_prompt, temperature)
return {
"status": "success",
"narration_copy": narration_copy,
"model": self.model,
"temperature": temperature,
}
except Exception as e:
logger.error(f"解说文案正文生成失败: {str(e)}")
return {
"status": "error",
"message": str(e),
"temperature": temperature,
}
def match_narration_copy_to_script(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str,
narration_copy: str,
temperature: float = 0.3,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
original_sound_ratio: int = 30,
stream_callback=None,
) -> Dict[str, Any]:
"""Match reviewed narration copy to source footage and return JSON script."""
try:
prompt, system_prompt = self._render_prompt(
"script_matching",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content,
"narration_copy": narration_copy,
"narration_language": narration_language,
"original_sound_ratio": int(original_sound_ratio),
},
)
narration_script = self._generate_json_text(
prompt,
system_prompt,
min(float(temperature), 0.3),
stream_callback=stream_callback,
)
return {
"status": "success",
"narration_script": narration_script,
"model": self.model,
"temperature": temperature,
}
except Exception as e:
logger.error(f"解说文案画面匹配失败: {str(e)}")
return {
"status": "error",
"message": str(e),
"temperature": temperature,
}
def plan_narration_segments(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str = "",
temperature: float = 0.3,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
) -> str:
"""Plan source segments before generating final copy."""
prompt, system_prompt = self._render_prompt(
"segment_planning",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content,
"narration_language": narration_language,
},
)
return self._generate_json_text(prompt, system_prompt, min(float(temperature), 0.3))
def generate_narration_script_from_plan(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str,
segment_plan: str,
temperature: float = 0.7,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
) -> str:
prompt, system_prompt = self._render_prompt(
"script_generation",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content,
"segment_plan": segment_plan,
"narration_language": narration_language,
},
)
return self._generate_json_text(prompt, system_prompt, temperature)
def repair_narration_script(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str,
invalid_script: str,
validation_errors: str,
temperature: float = 0.3,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
stream_callback=None,
) -> Dict[str, Any]:
"""Repair a generated script once after deterministic validation fails."""
try:
prompt, system_prompt = self._render_prompt(
"script_repair",
{
"drama_name": short_name,
"drama_genre": drama_genre,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content,
"invalid_script": invalid_script,
"validation_errors": validation_errors,
"narration_language": narration_language,
},
)
repaired_script = self._generate_json_text(
prompt,
system_prompt,
min(float(temperature), 0.3),
stream_callback=stream_callback,
)
return {
"status": "success",
"narration_script": repaired_script,
"model": self.model,
"temperature": temperature,
}
except Exception as e:
logger.error(f"解说文案修复失败: {str(e)}")
return {
"status": "error",
"message": str(e),
"temperature": temperature,
}
def analyze_subtitle(self, subtitle_content: str) -> Dict[str, Any]:
"""
@ -243,6 +474,7 @@ class SubtitleAnalyzerAdapter:
subtitle_content=subtitle_content,
provider=self.provider,
temperature=1.0,
prompt_category=self.prompt_category,
api_key=self.api_key,
api_base=self.base_url
)
@ -262,7 +494,15 @@ class SubtitleAnalyzerAdapter:
"temperature": 1.0
}
def generate_narration_script(self, short_name: str, plot_analysis: str, subtitle_content: str = "", temperature: float = 0.7) -> Dict[str, Any]:
def generate_narration_script(
self,
short_name: str,
plot_analysis: str,
subtitle_content: str = "",
temperature: float = 0.7,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
) -> Dict[str, Any]:
"""
生成解说文案 - 兼容原有接口
@ -271,36 +511,30 @@ class SubtitleAnalyzerAdapter:
plot_analysis: 剧情分析内容
subtitle_content: 原始字幕内容用于提供准确的时间戳信息
temperature: 生成温度
narration_language: 解说台词目标语言
Returns:
生成结果字典
"""
try:
# 使用新的提示词管理系统构建提示词
prompt = PromptManager.get_prompt(
category="short_drama_narration",
name="script_generation",
parameters={
"drama_name": short_name,
"plot_analysis": plot_analysis,
"subtitle_content": subtitle_content
}
)
# 使用统一服务生成文案
result = self._run_async_safely(
UnifiedLLMService.generate_text,
prompt=prompt,
system_prompt="你是一位专业的短视频解说脚本撰写专家。",
provider=self.provider,
segment_plan = self.plan_narration_segments(
short_name=short_name,
plot_analysis=plot_analysis,
subtitle_content=subtitle_content,
temperature=temperature,
response_format="json",
api_key=self.api_key,
api_base=self.base_url
narration_language=narration_language,
drama_genre=drama_genre,
)
cleaned_result = self.generate_narration_script_from_plan(
short_name=short_name,
plot_analysis=plot_analysis,
subtitle_content=subtitle_content,
segment_plan=segment_plan,
temperature=temperature,
narration_language=narration_language,
drama_genre=drama_genre,
)
# 清理JSON输出
cleaned_result = self._clean_json_output(result)
# 新的提示词系统返回的是包含items数组的JSON格式
# 为了保持向后兼容我们需要直接返回这个JSON字符串

View File

@ -22,7 +22,7 @@ from openai import (
)
from app.config import config
from app.config.defaults import normalize_openai_compatible_model_name
from app.config.defaults import DEFAULT_LLM_GENERATION_CONFIG, normalize_openai_compatible_model_name
from .base import TextModelProvider, VisionModelProvider
from .exceptions import APICallError, AuthenticationError, ContentFilterError, RateLimitError
@ -68,18 +68,59 @@ class _OpenAICompatibleBase:
# SDK client 按请求参数动态构建,这里无需初始化全局状态。
pass
def _generation_config_value(self, model_type: str, param_name: str, override: Any = None) -> Any:
if override is not None:
return override
return config.app.get(
f"{model_type}_openai_{param_name}",
DEFAULT_LLM_GENERATION_CONFIG[param_name],
)
def _build_chat_completion_options(
self,
model_type: str,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
**kwargs,
) -> Dict[str, Any]:
"""Build common OpenAI-compatible generation options from config and overrides."""
options: Dict[str, Any] = {
"temperature": float(self._generation_config_value(model_type, "temperature", temperature)),
}
top_p = float(self._generation_config_value(model_type, "top_p", kwargs.get("top_p")))
options["top_p"] = top_p
configured_max_tokens = self._generation_config_value(model_type, "max_tokens", max_tokens)
if configured_max_tokens is not None and int(configured_max_tokens) > 0:
options["max_tokens"] = int(configured_max_tokens)
extra_body: Dict[str, Any] = {}
thinking_level = str(
self._generation_config_value(model_type, "thinking_level", kwargs.get("thinking_level")) or "auto"
)
if thinking_level in {"low", "medium", "high"}:
extra_body["reasoning_effort"] = thinking_level
if extra_body:
options["extra_body"] = extra_body
return options
def _build_client(
self,
api_key_override: Optional[str] = None,
base_url_override: Optional[str] = None,
timeout_override: Optional[float] = None,
max_retries_override: Optional[int] = None,
) -> AsyncOpenAI:
"""按请求构建 AsyncOpenAI 客户端,支持动态覆盖 api_key / base_url。"""
api_key = api_key_override or self.api_key
base_url = base_url_override or self.base_url or None
timeout_seconds: float = timeout_override or config.app.get("llm_text_timeout", 180)
max_retries: int = config.app.get("llm_max_retries", 3)
max_retries: int = max_retries_override or config.app.get("llm_max_retries", 3)
return AsyncOpenAI(
api_key=api_key,
@ -147,11 +188,17 @@ class OpenAICompatibleVisionProvider(_OpenAICompatibleBase, VisionModelProvider)
)
try:
generation_overrides = dict(kwargs)
completion_options = self._build_chat_completion_options(
"vision",
temperature=generation_overrides.pop("temperature", None),
max_tokens=generation_overrides.pop("max_tokens", None),
**generation_overrides,
)
response = await client.chat.completions.create(
model=model_name,
messages=messages,
temperature=kwargs.get("temperature", 1.0),
max_tokens=kwargs.get("max_tokens", 4000),
**completion_options,
)
if response.choices and response.choices[0].message and response.choices[0].message.content:
return response.choices[0].message.content
@ -186,6 +233,61 @@ class OpenAICompatibleVisionProvider(_OpenAICompatibleBase, VisionModelProvider)
class OpenAICompatibleTextProvider(_OpenAICompatibleBase, TextModelProvider):
"""OpenAI 兼容文本模型提供商。"""
def _build_text_completion_kwargs(
self,
messages: List[Dict[str, str]],
temperature: float,
max_tokens: Optional[int],
response_format: Optional[str],
kwargs: Dict[str, Any],
) -> Dict[str, Any]:
model_name = _normalize_model_name(self.model_name)
generation_kwargs = dict(kwargs)
temperature_override = generation_kwargs.pop("temperature", None)
if temperature_override is None and temperature != 1.0:
temperature_override = temperature
completion_kwargs: Dict[str, Any] = {
"model": model_name,
"messages": messages,
}
completion_kwargs.update(
self._build_chat_completion_options(
"text",
temperature=temperature_override,
max_tokens=generation_kwargs.pop("max_tokens", max_tokens),
**generation_kwargs,
)
)
if response_format == "json":
completion_kwargs["response_format"] = {"type": "json_object"}
return completion_kwargs
@staticmethod
def _emit_stream_chunk(on_chunk, chunk_type: str, text: str):
if not on_chunk or not text:
return
try:
on_chunk({"type": chunk_type, "text": text})
except Exception as exc:
logger.debug(f"流式回调更新失败: {exc}")
@staticmethod
def _extract_reasoning_delta(delta: Any) -> str:
if delta is None:
return ""
if hasattr(delta, "reasoning_content"):
value = getattr(delta, "reasoning_content")
if value:
return str(value)
if hasattr(delta, "model_dump"):
data = delta.model_dump(exclude_none=True)
for key in ("reasoning_content", "reasoning", "thinking"):
value = data.get(key)
if value:
return str(value)
return ""
async def generate_text(
self,
prompt: str,
@ -196,7 +298,6 @@ class OpenAICompatibleTextProvider(_OpenAICompatibleBase, TextModelProvider):
**kwargs,
) -> str:
messages = self._build_messages(prompt, system_prompt)
model_name = _normalize_model_name(self.model_name)
client = self._build_client(
api_key_override=kwargs.get("api_key"),
@ -204,15 +305,13 @@ class OpenAICompatibleTextProvider(_OpenAICompatibleBase, TextModelProvider):
timeout_override=config.app.get("llm_text_timeout", 180),
)
completion_kwargs: Dict[str, Any] = {
"model": model_name,
"messages": messages,
"temperature": temperature,
}
if max_tokens:
completion_kwargs["max_tokens"] = max_tokens
if response_format == "json":
completion_kwargs["response_format"] = {"type": "json_object"}
completion_kwargs = self._build_text_completion_kwargs(
messages,
temperature,
max_tokens,
response_format,
kwargs,
)
try:
response = await client.chat.completions.create(**completion_kwargs)
@ -250,5 +349,81 @@ class OpenAICompatibleTextProvider(_OpenAICompatibleBase, TextModelProvider):
logger.error(f"OpenAI 兼容接口调用失败: {exc}")
raise APICallError(f"调用失败: {exc}")
async def generate_text_stream(
self,
prompt: str,
system_prompt: Optional[str] = None,
temperature: float = 1.0,
max_tokens: Optional[int] = None,
response_format: Optional[str] = None,
on_chunk=None,
**kwargs,
) -> str:
messages = self._build_messages(prompt, system_prompt)
client = self._build_client(
api_key_override=kwargs.get("api_key"),
base_url_override=kwargs.get("api_base"),
timeout_override=config.app.get("llm_text_timeout", 180),
)
completion_kwargs = self._build_text_completion_kwargs(
messages,
temperature,
max_tokens,
response_format,
kwargs,
)
completion_kwargs["stream"] = True
async def collect_stream() -> str:
content_parts: List[str] = []
stream = await client.chat.completions.create(**completion_kwargs)
async for chunk in stream:
if not getattr(chunk, "choices", None):
continue
delta = chunk.choices[0].delta
reasoning_delta = self._extract_reasoning_delta(delta)
if reasoning_delta:
self._emit_stream_chunk(on_chunk, "reasoning", reasoning_delta)
content_delta = getattr(delta, "content", None) if delta is not None else None
if content_delta:
content_parts.append(content_delta)
self._emit_stream_chunk(on_chunk, "content", content_delta)
result = "".join(content_parts).strip()
if result:
self._emit_stream_chunk(on_chunk, "done", "")
return result
raise APICallError("OpenAI 兼容接口返回空响应")
try:
return await collect_stream()
except OpenAIBadRequestError as exc:
error_msg = str(exc)
if response_format == "json" and _is_response_format_error(error_msg):
logger.warning("目标网关不支持流式 response_format回退为提示词约束 JSON 输出")
completion_kwargs.pop("response_format", None)
messages[-1]["content"] += "\n\n请确保输出严格的JSON格式不要包含任何其他文字或标记。"
result = await collect_stream()
return _clean_json_output(result)
if _is_content_filter_error(error_msg):
raise ContentFilterError(f"内容被安全过滤器阻止: {error_msg}")
raise APICallError(f"请求错误: {error_msg}")
except OpenAIAuthError as exc:
logger.error(f"OpenAI 兼容接口认证失败: {exc}")
raise AuthenticationError(str(exc))
except OpenAIRateLimitError as exc:
logger.error(f"OpenAI 兼容接口速率限制: {exc}")
raise RateLimitError(str(exc))
except OpenAIAPIError as exc:
logger.error(f"OpenAI 兼容接口 API 错误: {exc}")
raise APICallError(f"API 错误: {exc}")
except Exception as exc:
logger.error(f"OpenAI 兼容接口流式调用失败: {exc}")
raise APICallError(f"流式调用失败: {exc}")
async def _make_api_call(self, payload: Dict[str, Any]) -> Dict[str, Any]:
return payload

View File

@ -8,7 +8,7 @@ from app.config import config
from app.services.llm.base import TextModelProvider
from app.services.llm.manager import LLMServiceManager
from app.services.llm.migration_adapter import LegacyLLMAdapter, VisionAnalyzerAdapter
from app.services.llm.openai_compatible_provider import OpenAICompatibleVisionProvider
from app.services.llm.openai_compatible_provider import OpenAICompatibleTextProvider, OpenAICompatibleVisionProvider
from app.services.llm.providers import register_all_providers
@ -116,6 +116,59 @@ class OpenAICompatVisionConcurrencyTests(unittest.IsolatedAsyncioTestCase):
self.assertEqual(2, max_in_flight)
class OpenAICompatGenerationOptionTests(unittest.TestCase):
def setUp(self):
self._original_app = dict(config.app)
def tearDown(self):
config.app.clear()
config.app.update(self._original_app)
def test_build_options_uses_generation_defaults(self):
provider = OpenAICompatibleTextProvider(api_key="k", model_name="m")
for key in (
"text_openai_temperature",
"text_openai_top_p",
"text_openai_max_tokens",
"text_openai_thinking_level",
):
config.app.pop(key, None)
options = provider._build_chat_completion_options("text")
self.assertEqual(1.0, options["temperature"])
self.assertEqual(0.95, options["top_p"])
self.assertEqual(65536, options["max_tokens"])
self.assertNotIn("extra_body", options)
def test_build_options_uses_per_model_generation_config(self):
provider = OpenAICompatibleTextProvider(api_key="k", model_name="m")
config.app.update(
{
"text_openai_temperature": 0.3,
"text_openai_top_p": 0.8,
"text_openai_max_tokens": 2048,
"text_openai_thinking_level": "high",
}
)
options = provider._build_chat_completion_options("text")
self.assertEqual(0.3, options["temperature"])
self.assertEqual(0.8, options["top_p"])
self.assertEqual(2048, options["max_tokens"])
self.assertEqual({"reasoning_effort": "high"}, options["extra_body"])
def test_explicit_generation_options_override_config(self):
provider = OpenAICompatibleTextProvider(api_key="k", model_name="m")
config.app["text_openai_temperature"] = 0.3
options = provider._build_chat_completion_options("text", temperature=0.9, max_tokens=512)
self.assertEqual(0.9, options["temperature"])
self.assertEqual(512, options["max_tokens"])
class ExplicitVisionAdapterSettingsTests(unittest.IsolatedAsyncioTestCase):
class _CapturingVisionProvider:
last_init: tuple[str, str, str | None] | None = None

View File

@ -0,0 +1,241 @@
import json
import unittest
from unittest import mock
from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter
from app.services.llm.unified_service import UnifiedLLMService
from app.services.prompts import PromptManager
class SubtitleAnalyzerAdapterPipelineTests(unittest.TestCase):
def test_generate_narration_copy_uses_plain_text_prompt_with_selected_type(self):
adapter = SubtitleAnalyzerAdapter(
api_key="sk-test",
model="test-model",
base_url="https://example.test/v1",
provider="openai",
)
with mock.patch.object(adapter, "_run_async_safely", return_value="她被家人逼到绝路,反击从这一刻开始。") as call:
result = adapter.generate_narration_copy(
short_name="测试短剧",
plot_analysis="女主被家人误会后反击。",
subtitle_content="# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n女主被误会。",
temperature=0.7,
narration_language="简体中文(中国)",
drama_genre="家庭伦理",
)
self.assertEqual("success", result["status"])
self.assertIn("反击", result["narration_copy"])
self.assertIn("家庭伦理", call.call_args.kwargs["prompt"])
self.assertNotIn("response_format", call.call_args.kwargs)
def test_generate_narration_copy_can_use_film_tv_prompt_category(self):
self.assertTrue(PromptManager.exists("film_tv_narration", "narration_copy"))
adapter = SubtitleAnalyzerAdapter(
api_key="sk-test",
model="test-model",
base_url="https://example.test/v1",
provider="openai",
prompt_category="film_tv_narration",
)
with mock.patch.object(adapter, "_run_async_safely", return_value="他发现证据不对,真正的凶手另有其人。") as call:
result = adapter.generate_narration_copy(
short_name="测试电影",
plot_analysis="主角发现证据疑点。",
subtitle_content="# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n证据不对。",
temperature=0.7,
narration_language="简体中文(中国)",
drama_genre="悬疑/犯罪",
)
self.assertEqual("success", result["status"])
self.assertIn("影视解说正文创作任务", call.call_args.kwargs["prompt"])
self.assertIn("用户选择的影视类型", call.call_args.kwargs["prompt"])
self.assertNotIn("短剧解说正文创作任务", call.call_args.kwargs["prompt"])
def test_film_tv_script_prompts_exclude_intro_outro_and_ads(self):
base_parameters = {
"drama_name": "测试电影",
"drama_genre": "悬疑/犯罪",
"plot_analysis": "主角发现证据疑点。",
"subtitle_content": "# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n证据不对。",
"narration_language": "简体中文(中国)",
}
prompt_parameters = {
"segment_planning": base_parameters,
"script_matching": {
**base_parameters,
"narration_copy": "他发现证据不对,真正的凶手另有其人。",
"original_sound_ratio": 30,
},
"script_generation": {
**base_parameters,
"segment_plan": '{"segments": []}',
},
"script_repair": {
**base_parameters,
"invalid_script": '{"items": []}',
"validation_errors": "片段包含广告",
},
}
for prompt_name, parameters in prompt_parameters.items():
with self.subTest(prompt_name=prompt_name):
prompt = PromptManager.get_prompt(
category="film_tv_narration",
name=prompt_name,
parameters=parameters,
)
self.assertIn("片头", prompt)
self.assertIn("片尾", prompt)
self.assertIn("广告", prompt)
self.assertIn("绝对不能", prompt)
def test_match_narration_copy_to_script_uses_json_prompt_with_selected_type(self):
adapter = SubtitleAnalyzerAdapter(
api_key="sk-test",
model="test-model",
base_url="https://example.test/v1",
provider="openai",
)
matched = json.dumps(
{
"items": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"picture": "女主被家人误会",
"narration": "她被家人逼到绝路,反击从这一刻开始。",
"OST": 0,
}
]
},
ensure_ascii=False,
)
with mock.patch.object(adapter, "_run_async_safely", return_value=matched) as call:
result = adapter.match_narration_copy_to_script(
short_name="测试短剧",
plot_analysis="女主被家人误会后反击。",
subtitle_content="# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n女主被误会。",
narration_copy="她被家人逼到绝路,反击从这一刻开始。",
temperature=0.7,
narration_language="简体中文(中国)",
drama_genre="家庭伦理",
original_sound_ratio=60,
)
self.assertEqual("success", result["status"])
self.assertEqual(1, json.loads(result["narration_script"])["items"][0]["_id"])
self.assertIn("家庭伦理", call.call_args.kwargs["prompt"])
self.assertIn("60%", call.call_args.kwargs["prompt"])
self.assertEqual("json", call.call_args.kwargs["response_format"])
def test_match_narration_copy_to_script_uses_streaming_when_callback_exists(self):
adapter = SubtitleAnalyzerAdapter(
api_key="sk-test",
model="test-model",
base_url="https://example.test/v1",
provider="openai",
)
matched = json.dumps({"items": []}, ensure_ascii=False)
with mock.patch.object(adapter, "_run_async_safely", return_value=matched) as call:
result = adapter.match_narration_copy_to_script(
short_name="测试短剧",
plot_analysis="女主被家人误会后反击。",
subtitle_content="# 视频 1: 1.mp4",
narration_copy="她被家人逼到绝路,反击从这一刻开始。",
stream_callback=lambda _event: None,
)
self.assertEqual("success", result["status"])
self.assertIs(UnifiedLLMService.generate_text_stream, call.call_args.args[0])
self.assertIn("on_chunk", call.call_args.kwargs)
def test_generate_narration_script_plans_segments_before_copywriting(self):
adapter = SubtitleAnalyzerAdapter(
api_key="sk-test",
model="test-model",
base_url="https://example.test/v1",
provider="openai",
)
responses = iter(
[
json.dumps(
{
"segments": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"OST": 0,
"intent": "开场钩子",
}
]
},
ensure_ascii=False,
),
json.dumps(
{
"items": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"picture": "女主被误会",
"narration": "她被所有人误会,真正的反击却刚刚开始。",
"OST": 0,
}
]
},
ensure_ascii=False,
),
]
)
with mock.patch.object(adapter, "_run_async_safely", side_effect=lambda *_args, **_kwargs: next(responses)) as call:
result = adapter.generate_narration_script(
short_name="测试短剧",
plot_analysis="女主被误会后反击。",
subtitle_content="# 视频 1: 1.mp4\n00:00:01,000 --> 00:00:04,000\n女主被误会。",
temperature=0.7,
narration_language="简体中文(中国)",
)
self.assertEqual("success", result["status"])
self.assertEqual(2, call.call_count)
self.assertEqual(1, json.loads(result["narration_script"])["items"][0]["_id"])
def test_repair_narration_script_returns_repaired_json(self):
adapter = SubtitleAnalyzerAdapter(
api_key="sk-test",
model="test-model",
base_url="https://example.test/v1",
provider="openai",
)
repaired = json.dumps({"items": []}, ensure_ascii=False)
with mock.patch.object(adapter, "_run_async_safely", return_value=repaired):
result = adapter.repair_narration_script(
short_name="测试短剧",
plot_analysis="",
subtitle_content="# 视频 1: 1.mp4",
invalid_script="{bad}",
validation_errors="时间戳错误",
narration_language="简体中文(中国)",
)
self.assertEqual("success", result["status"])
self.assertEqual(repaired, result["narration_script"])
if __name__ == "__main__":
unittest.main()

View File

@ -12,6 +12,7 @@ from loguru import logger
from .manager import LLMServiceManager
from .validators import OutputValidator
from .exceptions import LLMServiceError
from app.services.prompts import PromptManager
# 提供商注册由 webui.py:main() 显式调用(见 LLM 提供商注册机制重构)
# 这样更可靠,错误也更容易调试
@ -107,6 +108,37 @@ class UnifiedLLMService:
except Exception as e:
logger.error(f"文本生成失败: {str(e)}")
raise LLMServiceError(f"文本生成失败: {str(e)}")
@staticmethod
async def generate_text_stream(prompt: str,
system_prompt: Optional[str] = None,
provider: Optional[str] = None,
temperature: float = 1.0,
max_tokens: Optional[int] = None,
response_format: Optional[str] = None,
on_chunk=None,
**kwargs) -> str:
"""
流式生成文本内容不支持流式的 provider 会退化为一次性返回
"""
try:
text_provider = LLMServiceManager.get_text_provider(provider)
result = await text_provider.generate_text_stream(
prompt=prompt,
system_prompt=system_prompt,
temperature=temperature,
max_tokens=max_tokens,
response_format=response_format,
on_chunk=on_chunk,
**kwargs
)
logger.info(f"流式文本生成完成,生成内容长度: {len(result)} 字符")
return result
except Exception as e:
logger.error(f"流式文本生成失败: {str(e)}")
raise LLMServiceError(f"流式文本生成失败: {str(e)}")
@staticmethod
async def generate_narration_script(prompt: str,
@ -162,6 +194,7 @@ class UnifiedLLMService:
async def analyze_subtitle(subtitle_content: str,
provider: Optional[str] = None,
temperature: float = 1.0,
prompt_category: str = "short_drama_narration",
validate_output: bool = True,
**kwargs) -> str:
"""
@ -181,12 +214,20 @@ class UnifiedLLMService:
LLMServiceError: 服务调用失败时抛出
"""
try:
# 构建分析提示词
system_prompt = "你是一位专业的剧本分析师和剧情概括助手。请仔细分析字幕内容,提取关键剧情信息。"
prompt = PromptManager.get_prompt(
category=prompt_category,
name="plot_analysis",
parameters={"subtitle_content": subtitle_content},
)
prompt_object = PromptManager.get_prompt_object(
category=prompt_category,
name="plot_analysis",
)
system_prompt = prompt_object.get_system_prompt()
# 生成分析结果
result = await UnifiedLLMService.generate_text(
prompt=subtitle_content,
prompt=prompt,
system_prompt=system_prompt,
provider=provider,
temperature=temperature,

View File

@ -113,6 +113,8 @@ class OutputValidator:
"required": ["_id", "timestamp", "picture", "narration"],
"properties": {
"_id": {"type": "number"},
"video_id": {"type": "number"},
"video_name": {"type": "string"},
"timestamp": {"type": "string"},
"picture": {"type": "string"},
"narration": {"type": "string"},
@ -161,6 +163,16 @@ class OutputValidator:
item_id = item.get("_id")
if not isinstance(item_id, (int, float)) or item_id <= 0:
raise ValidationError(f"{index+1}项ID必须为正整数: {item_id}", "invalid_id")
video_id = item.get("video_id")
if video_id not in (None, "") and (
not isinstance(video_id, (int, float)) or video_id <= 0
):
raise ValidationError(f"{index+1}项video_id必须为正整数: {video_id}", "invalid_video_id")
video_name = item.get("video_name")
if video_name not in (None, "") and not isinstance(video_name, str):
raise ValidationError(f"{index+1}项video_name必须为字符串: {video_name}", "invalid_video_name")
@staticmethod
def validate_subtitle_analysis(output: str) -> str:

View File

@ -9,6 +9,7 @@
'''
import os
import json
import shutil
import subprocess
from enum import Enum
@ -127,6 +128,188 @@ def create_ffmpeg_concat_file(video_paths: List[str], concat_file_path: str) ->
return concat_file_path
def _get_video_stream_signature(video_path: str) -> Optional[dict]:
"""
获取用于判断 concat copy 是否安全的视频流关键参数
"""
probe_cmd = [
'ffprobe', '-v', 'error',
'-select_streams', 'v:0',
'-show_entries',
'stream=codec_name,profile,width,height,pix_fmt,r_frame_rate,avg_frame_rate,time_base,sample_aspect_ratio',
'-of', 'json',
video_path
]
try:
result = subprocess.run(
probe_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
streams = json.loads(result.stdout or "{}").get("streams", [])
if not streams:
logger.warning(f"视频没有可用的视频流,不能使用 copy 合并: {video_path}")
return None
stream = streams[0]
return {
"codec_name": stream.get("codec_name"),
"profile": stream.get("profile"),
"width": stream.get("width"),
"height": stream.get("height"),
"pix_fmt": stream.get("pix_fmt"),
"r_frame_rate": stream.get("r_frame_rate"),
"avg_frame_rate": stream.get("avg_frame_rate"),
"time_base": stream.get("time_base"),
"sample_aspect_ratio": stream.get("sample_aspect_ratio", "1:1"),
}
except Exception as e:
logger.warning(f"探测视频流参数失败,不能使用 copy 合并: {video_path}, 错误: {str(e)}")
return None
def _can_concat_video_copy(video_paths: List[str]) -> bool:
"""
判断所有片段的视频流参数是否一致避免 concat copy 造成时间轴或封装异常
"""
if not video_paths:
return False
signatures = []
for video_path in video_paths:
signature = _get_video_stream_signature(video_path)
if not signature:
return False
signatures.append(signature)
base_signature = signatures[0]
for video_path, signature in zip(video_paths[1:], signatures[1:]):
if signature != base_signature:
logger.warning(
"视频片段参数不一致,跳过 copy 合并并回退重编码: "
f"{video_path}, 基准={base_signature}, 当前={signature}"
)
return False
return True
def _get_media_duration(video_path: str) -> Optional[float]:
probe_cmd = [
'ffprobe', '-v', 'error',
'-show_entries', 'format=duration',
'-of', 'csv=p=0',
video_path
]
try:
result = subprocess.run(
probe_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
return float(result.stdout.strip())
except Exception as e:
logger.warning(f"探测视频时长失败: {video_path}, 错误: {str(e)}")
return None
def _concat_duration_matches(video_paths: List[str], output_path: str) -> bool:
input_durations = []
for video_path in video_paths:
duration = _get_media_duration(video_path)
if duration is None:
return False
input_durations.append(duration)
output_duration = _get_media_duration(output_path)
if output_duration is None:
return False
expected_duration = sum(input_durations)
diff = abs(expected_duration - output_duration)
tolerance = max(0.5, len(video_paths) * 0.04)
if diff > tolerance:
logger.warning(
"视频流 copy 合并后的时长偏差过大,将回退重编码: "
f"期望={expected_duration:.3f}s, 实际={output_duration:.3f}s, 偏差={diff:.3f}s"
)
return False
logger.info(
"视频流 copy 合并时长校验通过: "
f"期望={expected_duration:.3f}s, 实际={output_duration:.3f}s"
)
return True
def _build_concat_video_copy_cmd(concat_file: str, output_path: str) -> List[str]:
return [
'ffmpeg', '-y',
'-f', 'concat',
'-safe', '0',
'-i', concat_file,
'-c:v', 'copy',
'-an',
'-movflags', '+faststart',
'-avoid_negative_ts', 'make_zero',
output_path
]
def _build_concat_video_reencode_cmd(concat_file: str, output_path: str, threads: int) -> List[str]:
return [
'ffmpeg', '-y',
'-f', 'concat',
'-safe', '0',
'-i', concat_file,
'-c:v', 'libx264',
'-preset', 'medium',
'-profile:v', 'high',
'-an',
'-threads', str(threads),
output_path
]
def _concat_video_streams(
video_paths: List[str],
concat_file: str,
output_path: str,
threads: int
) -> None:
"""
优先使用无损 copy 合并视频流失败时回退到原来的重编码合并
"""
if _can_concat_video_copy(video_paths):
copy_cmd = _build_concat_video_copy_cmd(concat_file, output_path)
try:
subprocess.run(copy_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if _concat_duration_matches(video_paths, output_path):
logger.info("视频流 copy 合并完成")
return
if os.path.exists(output_path):
try:
os.remove(output_path)
except OSError as e:
logger.warning(f"删除 copy 合并临时结果失败,将继续尝试重编码覆盖: {str(e)}")
except subprocess.CalledProcessError as e:
error_msg = e.stderr.decode() if e.stderr else str(e)
logger.warning(f"视频流 copy 合并失败,将回退重编码合并: {error_msg}")
else:
logger.info("视频流不满足 copy 合并条件,将使用重编码合并")
reencode_cmd = _build_concat_video_reencode_cmd(concat_file, output_path, threads)
subprocess.run(reencode_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
logger.info("视频流重编码合并完成")
def process_single_video(
input_path: str,
output_path: str,
@ -474,22 +657,7 @@ def combine_clip_videos(
concat_file = os.path.join(temp_dir, "concat_list.txt")
create_ffmpeg_concat_file(video_paths_only, concat_file)
# 合并所有视频流,但不包含音频
concat_cmd = [
'ffmpeg', '-y',
'-f', 'concat',
'-safe', '0',
'-i', concat_file,
'-c:v', 'libx264',
'-preset', 'medium',
'-profile:v', 'high',
'-an', # 不包含音频
'-threads', str(threads),
video_concat_path
]
subprocess.run(concat_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
logger.info("视频流合并完成")
_concat_video_streams(video_paths_only, concat_file, video_concat_path, threads)
# 2. 提取并合并有音频的片段
audio_segments = [video for video in processed_videos if video["keep_audio"]]

View File

@ -56,11 +56,13 @@ __all__ = [
def initialize_prompts():
"""初始化提示词模块,注册所有提示词"""
from . import documentary
from . import film_tv_narration
from . import short_drama_editing
from . import short_drama_narration
# 注册各模块的提示词
documentary.register_prompts()
film_tv_narration.register_prompts()
short_drama_editing.register_prompts()
short_drama_narration.register_prompts()

View File

@ -0,0 +1,48 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: NarratoAI
@File : __init__.py
@Description: 影视解说提示词模块
"""
from .plot_analysis import PlotAnalysisPrompt
from .narration_copy import NarrationCopyPrompt
from .segment_planning import SegmentPlanningPrompt
from .script_generation import ScriptGenerationPrompt
from .script_matching import ScriptMatchingPrompt
from .script_repair import ScriptRepairPrompt
from ..manager import PromptManager
def register_prompts():
"""注册影视解说相关的提示词"""
plot_analysis_prompt = PlotAnalysisPrompt()
PromptManager.register_prompt(plot_analysis_prompt, is_default=True)
narration_copy_prompt = NarrationCopyPrompt()
PromptManager.register_prompt(narration_copy_prompt, is_default=True)
segment_planning_prompt = SegmentPlanningPrompt()
PromptManager.register_prompt(segment_planning_prompt, is_default=True)
script_generation_prompt = ScriptGenerationPrompt()
PromptManager.register_prompt(script_generation_prompt, is_default=True)
script_matching_prompt = ScriptMatchingPrompt()
PromptManager.register_prompt(script_matching_prompt, is_default=True)
script_repair_prompt = ScriptRepairPrompt()
PromptManager.register_prompt(script_repair_prompt, is_default=True)
__all__ = [
"PlotAnalysisPrompt",
"NarrationCopyPrompt",
"SegmentPlanningPrompt",
"ScriptGenerationPrompt",
"ScriptMatchingPrompt",
"ScriptRepairPrompt",
"register_prompts",
]

View File

@ -0,0 +1,88 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 影视解说-解说文案
@File : narration_copy.py
@Description: 生成可供用户审核修改的影视解说正文
"""
from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat
class NarrationCopyPrompt(ParameterizedPrompt):
"""影视解说正文生成提示词"""
def __init__(self):
metadata = PromptMetadata(
name="narration_copy",
category="film_tv_narration",
version="v1.0",
description="基于剧情理解和字幕生成可审核修改的影视解说正文,不绑定时间戳",
model_type=ModelType.TEXT,
output_format=OutputFormat.TEXT,
tags=["影视", "解说文案", "电影解说", "剧情承接", "用户审核"],
parameters=["drama_name", "drama_genre", "plot_analysis", "subtitle_content", "narration_language"],
)
super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "subtitle_content"])
self._system_prompt = (
"你是一位影视解说文案创作者。你只输出可供用户审核修改的解说正文,"
"不要输出JSON、时间戳、编号、标题、解释或Markdown。"
)
def get_template(self) -> str:
return """# 影视解说正文创作任务
## 目标
为影视作品${drama_name}创作一份可直接给用户审核修改的解说文案正文此阶段不做画面匹配不输出时间戳
## 剧情理解材料
<plot>
${plot_analysis}
</plot>
## 原始字幕
<subtitles>
${subtitle_content}
</subtitles>
## 输出语言
<narration_language>
${narration_language}
</narration_language>
## 用户选择的影视类型
<drama_genre>
${drama_genre}
</drama_genre>
## 类型写作规则
必须按用户选择的影视类型调整表达重点不要自行改判类型
- 剧情/情感突出人物选择关系裂痕命运压力和情绪余波
- 悬疑/犯罪突出线索疑点动机误导和未揭开的真相
- 动作/冒险突出目标危险升级身体对抗和关键抉择
- 喜剧/轻松突出误会反差节奏包袱和人物可爱处
- 科幻/奇幻突出设定规则未知威胁世界观反差和代价
- 历史/战争突出时代处境阵营选择牺牲和局势变化
- 恐怖/惊悚突出异常细节压迫感未知危险和心理悬念
- 自定义类型严格服从用户填写的类型方向
## 开头钩子公式
开头必须使用人物困境 + 反常信息 + 悬念问题
1. 先点出主角或关键人物正在面对什么压力
2. 再抛出一个违背常识关系突变或危险升级的信息
3. 最后留下观众想继续看的问题他为什么这样做谁在撒谎这场选择会把所有人推向哪里
## 写作规则
1. 必须使用 ${narration_language}
2. 严格基于剧情理解和字幕事实不编造核心情节身份结局
3. 先写清楚人物动机和因果链再写情绪金句不要只堆形容词
4. 每句话只表达一个信息点适合后续按句匹配画面
5. 句子尽量短单句优先 15-35 信息复杂时拆成多句
6. 2-3 句要有明确承接让观众知道为什么从上一幕来到下一幕
7. 总长度控制在 350-750 短素材取下限长素材取上限
8. 不要使用编号项目符号章节标题或括号说明
## 输出要求
只输出解说正文不要输出 JSON时间戳代码块或任何解释"""

View File

@ -0,0 +1,99 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 影视解说-剧情分析
@File : plot_analysis.py
@Description: 影视剧情分析提示词
"""
from ..base import TextPrompt, PromptMetadata, ModelType, OutputFormat
class PlotAnalysisPrompt(TextPrompt):
"""影视剧情分析提示词"""
def __init__(self):
metadata = PromptMetadata(
name="plot_analysis",
category="film_tv_narration",
version="v1.0",
description="结合字幕和可选联网检索上下文,输出适合影视解说脚本生成的结构化剧情理解",
model_type=ModelType.TEXT,
output_format=OutputFormat.TEXT,
tags=["影视", "电影", "电视剧", "剧情分析", "字幕解析", "解说脚本素材"],
parameters=["subtitle_content"],
)
super().__init__(metadata)
self._system_prompt = (
"你是一位专业的影视解说策划和剧作分析师。请输出克制、结构化、"
"可直接供下游影视解说脚本生成使用的剧情理解材料。"
)
def get_template(self) -> str:
return """# 角色
你是一位专业的影视解说策划和剧作分析师你的输出不是给观众看的成片文案而是给下游影视解说脚本生成器使用的结构化剧情理解材料
# 输入说明
下面的输入可能只包含一个视频的原始字幕也可能包含多个视频文件的字幕也可能同时包含联网检索结果和原始字幕
- 联网检索结果只能用于辅助识别作品名称人物关系时代背景公开剧情梗概
- 原始字幕是唯一可信的当前片段事实来源
- 如果联网检索结果与字幕冲突必须以字幕为准
- 如果联网检索结果包含当前字幕尚未出现的后续剧情只能放在字幕未覆盖/需谨慎信息不能写进当前剧情事实
- 多个视频字幕会以视频 1: 文件名视频 2: 文件名等标题分隔时间戳均为对应视频内部时间不是拼接后的累计时间
# 核心任务
请基于输入完成剧情理解目标是帮助后续生成高质量影视解说脚本
1. 识别作品名称当前字幕范围视频来源联网检索辅助信息和字幕事实边界
2. 统一人物称呼梳理人物关系动机和当前场景中的立场变化
3. 120-220 字概括当前字幕覆盖的剧情不提前剧透字幕未出现的内容
4. 按视频来源和字幕时间顺序拆分关键剧情段落并为每段标注准确 video_id / video_name / 时间戳
5. 提炼解说创作可用的开场钩子人物困境情绪转折信息反转名场面和建议保留原声片段
# 强制输出规则
1. 禁止输出寒暄解释身份或好的我将等聊天式开场
2. 禁止编造字幕中没有的具体事件对白关系进展或结局
3. 时间戳必须直接来自对应视频字幕无法确定时写字幕未明确不要猜测
4. 多视频场景下必须明确每段来自哪个视频文件禁止把不同视频的同名时间戳混在一起
5. 人名必须统一优先采用联网检索中的正式名称如果字幕写法不同在人物表中保留字幕称呼
6. 内容要简洁客观可复用避免散文化长段落
7. 必须严格按照下面的 Markdown 格式输出不要添加额外章节
# 输出格式
## 一、基础识别
- 作品名称[如输入可判断则填写否则写未知]
- 当前字幕范围[开始时间戳] --> [结束时间戳]无法确定则写字幕未明确
- 视频来源[列出视频编号文件名和各自字幕时间范围单视频也要写]
- 联网检索确认[仅写可辅助理解的公开信息没有联网结果则写未启用/未提供]
- 字幕内实际出现[列出当前字幕真实出现的关键事实2-5 ]
- 字幕未覆盖/需谨慎信息[列出联网结果提到但当前字幕未发生的内容没有则写]
## 二、人物与关系
| 统一称呼 | 字幕称呼 | 身份/关系 | 当前动机/立场 | 确定性 |
|---|---|---|---|---|
| [人物名] | [字幕原文称呼] | [身份或关系] | [在当前片段中的目标压力或转变] | 字幕明确/联网辅助/合理推断 |
## 三、整体剧情概括
[120-220 只概括当前字幕覆盖的剧情必须包含核心冲突人物动机场景推进和当前悬念]
## 四、分段剧情解析
| 视频 | 时间戳 | 段落主题 | 剧情事件 | 叙事功能 |
|---|---|---|---|---|
| [video_id + video_name] | [开始] --> [结束] | [简短主题] | [当前段落发生了什么] | [铺垫/冲突升级/人物塑造/反转/悬念/情绪爆发/名场面等] |
## 五、解说创作重点
- 开场钩子[用一句话指出最适合开场抓人的冲突谜题或人物困境]
- 核心冲突[当前片段最主要的矛盾]
- 情绪转折/信息反转[ 1-3 没有则写无明显]
- 名场面/高光对白[ 1-3 没有则写无明显]
- 悬念点[当前片段留下的疑问或后续期待]
- 建议保留原声片段
1. [video_id + video_name + 时间戳][保留理由如果没有合适原声无明显]
## 六、联网信息校验
- 可用于辅助理解的信息[联网结果中可帮助理解当前字幕的信息没有则写]
- 与字幕不一致或字幕未覆盖的信息[必须列出不要混入当前剧情事实没有则写]
# 输入内容
${subtitle_content}"""

View File

@ -0,0 +1,154 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 影视解说-文案画面匹配
@File : script_generation.py
@Description: 影视解说脚本生成提示词
"""
from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat
class ScriptGenerationPrompt(ParameterizedPrompt):
"""影视解说脚本生成提示词"""
def __init__(self):
metadata = PromptMetadata(
name="script_generation",
category="film_tv_narration",
version="v1.0",
description="基于已规划片段生成高质量影视解说脚本,重点补足人物动机、信息承接和剧情因果",
model_type=ModelType.TEXT,
output_format=OutputFormat.JSON,
tags=["影视", "解说脚本", "文案生成", "原声片段", "悬念", "名场面"],
parameters=[
"drama_name",
"drama_genre",
"plot_analysis",
"subtitle_content",
"segment_plan",
"narration_language",
],
)
super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "segment_plan"])
self._system_prompt = (
"你是一位影视解说文案写手。你必须严格按照JSON格式输出"
"只能补充picture和narration不能改动上游片段规划中的_id、video_id、video_name、timestamp和OST。"
)
def get_template(self) -> str:
return """# 影视解说脚本文案生成任务
## 任务目标
为影视作品${drama_name}生成最终可剪辑解说脚本片段已经由上游规划完成你只能补充 picture narration不能改变片段来源和时间戳
## 输入材料
### 剧情概述
<plot>
${plot_analysis}
</plot>
### 已规划片段(必须逐项照抄结构字段)
<segment_plan>
${segment_plan}
</segment_plan>
### 原始字幕(含视频编号和精确时间戳)
<subtitles>
${subtitle_content}
</subtitles>
### 解说台词语言
<narration_language>
${narration_language}
</narration_language>
### 用户选择的影视类型
<drama_genre>
${drama_genre}
</drama_genre>
字幕可能来自多个视频文件每个字幕分段标题会以视频 1: 文件名视频 2: 文件名等形式标识来源
生成脚本时必须把每个片段绑定到对应视频来源时间戳表示该视频文件内部的局部时间不是把多个视频拼接后的全局时间
所有 OST=0 narration 字段必须使用上方指定的解说台词语言输出不要因为原始字幕是其他语言就切回字幕原语言
OST=1 的原声片段 narration 字段必须继续使用播放原片+序号格式不要翻译这个固定标记
## 绝对绑定规则
0. 最高优先级如果 segment_plan 中混入片头片尾演职员表版权声明平台水印展示下集预告花絮赞助口播商品露出贴片广告中插广告片中广告或任何与主线剧情无关的推广片段必须直接删除这些片段绝对不能输出到最终 items此规则高于下面所有照抄 segment_plan的绑定规则
1. 除被第 0 条删除的片头片尾和广告片段外输出 items 数量顺序和 _id 必须与 segment_plan 完全一致
2. 除被第 0 条删除的片头片尾和广告片段外每个 item _idvideo_idvideo_nametimestampOST 必须逐字复制 segment_plan不得新增合并拆分或改动
3. 你只能补充 picture narration 两个字段
4. OST=1 narration 必须写成播放原片+_id例如 _id 5 时写播放原片5
5. OST=0 narration 必须使用 ${narration_language}并严格基于剧情和字幕不虚构字幕外的具体事件
## 叙事连续性要求
- 你必须把每个 OST=0 当成观众理解剧情的桥不能只概括当前画面
- 每个 OST=0 narration 要尽量回答上一段发生了什么人物为什么这么做这一段带来什么新信息或新危机
- video_id 或跨时间大跳跃时OST=0 必须明确补出承接句例如真正危险的不是这场争吵而是他终于发现证据指向了身边人
- 原声片段前后的 OST=0 要解释原声的重要性避免观众只看到对白片段合集
- 如果 segment_plan 中有 story_roleintenttransition 字段必须利用它们组织 narration但不要把这些字段输出到最终 JSON
- 结尾 OST=0 要留下后续阻力真相疑问或人物选择如果结尾是 OST=1则前一个 OST=0 必须提前点出这段原声会把矛盾推向哪里
## 开头钩子要求
- 第一段必须是 OST=0 解说钩子不能直接播放原片
- 开头用人物困境 + 反常信息 + 悬念问题主角压力 + 异常线索/关系突变 + 后续疑问
- 写法示例方向他以为这只是一次普通问询可一句话之后所有证据都指向了他最信任的人
- 示例只用于理解公式必须基于当前字幕事实原创不要夸大到字幕没有的情节
## 解说密度与画面节奏
- OST=0 文案必须能被当前 timestamp 的画面承载解说字数 / 5 = 所需视频秒数估算
- 如果画面只有 6 就不要写 80 应压缩到约 30 或依赖 segment_plan 选择更长画面
- 优先短句单句只表达一个信息点不要把人物介绍前因反转和悬念全塞进一个短画面
- 长信息要拆成多段每段只承担一个叙事功能让画面节奏跟上解说
## 用户选择类型文案规则
影视类型由用户手动选择为 ${drama_genre}不得自行改判必须按对应方向写
- 剧情/情感突出人物选择关系裂痕命运压力和情绪余波
- 悬疑/犯罪突出线索疑点动机误导和未揭开的真相
- 动作/冒险突出目标危险升级身体对抗和关键抉择
- 喜剧/轻松突出误会反差节奏包袱和人物可爱处
- 科幻/奇幻突出设定规则未知威胁世界观反差和代价
- 历史/战争突出时代处境阵营选择牺牲和局势变化
- 恐怖/惊悚突出异常细节压迫感未知危险和心理悬念
- 自定义类型严格服从用户填写的类型方向
## 文案质量要求
- 开场片段要有强钩子直接点出冲突疑点或人物困境
- 最终剪辑脚本不得包含片头片尾或任何广告片段如果字幕内容明显属于非剧情推广不要把它包装成剧情解说
- 每段解说优先 25-90 具体长度必须服从画面时长短画面宁可少说不要密集灌信息
- 可以使用可真正的问题是而他还不知道这句话背后危险已经开始靠近等影视解说转折语但不要堆砌
- picture 要描述画面和人物状态便于后期识别素材
- 少用孤立信息句多用承接句不要让观众感觉剧情突然跳场
- 不要解释规则不要输出 Markdown不要输出代码块
## 输出格式
请严格按照以下JSON格式输出绝不添加任何其他文字说明或代码块标记
{
"items": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:05,500",
"picture": "男主站在审讯室门口,神情紧张地看向桌上的证据袋",
"narration": "他以为这只是一次普通问询,可桌上的证据却把所有矛头指向了自己。",
"OST": 0
},
{
"_id": 2,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:05,500-00:00:08,000",
"picture": "警官低声质问,男主沉默不语",
"narration": "播放原片2",
"OST": 1
}
]
}
现在请基于以上要求为影视作品${drama_name}创作解说脚本"""

View File

@ -0,0 +1,133 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 影视解说-文案画面匹配
@File : script_matching.py
@Description: 将用户审核后的影视解说文案匹配到字幕时间戳并生成最终剪辑脚本
"""
from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat
class ScriptMatchingPrompt(ParameterizedPrompt):
"""影视解说文案画面匹配提示词"""
def __init__(self):
metadata = PromptMetadata(
name="script_matching",
category="film_tv_narration",
version="v1.0",
description="将审核后的影视解说文案按叙事节奏拆分并匹配到字幕时间戳生成最终剪辑JSON",
model_type=ModelType.TEXT,
output_format=OutputFormat.JSON,
tags=["影视", "画面匹配", "剪辑脚本", "时间戳", "用户文案"],
parameters=[
"drama_name",
"drama_genre",
"plot_analysis",
"subtitle_content",
"narration_copy",
"narration_language",
"original_sound_ratio",
],
)
super().__init__(
metadata,
required_parameters=["drama_name", "subtitle_content", "narration_copy"],
)
self._system_prompt = (
"你是一位懂影视叙事节奏的剪辑师。你必须严格输出JSON"
"核心任务是把用户审核后的解说文案逐句匹配到最合适的原视频字幕时间戳。"
)
def get_template(self) -> str:
return """# 影视解说文案画面匹配任务
## 目标
用户已经审核并修改了解说文案请根据这份文案和原始字幕生成最终可剪辑 JSON 脚本
## 作品名
${drama_name}
## 剧情理解材料
<plot>
${plot_analysis}
</plot>
## 用户审核后的解说文案
<narration_copy>
${narration_copy}
</narration_copy>
## 原始字幕(含视频编号和局部时间戳)
<subtitles>
${subtitle_content}
</subtitles>
## 输出语言
<narration_language>
${narration_language}
</narration_language>
## 用户选择的影视类型
<drama_genre>
${drama_genre}
</drama_genre>
## 用户选择的原片占比
<original_sound_ratio>
${original_sound_ratio}%
</original_sound_ratio>
## 匹配流程
1. 先按句号问号感叹号省略号切分解说文案得到候选解说句
2. 逗号只在明显分割两个动作场景观点或描述对象时切分不要切出没有独立意义的碎片
3. 不要求每个候选句都单独输出为 OST=0可以合并压缩相邻候选句作为剧情桥段但不能改变用户文案的核心意思
4. 严禁把解说文案匹配到片头片尾演职员表版权声明平台水印展示下集预告花絮赞助口播商品露出贴片广告中插广告片中广告或任何与主线剧情无关的推广片段这些内容绝对不能进入最终 items
5. 如果字幕或画面文字出现广告赞助推广片头片尾预告下集扫码购买会员关注等明显非剧情信号必须跳过对应时间段不得用作 OST=0 OST=1
6. 为每个解说片段寻找最匹配的原始字幕画面优先选择能表达该句核心含义人物状态或信息转折的画面
7. 使用公式估算所需画面时长所需秒数 = 解说字数 / 5匹配画面时长尽量接近误差优先控制在 ±0.5
8. 如果一句解说太长必须拆成多个 OST=0 片段分别匹配不同或连续画面
9. timestamp 必须使用对应 video_id 内部局部时间戳不得换算为多个视频拼接后的累计时间
10. 同一 video_id 内时间段不得交叉或重叠
11. 第一段必须是 OST=0 解说钩子不能直接播放原片
12. OST=1 原声片段的总时长占比要尽量接近用户选择的 ${original_sound_ratio}%这里按最终 items timestamp 总时长估算不按片段数量估算
13. 不要自行判断或改写影视类型画面匹配和 picture 描述要服务用户选择的 ${drama_genre} 叙事重点
## 原片占比规则
- ${original_sound_ratio}% = 0% 不要输出 OST=1全部使用解说承接
- ${original_sound_ratio}% 10%-30% 只保留关键对白信息反转情绪爆发或名场面原声
- ${original_sound_ratio}% 40%-60% 解说负责串联因果原片负责承载关键场面和对白
- ${original_sound_ratio}% 70%-90% 以原片对白和表演为主解说只做开场钩子转场桥和必要补充
- 如果原片占比与第一段必须 OST=0冲突优先保证第一段是 OST=0然后在后续片段提高 OST=1 时长占比
- 选择高原片占比时可以把用户文案合并成更少的 OST=0 桥段不要为了逐句使用文案而压低原片占比
## 字段规则
- _id 1 开始连续递增
- video_id来自字幕分段标题例如视频 2就填 2
- video_name对应视频文件名必须从字幕分段标题提取
- timestamp格式为 "HH:MM:SS,mmm-HH:MM:SS,mmm"
- picture描述匹配画面中人物动作情绪场景和关键道具
- narrationOST=0 时填写用户文案片段OST=1 时填写播放原片+_id
- OST解说片段填 0原声片段填 1
## 输出格式
只输出严格 JSON
{
"items": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:06,000",
"picture": "主角站在走廊尽头,回头看向紧闭的房门",
"narration": "他以为自己终于逃出了那间房,可真正的危险,其实才刚刚醒来。",
"OST": 0
}
]
}
现在请基于用户审核后的解说文案生成最终剪辑脚本"""

View File

@ -0,0 +1,98 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 影视解说-脚本修复
@File : script_repair.py
@Description: 影视解说脚本校验失败后的JSON修复提示词
"""
from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat
class ScriptRepairPrompt(ParameterizedPrompt):
"""影视解说脚本修复提示词"""
def __init__(self):
metadata = PromptMetadata(
name="script_repair",
category="film_tv_narration",
version="v1.0",
description="根据确定性校验错误修复影视解说脚本JSON优先修正时间戳、视频来源和格式问题",
model_type=ModelType.TEXT,
output_format=OutputFormat.JSON,
tags=["影视", "解说脚本", "JSON修复", "时间戳校验", "多视频"],
parameters=[
"drama_name",
"drama_genre",
"plot_analysis",
"subtitle_content",
"invalid_script",
"validation_errors",
"narration_language",
],
)
super().__init__(
metadata,
required_parameters=["drama_name", "subtitle_content", "invalid_script", "validation_errors"],
)
self._system_prompt = (
"你是一位影视解说脚本JSON修复器。你只能根据校验错误修复JSON"
"必须输出严格JSON不能输出解释、Markdown或代码块。"
)
def get_template(self) -> str:
return """# 影视解说脚本修复任务
## 修复目标
下面的影视作品${drama_name}解说脚本未通过剪辑校验请只根据校验错误和字幕内容修复它输出一个完整可剪辑的 JSON
## 剧情理解材料
<plot>
${plot_analysis}
</plot>
## 校验错误
<validation_errors>
${validation_errors}
</validation_errors>
## 当前无效脚本
<invalid_script>
${invalid_script}
</invalid_script>
## 可用字幕窗口
<subtitles>
${subtitle_content}
</subtitles>
## 解说台词目标语言
<narration_language>
${narration_language}
</narration_language>
## 用户选择的影视类型
<drama_genre>
${drama_genre}
</drama_genre>
## 修复规则
1. 只输出 JSON不要任何解释标题Markdown 或代码块
2. 输出根对象必须是 {"items": [...]}
3. 每个 item 必须包含 _idvideo_idvideo_nametimestamppicturenarrationOST
4. 必须删除片头片尾演职员表版权声明平台水印展示下集预告花絮赞助口播商品露出贴片广告中插广告片中广告或任何与主线剧情无关的推广片段这些内容绝对不能出现在修复后的 items
5. 如果字幕或画面文字出现广告赞助推广片头片尾预告下集扫码购买会员关注等明显非剧情信号必须删除对应 item不得改写成解说片段
6. video_idvideo_name timestamp 必须来自对应字幕窗口不得把不同视频的同名时间戳混用
7. 同一 video_id 内片段不得交叉或重叠
8. OST=1 narration 必须是播放原片+序号OST=0 narration 必须使用 ${narration_language}
9. 禁止连续 3 个或更多 OST=1必须插入或改写 OST=0 解说片段承接剧情
10. video_id 切换前后不能都是 OST=1必须至少有一个 OST=0 片段解释场景和剧情为什么切换
11. OST=0 narration 要补足人物动机信息承接和因果转折不要只概括当前画面
12. 第一段必须是 OST=0 解说钩子人物困境 + 反常信息 + 悬念问题不要直接播放原片
13. OST=0 文案必须匹配画面时长解说字数 / 5 = 所需视频秒数估算过密时要缩短文案延长时间戳或拆成多个片段
14. 不要自行改判影视类型如需改写 narration必须按用户选择的 ${drama_genre} 保持表达重点
15. 尽量保留原脚本中没有错误的片段无法修复的片段可以删除但剩余片段必须重新按 1 开始编号
请输出修复后的完整 JSON"""

View File

@ -0,0 +1,105 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 影视解说-片段规划
@File : segment_planning.py
@Description: 影视解说脚本片段规划提示词
"""
from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat
class SegmentPlanningPrompt(ParameterizedPrompt):
"""影视解说片段规划提示词"""
def __init__(self):
metadata = PromptMetadata(
name="segment_planning",
category="film_tv_narration",
version="v1.0",
description="基于剧情理解和原始字幕规划可剪辑片段,优先保证影视叙事连续性和原声解说节奏",
model_type=ModelType.TEXT,
output_format=OutputFormat.JSON,
tags=["影视", "解说脚本", "片段规划", "时间戳", "多视频", "原声"],
parameters=["drama_name", "drama_genre", "plot_analysis", "subtitle_content", "narration_language"],
)
super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "subtitle_content"])
self._system_prompt = (
"你是一位影视解说剪辑规划师。你的任务是从字幕中选择可剪辑片段,"
"必须严格输出JSON不能写解说文案不能输出Markdown或额外说明。"
)
def get_template(self) -> str:
return """# 影视解说片段规划任务
## 目标
为影视作品${drama_name}规划一组可直接剪辑的视频片段你只负责选片段和标注用途不写最终解说台词
## 剧情理解材料
<plot>
${plot_analysis}
</plot>
## 原始字幕(含视频编号和局部时间戳)
<subtitles>
${subtitle_content}
</subtitles>
## 解说台词目标语言
<narration_language>
${narration_language}
</narration_language>
## 用户选择的影视类型
<drama_genre>
${drama_genre}
</drama_genre>
## 叙事规划目标
你不是在挑精彩片段合集而是在规划一条观众能顺着看懂的影视解说故事线必须先想清楚人物处境 -> 事件触发 -> 关系或信息变化 -> 新危机 -> 悬念的因果链再选片段
## 开场钩子规则
第一段必须是 OST=0 解说开场不要直接播放原片开头参考人物困境 + 反常信息 + 悬念问题的公式
- 先给人物一个明确压力被误解被追捕被迫选择失去重要之人发现异常线索
- 再给一个反常信息熟人背叛证据失效规则被打破危险提前出现
- 最后抛出问题谁在说谎真相藏在哪里这次选择会付出什么代价
- 不要照抄示例要基于字幕事实改写成当前作品自己的钩子
## 规划规则
1. 只能使用原始字幕中真实存在的视频编号视频文件名和时间范围
2. timestamp 必须是对应 video_id 内部的局部时间戳禁止换算成多个视频拼接后的累计时间
3. 同一个 video_id 内的片段不得交叉或重叠尽量按故事顺序排列
4. 严禁选择片头片尾演职员表版权声明平台水印展示下集预告花絮赞助口播商品露出贴片广告中插广告片中广告或任何与主线剧情无关的推广片段这些内容绝对不能进入 segments
5. 如果字幕或画面文字出现广告赞助推广片头片尾预告下集扫码购买会员关注等明显非剧情信号必须整段跳过不得用作 OST=0 OST=1
6. 每个片段必须推动主线解释人物动机制造情绪转折承接原声或保留关键对白
7. OST=1 表示保留原声适合关键对白情绪爆发真相揭露名场面和反转OST=0 表示后续需要配解说
8. 原声片段单段优先控制在 3-10 解说片段可以更长但必须能从字幕范围中定位
9. 影视类型由用户手动选择为 ${drama_genre}不得自行改判选片段时优先服务该类型的主要看点
10. 禁止连续 3 个或更多 OST=1 1-2 个原声片段后必须安排 OST=0 解说片段承接剧情
11. video_id 切换前后必须至少有一个 OST=0 片段作为剧情桥段解释为什么从上一场转到下一场
12. 每个 OST=0 片段必须承担明确叙事功能开场钩子人物介绍因果过渡信息解释情绪转折冲突升级结尾悬念
13. 不要跳过关键因果关系变化线索发现危机升级必须有画面或解说桥段承接
14. 结尾优先选择能留下新问题新危险或人物选择的片段不要只停在原声对白堆叠上
15. 解说画面必须给足时长解说字数 / 5 = 所需视频秒数预估短画面不要承载长解说
## 输出格式
只输出严格 JSON
{
"segments": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:05,500",
"OST": 0,
"story_role": "开场钩子",
"intent": "点出主角困境和反常线索,制造继续观看的疑问",
"transition": "从当前场景切入人物压力,引出下一段关键对白"
}
]
}
现在请规划影视作品${drama_name}的解说片段"""

View File

@ -10,7 +10,11 @@
"""
from .plot_analysis import PlotAnalysisPrompt
from .narration_copy import NarrationCopyPrompt
from .segment_planning import SegmentPlanningPrompt
from .script_generation import ScriptGenerationPrompt
from .script_matching import ScriptMatchingPrompt
from .script_repair import ScriptRepairPrompt
from ..manager import PromptManager
@ -20,14 +24,34 @@ def register_prompts():
# 注册剧情分析提示词
plot_analysis_prompt = PlotAnalysisPrompt()
PromptManager.register_prompt(plot_analysis_prompt, is_default=True)
# 注册可审核解说文案提示词
narration_copy_prompt = NarrationCopyPrompt()
PromptManager.register_prompt(narration_copy_prompt, is_default=True)
# 注册片段规划提示词
segment_planning_prompt = SegmentPlanningPrompt()
PromptManager.register_prompt(segment_planning_prompt, is_default=True)
# 注册解说脚本生成提示词
script_generation_prompt = ScriptGenerationPrompt()
PromptManager.register_prompt(script_generation_prompt, is_default=True)
# 注册文案画面匹配提示词
script_matching_prompt = ScriptMatchingPrompt()
PromptManager.register_prompt(script_matching_prompt, is_default=True)
# 注册解说脚本修复提示词
script_repair_prompt = ScriptRepairPrompt()
PromptManager.register_prompt(script_repair_prompt, is_default=True)
__all__ = [
"PlotAnalysisPrompt",
"NarrationCopyPrompt",
"SegmentPlanningPrompt",
"ScriptGenerationPrompt",
"ScriptMatchingPrompt",
"ScriptRepairPrompt",
"register_prompts"
]

View File

@ -0,0 +1,88 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 短剧解说-解说文案
@File : narration_copy.py
@Description: 生成可供用户审核修改的短剧解说正文
"""
from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat
class NarrationCopyPrompt(ParameterizedPrompt):
"""短剧解说正文生成提示词"""
def __init__(self):
metadata = PromptMetadata(
name="narration_copy",
category="short_drama_narration",
version="v1.0",
description="基于剧情理解和字幕生成可审核修改的短剧解说正文,不绑定时间戳",
model_type=ModelType.TEXT,
output_format=OutputFormat.TEXT,
tags=["短剧", "解说文案", "爆款开头", "叙事连续性", "用户审核"],
parameters=["drama_name", "drama_genre", "plot_analysis", "subtitle_content", "narration_language"],
)
super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "subtitle_content"])
self._system_prompt = (
"你是一位短剧解说文案创作者。你只输出可供用户审核修改的解说正文,"
"不要输出JSON、时间戳、编号、标题、解释或Markdown。"
)
def get_template(self) -> str:
return """# 短剧解说正文创作任务
## 目标
为短剧${drama_name}创作一份可直接给用户审核修改的解说文案正文此阶段不做画面匹配不输出时间戳
## 剧情理解材料
<plot>
${plot_analysis}
</plot>
## 原始字幕
<subtitles>
${subtitle_content}
</subtitles>
## 输出语言
<narration_language>
${narration_language}
</narration_language>
## 用户选择的短剧类型
<drama_genre>
${drama_genre}
</drama_genre>
## 类型写作规则
必须按用户选择的短剧类型调整表达重点不要自行改判类型
- 霸总/甜宠突出误会身份差暧昧拉扯守护感和情绪反差
- 逆袭/复仇突出羞辱反击打脸身份揭露和爽点升级
- 家庭伦理突出亲情撕扯秘密委屈选择和道德冲突
- 古装/权谋突出身份局势算计立场和反转
- 悬疑/犯罪突出线索危机动机和未揭开的疑问
- 都市情感突出关系裂痕现实压力误会和情绪拉扯
- 年代/乡村突出家庭处境人情压力生活困境和命运转折
- 自定义类型严格服从用户填写的类型方向
## 开头钩子公式
开头必须使用高能反转 + 情绪冲突 + 悬念钩子
1. 强身份或强处境兵王单亲妈妈被赶出家门的女人被全家看不起的人等
2. 致命反差刚立功就被迫退役刚回家就发现钱被输光刚结婚就遇到孩子/婆婆阻挠
3. 后续悬念真正的噩梦才开始他要讨回的不是钱这段关系真正难的不是相爱
## 写作规则
1. 必须使用 ${narration_language}
2. 严格基于剧情理解和字幕事实不编造核心情节身份结局
3. 先写完整故事线再写金句不要只堆爆点
4. 每句话只表达一个信息点适合后续按句匹配画面
5. 句子尽量短单句优先 15-35 信息复杂时拆成多句
6. 2-3 句要有明确因果承接让观众知道为什么从上一幕来到下一幕
7. 总长度控制在 300-650 短素材取下限长素材取上限
8. 不要使用编号项目符号章节标题或括号说明
## 输出要求
只输出解说正文不要输出 JSON时间戳代码块或任何解释"""

View File

@ -19,72 +19,79 @@ class PlotAnalysisPrompt(TextPrompt):
metadata = PromptMetadata(
name="plot_analysis",
category="short_drama_narration",
version="v1.0",
description="分析短剧字幕内容,提供详细的剧情分析和分段解析",
version="v1.1",
description="结合字幕和可选联网检索上下文,输出适合短剧解说脚本生成的结构化剧情理解",
model_type=ModelType.TEXT,
output_format=OutputFormat.TEXT,
tags=["短剧", "剧情分析", "字幕解析", "分段分析"],
tags=["短剧", "剧情分析", "字幕解析", "分段分析", "联网检索", "解说脚本素材"],
parameters=["subtitle_content"]
)
super().__init__(metadata)
self._system_prompt = "你是一位专业的剧本分析师和剧情概括助手"
self._system_prompt = "你是一位专业的短剧解说策划和剧本分析师。请输出克制、结构化、可直接供下游解说脚本生成使用的剧情理解材料"
def get_template(self) -> str:
return """# 角色
你是一位专业的剧本分析师和剧情概括助手
你是一位专业的短剧解说策划和剧本分析师你的输出不是给观众看的成片文案而是给下游短剧解说脚本生成器使用的结构化剧情理解材料
# 任务
我将为你提供一部短剧的完整字幕文本请你基于这些字幕完成以下任务
1. **整体剧情分析**简要概括整个短剧的核心剧情脉络主要冲突和结局如果有的话
2. **分段剧情解析与时间戳定位**
* 将整个短剧划分为若干个关键的剧情段落例如开端发展转折高潮结局或根据具体情节自然划分
* 段落数应该与字幕长度成正比
* 对于每一个剧情段落
* **概括该段落的主要内容**用简洁的语言描述这段剧情发生了什么
* **标注对应的时间戳范围**明确指出该剧情段落对应的开始字幕时间戳和结束字幕时间戳请直接从字幕中提取时间信息
# 输入说明
下面的输入可能只包含一个视频的原始字幕也可能包含多个视频文件的字幕也可能同时包含 Tavily 联网检索结果和原始字幕
- 联网检索结果只能用于辅助识别短剧名称人物关系时代背景公开剧情梗概
- 原始字幕是唯一可信的当前片段事实来源
- 如果联网检索结果与字幕冲突必须以字幕为准
- 如果联网检索结果包含当前字幕尚未出现的后续剧情只能放在字幕未覆盖/需谨慎信息不能写进当前剧情事实
- 多个视频字幕会以视频 1: 文件名视频 2: 文件名等标题分隔时间戳均为对应视频内部时间不是拼接后的累计时间
# 输入格式
字幕内容通常包含时间戳和对话例如
```
00:00:05,000 --> 00:00:10,000
[角色A]: 你好吗
00:00:10,500 --> 00:00:15,000
[角色B]: 我很好谢谢发生了一些有趣的事情
... (更多字幕内容) ...
```
我将把实际字幕粘贴在下方
# 核心任务
请基于输入完成剧情理解目标是帮助后续生成高质量短剧解说脚本
1. 识别短剧名称当前字幕范围视频来源联网检索辅助信息和字幕事实边界
2. 统一人物称呼避免同一人物出现多个名字写法
3. 100-180 字概括当前字幕覆盖的剧情不提前剧透字幕未出现的内容
4. 按视频来源和字幕时间顺序拆分关键剧情段落并为每段标注准确 video_id / video_name / 时间戳
5. 提炼解说创作可用的钩子冲突爽点/泪点/悬念点和建议保留原声片段
# 输出格式要求
请按照以下格式清晰地呈现分析结果
# 强制输出规则
1. 禁止输出寒暄解释身份或好的我将等聊天式开场
2. 禁止编造字幕中没有的具体事件对白关系进展或结局
3. 时间戳必须直接来自对应视频字幕无法确定时写字幕未明确不要猜测
4. 多视频场景下必须明确每段来自哪个视频文件禁止把不同视频的同名时间戳混在一起
5. 人名必须统一优先采用联网检索中的正式名称如果字幕写法不同在人物表中保留字幕称呼
6. 内容要简洁客观可复用避免散文化长段落
7. 必须严格按照下面的 Markdown 格式输出不要添加额外章节
**整体剧情概括**
[此处填写对整个短剧剧情的概括]
# 输出格式
## 一、基础识别
- 短剧名称[如输入可判断则填写否则写未知]
- 当前字幕范围[开始时间戳] --> [结束时间戳]无法确定则写字幕未明确
- 视频来源[列出视频编号文件名和各自字幕时间范围单视频也要写]
- 联网检索确认[仅写可辅助理解的公开信息没有联网结果则写未启用/未提供]
- 字幕内实际出现[列出当前字幕真实出现的关键事实2-4 ]
- 字幕未覆盖/需谨慎信息[列出联网结果提到但当前字幕未发生的内容没有则写]
**分段剧情解析**
## 二、人物与关系
| 统一称呼 | 字幕称呼 | 身份/关系 | 当前剧情作用 | 确定性 |
|---|---|---|---|---|
| [人物名] | [字幕原文称呼] | [身份或关系] | [在当前片段中的作用] | 字幕明确/联网辅助/合理推断 |
**剧情段落 1[段落主题/概括例如主角登场与背景介绍]**
* **时间戳** [开始时间戳] --> [结束时间戳]
* **内容概要** [对这段剧情的详细描述]
## 三、整体剧情概括
[100-180 只概括当前字幕覆盖的剧情必须包含核心冲突人物动机和当前悬念]
**剧情段落 2[段落主题/概括例如第一个冲突出现]**
* **时间戳** [开始时间戳] --> [结束时间戳]
* **内容概要** [对这段剧情的详细描述]
## 四、分段剧情解析
| 视频 | 时间戳 | 段落主题 | 剧情事件 | 情绪/冲突功能 |
|---|---|---|---|---|
| [video_id + video_name] | [开始] --> [结束] | [简短主题] | [当前段落发生了什么] | [铺垫/冲突升级/人物塑造/反转/悬念/情绪爆发等] |
... (根据实际剧情段落数量继续) ...
## 五、解说创作重点
- 开场钩子[用一句话指出最适合开场抓人的冲突或疑问]
- 核心冲突[当前片段最主要的矛盾]
- 爽点/泪点/情绪点[ 1-3 没有则写无明显]
- 悬念点[当前片段留下的疑问或后续期待]
- 建议保留原声片段
1. [video_id + video_name + 时间戳][保留理由如果没有合适原声无明显]
**剧情段落 N[段落主题/概括例如结局与反思]**
* **时间戳** [开始时间戳] --> [结束时间戳]
* **内容概要** [对这段剧情的详细描述]
## 六、联网信息校验
- 可用于辅助理解的信息[联网结果中可帮助理解当前字幕的信息没有则写]
- 与字幕不一致或字幕未覆盖的信息[必须列出不要混入当前剧情事实没有则写]
# 注意事项
* 请确保时间戳的准确性直接引用字幕中的时间
* 剧情段落的划分应合乎逻辑能够反映剧情的起承转合
* 语言表达应简洁准确客观
# 限制
1. 严禁输出与分析结果无关的内容
2. 时间戳必须严格按照字幕中的实际时间
# 请处理以下字幕:
# 输入内容
${subtitle_content}"""

View File

@ -19,222 +19,112 @@ class ScriptGenerationPrompt(ParameterizedPrompt):
metadata = PromptMetadata(
name="script_generation",
category="short_drama_narration",
version="v2.0",
description="基于短剧解说创作核心要素,生成高质量解说脚本,包含黄金开场、爽点放大、个性吐槽等专业技巧",
version="v2.1",
description="基于已规划片段生成高质量短剧解说脚本,重点补足剧情承接、因果解释和观众理解路径",
model_type=ModelType.TEXT,
output_format=OutputFormat.JSON,
tags=["短剧", "解说脚本", "文案生成", "原声片段", "黄金开场", "爽点放大", "个性吐槽", "悬念预埋"],
parameters=["drama_name", "plot_analysis", "subtitle_content"]
parameters=[
"drama_name",
"drama_genre",
"plot_analysis",
"subtitle_content",
"segment_plan",
"narration_language",
]
)
super().__init__(metadata, required_parameters=["drama_name", "plot_analysis"])
super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "segment_plan"])
self._system_prompt = "你是一位顶级的短剧解说up主精通短视频创作的所有核心技巧。你必须严格按照JSON格式输出绝不能包含任何其他文字、说明或代码块标记。"
self._system_prompt = (
"你是一位短剧解说文案写手。你必须严格按照JSON格式输出"
"只能补充picture和narration不能改动上游片段规划中的_id、video_id、video_name、timestamp和OST。"
)
def get_template(self) -> str:
return """# 短剧解说脚本创作任务
return """# 短剧解说脚本文案生成任务
## 任务目标
我是一位专业的短剧解说up主需要为短剧${drama_name}创作一份高质量的解说脚本目标是让观众在短时间内了解剧情精华并产生强烈的继续观看欲望
为短剧${drama_name}生成最终可剪辑解说脚本片段已经由上游规划完成你只能补充 picture narration不能改变片段来源和时间戳
## 素材信息
## 输入材料
### 剧情概述
<plot>
${plot_analysis}
</plot>
### 原始字幕(含精确时间戳)
### 已规划片段(必须逐项照抄结构字段)
<segment_plan>
${segment_plan}
</segment_plan>
### 原始字幕(含视频编号和精确时间戳)
<subtitles>
${subtitle_content}
</subtitles>
## 短剧解说创作核心要素
### 解说台词语言
<narration_language>
${narration_language}
</narration_language>
### 1. 黄金开场3秒法则
**开头3秒内必须制造强烈钩子激发"想知道后续发展"的强烈好奇心**
- **悬念设置**直接抛出最核心的冲突或疑问
* 示例"身为一个名声恶臭的政客,他知道自己早晚会被暗杀"
* 技巧直接定性角色身份和处境制造紧张感
- **冲突展示**展现最激烈的对立关系
* 示例"而这一天,就在他刚露头的时候..."
* 技巧用时间节点强调关键时刻的到来
- **情感共鸣**触及观众内心的普遍情感
- **反转预告**暗示即将发生的惊人转折
* 技巧使用"没想到""原来""竟然"等词汇预告反转
### 用户选择的短剧类型
<drama_genre>
${drama_genre}
</drama_genre>
### 2. 主线提炼(去繁就简)
**快节奏解说速度超越原剧专注核心主线**
- 舍弃次要情节和配角只保留推动主线的关键人物
- 突出核心矛盾冲突每个片段都要推进主要故事线
- 快速跳过铺垫直击剧情要害
- 确保每个解说片段都有明确的剧情推进作用
- **转折技巧**大量使用"而这时""就在这时""没多久"等时间转折词
字幕可能来自多个视频文件每个字幕分段标题会以视频 1: 文件名视频 2: 文件名等形式标识来源
生成脚本时必须把每个片段绑定到对应视频来源时间戳表示该视频文件内部的局部时间不是把多个视频拼接后的全局时间
所有 OST=0 narration 字段必须使用上方指定的解说台词语言输出不要因为原始字幕是其他语言就切回字幕原语言
OST=1 的原声片段 narration 字段必须继续使用播放原片+序号格式不要翻译这个固定标记
### 3. 爽点放大(情绪引爆)
**精准识别剧中"爽点"并用富有感染力的语言放大**
- **主角逆袭**突出弱者变强反败为胜的瞬间
- **反派被打脸**强调恶人得到报应的痛快感
- **智商在线**赞美角色的机智和策略
* 示例"豺狼已经提前数日跟踪这名清洁工,并在他身上放了窃听器"
* 技巧展现角色的深谋远虑和专业能力
- **情感爆发**放大感人愤怒震撼等强烈情绪
- 使用激昂语气和富有感染力的词汇调动观众情绪
## 绝对绑定规则
1. 输出 items 数量顺序和 _id 必须与 segment_plan 完全一致
2. 每个 item _idvideo_idvideo_nametimestampOST 必须逐字复制 segment_plan不得新增删除合并拆分或改动
3. 你只能补充 picture narration 两个字段
4. OST=1 narration 必须写成播放原片+_id例如 _id 5 时写播放原片5
5. OST=0 narration 必须使用 ${narration_language}并严格基于剧情和字幕不虚构字幕外的具体事件
### 4. 个性吐槽(增加趣味)
**以观众视角进行犀利点评体现解说员独特人设**
- 避免单纯复述剧情要有自己的观点和态度
- **"上帝视角"分析技巧**
* 揭示角色内心"他莫名地笑了一下"
* 分析动机"豺狼的这几步都是事先算好的"
* 预判后果"这又会有何代价呢"
- 适当吐槽剧情的套路或角色的愚蠢行为
- 用幽默犀利的语言增加观看趣味
- 站在观众立场说出观众想说的话
- **心理活动描述**深入角色内心增强代入感
## 叙事连续性要求
- 你必须把每个 OST=0 当成观众理解剧情的桥不能只概括当前画面
- 每个 OST=0 narration 要尽量回答上一段发生了什么为什么会发展到这一段这一段带来什么新矛盾
- video_id 或跨时间大跳跃时OST=0 必须明确补出承接句例如可这段婚姻真正难的不是相爱而是两个孩子和婆婆都还没接纳她
- 原声片段前后的 OST=0 要解释原声的重要性避免观众只看到对白片段合集
- 如果 segment_plan 中有 story_roleintenttransition 字段必须利用它们组织 narration但不要把这些字段输出到最终 JSON
- 结尾 OST=0 要留下后续阻力或悬念如果结尾是 OST=1则前一个 OST=0 必须提前点出这段原声会把矛盾推向哪里
### 5. 悬念预埋(引导互动)
**在关键节点和结尾处"卖关子"激发互动欲望**
- 在剧情高潮前停止留下"接下来会发生什么"的疑问
- **悬念设置技巧**
* 问题抛出"那么UDC究竟是谁呢"
* 反转预告"而从这句话开始,所有的专业、体面和虚伪的平静都将分崩瓦解"
* 时间悬念"几分钟后...""不久之后..."
- 提出引导性问题"你们觉得他会怎么做?"
- 预告后续精彩"更劲爆的还在后面"
- 为后续内容预热激发评论点赞关注
## 开头钩子要求
- 第一段必须是 OST=0 解说钩子不能直接播放原片
- 开头用高能反转 + 情绪冲突 + 悬念钩子强身份/强处境 + 致命反差 + 后续悬念
- 写法示例方向一个刚立功的兵王下一秒却被迫脱下军装他回家的第一天家里的钱和尊严都被赌桌吞了
- 示例只用于理解公式必须基于当前字幕事实原创不要夸大到字幕没有的情节
### 6. 卡点配合(视听协调)
**考虑文案与画面音乐的完美结合**
- 在情感高潮处预设BGM卡点
- 解说节奏要配合画面节奏
- 重要台词处保留原声解说适时停顿
- 追求文案+画面+音乐的协同效应
## 解说密度与画面节奏
- OST=0 文案必须能被当前 timestamp 的画面承载解说字数 / 5 = 所需视频秒数估算
- 如果画面只有 6 就不要写 80 应压缩到约 30 或依赖 segment_plan 选择更长画面
- 优先短句单句只表达一个信息点不要把人物介绍前因反转和悬念全塞进一个短画面
- 长信息要拆成多段每段只承担一个叙事功能让画面节奏跟上解说
## 专业解说语言技巧
## 用户选择类型文案规则
短剧类型由用户手动选择为 ${drama_genre}不得自行改判必须按对应方向写
- 霸总/甜宠突出误会身份差暧昧拉扯守护感和情绪反差
- 逆袭/复仇突出羞辱反击打脸身份揭露和爽点升级
- 家庭伦理突出亲情撕扯秘密委屈选择和道德冲突
- 古装/权谋突出身份局势算计立场和反转
- 悬疑/犯罪突出线索危机动机和未揭开的疑问
- 都市情感突出关系裂痕现实压力误会和情绪拉扯
- 年代/乡村突出家庭处境人情压力生活困境和命运转折
- 自定义类型严格服从用户填写的类型方向
### 1. 氛围营造技巧
**通过环境和细节描述增强画面感和代入感**
- **环境描述**"在这个距离,枪声都无法传到那边"
- **细节刻画**"他的床头有酒,身边的纸碟堆满烟头"
- **氛围渲染**"黑暗树林里有一间仓房"
- **情绪描述**"孤独又无助的豺狼,竟在这时露出了反常的一面"
## 文案质量要求
- 开场片段要有强钩子直接点出冲突悬念或情绪爆点
- 每段解说优先 25-90 具体长度必须服从画面时长短画面宁可少说不要密集灌信息
- 可以使用没想到可下一秒而这时真正的问题来了等短剧转折语但不要堆砌
- picture 要描述画面和人物状态便于后期识别素材
- 少用孤立信息句多用承接句不要让观众感觉剧情突然跳场
- 不要解释规则不要输出 Markdown不要输出代码块
### 2. 情感词汇运用
**使用富有感染力的词汇调动观众情绪**
- **紧张感**"名声恶臭""早晚会被暗杀""动用军警资源"
- **神秘感**"尘封的传奇""高度机密""暗藏玄机"
- **震撼感**"空前绝后的一枪""天衣无缝""神不知鬼不觉"
- **悲伤感**"目光非常悲伤""注定永远无法哀悼"
### 3. 节奏控制技巧
**通过语言节奏控制观众注意力**
- **快节奏推进**使用短句密集信息
- **慢节奏渲染**使用长句详细描述
- **停顿技巧**在关键信息前适当停顿
- **重复强调**重要信息适当重复
## 严格技术要求
### 时间戳管理(绝对不能违反)
- **时间戳绝对不能重叠**确保剪辑后无重复画面
- **时间段必须连续且不交叉**严格按时间顺序排列
- **每个时间戳都必须在原始字幕中找到对应范围**
- 可以拆分原时间片段但必须保持时间连续性
- 时间戳的格式必须与原始字幕中的格式完全一致
### 时长控制1/3原则
- **解说视频总长度 = 原视频长度的 1/3**
- 精确控制节奏和密度既不能过短也不能过长
- 合理分配解说和原声的时间比例
### 剧情连贯性
- **保持故事逻辑完整**确保情节发展自然流畅
- **严格按照时间顺序**禁止跳跃式叙述
- **符合因果逻辑**先发生A再发生BA导致B
## 原声片段使用规范
### 原声片段格式要求
原声片段必须严格按照以下JSON格式
```json
{
"_id": 序号,
"timestamp": "开始时间-结束时间",
"picture": "画面内容描述",
"narration": "播放原片+序号",
"OST": 1
}
```
### 原声片段插入策略
#### 1. 关键情绪爆发点
**在角色强烈情绪表达时必须保留原声增强观众代入感**
- **愤怒爆发**角色愤怒咆哮情绪失控的瞬间
* 参考"Come on, you bastard. Reaching."愤怒对峙
- **感动落泪**角色感动哭泣情感宣泄的时刻
- **震惊反应**角色震惊不敢置信的表情和台词
* 参考"Are you sure about that?"质疑震惊
- **绝望崩溃**角色绝望崩溃的情感表达
* 参考"Charles you're scaring me, what's wrong"恐惧绝望
- **狂欢庆祝**角色兴奋狂欢的情绪高潮
#### 2. 重要对白时刻
**保留推动剧情发展的关键台词和对话**
- **身份揭露**揭示角色真实身份的重要台词
- **真相大白**揭晓谜底真相的关键对话
- **情感告白**爱情告白情感表达的重要台词
* 参考"i'm really not good"情感表达
- **威胁警告**反派威胁警告的重要对白
* 参考"You do not want to make enemies of these people"威胁警告
- **决定宣布**角色做出重要决定的宣告
#### 3. 爽点瞬间
**"爽点"时刻保留原声增强痛快感**
- **主角逆袭**弱者反击逆转局面的台词
- **反派被打脸**恶人得到报应被揭穿的瞬间
- **智商碾压**主角展现智慧碾压对手的台词
* 参考"That is a fucking work of art guys"技能展示
- **正义伸张**正义得到伸张恶有恶报的时刻
- **实力展现**主角展现真实实力震撼全场
#### 4. 悬念节点
**在制造悬念或揭晓答案的关键时刻保留原声**
- **悬念制造**制造悬念留下疑问的台词
- **答案揭晓**揭晓答案解开谜团的对话
- **转折预告**暗示即将发生转折的重要台词
- **危机降临**危机来临紧张时刻的对白
#### 5. 经典台词时刻
**保留具有强烈感染力和记忆点的经典台词**
- **哲理感悟**角色的人生感悟和哲理思考
- **幽默调侃**轻松幽默的对话增加趣味性
- **专业术语**体现角色专业性的术语和对话
* 参考"The scanner will pick up the metal components"专业解释
- **情感共鸣**能引起观众共鸣的经典表达
### 原声片段技术规范
#### 格式规范
- **OST字段**设置为1表示保留原声解说片段设置为0
- **narration格式**严格使用"播放原片+序号""播放原片26"
- **picture字段**详细描述画面内容便于后期剪辑参考
- **时间戳精度**必须与字幕中的重要对白时间精确匹配
#### 比例控制
- **原声与解说比例**7:3原声70%解说30%
- **分布均匀**原声片段要在整个视频中均匀分布
- **长度适中**单个原声片段时长控制在3-8
- **衔接自然**原声片段与解说片段之间衔接自然流畅
#### 选择原则
- **情感优先**优先选择情感强烈的台词和对话
- **剧情关键**必须是推动剧情发展的重要内容
- **观众共鸣**选择能引起观众共鸣的经典台词
- **视听效果**考虑台词的声音效果和表演张力
- **代入感强**选择能让观众产生强烈代入感的对话
## 输出格式要求
## 输出格式
请严格按照以下JSON格式输出绝不添加任何其他文字说明或代码块标记
@ -242,6 +132,8 @@ ${subtitle_content}
"items": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:05,500",
"picture": "女主角林小雨慌张地道歉,男主角沈墨轩冷漠地看着她",
"narration": "一个普通女孩的命运即将因为一杯咖啡彻底改变!她撞到的这个男人,竟然是...",
@ -249,6 +141,8 @@ ${subtitle_content}
},
{
"_id": 2,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:05,500-00:00:08,000",
"picture": "沈墨轩质问林小雨,语气冷厉威严",
"narration": "播放原片2",
@ -256,6 +150,8 @@ ${subtitle_content}
},
{
"_id": 3,
"video_id": 2,
"video_name": "2.mp4",
"timestamp": "00:00:08,000-00:00:12,000",
"picture": "林小雨惊慌失措,沈墨轩眼中闪过一丝兴趣",
"narration": "霸道总裁的经典开场!一杯咖啡引发的爱情故事就这样开始了...",
@ -264,44 +160,4 @@ ${subtitle_content}
]
}
## 质量标准
### 解说文案要求:
- **字数控制**每段解说文案80-150
- **语言风格**生动有趣富有感染力符合短视频观众喜好
* 参考风格"身为一个名声恶臭的政客,他知道自己早晚会被暗杀"
* 直接定性制造紧张感和代入感
- **情感调动**能够有效调动观众情绪产生代入感
* 使用"而这时""没想到""原来"等转折词增强戏剧性
- **节奏把控**快节奏但不失条理紧凑但不混乱
* 短句推进剧情长句渲染氛围
### 技术规范:
- **解说与原片比例**3:7解说30%原片70%
- **原声片段标识**OST=1表示原声OST=0表示解说
- **原声格式规范**narration字段必须使用"播放原片+序号"格式
- **关键情绪点**必须保留原片原声增强观众代入感
- **时间戳精度**精确到毫秒级别确保与字幕完美匹配
- **逻辑连贯性**严格遵循剧情发展顺序
### 创作原则:
1. **只输出JSON内容**不要任何说明性文字
2. **严格基于提供的剧情和字幕**不虚构内容
3. **突出核心冲突**舍弃无关细节
4. **强化观众体验**始终考虑观看感受
5. **保持专业水准**体现解说up主的专业素养
6. **融入经典解说技巧**
- 大量使用"上帝视角"分析
- 适时插入心理活动描述
- 运用悬念设置和反转技巧
- 保持强烈的画面感和代入感
### 参考解说风格示例:
- **开场悬念**"身为一个名声恶臭的政客,他知道自己早晚会被暗杀"
- **转折技巧**"而这一天,就在他刚露头的时候..."
- **上帝视角**"豺狼已经提前数日跟踪这名清洁工"
- **情感渲染**"孤独又无助的豺狼,竟在这时露出了反常的一面"
- **悬念设置**"那么UDC究竟是谁呢"
- **反转预告**"而从这句话开始,所有的专业、体面和虚伪的平静都将分崩瓦解"
现在请基于以上要求为短剧${drama_name}创作解说脚本"""

View File

@ -0,0 +1,131 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 短剧解说-文案画面匹配
@File : script_matching.py
@Description: 将用户审核后的解说文案匹配到字幕时间戳并生成最终剪辑脚本
"""
from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat
class ScriptMatchingPrompt(ParameterizedPrompt):
"""短剧解说文案画面匹配提示词"""
def __init__(self):
metadata = PromptMetadata(
name="script_matching",
category="short_drama_narration",
version="v1.0",
description="将审核后的解说文案按叙事节奏拆分并匹配到字幕时间戳生成最终剪辑JSON",
model_type=ModelType.TEXT,
output_format=OutputFormat.JSON,
tags=["短剧", "画面匹配", "剪辑脚本", "时间戳", "用户文案"],
parameters=[
"drama_name",
"drama_genre",
"plot_analysis",
"subtitle_content",
"narration_copy",
"narration_language",
"original_sound_ratio",
],
)
super().__init__(
metadata,
required_parameters=["drama_name", "subtitle_content", "narration_copy"],
)
self._system_prompt = (
"你是一位懂叙事节奏的短剧剪辑师。你必须严格输出JSON"
"核心任务是把用户审核后的解说文案逐句匹配到最合适的原视频字幕时间戳。"
)
def get_template(self) -> str:
return """# 短剧解说文案画面匹配任务
## 目标
用户已经审核并修改了解说文案请根据这份文案和原始字幕生成最终可剪辑 JSON 脚本
## 剧名
${drama_name}
## 剧情理解材料
<plot>
${plot_analysis}
</plot>
## 用户审核后的解说文案
<narration_copy>
${narration_copy}
</narration_copy>
## 原始字幕(含视频编号和局部时间戳)
<subtitles>
${subtitle_content}
</subtitles>
## 输出语言
<narration_language>
${narration_language}
</narration_language>
## 用户选择的短剧类型
<drama_genre>
${drama_genre}
</drama_genre>
## 用户选择的原片占比
<original_sound_ratio>
${original_sound_ratio}%
</original_sound_ratio>
## 匹配流程
1. 先按句号问号感叹号省略号切分解说文案得到候选解说句
2. 逗号只在明显分割两个动作场景观点或描述对象时切分不要切出没有独立意义的碎片
3. 不要求每个候选句都单独输出为 OST=0可以合并压缩相邻候选句作为剧情桥段但不能改变用户文案的核心意思
4. 为每个解说片段寻找最匹配的原始字幕画面优先选择能表达该句核心含义的画面
5. 使用公式估算所需画面时长所需秒数 = 解说字数 / 5匹配画面时长尽量接近误差优先控制在 ±0.5
6. 如果一句解说太长必须拆成多个 OST=0 片段分别匹配不同或连续画面
7. timestamp 必须使用对应 video_id 内部局部时间戳不得换算为多个视频拼接后的累计时间
8. 同一 video_id 内时间段不得交叉或重叠
9. 第一段必须是 OST=0 解说钩子不能直接播放原片
10. OST=1 原声片段的总时长占比要尽量接近用户选择的 ${original_sound_ratio}%这里按最终 items timestamp 总时长估算不按片段数量估算
11. 不要自行判断或改写短剧类型画面匹配和 picture 描述要服务用户选择的 ${drama_genre} 叙事重点
## 原片占比规则
- ${original_sound_ratio}% = 0% 不要输出 OST=1全部使用解说承接
- ${original_sound_ratio}% 10%-30% 只保留关键对白反转情绪爆发或爽点原声
- ${original_sound_ratio}% 40%-60% 解说负责串联因果原片负责承载关键场面和对白
- ${original_sound_ratio}% 70%-90% 以原片对白和表演为主解说只做开场钩子转场桥和必要补充
- 如果原片占比与第一段必须 OST=0冲突优先保证第一段是 OST=0然后在后续片段提高 OST=1 时长占比
- 选择高原片占比时可以把用户文案合并成更少的 OST=0 桥段不要为了逐句使用文案而压低原片占比
## 字段规则
- _id 1 开始连续递增
- video_id来自字幕分段标题例如视频 2就填 2
- video_name对应视频文件名必须从字幕分段标题提取
- timestamp格式为 "HH:MM:SS,mmm-HH:MM:SS,mmm"
- picture描述匹配画面中人物动作情绪和场景
- narrationOST=0 时填写用户文案片段OST=1 时填写播放原片+_id
- OST解说片段填 0原声片段填 1
## 输出格式
只输出严格 JSON
{
"items": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:06,000",
"picture": "主角站在门口,震惊地看着屋内混乱的场面",
"narration": "一个刚立功的兵王,回家的第一天就发现家里四百万被亲爹输光。",
"OST": 0
}
]
}
现在请基于用户审核后的解说文案生成最终剪辑脚本"""

View File

@ -0,0 +1,96 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 短剧解说-脚本修复
@File : script_repair.py
@Description: 短剧解说脚本校验失败后的JSON修复提示词
"""
from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat
class ScriptRepairPrompt(ParameterizedPrompt):
"""短剧解说脚本修复提示词"""
def __init__(self):
metadata = PromptMetadata(
name="script_repair",
category="short_drama_narration",
version="v1.0",
description="根据确定性校验错误修复短剧解说脚本JSON优先修正时间戳、视频来源和格式问题",
model_type=ModelType.TEXT,
output_format=OutputFormat.JSON,
tags=["短剧", "解说脚本", "JSON修复", "时间戳校验", "多视频"],
parameters=[
"drama_name",
"drama_genre",
"plot_analysis",
"subtitle_content",
"invalid_script",
"validation_errors",
"narration_language",
],
)
super().__init__(
metadata,
required_parameters=["drama_name", "subtitle_content", "invalid_script", "validation_errors"],
)
self._system_prompt = (
"你是一位短剧解说脚本JSON修复器。你只能根据校验错误修复JSON"
"必须输出严格JSON不能输出解释、Markdown或代码块。"
)
def get_template(self) -> str:
return """# 短剧解说脚本修复任务
## 修复目标
下面的短剧${drama_name}解说脚本未通过剪辑校验请只根据校验错误和字幕内容修复它输出一个完整可剪辑的 JSON
## 剧情理解材料
<plot>
${plot_analysis}
</plot>
## 校验错误
<validation_errors>
${validation_errors}
</validation_errors>
## 当前无效脚本
<invalid_script>
${invalid_script}
</invalid_script>
## 可用字幕窗口
<subtitles>
${subtitle_content}
</subtitles>
## 解说台词目标语言
<narration_language>
${narration_language}
</narration_language>
## 用户选择的短剧类型
<drama_genre>
${drama_genre}
</drama_genre>
## 修复规则
1. 只输出 JSON不要任何解释标题Markdown 或代码块
2. 输出根对象必须是 {"items": [...]}
3. 每个 item 必须包含 _idvideo_idvideo_nametimestamppicturenarrationOST
4. video_idvideo_name timestamp 必须来自对应字幕窗口不得把不同视频的同名时间戳混用
5. 同一 video_id 内片段不得交叉或重叠
6. OST=1 narration 必须是播放原片+序号OST=0 narration 必须使用 ${narration_language}
7. 禁止连续 3 个或更多 OST=1必须插入或改写 OST=0 解说片段承接剧情
8. video_id 切换前后不能都是 OST=1必须至少有一个 OST=0 片段解释场景和剧情为什么切换
9. OST=0 narration 要补足因果承接不要只概括当前画面
10. 第一段必须是 OST=0 解说钩子高能反转 + 情绪冲突 + 悬念钩子不要直接播放原片
11. OST=0 文案必须匹配画面时长解说字数 / 5 = 所需视频秒数估算过密时要缩短文案延长时间戳或拆成多个片段
12. 不要自行改判短剧类型如需改写 narration必须按用户选择的 ${drama_genre} 保持表达重点
13. 尽量保留原脚本中没有错误的片段无法修复的片段可以删除但剩余片段必须重新按 1 开始编号
请输出修复后的完整 JSON"""

View File

@ -0,0 +1,104 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project: 短剧解说-片段规划
@File : segment_planning.py
@Description: 短剧解说脚本片段规划提示词
"""
from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat
class SegmentPlanningPrompt(ParameterizedPrompt):
"""短剧解说片段规划提示词"""
def __init__(self):
metadata = PromptMetadata(
name="segment_planning",
category="short_drama_narration",
version="v1.1",
description="基于剧情理解和原始字幕规划可剪辑片段,优先保证叙事连续性、跨视频承接和原声解说节奏",
model_type=ModelType.TEXT,
output_format=OutputFormat.JSON,
tags=["短剧", "解说脚本", "片段规划", "时间戳", "多视频", "原声"],
parameters=["drama_name", "drama_genre", "plot_analysis", "subtitle_content", "narration_language"],
)
super().__init__(metadata, required_parameters=["drama_name", "plot_analysis", "subtitle_content"])
self._system_prompt = (
"你是一位短剧解说剪辑规划师。你的任务是从字幕中选择可剪辑片段,"
"必须严格输出JSON不能写解说文案不能输出Markdown或额外说明。"
)
def get_template(self) -> str:
return """# 短剧解说片段规划任务
## 目标
为短剧${drama_name}规划一组可直接剪辑的视频片段你只负责选片段和标注用途不写最终解说台词
## 剧情理解材料
<plot>
${plot_analysis}
</plot>
## 原始字幕(含视频编号和局部时间戳)
<subtitles>
${subtitle_content}
</subtitles>
## 解说台词目标语言
<narration_language>
${narration_language}
</narration_language>
## 用户选择的短剧类型
<drama_genre>
${drama_genre}
</drama_genre>
## 叙事规划目标
你不是在挑精彩片段合集而是在规划一条观众能顺着看懂的短剧解说故事线必须先想清楚人物困境 -> 冲突触发 -> 关系变化 -> 新阻力 -> 悬念的因果链再选片段
## 爆款开头钩子规则
第一段必须是 OST=0 解说开场不要直接播放原片开头参考高能反转 + 情绪冲突 + 悬念钩子的公式
- 先给人物一个强身份或强处境兵王单亲妈妈被赶出家门的女人被全家看不起的赘婿
- 再给一个反差冲突刚立功就被迫退役刚回家就发现钱被输光刚结婚就遇到孩子/婆婆阻挠
- 最后抛出悬念真正的噩梦才开始他要讨回的不是钱这场婚姻真正难的不是相爱
- 不要照抄示例要基于字幕事实改写成当前剧情自己的钩子
## 规划规则
1. 只能使用原始字幕中真实存在的视频编号视频文件名和时间范围
2. timestamp 必须是对应 video_id 内部的局部时间戳禁止换算成多个视频拼接后的累计时间
3. 同一个 video_id 内的片段不得交叉或重叠尽量按故事顺序排列
4. 每个片段必须推动主线制造情绪点承接原声或保留关键对白
5. OST=1 表示保留原声适合关键对白情绪爆发身份揭露反转和爽点OST=0 表示后续需要配解说
6. 原声片段单段优先控制在 3-8 解说片段可以更长但必须能从字幕范围中定位
7. 短剧类型由用户手动选择为 ${drama_genre}不得自行改判选片段时优先服务该类型的主要看点
8. 禁止连续 3 个或更多 OST=1 1-2 个原声片段后必须安排 OST=0 解说片段承接剧情
9. video_id 切换前后必须至少有一个 OST=0 片段作为剧情桥段解释为什么从上一场转到下一场
10. 每个 OST=0 片段必须承担明确叙事功能开场钩子人物介绍因果过渡冲突升级关系转折阻力解释结尾悬念
11. 不要跳过关键因果例如从求婚直接跳到孩子/婆婆阻挠中间必须用 OST=0 解释婚姻真正的难题变成家庭接纳
12. 结尾优先选择能留下后续阻力或新矛盾的片段不要只停在原声对白堆叠上
13. 解说画面必须给足时长解说字数 / 5 = 所需视频秒数预估短画面不要承载长解说
14. OST=0 片段如果需要讲清多层信息应选择更长的连续画面或拆成多个 OST=0 片段分别承接
## 输出格式
只输出严格 JSON
{
"segments": [
{
"_id": 1,
"video_id": 1,
"video_name": "1.mp4",
"timestamp": "00:00:01,000-00:00:05,500",
"OST": 0,
"story_role": "开场钩子",
"intent": "女主被羞辱,制造逆袭期待",
"transition": "从灾后恢复现场切入女主处境,引出她为什么敢和领导硬刚"
}
]
}
现在请规划短剧${drama_name}的解说片段"""

View File

@ -0,0 +1,421 @@
import os
import re
import unicodedata
from typing import Iterable, List, Optional, Sequence, Tuple
from loguru import logger
from app.services.short_drama_narration_validation import build_subtitle_index
from app.services.subtitle_text import read_subtitle_text
from app.utils import utils
DEFAULT_SUBTITLE_OST_TYPES = (0, 2)
DEFAULT_ORIGINAL_SUBTITLE_OST_TYPES = (1,)
DEFAULT_MAX_CHARS_PER_SUBTITLE = 12
SENTENCE_PART_RE = re.compile(r"[^。!?!?;,、\n]+[。!?!?;,、]?")
SubtitleEntry = Tuple[float, float, str]
def _normalize_text(text: str) -> str:
return re.sub(r"\s+", " ", str(text or "")).strip()
def _remove_punctuation(text: str) -> str:
return "".join(
char for char in str(text or "")
if not unicodedata.category(char).startswith("P")
)
def clean_subtitle_text(text: str) -> str:
"""Normalize subtitle text for burn-in display."""
return _normalize_text(_remove_punctuation(text))
def split_narration(text: str, max_chars: int = DEFAULT_MAX_CHARS_PER_SUBTITLE) -> List[str]:
"""Split narration into readable subtitle chunks."""
text = _normalize_text(text)
if not text:
return []
max_chars = max(1, int(max_chars or DEFAULT_MAX_CHARS_PER_SUBTITLE))
parts = [match.group(0).strip() for match in SENTENCE_PART_RE.finditer(text)]
if not parts:
parts = [text]
chunks = []
current = ""
def flush_long_part(part: str) -> str:
while len(part) > max_chars:
chunks.append(part[:max_chars].strip())
part = part[max_chars:].strip()
return part
for part in parts:
if not part:
continue
if len(part) > max_chars:
if current:
chunks.append(current.strip())
current = ""
current = flush_long_part(part)
continue
candidate = f"{current}{part}" if current else part
if len(candidate) <= max_chars:
current = candidate
else:
if current:
chunks.append(current.strip())
current = part
if current:
chunks.append(current.strip())
return [cleaned for chunk in chunks if (cleaned := clean_subtitle_text(chunk))]
def parse_srt_like_time(time_text: str) -> float:
time_text = str(time_text or "").strip().replace(",", ".")
parts = time_text.split(":")
if len(parts) != 3:
raise ValueError(f"不支持的时间格式: {time_text}")
hours = int(parts[0])
minutes = int(parts[1])
seconds = float(parts[2])
return hours * 3600 + minutes * 60 + seconds
def parse_time_range(time_range: str) -> Tuple[float, float]:
if not time_range or "-" not in str(time_range):
raise ValueError(f"不支持的时间范围: {time_range}")
start_text, end_text = str(time_range).split("-", 1)
start = parse_srt_like_time(start_text)
end = parse_srt_like_time(end_text)
if end <= start:
raise ValueError(f"结束时间必须晚于开始时间: {time_range}")
return start, end
def format_srt_time(seconds: float) -> str:
milliseconds_total = max(0, int(round(float(seconds) * 1000)))
milliseconds = milliseconds_total % 1000
total_seconds = milliseconds_total // 1000
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
secs = total_seconds % 60
return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
def _safe_ost_value(value) -> Optional[int]:
try:
return int(value)
except (TypeError, ValueError):
return None
def _coerce_positive_int(value) -> Optional[int]:
try:
number = int(value)
except (TypeError, ValueError):
return None
return number if number > 0 else None
def _normalize_paths(paths) -> List[str]:
if isinstance(paths, str):
paths = [paths]
if not paths:
return []
normalized_paths = []
seen = set()
for item in paths:
if not isinstance(item, str):
continue
item = item.strip()
if not item or item in seen:
continue
normalized_paths.append(item)
seen.add(item)
return normalized_paths
def _resolve_script_video_id(item: dict, video_origin_paths: Sequence[str]) -> int:
video_id = _coerce_positive_int(item.get("video_id") or item.get("video_index"))
if video_id is not None:
return video_id
video_name = os.path.basename(
str(item.get("video_name") or item.get("source_video") or "").strip()
)
if video_name:
for index, video_path in enumerate(video_origin_paths, start=1):
if os.path.basename(video_path) == video_name:
return index
return 1
def _read_subtitle_file(subtitle_path: str) -> str:
try:
return read_subtitle_text(subtitle_path).text
except Exception as e:
logger.warning(f"读取原片字幕失败: {subtitle_path}, {e}")
return ""
def _build_combined_original_subtitle_content(
original_subtitle_paths,
video_origin_paths=None,
) -> str:
subtitle_paths = _normalize_paths(original_subtitle_paths)
video_paths = _normalize_paths(video_origin_paths)
sections = []
for index, subtitle_path in enumerate(subtitle_paths, start=1):
if not os.path.exists(subtitle_path):
logger.warning(f"原片字幕文件不存在,跳过: {subtitle_path}")
continue
content = _read_subtitle_file(subtitle_path)
if not content:
logger.warning(f"原片字幕文件为空,跳过: {subtitle_path}")
continue
video_path = video_paths[index - 1] if index <= len(video_paths) else ""
if video_path:
header = (
f"# 视频 {index}: {os.path.basename(video_path)}\n"
f"字幕文件: {os.path.basename(subtitle_path)}"
)
else:
header = f"# 视频 {index}\n字幕文件: {os.path.basename(subtitle_path)}"
sections.append(f"{header}\n{content}".strip())
return "\n\n".join(sections)
def _resolve_item_time_range(item: dict, current_time: float) -> Tuple[Optional[Tuple[float, float]], float]:
duration = float(item.get("duration", 0.0) or 0.0)
if duration > 0:
start = current_time
end = current_time + duration
return (start, end), end
edited_time_range = item.get("editedTimeRange")
if edited_time_range:
try:
start, end = parse_time_range(edited_time_range)
return (start, end), end
except ValueError as e:
logger.warning(f"解析 editedTimeRange 失败,将尝试使用 duration: {e}")
return None, current_time
def _build_narration_subtitle_entries(
list_script: Sequence[dict],
include_ost: Iterable[int],
max_chars: int,
) -> List[SubtitleEntry]:
include_ost_set = {int(item) for item in include_ost}
entries: List[SubtitleEntry] = []
current_time = 0.0
for item in list_script:
time_range, current_time = _resolve_item_time_range(item, current_time)
if not time_range:
continue
ost = _safe_ost_value(item.get("OST"))
if ost not in include_ost_set:
continue
chunks = split_narration(item.get("narration", ""), max_chars=max_chars)
if not chunks:
continue
start, end = time_range
segment_duration = end - start
if segment_duration <= 0:
continue
chunk_duration = segment_duration / len(chunks)
for chunk_index, chunk in enumerate(chunks):
chunk_start = start + chunk_duration * chunk_index
chunk_end = end if chunk_index == len(chunks) - 1 else start + chunk_duration * (chunk_index + 1)
entries.append((chunk_start, chunk_end, chunk))
return entries
def _build_original_subtitle_entries(
list_script: Sequence[dict],
original_subtitle_paths=None,
video_origin_paths=None,
include_ost: Iterable[int] = DEFAULT_ORIGINAL_SUBTITLE_OST_TYPES,
) -> List[SubtitleEntry]:
original_subtitle_content = _build_combined_original_subtitle_content(
original_subtitle_paths,
video_origin_paths,
)
if not original_subtitle_content:
return []
video_paths = _normalize_paths(video_origin_paths)
subtitle_index = build_subtitle_index(original_subtitle_content, video_paths)
if not subtitle_index:
logger.warning("原片字幕索引为空,无法为原声片段生成字幕")
return []
cues_by_video = {}
for cue in subtitle_index:
cues_by_video.setdefault(cue.video_id, []).append(cue)
include_ost_set = {int(item) for item in include_ost}
entries: List[SubtitleEntry] = []
current_time = 0.0
for item in list_script:
time_range, current_time = _resolve_item_time_range(item, current_time)
if not time_range:
continue
ost = _safe_ost_value(item.get("OST"))
if ost not in include_ost_set:
continue
source_time_range = item.get("sourceTimeRange") or item.get("timestamp")
try:
source_start, source_end = parse_time_range(source_time_range)
except ValueError as e:
logger.warning(f"解析原声片段源时间失败,跳过原片字幕: {e}")
continue
target_start, target_end = time_range
source_duration = source_end - source_start
target_duration = target_end - target_start
if source_duration <= 0 or target_duration <= 0:
continue
video_id = _resolve_script_video_id(item, video_paths)
video_cues = cues_by_video.get(video_id, [])
if not video_cues:
logger.warning(f"视频 {video_id} 未找到可用原片字幕,片段 {item.get('_id')} 跳过")
continue
for cue in video_cues:
cue_start = cue.start_ms / 1000
cue_end = cue.end_ms / 1000
overlap_start = max(source_start, cue_start)
overlap_end = min(source_end, cue_end)
if overlap_end <= overlap_start:
continue
text = clean_subtitle_text(cue.text)
if not text:
continue
mapped_start = target_start + (overlap_start - source_start)
mapped_end = target_start + (overlap_end - source_start)
mapped_start = max(target_start, min(mapped_start, target_end))
mapped_end = max(target_start, min(mapped_end, target_end))
if mapped_end <= mapped_start:
continue
entries.append((mapped_start, mapped_end, text))
return entries
def _subtitle_entries_to_blocks(entries: Sequence[SubtitleEntry]) -> List[str]:
blocks = []
sorted_entries = sorted(
entries,
key=lambda entry: (entry[0], entry[1], entry[2]),
)
for subtitle_index, (start, end, text) in enumerate(sorted_entries, start=1):
blocks.append(
"\n".join(
[
str(subtitle_index),
f"{format_srt_time(start)} --> {format_srt_time(end)}",
text,
]
)
)
return blocks
def _build_srt_blocks(
list_script: Sequence[dict],
include_ost: Iterable[int],
max_chars: int,
) -> List[str]:
entries = _build_narration_subtitle_entries(
list_script,
include_ost=include_ost,
max_chars=max_chars,
)
return _subtitle_entries_to_blocks(entries)
def create_script_subtitle_file(
task_id: str,
list_script: Sequence[dict],
output_file: Optional[str] = None,
include_ost: Optional[Iterable[int]] = None,
max_chars: int = DEFAULT_MAX_CHARS_PER_SUBTITLE,
original_subtitle_paths=None,
video_origin_paths=None,
include_original_ost: Optional[Iterable[int]] = None,
) -> str:
"""Create a full SRT file from script narration plus original-audio subtitles."""
if not list_script:
return ""
if include_ost is None:
include_ost = DEFAULT_SUBTITLE_OST_TYPES
if include_original_ost is None:
include_original_ost = DEFAULT_ORIGINAL_SUBTITLE_OST_TYPES
entries = _build_narration_subtitle_entries(
list_script,
include_ost=include_ost,
max_chars=max_chars,
)
entries.extend(
_build_original_subtitle_entries(
list_script,
original_subtitle_paths=original_subtitle_paths,
video_origin_paths=video_origin_paths,
include_ost=include_original_ost,
)
)
blocks = _subtitle_entries_to_blocks(entries)
if not blocks:
logger.warning("程序化字幕未生成内容")
return ""
if output_file is None:
output_file = os.path.join(utils.task_dir(task_id), "script_subtitles.srt")
output_dir = os.path.dirname(output_file)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
f.write("\n\n".join(blocks))
f.write("\n")
logger.info(f"程序化字幕生成成功: {output_file}, 共 {len(blocks)}")
return output_file

View File

@ -0,0 +1,435 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""Validation helpers for short drama narration scripts."""
from __future__ import annotations
import os
import re
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
TIMESTAMP_RE = re.compile(r"^\d{2}:\d{2}:\d{2},\d{3}$")
SCRIPT_RANGE_RE = re.compile(
r"^(?P<start>\d{2}:\d{2}:\d{2}[,.]\d{3})-(?P<end>\d{2}:\d{2}:\d{2}[,.]\d{3})$"
)
SRT_RANGE_RE = re.compile(
r"(?P<start>\d{2}:\d{2}:\d{2}[,.]\d{3})\s*-->\s*"
r"(?P<end>\d{2}:\d{2}:\d{2}[,.]\d{3})"
)
VIDEO_HEADER_RE = re.compile(r"^#\s*视频\s*(?P<video_id>\d+)(?:\s*[:]\s*(?P<video_name>.+?))?\s*$")
NARRATION_CHARS_PER_SECOND = 5.0
NARRATION_DURATION_TOLERANCE_SECONDS = 0.5
@dataclass(frozen=True)
class SubtitleCue:
video_id: int
video_name: str
start_ms: int
end_ms: int
text: str
timestamp: str
@dataclass(frozen=True)
class ScriptValidationResult:
valid: bool
errors: List[str]
items: List[Dict[str, Any]]
class NarrationScriptValidationError(ValueError):
"""Raised when a narration script cannot be made safe for clipping."""
def timestamp_to_ms(timestamp: str) -> int:
value = str(timestamp or "").strip().replace(".", ",")
if not TIMESTAMP_RE.match(value):
raise ValueError(f"时间戳格式错误: {timestamp}")
hh, mm, rest = value.split(":")
ss, ms = rest.split(",")
return ((int(hh) * 60 + int(mm)) * 60 + int(ss)) * 1000 + int(ms)
def ms_to_timestamp(ms: int) -> str:
if ms < 0:
raise ValueError("毫秒时间不能为负数")
hours, remainder = divmod(ms, 60 * 60 * 1000)
minutes, remainder = divmod(remainder, 60 * 1000)
seconds, millis = divmod(remainder, 1000)
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{millis:03d}"
def parse_script_timestamp_range(timestamp_range: str) -> Tuple[int, int, str]:
value = str(timestamp_range or "").strip().replace(".", ",")
match = SCRIPT_RANGE_RE.match(value)
if not match:
raise ValueError("时间戳格式应为 'HH:MM:SS,mmm-HH:MM:SS,mmm'")
start = timestamp_to_ms(match.group("start"))
end = timestamp_to_ms(match.group("end"))
return start, end, f"{ms_to_timestamp(start)}-{ms_to_timestamp(end)}"
def _normalize_paths(paths: Optional[Iterable[str]]) -> List[str]:
if isinstance(paths, str):
paths = [paths]
if not paths:
return []
normalized = []
for path in paths:
if not isinstance(path, str):
continue
path = path.strip()
if path:
normalized.append(path)
return normalized
def _default_video_name(video_id: int, video_paths: Sequence[str]) -> str:
if 1 <= video_id <= len(video_paths):
return os.path.basename(video_paths[video_id - 1])
return ""
def _split_subtitle_sections(
subtitle_content: str,
video_paths: Sequence[str],
) -> List[Tuple[int, str, str]]:
sections: List[Tuple[int, str, str]] = []
current_video_id = 1
current_video_name = _default_video_name(1, video_paths)
current_lines: List[str] = []
saw_header = False
for line in str(subtitle_content or "").splitlines():
header_match = VIDEO_HEADER_RE.match(line.strip())
if header_match:
if current_lines or saw_header:
sections.append((current_video_id, current_video_name, "\n".join(current_lines)))
current_lines = []
saw_header = True
current_video_id = int(header_match.group("video_id"))
header_video_name = str(header_match.group("video_name") or "").strip()
current_video_name = header_video_name or _default_video_name(current_video_id, video_paths)
continue
current_lines.append(line)
if current_lines or not sections:
sections.append((current_video_id, current_video_name, "\n".join(current_lines)))
return sections
def _extract_cues_from_section(video_id: int, video_name: str, section_text: str) -> List[SubtitleCue]:
lines = str(section_text or "").splitlines()
cues: List[SubtitleCue] = []
index = 0
while index < len(lines):
match = SRT_RANGE_RE.search(lines[index])
if not match:
index += 1
continue
start_ms = timestamp_to_ms(match.group("start"))
end_ms = timestamp_to_ms(match.group("end"))
timestamp = f"{ms_to_timestamp(start_ms)}-{ms_to_timestamp(end_ms)}"
index += 1
text_lines: List[str] = []
while index < len(lines) and lines[index].strip():
text_lines.append(lines[index].strip())
index += 1
cues.append(
SubtitleCue(
video_id=video_id,
video_name=video_name,
start_ms=start_ms,
end_ms=end_ms,
text=" ".join(text_lines).strip(),
timestamp=timestamp,
)
)
index += 1
return cues
def build_subtitle_index(subtitle_content: str, video_paths: Optional[Iterable[str]] = None) -> List[SubtitleCue]:
"""Build a per-video subtitle index from combined SRT text."""
normalized_video_paths = _normalize_paths(video_paths)
cues: List[SubtitleCue] = []
for video_id, video_name, section_text in _split_subtitle_sections(subtitle_content, normalized_video_paths):
cues.extend(_extract_cues_from_section(video_id, video_name, section_text))
return cues
def _coerce_positive_int(value: Any) -> Optional[int]:
try:
number = int(value)
except (TypeError, ValueError):
return None
return number if number > 0 else None
def _video_id_by_name(video_name: Any, video_paths: Sequence[str]) -> Optional[int]:
normalized_name = os.path.basename(str(video_name or "").strip())
if not normalized_name:
return None
for index, path in enumerate(video_paths, start=1):
if os.path.basename(path) == normalized_name:
return index
return None
def normalize_script_video_sources(
items: Sequence[Dict[str, Any]],
video_paths: Optional[Iterable[str]] = None,
) -> List[Dict[str, Any]]:
"""Normalize video_name from a valid source without inventing video_id."""
normalized_video_paths = _normalize_paths(video_paths)
normalized_items: List[Dict[str, Any]] = []
for raw_item in items:
item = dict(raw_item)
video_id = _coerce_positive_int(item.get("video_id") or item.get("video_index"))
matched_video_id = _video_id_by_name(item.get("video_name") or item.get("source_video"), normalized_video_paths)
if matched_video_id is not None:
video_id = matched_video_id
if video_id is not None:
item["video_id"] = video_id
if 1 <= video_id <= len(normalized_video_paths):
item["video_name"] = os.path.basename(normalized_video_paths[video_id - 1])
normalized_items.append(item)
return normalized_items
def _cues_for_video(cues: Sequence[SubtitleCue], video_id: int) -> List[SubtitleCue]:
return [cue for cue in cues if cue.video_id == video_id]
def _range_overlaps_subtitle(cues: Sequence[SubtitleCue], start_ms: int, end_ms: int) -> bool:
return any(start_ms < cue.end_ms and end_ms > cue.start_ms for cue in cues)
def _range_within_subtitle_bounds(cues: Sequence[SubtitleCue], start_ms: int, end_ms: int) -> bool:
if not cues:
return False
return min(cue.start_ms for cue in cues) <= start_ms and end_ms <= max(cue.end_ms for cue in cues)
def _item_ost(item: Dict[str, Any]) -> Optional[int]:
try:
return int(item.get("OST"))
except (TypeError, ValueError):
return None
def _item_video_id(item: Dict[str, Any]) -> Optional[int]:
return _coerce_positive_int(item.get("video_id"))
def count_narration_chars(text: str) -> int:
"""Count visible narration characters for rough TTS/video-duration matching."""
return len(re.sub(r"\s+", "", str(text or "")))
def max_narration_chars_for_duration(start_ms: int, end_ms: int) -> int:
duration_seconds = max(0.0, (end_ms - start_ms) / 1000)
return max(8, int((duration_seconds + NARRATION_DURATION_TOLERANCE_SECONDS) * NARRATION_CHARS_PER_SECOND))
def _validate_story_continuity(items: Sequence[Dict[str, Any]]) -> List[str]:
"""Validate structural continuity rules that affect viewer comprehension."""
errors: List[str] = []
consecutive_ost = 0
previous_item: Optional[Dict[str, Any]] = None
for index, item in enumerate(items):
if not isinstance(item, dict):
consecutive_ost = 0
previous_item = None
continue
item_id = item.get("_id", index + 1)
ost = _item_ost(item)
if index == 0 and ost != 0:
errors.append(f"片段 {item_id} 必须是 OST=0 解说开场钩子,不能直接播放原片")
if ost == 1:
consecutive_ost += 1
if consecutive_ost > 2:
errors.append(f"片段 {item_id} 连续原声过多,必须插入 OST=0 解说承接剧情")
else:
consecutive_ost = 0
if previous_item is not None:
previous_video_id = _item_video_id(previous_item)
current_video_id = _item_video_id(item)
if (
previous_video_id is not None
and current_video_id is not None
and previous_video_id != current_video_id
and _item_ost(previous_item) == 1
and ost == 1
):
errors.append(
f"片段 {previous_item.get('_id')} 到片段 {item_id} 跨视频切换缺少 OST=0 解说桥段"
)
previous_item = item
return errors
def validate_narration_script_items(
items: Any,
subtitle_index: Sequence[SubtitleCue],
video_paths: Optional[Iterable[str]] = None,
) -> ScriptValidationResult:
"""Validate final narration items against subtitle/video source constraints."""
errors: List[str] = []
if not isinstance(items, list) or not items:
return ScriptValidationResult(False, ["解说脚本 items 必须是非空数组"], [])
normalized_video_paths = _normalize_paths(video_paths)
normalized_items = normalize_script_video_sources(items, normalized_video_paths)
available_video_ids = {cue.video_id for cue in subtitle_index}
if normalized_video_paths:
available_video_ids.update(range(1, len(normalized_video_paths) + 1))
ranges_by_video: Dict[int, List[Tuple[int, int, int]]] = {}
seen_ids = set()
required_fields = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"]
for index, item in enumerate(normalized_items):
if not isinstance(item, dict):
errors.append(f"{index + 1} 个片段必须是对象")
continue
item_id = item.get("_id", index + 1)
coerced_item_id = _coerce_positive_int(item_id)
if coerced_item_id is None:
errors.append(f"{index + 1} 个片段缺少有效 _id")
coerced_item_id = index + 1
elif coerced_item_id in seen_ids:
errors.append(f"片段 _id={coerced_item_id} 重复")
seen_ids.add(coerced_item_id)
for field in required_fields:
if field not in item:
errors.append(f"片段 {item_id} 缺少字段 {field}")
video_id = _coerce_positive_int(item.get("video_id"))
if video_id is None:
errors.append(f"片段 {item_id} 缺少有效 video_id")
continue
if available_video_ids and video_id not in available_video_ids:
errors.append(f"片段 {item_id} 的 video_id={video_id} 不在已选视频范围内")
expected_video_name = _default_video_name(video_id, normalized_video_paths)
if expected_video_name and os.path.basename(str(item.get("video_name") or "")) != expected_video_name:
errors.append(f"片段 {item_id} 的 video_name 必须是 {expected_video_name}")
try:
start_ms, end_ms, normalized_timestamp = parse_script_timestamp_range(item.get("timestamp", ""))
item["timestamp"] = normalized_timestamp
except ValueError as exc:
errors.append(f"片段 {item_id}: {exc}")
continue
if start_ms >= end_ms:
errors.append(f"片段 {item_id} 的开始时间必须早于结束时间")
continue
video_cues = _cues_for_video(subtitle_index, video_id)
if not _range_within_subtitle_bounds(video_cues, start_ms, end_ms):
errors.append(f"片段 {item_id} 的时间戳不在视频 {video_id} 的字幕范围内")
elif not _range_overlaps_subtitle(video_cues, start_ms, end_ms):
errors.append(f"片段 {item_id} 的时间戳没有命中视频 {video_id} 的字幕内容")
for text_field in ["picture", "narration"]:
if not isinstance(item.get(text_field), str) or not item[text_field].strip():
errors.append(f"片段 {item_id}{text_field} 不能为空")
ost = _item_ost(item)
if item.get("OST") not in [0, 1, 2]:
errors.append(f"片段 {item_id} 的 OST 必须是 0、1 或 2")
if ost == 1 and not str(item.get("narration", "")).startswith("播放原片"):
errors.append(f"片段 {item_id} 是原声片段narration 必须使用“播放原片+序号”")
if ost == 0:
narration_chars = count_narration_chars(item.get("narration", ""))
max_chars = max_narration_chars_for_duration(start_ms, end_ms)
if narration_chars > max_chars:
duration_seconds = (end_ms - start_ms) / 1000
errors.append(
f"片段 {item_id} 解说过密:{narration_chars} 字需要约 {narration_chars / NARRATION_CHARS_PER_SECOND:.1f} 秒,"
f"但画面只有 {duration_seconds:.1f} 秒,建议不超过 {max_chars} 字或延长画面"
)
ranges_by_video.setdefault(video_id, []).append((start_ms, end_ms, coerced_item_id))
for video_id, ranges in ranges_by_video.items():
sorted_ranges = sorted(ranges, key=lambda item: (item[0], item[1], item[2]))
previous_start, previous_end, previous_id = sorted_ranges[0]
for start_ms, end_ms, item_id in sorted_ranges[1:]:
if start_ms < previous_end:
errors.append(f"视频 {video_id} 的片段 {item_id} 与片段 {previous_id} 时间戳重叠")
if end_ms > previous_end:
previous_start, previous_end, previous_id = start_ms, end_ms, item_id
errors.extend(_validate_story_continuity(normalized_items))
return ScriptValidationResult(not errors, errors, normalized_items)
def require_valid_narration_script_items(
items: Any,
subtitle_index: Sequence[SubtitleCue],
video_paths: Optional[Iterable[str]] = None,
) -> List[Dict[str, Any]]:
result = validate_narration_script_items(items, subtitle_index, video_paths)
if not result.valid:
raise NarrationScriptValidationError("\n".join(result.errors))
return result.items
def summarize_subtitle_window(
subtitle_index: Sequence[SubtitleCue],
max_cues_per_video: int = 80,
) -> str:
"""Return compact subtitle context for a repair prompt."""
lines: List[str] = []
by_video: Dict[int, List[SubtitleCue]] = {}
for cue in subtitle_index:
by_video.setdefault(cue.video_id, []).append(cue)
for video_id in sorted(by_video):
cues = by_video[video_id][:max_cues_per_video]
video_name = cues[0].video_name if cues else ""
header = f"# 视频 {video_id}: {video_name}" if video_name else f"# 视频 {video_id}"
lines.append(header)
for cue in cues:
text = cue.text.replace("\n", " ").strip()
lines.append(f"{cue.timestamp} {text}")
if len(by_video[video_id]) > max_cues_per_video:
lines.append(f"... 已省略 {len(by_video[video_id]) - max_cues_per_video} 条字幕")
return "\n".join(lines)

View File

@ -0,0 +1,231 @@
"""LLM-powered SRT subtitle correction."""
from __future__ import annotations
import json
import os
import re
from dataclasses import dataclass
from typing import Any
from loguru import logger
from app.services.llm.manager import LLMServiceManager
from app.services.llm.migration_adapter import _run_async_safely
from app.services.llm.unified_service import UnifiedLLMService
from app.services.subtitle_text import has_timecodes, normalize_subtitle_text, read_subtitle_text
from app.utils import utils
class SubtitleCorrectionError(RuntimeError):
"""Raised when subtitle correction cannot produce a valid SRT."""
_TIME_LINE_RE = re.compile(
r"^\s*\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]\d{3}(?:\s+.*)?$"
)
_JSON_BLOCK_RE = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.DOTALL | re.IGNORECASE)
@dataclass(frozen=True)
class SubtitleBlock:
order: int
index_line: str
time_line: str
text: str
def _ensure_llm_providers_registered() -> None:
if LLMServiceManager.is_registered():
return
from app.services.llm.providers import register_all_providers
register_all_providers()
def parse_srt_blocks(srt_content: str) -> list[SubtitleBlock]:
normalized = normalize_subtitle_text(srt_content)
if not normalized or not has_timecodes(normalized):
raise SubtitleCorrectionError("字幕内容为空或未检测到有效 SRT 时间轴")
blocks: list[SubtitleBlock] = []
raw_blocks = re.split(r"\n\s*\n", normalized)
for raw_block in raw_blocks:
lines = [line.rstrip() for line in raw_block.splitlines() if line.strip()]
if not lines:
continue
if len(lines) >= 2 and _TIME_LINE_RE.match(lines[1]):
index_line = lines[0].strip()
time_line = lines[1].strip()
text = "\n".join(lines[2:]).strip()
elif _TIME_LINE_RE.match(lines[0]):
index_line = str(len(blocks) + 1)
time_line = lines[0].strip()
text = "\n".join(lines[1:]).strip()
else:
raise SubtitleCorrectionError(f"无法解析字幕块: {raw_block[:80]}")
blocks.append(
SubtitleBlock(
order=len(blocks) + 1,
index_line=index_line,
time_line=time_line,
text=text,
)
)
if not blocks:
raise SubtitleCorrectionError("字幕内容为空或未检测到有效字幕块")
return blocks
def _build_correction_prompt(blocks: list[SubtitleBlock]) -> str:
payload = [
{
"id": block.order,
"time": block.time_line,
"text": block.text,
}
for block in blocks
]
return f"""
请校准以下 SRT 字幕文本中的明显语音识别错误字幕可能是中文英文日文韩文或其他语言也可能包含多语言混合内容
校准要求
1. 先结合全部字幕内容识别原语言和语境保持原语言输出多语言混合内容也要保持原有语言混合方式
2. 只纠正明显的 ASR 错字拼写错误同音或近音误识别词形误识别专有名词前后不一致
3. 不要润色扩写改写句意不要翻译不要增删剧情信息
4. 不要修改时间轴序号条目数量或条目顺序
5. 不确定的内容保持原样
6. 保留必要的说话人标记标点和换行
只输出严格 JSON不要输出 Markdown 或解释文字格式必须为
{{"items":[{{"id":1,"text":"校准后的字幕文本"}}]}}
待校准字幕条目
{json.dumps(payload, ensure_ascii=False, indent=2)}
""".strip()
def _extract_json_text(raw_output: str) -> str:
text = str(raw_output or "").strip()
block_match = _JSON_BLOCK_RE.search(text)
if block_match:
return block_match.group(1).strip()
if not text.startswith(("{", "[")):
starts = [pos for pos in (text.find("{"), text.find("[")) if pos >= 0]
if starts:
start = min(starts)
end = max(text.rfind("}"), text.rfind("]"))
if end > start:
return text[start:end + 1]
return text
def _parse_corrections(raw_output: str, expected_ids: set[int]) -> dict[int, str]:
json_text = _extract_json_text(raw_output)
try:
data = json.loads(json_text)
except json.JSONDecodeError as exc:
raise SubtitleCorrectionError("LLM 未返回有效 JSON 字幕校准结果") from exc
if isinstance(data, dict) and "items" in data:
items = data["items"]
elif isinstance(data, list):
items = data
elif isinstance(data, dict):
items = [{"id": key, "text": value} for key, value in data.items()]
else:
raise SubtitleCorrectionError("LLM 字幕校准结果格式无效")
corrections: dict[int, str] = {}
if not isinstance(items, list):
raise SubtitleCorrectionError("LLM 字幕校准结果缺少 items 列表")
for item in items:
if not isinstance(item, dict):
continue
try:
item_id = int(item.get("id"))
except (TypeError, ValueError):
continue
if item_id in expected_ids:
corrections[item_id] = str(item.get("text") or "").strip()
missing_ids = sorted(expected_ids - set(corrections.keys()))
if missing_ids:
raise SubtitleCorrectionError(f"LLM 字幕校准结果缺少字幕条目: {missing_ids[:10]}")
return corrections
def _render_srt(blocks: list[SubtitleBlock], corrections: dict[int, str]) -> str:
rendered_blocks = []
for block in blocks:
corrected_text = corrections.get(block.order, "").strip() or block.text
rendered_blocks.append(f"{block.index_line}\n{block.time_line}\n{corrected_text}")
return "\n\n".join(rendered_blocks).rstrip() + "\n"
def correct_srt_content(
srt_content: str,
*,
provider: str = "",
api_key: str = "",
base_url: str = "",
temperature: float = 0.1,
) -> str:
blocks = parse_srt_blocks(srt_content)
_ensure_llm_providers_registered()
logger.info(f"开始校准字幕,共 {len(blocks)}")
prompt = _build_correction_prompt(blocks)
raw_output = _run_async_safely(
UnifiedLLMService.generate_text,
prompt=prompt,
system_prompt="你是一位专业的多语言字幕校对员,擅长修正 ASR 语音识别造成的明显错字、拼写错误、同音或近音误识别,同时严格保留字幕结构和原语言。",
provider=provider,
temperature=temperature,
response_format="json",
api_key=api_key,
api_base=base_url,
)
corrections = _parse_corrections(raw_output, {block.order for block in blocks})
corrected_srt = _render_srt(blocks, corrections)
logger.info("字幕校准完成")
return corrected_srt
def write_srt_file(srt_content: str, subtitle_file: str = "") -> str:
if not subtitle_file:
subtitle_file = os.path.join(utils.subtitle_dir(), "subtitle_corrected.srt")
parent = os.path.dirname(subtitle_file)
if parent:
os.makedirs(parent, exist_ok=True)
with open(subtitle_file, "w", encoding="utf-8") as f:
f.write(srt_content)
return subtitle_file
def correct_subtitle_file(
subtitle_file: str,
output_file: str = "",
*,
provider: str = "",
api_key: str = "",
base_url: str = "",
temperature: float = 0.1,
) -> str:
if not subtitle_file or not os.path.isfile(subtitle_file):
raise SubtitleCorrectionError(f"字幕文件不存在: {subtitle_file}")
decoded = read_subtitle_text(subtitle_file)
corrected_srt = correct_srt_content(
decoded.text,
provider=provider,
api_key=api_key,
base_url=base_url,
temperature=temperature,
)
return write_srt_file(corrected_srt, output_file)

View File

@ -10,11 +10,301 @@ from app.config import config
from app.config.audio_config import AudioConfig, get_recommended_volumes_for_content
from app.models import const
from app.models.schema import VideoClipParams
from app.services import (voice, audio_merger, subtitle_merger, clip_video, merger_video, update_script, generate_video)
from app.services import (
voice,
audio_merger,
subtitle_merger,
clip_video,
merger_video,
update_script,
generate_video,
script_subtitle,
)
from app.services import state as sm
from app.utils import utils
VIDEO_GENERATION_TOTAL_STEPS = 6
def _update_video_generation_task(
task_id: str,
progress: int,
message: str,
step_current: int = 0,
ffmpeg_progress: float | None = None,
state: int = const.TASK_STATE_PROCESSING,
**kwargs,
) -> None:
task_fields = {
"message": message,
"step_current": step_current,
"step_total": VIDEO_GENERATION_TOTAL_STEPS,
**kwargs,
}
if ffmpeg_progress is not None:
task_fields["ffmpeg_progress"] = round(
max(0.0, min(100.0, float(ffmpeg_progress))),
1,
)
sm.state.update_task(
task_id,
state=state,
progress=progress,
**task_fields,
)
def _is_auto_transcription_enabled(params: VideoClipParams) -> bool:
return bool(
getattr(params, "subtitle_enabled", True)
and getattr(params, "subtitle_auto_transcribe_enabled", False)
)
def _get_auto_transcription_backend(params: VideoClipParams) -> str:
backend = str(getattr(params, "subtitle_auto_transcribe_backend", "") or "").strip().lower()
if backend not in {"local", "firered", "bailian"}:
backend = "local"
return backend
def _get_original_subtitle_paths(params: VideoClipParams) -> list[str]:
subtitle_paths = getattr(params, "original_subtitle_paths", []) or []
if isinstance(subtitle_paths, str):
subtitle_paths = [subtitle_paths]
normalized_paths = []
seen = set()
for subtitle_path in subtitle_paths:
if not isinstance(subtitle_path, str):
continue
subtitle_path = subtitle_path.strip()
if subtitle_path and subtitle_path not in seen:
normalized_paths.append(subtitle_path)
seen.add(subtitle_path)
single_subtitle_path = str(getattr(params, "original_subtitle_path", "") or "").strip()
if single_subtitle_path and single_subtitle_path not in seen:
normalized_paths.insert(0, single_subtitle_path)
if not normalized_paths:
normalized_paths = _find_original_subtitle_paths_for_videos(_get_video_origin_paths(params))
return normalized_paths
def _get_video_origin_paths(params: VideoClipParams) -> list[str]:
video_paths = getattr(params, "video_origin_paths", []) or []
if isinstance(video_paths, str):
video_paths = [video_paths]
normalized_paths = []
seen = set()
for video_path in video_paths:
if not isinstance(video_path, str):
continue
video_path = video_path.strip()
if video_path and video_path not in seen:
normalized_paths.append(video_path)
seen.add(video_path)
single_video_path = str(getattr(params, "video_origin_path", "") or "").strip()
if single_video_path and single_video_path not in seen:
normalized_paths.insert(0, single_video_path)
return normalized_paths
def _video_stem_candidates(video_path: str) -> list[str]:
stem = path.splitext(path.basename(str(video_path or "").strip()))[0]
if not stem:
return []
candidates = [stem]
timestamp_stripped = re.sub(r"_[0-9]{14}$", "", stem)
if timestamp_stripped and timestamp_stripped not in candidates:
candidates.append(timestamp_stripped)
return candidates
def _find_original_subtitle_paths_for_videos(video_paths: list[str]) -> list[str]:
subtitle_dir = utils.subtitle_dir()
if not path.isdir(subtitle_dir):
return []
subtitle_files = [
path.join(subtitle_dir, filename)
for filename in os.listdir(subtitle_dir)
if filename.lower().endswith(".srt")
]
if not subtitle_files:
return []
resolved_paths = []
seen = set()
for video_path in video_paths:
candidates = _video_stem_candidates(video_path)
if not candidates:
continue
matches = []
for subtitle_path in subtitle_files:
subtitle_stem = path.splitext(path.basename(subtitle_path))[0]
for candidate in candidates:
if subtitle_stem == candidate or subtitle_stem.startswith(f"{candidate}_"):
matches.append(subtitle_path)
break
if not matches:
continue
matches.sort(key=lambda item: path.getmtime(item), reverse=True)
selected_path = matches[0]
if selected_path not in seen:
resolved_paths.append(selected_path)
seen.add(selected_path)
if resolved_paths:
logger.info(f"未从参数获取原片字幕,已按视频文件名自动匹配: {resolved_paths}")
return resolved_paths
def _create_programmatic_subtitle_file(
task_id: str,
list_script: list[dict],
params: VideoClipParams,
) -> str:
if not getattr(params, "subtitle_enabled", True):
return ""
original_subtitle_paths = _get_original_subtitle_paths(params)
logger.info(f"程序化字幕使用原片字幕路径: {original_subtitle_paths or '未提供'}")
return script_subtitle.create_script_subtitle_file(
task_id=task_id,
list_script=list_script,
original_subtitle_paths=original_subtitle_paths,
video_origin_paths=_get_video_origin_paths(params),
)
def _build_subtitle_mask_options(params: VideoClipParams, enabled=None) -> dict:
mask_configured = bool(
getattr(params, "subtitle_enabled", True)
and getattr(params, "subtitle_mask_enabled", False)
)
mask_enabled = mask_configured if enabled is None else mask_configured and enabled
return {
'subtitle_mask_enabled': mask_enabled,
'subtitle_mask_landscape_x_percent': getattr(params, "subtitle_mask_landscape_x_percent", 10.0),
'subtitle_mask_landscape_y_percent': getattr(params, "subtitle_mask_landscape_y_percent", 78.0),
'subtitle_mask_landscape_width_percent': getattr(params, "subtitle_mask_landscape_width_percent", 80.0),
'subtitle_mask_landscape_height_percent': getattr(params, "subtitle_mask_landscape_height_percent", 14.0),
'subtitle_mask_landscape_blur_radius': getattr(params, "subtitle_mask_landscape_blur_radius", 18),
'subtitle_mask_landscape_opacity_percent': getattr(params, "subtitle_mask_landscape_opacity_percent", 82),
'subtitle_mask_portrait_x_percent': getattr(params, "subtitle_mask_portrait_x_percent", 8.0),
'subtitle_mask_portrait_y_percent': getattr(params, "subtitle_mask_portrait_y_percent", 79.0),
'subtitle_mask_portrait_width_percent': getattr(params, "subtitle_mask_portrait_width_percent", 84.0),
'subtitle_mask_portrait_height_percent': getattr(params, "subtitle_mask_portrait_height_percent", 16.0),
'subtitle_mask_portrait_blur_radius': getattr(params, "subtitle_mask_portrait_blur_radius", 26),
'subtitle_mask_portrait_opacity_percent': getattr(params, "subtitle_mask_portrait_opacity_percent", 84),
'subtitle_position_landscape_y_percent': getattr(params, "subtitle_position_landscape_y_percent", 85.0),
'subtitle_position_portrait_y_percent': getattr(params, "subtitle_position_portrait_y_percent", 82.0),
}
def _transcribe_final_video(task_id: str, video_path: str, params: VideoClipParams) -> str:
"""Transcribe the fully merged video into an SRT file."""
from app.services import fun_asr_subtitle
if not video_path or not path.exists(video_path):
raise FileNotFoundError(f"自动转录视频不存在: {video_path}")
backend = _get_auto_transcription_backend(params)
subtitle_file = path.join(utils.task_dir(task_id), "auto_transcribed_final.srt")
logger.info(f"开始自动转录最终视频: {video_path}, backend={backend}")
if backend == "local":
api_url = str(
getattr(params, "subtitle_auto_transcribe_api_url", "")
or config.fun_asr.get("api_url", fun_asr_subtitle.LOCAL_FUN_ASR_API_URL)
).strip()
if not api_url:
raise ValueError("请先输入本地 FunASR-Pack API 地址")
generated_path = fun_asr_subtitle.create_with_local_fun_asr(
local_file=video_path,
subtitle_file=subtitle_file,
api_url=api_url,
hotword=str(getattr(params, "subtitle_auto_transcribe_hotword", "") or "").strip(),
enable_spk=bool(getattr(params, "subtitle_auto_transcribe_enable_spk", False)),
)
elif backend == "firered":
api_url = str(
getattr(params, "subtitle_auto_transcribe_firered_api_url", "")
or config.fun_asr.get("firered_api_url", fun_asr_subtitle.LOCAL_FIRERED_ASR_API_URL)
).strip()
if not api_url:
raise ValueError("请先输入本地ASR API 地址")
generated_path = fun_asr_subtitle.create_with_local_firered_asr(
local_file=video_path,
subtitle_file=subtitle_file,
api_url=api_url,
)
else:
api_key = str(
getattr(params, "subtitle_auto_transcribe_api_key", "")
or config.fun_asr.get("api_key", "")
).strip()
if not api_key:
raise ValueError("请先输入阿里百炼 API Key")
generated_path = fun_asr_subtitle.create_with_fun_asr(
local_file=video_path,
subtitle_file=subtitle_file,
api_key=api_key,
)
if not generated_path or not path.exists(generated_path):
raise RuntimeError("自动转录失败:未生成字幕文件")
logger.info(f"自动转录字幕生成成功: {generated_path}")
return generated_path
def _merge_auto_transcribed_subtitles(
source_video_path: str,
output_video_path: str,
subtitle_path: str,
params: VideoClipParams,
) -> str:
subtitle_options = {
'voice_volume': 1.0,
'bgm_volume': 0.0,
'original_audio_volume': 1.0,
'keep_original_audio': True,
'subtitle_enabled': True,
'subtitle_font': params.font_name,
'subtitle_font_size': params.font_size,
'subtitle_color': params.text_fore_color,
'subtitle_bg_color': None,
'subtitle_position': params.subtitle_position,
'custom_position': params.custom_position,
'threads': params.n_threads,
**_build_subtitle_mask_options(params, enabled=True),
}
return generate_video.merge_materials(
video_path=source_video_path,
audio_path="",
subtitle_path=subtitle_path,
bgm_path="",
output_path=output_video_path,
options=subtitle_options
)
def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict = None):
"""
后台任务统一视频裁剪处理- 优化版本
@ -108,6 +398,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
# 使用新的统一裁剪策略
video_clip_result = clip_video.clip_video_unified(
video_origin_path=params.video_origin_path,
video_origin_paths=getattr(params, "video_origin_paths", []),
script_list=list_script,
tts_results=tts_results
)
@ -139,7 +430,19 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
logger.info(f"音频文件合并成功->{merged_audio_path}")
# 合并字幕文件
merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list)
merged_subtitle_path = ""
if getattr(params, "subtitle_enabled", True):
try:
merged_subtitle_path = _create_programmatic_subtitle_file(
task_id,
new_script_list,
params,
)
except Exception as e:
logger.warning(f"程序化字幕生成失败将尝试合并TTS字幕: {e}")
if not merged_subtitle_path and getattr(params, "subtitle_enabled", True):
merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list)
if merged_subtitle_path:
logger.info(f"字幕文件合并成功->{merged_subtitle_path}")
else:
@ -156,6 +459,15 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
logger.warning("没有需要合并的音频/字幕")
merged_audio_path = ""
merged_subtitle_path = ""
if getattr(params, "subtitle_enabled", True):
try:
merged_subtitle_path = _create_programmatic_subtitle_file(
task_id,
new_script_list,
params,
)
except Exception as e:
logger.warning(f"程序化字幕生成失败: {e}")
"""
5. 合并视频
@ -200,10 +512,19 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
6. 合并字幕/BGM/配音/视频
"""
output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}")
auto_transcription_enabled = _is_auto_transcription_enabled(params)
merge_output_video_path = (
path.join(utils.task_dir(task_id), "combined_without_auto_subtitles.mp4")
if auto_transcription_enabled
else output_video_path
)
logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {merge_output_video_path}")
# bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3'
bgm_path = utils.get_bgm_file()
bgm_path = utils.get_bgm_file(
bgm_type=getattr(params, "bgm_type", "random"),
bgm_file=getattr(params, "bgm_file", ""),
)
# 获取优化的音量配置
optimized_volumes = get_recommended_volumes_for_content('mixed')
@ -232,24 +553,39 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
'bgm_volume': final_bgm_volume, # 背景音乐音量(优化后)
'original_audio_volume': final_original_volume, # 视频原声音量(优化后)
'keep_original_audio': True, # 是否保留原声
'subtitle_enabled': params.subtitle_enabled, # 是否启用字幕 - 修复字幕开关bug
'subtitle_enabled': params.subtitle_enabled and not auto_transcription_enabled,
'subtitle_font': params.font_name, # 这里使用相对字体路径,会自动在 font_dir() 目录下查找
'subtitle_font_size': params.font_size,
'subtitle_color': params.text_fore_color,
'subtitle_bg_color': None, # 直接使用None表示透明背景
'subtitle_position': params.subtitle_position,
'custom_position': params.custom_position,
'threads': params.n_threads
'threads': params.n_threads,
**_build_subtitle_mask_options(params, enabled=not auto_transcription_enabled),
}
generate_video.merge_materials(
video_path=combined_video_path,
audio_path=merged_audio_path,
subtitle_path=merged_subtitle_path,
bgm_path=bgm_path,
output_path=output_video_path,
output_path=merge_output_video_path,
options=options
)
auto_subtitle_path = ""
if auto_transcription_enabled:
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=90)
logger.info("\n\n## 7. 自动转录最终视频字幕")
auto_subtitle_path = _transcribe_final_video(task_id, merge_output_video_path, params)
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=95)
logger.info(f"\n\n## 8. 压入自动转录字幕 -> {output_video_path}")
_merge_auto_transcribed_subtitles(
source_video_path=merge_output_video_path,
output_video_path=output_video_path,
subtitle_path=auto_subtitle_path,
params=params,
)
final_video_paths.append(output_video_path)
combined_video_paths.append(combined_video_path)
@ -259,6 +595,8 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
"videos": final_video_paths,
"combined_videos": combined_video_paths
}
if auto_subtitle_path:
kwargs["subtitles"] = [auto_subtitle_path]
sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs)
return kwargs
@ -277,12 +615,23 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
global merged_audio_path, merged_subtitle_path
logger.info(f"\n\n## 开始统一视频处理任务: {task_id}")
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0)
_update_video_generation_task(
task_id,
progress=0,
message="正在初始化视频生成任务",
step_current=0,
)
"""
1. 加载剪辑脚本
"""
logger.info("\n\n## 1. 加载视频脚本")
_update_video_generation_task(
task_id,
progress=5,
message="正在加载剪辑脚本",
step_current=1,
)
video_script_path = path.join(params.video_clip_json_path)
if path.exists(video_script_path):
@ -308,6 +657,12 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
2. 使用 TTS 生成音频素材
"""
logger.info("\n\n## 2. 根据OST设置生成音频列表")
_update_video_generation_task(
task_id,
progress=10,
message="正在生成 TTS 配音",
step_current=2,
)
# 只为OST=0 or 2的判断生成音频 OST=0 仅保留解说 OST=2 保留解说和原声
tts_segments = [
segment for segment in list_script
@ -324,16 +679,28 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
voice_pitch=params.voice_pitch,
)
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
_update_video_generation_task(
task_id,
progress=20,
message="TTS 配音生成完成",
step_current=2,
)
"""
3. 统一视频裁剪 - 基于OST类型的差异化裁剪策略
"""
logger.info("\n\n## 3. 统一视频裁剪基于OST类型")
_update_video_generation_task(
task_id,
progress=30,
message="正在按脚本裁剪视频片段",
step_current=3,
)
# 使用新的统一裁剪策略
video_clip_result = clip_video.clip_video_unified(
video_origin_path=params.video_origin_path,
video_origin_paths=getattr(params, "video_origin_paths", []),
script_list=list_script,
tts_results=tts_results
)
@ -347,12 +714,23 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
logger.info(f"统一裁剪完成,处理了 {len(video_clip_result)} 个视频片段")
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60)
_update_video_generation_task(
task_id,
progress=60,
message="视频片段裁剪完成",
step_current=3,
)
"""
4. 合并音频和字幕
"""
logger.info("\n\n## 4. 合并音频和字幕")
_update_video_generation_task(
task_id,
progress=65,
message="正在合并配音和字幕",
step_current=4,
)
total_duration = sum([script["duration"] for script in new_script_list])
if tts_segments:
try:
@ -364,8 +742,21 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
)
logger.info(f"音频文件合并成功->{merged_audio_path}")
# 合并字幕文件
merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list)
# 优先基于脚本文案和成片时间线生成字幕失败时回退到TTS字幕合并
merged_subtitle_path = ""
if getattr(params, "subtitle_enabled", True):
try:
merged_subtitle_path = _create_programmatic_subtitle_file(
task_id,
new_script_list,
params,
)
except Exception as e:
logger.warning(f"程序化字幕生成失败将尝试合并TTS字幕: {e}")
if not merged_subtitle_path and getattr(params, "subtitle_enabled", True):
merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list)
if merged_subtitle_path:
logger.info(f"字幕文件合并成功->{merged_subtitle_path}")
else:
@ -382,6 +773,21 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
logger.warning("没有需要合并的音频/字幕")
merged_audio_path = ""
merged_subtitle_path = ""
if getattr(params, "subtitle_enabled", True):
try:
merged_subtitle_path = _create_programmatic_subtitle_file(
task_id,
new_script_list,
params,
)
except Exception as e:
logger.warning(f"程序化字幕生成失败: {e}")
_update_video_generation_task(
task_id,
progress=70,
message="配音和字幕合并完成",
step_current=4,
)
"""
5. 合并视频
@ -391,6 +797,12 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4")
logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")
_update_video_generation_task(
task_id,
progress=75,
message="正在合并视频片段",
step_current=5,
)
# 使用统一裁剪后的视频片段
video_clips = []
@ -410,15 +822,38 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
video_aspect=params.video_aspect,
threads=params.n_threads
)
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80)
_update_video_generation_task(
task_id,
progress=80,
message="视频片段合并完成",
step_current=5,
)
"""
6. 合并字幕/BGM/配音/视频
"""
output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}")
auto_transcription_enabled = _is_auto_transcription_enabled(params) and not bool(merged_subtitle_path)
if _is_auto_transcription_enabled(params) and merged_subtitle_path:
logger.info("已生成字幕文件,跳过最终视频自动转录")
merge_output_video_path = (
path.join(utils.task_dir(task_id), "combined_without_auto_subtitles.mp4")
if auto_transcription_enabled
else output_video_path
)
logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {merge_output_video_path}")
_update_video_generation_task(
task_id,
progress=85,
message="正在合成最终视频",
step_current=6,
ffmpeg_progress=0,
)
bgm_path = utils.get_bgm_file()
bgm_path = utils.get_bgm_file(
bgm_type=getattr(params, "bgm_type", "random"),
bgm_file=getattr(params, "bgm_file", ""),
)
# 获取优化的音量配置
optimized_volumes = get_recommended_volumes_for_content('mixed')
@ -446,24 +881,66 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
'bgm_volume': final_bgm_volume,
'original_audio_volume': final_original_volume,
'keep_original_audio': True,
'subtitle_enabled': params.subtitle_enabled,
'subtitle_enabled': params.subtitle_enabled and not auto_transcription_enabled,
'subtitle_font': params.font_name,
'subtitle_font_size': params.font_size,
'subtitle_color': params.text_fore_color,
'subtitle_bg_color': None,
'subtitle_position': params.subtitle_position,
'custom_position': params.custom_position,
'threads': params.n_threads
'threads': params.n_threads,
**_build_subtitle_mask_options(params, enabled=not auto_transcription_enabled),
}
final_merge_progress_start = 85
final_merge_progress_end = 89 if auto_transcription_enabled else 99
def update_final_merge_progress(ffmpeg_progress: float):
progress_span = final_merge_progress_end - final_merge_progress_start
overall_progress = final_merge_progress_start + int(
round((max(0.0, min(100.0, float(ffmpeg_progress))) / 100) * progress_span)
)
_update_video_generation_task(
task_id,
progress=overall_progress,
message="正在合成最终视频",
step_current=6,
ffmpeg_progress=ffmpeg_progress,
)
generate_video.merge_materials(
video_path=combined_video_path,
audio_path=merged_audio_path,
subtitle_path=merged_subtitle_path,
bgm_path=bgm_path,
output_path=output_video_path,
options=options
output_path=merge_output_video_path,
options=options,
progress_callback=update_final_merge_progress,
)
auto_subtitle_path = ""
if auto_transcription_enabled:
_update_video_generation_task(
task_id,
progress=90,
message="正在自动转录最终视频",
step_current=6,
)
logger.info("\n\n## 7. 自动转录最终视频字幕")
auto_subtitle_path = _transcribe_final_video(task_id, merge_output_video_path, params)
_update_video_generation_task(
task_id,
progress=95,
message="正在压入自动转录字幕",
step_current=6,
)
logger.info(f"\n\n## 8. 压入自动转录字幕 -> {output_video_path}")
_merge_auto_transcribed_subtitles(
source_video_path=merge_output_video_path,
output_video_path=output_video_path,
subtitle_path=auto_subtitle_path,
params=params,
)
final_video_paths.append(output_video_path)
combined_video_paths.append(combined_video_path)
@ -473,7 +950,16 @@ def start_subclip_unified(task_id: str, params: VideoClipParams):
"videos": final_video_paths,
"combined_videos": combined_video_paths
}
sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs)
if auto_subtitle_path:
kwargs["subtitles"] = [auto_subtitle_path]
_update_video_generation_task(
task_id,
progress=100,
message="视频生成完成",
step_current=VIDEO_GENERATION_TOTAL_STEPS,
state=const.TASK_STATE_COMPLETE,
**kwargs
)
return kwargs

View File

@ -0,0 +1,137 @@
"""Tavily-powered web search helpers for plot analysis."""
from __future__ import annotations
import os
from typing import Any
import requests
from loguru import logger
TAVILY_API_BASE_URL = "https://api.tavily.com"
DEFAULT_SEARCH_DEPTH = "basic"
DEFAULT_MAX_RESULTS = 5
DEFAULT_TIMEOUT = 20
class TavilySearchError(RuntimeError):
"""Raised when Tavily search cannot be completed."""
def _trim_text(value: Any, max_chars: int) -> str:
text = str(value or "").strip()
if len(text) <= max_chars:
return text
return f"{text[:max_chars].rstrip()}..."
def search_short_drama(
short_name: str,
api_key: str | None = None,
*,
search_depth: str = DEFAULT_SEARCH_DEPTH,
max_results: int = DEFAULT_MAX_RESULTS,
timeout: int = DEFAULT_TIMEOUT,
) -> dict[str, Any]:
"""Search web context for a short drama name with Tavily."""
return search_story_context(
short_name,
api_key,
search_keywords="短剧 剧情 介绍 人物 结局",
empty_name_message="短剧名称不能为空",
search_depth=search_depth,
max_results=max_results,
timeout=timeout,
)
def search_story_context(
title: str,
api_key: str | None = None,
*,
search_keywords: str = "剧情 介绍 人物 结局",
empty_name_message: str = "作品名称不能为空",
search_depth: str = DEFAULT_SEARCH_DEPTH,
max_results: int = DEFAULT_MAX_RESULTS,
timeout: int = DEFAULT_TIMEOUT,
) -> dict[str, Any]:
"""Search web context for a story title with Tavily."""
title = str(title or "").strip()
if not title:
raise TavilySearchError(empty_name_message)
api_key = (api_key or os.getenv("TAVILY_API_KEY") or "").strip()
if not api_key:
raise TavilySearchError("Tavily API Key 未配置")
query = f"{title} {search_keywords}".strip()
payload = {
"query": query,
"search_depth": search_depth or DEFAULT_SEARCH_DEPTH,
"topic": "general",
"max_results": max(1, min(int(max_results or DEFAULT_MAX_RESULTS), 10)),
"include_answer": True,
"include_raw_content": False,
"include_images": False,
}
try:
response = requests.post(
f"{TAVILY_API_BASE_URL}/search",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json=payload,
timeout=timeout,
)
except requests.RequestException as exc:
raise TavilySearchError(f"Tavily 请求失败: {exc}") from exc
if response.status_code >= 400:
message = _trim_text(response.text, 500)
raise TavilySearchError(f"Tavily 请求失败: HTTP {response.status_code} {message}")
try:
data = response.json()
except ValueError as exc:
raise TavilySearchError("Tavily 返回内容不是有效 JSON") from exc
logger.info(
"Tavily 剧情检索完成: query={}, results={}",
query,
len(data.get("results") or []),
)
return data
def format_search_context(search_data: dict[str, Any], *, max_chars: int = 6000) -> str:
"""Format Tavily response into compact LLM context."""
if not search_data:
return ""
lines = [
"# Tavily 联网检索结果",
f"检索 query: {search_data.get('query', '')}",
]
answer = _trim_text(search_data.get("answer"), 1200)
if answer:
lines.extend(["", "## 综合回答", answer])
results = search_data.get("results") or []
if results:
lines.extend(["", "## 搜索来源"])
for index, result in enumerate(results, start=1):
title = _trim_text(result.get("title"), 120)
url = _trim_text(result.get("url"), 240)
content = _trim_text(result.get("content") or result.get("raw_content"), 700)
lines.extend(
[
f"{index}. 标题: {title}",
f" 来源: {url}",
f" 摘要: {content}",
]
)
return _trim_text("\n".join(lines).strip(), max_chars)

View File

@ -12,9 +12,11 @@ from app.services import fun_asr_subtitle as fasr
class FakeResponse:
def __init__(self, payload=None, status_code=200):
def __init__(self, payload=None, status_code=200, text=None):
self.payload = payload or {}
self.status_code = status_code
self.text = text
self.content = text.encode("utf-8") if isinstance(text, str) else b""
def json(self):
return self.payload
@ -375,6 +377,195 @@ class FunAsrServiceTests(unittest.TestCase):
fasr.download_transcription_result("https://result.example/bad.json", session=MalformedDownloadSession({}))
class LocalFunAsrServiceTests(unittest.TestCase):
def test_request_local_fun_asr_posts_file_and_options(self):
class LocalSession:
def __init__(self):
self.calls = []
def post(self, url, **kwargs):
self.calls.append(("POST", url, kwargs))
return FakeResponse({"text": "你好", "srt_file": "/tmp/out.srt"})
with tempfile.TemporaryDirectory() as tmp_dir:
local_file = Path(tmp_dir) / "audio.wav"
local_file.write_bytes(b"audio")
session = LocalSession()
result = fasr.request_local_fun_asr(
str(local_file),
api_url="127.0.0.1:7860",
hotword="NarratoAI",
enable_spk=True,
timeout=123,
session=session,
)
self.assertEqual("你好", result["text"])
self.assertEqual("POST", session.calls[0][0])
self.assertEqual("http://127.0.0.1:7860/asr", session.calls[0][1])
self.assertEqual({"hotword": "NarratoAI", "enable_spk": "true"}, session.calls[0][2]["data"])
self.assertEqual(123, session.calls[0][2]["timeout"])
self.assertIn("file", session.calls[0][2]["files"])
def test_create_with_local_fun_asr_copies_pack_srt_file(self):
class LocalSession:
def __init__(self, srt_file):
self.srt_file = srt_file
self.calls = []
def post(self, url, **kwargs):
self.calls.append(("POST", url, kwargs))
return FakeResponse({"text": "你好", "srt_file": str(self.srt_file)})
with tempfile.TemporaryDirectory() as tmp_dir:
local_file = Path(tmp_dir) / "audio.wav"
local_file.write_bytes(b"audio")
pack_srt = Path(tmp_dir) / "pack.srt"
pack_srt.write_text("1\n00:00:00,000 --> 00:00:01,000\n你好\n", encoding="utf-8")
subtitle_file = Path(tmp_dir) / "out.srt"
result_path = fasr.create_with_local_fun_asr(
str(local_file),
subtitle_file=str(subtitle_file),
api_url="http://127.0.0.1:7860",
session=LocalSession(pack_srt),
)
self.assertEqual(str(subtitle_file), result_path)
self.assertEqual(pack_srt.read_text(encoding="utf-8"), subtitle_file.read_text(encoding="utf-8"))
def test_create_with_local_fun_asr_downloads_relative_srt(self):
class LocalSession:
def __init__(self):
self.calls = []
def post(self, url, **kwargs):
self.calls.append(("POST", url, kwargs))
return FakeResponse({"text": "你好", "downloads": {"srt": "/download/result.srt"}})
def get(self, url, **kwargs):
self.calls.append(("GET", url, kwargs))
return FakeResponse(text="1\n00:00:00,000 --> 00:00:01,000\n你好\n")
with tempfile.TemporaryDirectory() as tmp_dir:
local_file = Path(tmp_dir) / "audio.wav"
local_file.write_bytes(b"audio")
subtitle_file = Path(tmp_dir) / "out.srt"
session = LocalSession()
result_path = fasr.create_with_local_fun_asr(
str(local_file),
subtitle_file=str(subtitle_file),
api_url="http://127.0.0.1:7860/asr",
session=session,
)
self.assertEqual(str(subtitle_file), result_path)
self.assertEqual("http://127.0.0.1:7860/download/result.srt", session.calls[1][1])
self.assertIn("你好", subtitle_file.read_text(encoding="utf-8"))
def test_local_fun_asr_result_to_srt_uses_raw_timestamps(self):
result = {
"raw": [
{
"text": "你好,世界。",
"timestamp": [[0, 300], [300, 600], [600, 900], [900, 1200]],
}
]
}
srt = fasr.local_fun_asr_result_to_srt(result, max_chars=20)
self.assertIn("00:00:00,000 --> 00:00:00,600\n你好,", srt)
self.assertIn("世界。", srt)
class LocalFireRedAsrServiceTests(unittest.TestCase):
def test_request_local_firered_asr_posts_file_and_options(self):
class LocalSession:
def __init__(self):
self.calls = []
def post(self, url, **kwargs):
self.calls.append(("POST", url, kwargs))
return FakeResponse({"text": "你好", "srt_url": "/outputs/out.srt"})
with tempfile.TemporaryDirectory() as tmp_dir:
local_file = Path(tmp_dir) / "audio.wav"
local_file.write_bytes(b"audio")
session = LocalSession()
result = fasr.request_local_firered_asr(
str(local_file),
api_url="127.0.0.1:7867",
enable_vad=True,
enable_lid=False,
enable_punc=True,
return_timestamp=True,
timeout=456,
session=session,
)
self.assertEqual("你好", result["text"])
self.assertEqual("POST", session.calls[0][0])
self.assertEqual("http://127.0.0.1:7867/asr", session.calls[0][1])
self.assertEqual(
{
"enable_vad": "true",
"enable_lid": "false",
"enable_punc": "true",
"return_timestamp": "true",
},
session.calls[0][2]["data"],
)
self.assertEqual(456, session.calls[0][2]["timeout"])
self.assertIn("file", session.calls[0][2]["files"])
def test_create_with_local_firered_asr_downloads_srt_url(self):
class LocalSession:
def __init__(self):
self.calls = []
def post(self, url, **kwargs):
self.calls.append(("POST", url, kwargs))
return FakeResponse({"text": "你好", "srt_url": "/outputs/result.srt"})
def get(self, url, **kwargs):
self.calls.append(("GET", url, kwargs))
return FakeResponse(text="1\n00:00:00,000 --> 00:00:01,000\n你好\n")
with tempfile.TemporaryDirectory() as tmp_dir:
local_file = Path(tmp_dir) / "audio.wav"
local_file.write_bytes(b"audio")
subtitle_file = Path(tmp_dir) / "out.srt"
session = LocalSession()
result_path = fasr.create_with_local_firered_asr(
str(local_file),
subtitle_file=str(subtitle_file),
api_url="http://127.0.0.1:7867",
session=session,
)
self.assertEqual(str(subtitle_file), result_path)
self.assertEqual("http://127.0.0.1:7867/outputs/result.srt", session.calls[1][1])
self.assertIn("你好", subtitle_file.read_text(encoding="utf-8"))
def test_firered_asr_result_to_srt_uses_sentence_timestamps(self):
result = {
"sentences": [
{"text": "你好。", "start_ms": 40, "end_ms": 900},
{"text": "欢迎观看。", "start_ms": 900, "end_ms": 2100},
]
}
srt = fasr.firered_asr_result_to_srt(result)
self.assertIn("1\n00:00:00,040 --> 00:00:00,900\n你好。", srt)
self.assertIn("2\n00:00:00,900 --> 00:00:02,100\n欢迎观看。", srt)
class FunAsrConfigTests(unittest.TestCase):
def test_save_config_persists_fun_asr_section(self):
original_config_file = cfg.config_file
@ -395,6 +586,9 @@ class FunAsrConfigTests(unittest.TestCase):
def test_config_example_fun_asr_section_parses(self):
config_data = tomllib.loads(Path("config.example.toml").read_text(encoding="utf-8"))
self.assertEqual("local", config_data["fun_asr"]["backend"])
self.assertEqual("http://127.0.0.1:7860", config_data["fun_asr"]["api_url"])
self.assertEqual("http://127.0.0.1:7867", config_data["fun_asr"]["firered_api_url"])
self.assertEqual("fun-asr", config_data["fun_asr"]["model"])
self.assertIn("api_key", config_data["fun_asr"])

View File

@ -0,0 +1,426 @@
import json
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch
from app.models.schema import VideoClipParams
from app.services import jianying_draft_builder, jianying_task
DraftPathPlaceholder = "##_draftpath_placeholder_0E685133-18CE-45ED-8CB8-2904A212EC80_##"
class JianyingTaskTests(unittest.TestCase):
def test_normalize_indextts_uses_valid_param_reference(self):
with tempfile.NamedTemporaryFile(suffix=".wav") as ref:
params = VideoClipParams(tts_engine="indextts", voice_name=ref.name)
jianying_task._normalize_indextts_reference_audio(params)
self.assertEqual(f"indextts:{ref.name}", params.voice_name)
def test_normalize_indextts_uses_config_reference_when_param_is_stale(self):
with tempfile.TemporaryDirectory() as temp_dir:
ref_path = Path(temp_dir) / "reference.wav"
ref_path.write_bytes(b"fake wav")
params = VideoClipParams(tts_engine="indextts", voice_name="zh-CN-YunjianNeural")
with patch.dict(jianying_task.config.indextts, {"reference_audio": str(ref_path)}, clear=False):
jianying_task._normalize_indextts_reference_audio(params)
self.assertEqual(f"indextts:{ref_path}", params.voice_name)
def test_normalize_indextts2_uses_valid_param_reference(self):
with tempfile.NamedTemporaryFile(suffix=".wav") as ref:
params = VideoClipParams(tts_engine="indextts2", voice_name=f"indextts2:{ref.name}")
jianying_task._normalize_indextts_reference_audio(params)
self.assertEqual("indextts2", params.tts_engine)
self.assertEqual(f"indextts2:{ref.name}", params.voice_name)
def test_normalize_indextts2_uses_config_reference_when_param_is_stale(self):
with tempfile.TemporaryDirectory() as temp_dir:
ref_path = Path(temp_dir) / "reference.wav"
ref_path.write_bytes(b"fake wav")
params = VideoClipParams(tts_engine="indextts2", voice_name="zh-CN-YunjianNeural")
with patch.dict(jianying_task.config.indextts2, {"reference_audio": str(ref_path)}, clear=False):
jianying_task._normalize_indextts_reference_audio(params)
self.assertEqual(f"indextts2:{ref_path}", params.voice_name)
def test_normalize_omnivoice_clone_uses_valid_param_reference(self):
with tempfile.NamedTemporaryFile(suffix=".wav") as ref:
params = VideoClipParams(tts_engine="omnivoice", voice_name=f"omnivoice:{ref.name}")
with patch.dict(jianying_task.config.omnivoice, {"mode": "voice_clone"}, clear=False):
jianying_task._normalize_indextts_reference_audio(params)
self.assertEqual(f"omnivoice:{ref.name}", params.voice_name)
def test_normalize_omnivoice_auto_does_not_require_reference(self):
params = VideoClipParams(tts_engine="omnivoice", voice_name="omnivoice:auto")
with patch.dict(jianying_task.config.omnivoice, {"mode": "auto", "reference_audio": ""}, clear=False):
jianying_task._normalize_indextts_reference_audio(params)
self.assertEqual("omnivoice:auto", params.voice_name)
def test_normalize_indextts_requires_existing_reference_audio(self):
params = VideoClipParams(tts_engine="indextts", voice_name="zh-CN-YunjianNeural")
with patch.dict(jianying_task.config.indextts, {"reference_audio": ""}, clear=False):
with self.assertRaisesRegex(ValueError, "IndexTTS-1.5 参考音频不存在"):
jianying_task._normalize_indextts_reference_audio(params)
def test_floor_duration_to_milliseconds(self):
self.assertAlmostEqual(6.997, jianying_task._floor_duration_to_milliseconds(6.997333))
self.assertAlmostEqual(7.0, jianying_task._floor_duration_to_milliseconds(7.000999))
def test_clamp_duration_to_media_uses_actual_media_duration(self):
duration_cache = {}
with patch.object(jianying_task, "get_media_duration_ffprobe", return_value=4.2809):
duration = jianying_task._clamp_duration_to_media(
requested_duration=4.31,
media_file="/tmp/clip.mp4",
duration_cache=duration_cache,
media_label="视频素材",
)
self.assertAlmostEqual(4.28, duration)
def test_clamp_duration_to_media_respects_source_start_time(self):
duration_cache = {}
with patch.object(jianying_task, "get_media_duration_ffprobe", return_value=10.0):
duration = jianying_task._clamp_duration_to_media(
requested_duration=4.0,
media_file="/tmp/original.mp4",
duration_cache=duration_cache,
media_label="原始视频素材",
source_start_time=8.5,
)
self.assertAlmostEqual(1.5, duration)
def test_format_seconds_for_trange_uses_millisecond_precision(self):
self.assertEqual("4.280s", jianying_task._format_seconds_for_trange(4.28))
def test_write_plaintext_jianying_draft_creates_root_package(self):
with tempfile.TemporaryDirectory() as temp_dir:
root_path = Path(temp_dir) / "drafts"
output_dir = Path(temp_dir) / "task"
root_path.mkdir()
output_dir.mkdir()
video_path = output_dir / "clip:01.mp4"
audio_path = output_dir / "audio_00_00_00,000-00_00_04,310.mp3"
video_path.write_bytes(b"fake video")
audio_path.write_bytes(b"fake audio")
params = VideoClipParams(
video_origin_path=str(video_path),
original_volume=0.4,
tts_volume=0.9,
)
script = [
{
"OST": 0,
"start_time": 0.0,
"duration": 4.31,
"timestamp": "00:00:00,000-00:00:04,310",
"video": str(video_path),
"audio": str(audio_path),
}
]
def fake_duration(file_path):
return 4.2809 if file_path == str(video_path) else 5.0
with (
patch.object(jianying_draft_builder, "_get_media_duration_ffprobe", side_effect=fake_duration),
patch.object(
jianying_draft_builder,
"_get_video_metadata_ffprobe",
return_value=(4_280_000, 720, 1280),
),
):
draft_path, draft_name = jianying_draft_builder.write_plaintext_jianying_draft(
str(root_path),
"NarratoAI_test",
script,
params,
str(output_dir),
)
draft_dir = Path(draft_path)
self.assertEqual("NarratoAI_test", draft_name)
self.assertTrue((draft_dir / "draft_info.json").exists())
self.assertTrue((draft_dir / "template-2.tmp").exists())
self.assertTrue((draft_dir / "template.tmp").exists())
self.assertTrue((draft_dir / "draft_cover.jpg").exists())
self.assertFalse((draft_dir / "draft_content_legacy.json").exists())
self.assertFalse((draft_dir / "Timelines" / "project.json").exists())
self.assertTrue((draft_dir / "assets" / "video" / "clip_01.mp4").exists())
self.assertTrue((draft_dir / "assets" / "audio" / audio_path.name).exists())
draft_info = json.loads((draft_dir / "draft_info.json").read_text(encoding="utf-8"))
self.assertEqual("169.0.0", draft_info["new_version"])
self.assertEqual("NarratoAI_test", draft_info["name"])
self.assertEqual(54, len(draft_info["materials"]))
self.assertEqual(
f"{DraftPathPlaceholder}/assets/video/clip_01.mp4",
draft_info["materials"]["videos"][0]["path"],
)
self.assertEqual(
f"{DraftPathPlaceholder}/assets/audio/{audio_path.name}",
draft_info["materials"]["audios"][0]["path"],
)
self.assertEqual(4_280_000, draft_info["tracks"][0]["segments"][0]["source_timerange"]["duration"])
self.assertEqual(4_280_000, draft_info["tracks"][1]["segments"][0]["source_timerange"]["duration"])
attachment_editing = json.loads((draft_dir / "attachment_editing.json").read_text(encoding="utf-8"))
self.assertEqual("1.0.0", attachment_editing["editing_draft"]["version"])
self.assertFalse(attachment_editing["editing_draft"]["is_use_audio_separation"])
empty_template = json.loads((draft_dir / "template.tmp").read_text(encoding="utf-8"))
self.assertEqual("75.0.0", empty_template["new_version"])
self.assertEqual([], empty_template["tracks"])
root_meta = json.loads((root_path / "root_meta_info.json").read_text(encoding="utf-8"))
self.assertEqual("NarratoAI_test", root_meta["all_draft_store"][0]["draft_name"])
self.assertEqual(str(draft_dir / "draft_info.json"), root_meta["all_draft_store"][0]["draft_json_file"])
def test_write_plaintext_jianying_draft_uses_source_timerange_and_writes_subtitles(self):
with tempfile.TemporaryDirectory() as temp_dir:
root_path = Path(temp_dir) / "drafts"
output_dir = Path(temp_dir) / "task"
root_path.mkdir()
output_dir.mkdir()
video_path = output_dir / "source.mp4"
audio_path = output_dir / "audio_00_00_02,000-00_00_04,000.mp3"
subtitle_path = output_dir / "script_subtitles.srt"
video_path.write_bytes(b"fake source video")
audio_path.write_bytes(b"fake audio")
subtitle_path.write_text(
"1\n00:00:00,000 --> 00:00:01,500\n测试字幕\n",
encoding="utf-8",
)
params = VideoClipParams(
video_origin_path=str(video_path),
original_volume=0.4,
tts_volume=0.9,
subtitle_enabled=True,
font_size=60,
text_fore_color="#FFFFFF",
)
script = [
{
"OST": 0,
"start_time": 2.0,
"source_start_time": 2.0,
"duration": 3.0,
"timestamp": "00:00:02,000-00:00:05,000",
"video": str(video_path),
"audio": str(audio_path),
"use_source_timerange": True,
}
]
def fake_duration(file_path):
return 10.0 if file_path == str(video_path) else 3.0
with (
patch.object(jianying_draft_builder, "_get_media_duration_ffprobe", side_effect=fake_duration),
patch.object(
jianying_draft_builder,
"_get_video_metadata_ffprobe",
return_value=(10_000_000, 1920, 1080),
),
):
draft_path, _ = jianying_draft_builder.write_plaintext_jianying_draft(
str(root_path),
"NarratoAI_source",
script,
params,
str(output_dir),
subtitle_path=str(subtitle_path),
)
draft_info = json.loads((Path(draft_path) / "draft_info.json").read_text(encoding="utf-8"))
self.assertEqual(1, len(draft_info["materials"]["videos"]))
self.assertEqual(1, len(draft_info["materials"]["texts"]))
self.assertIn("测试字幕", draft_info["materials"]["texts"][0]["content"])
video_segment = draft_info["tracks"][0]["segments"][0]
self.assertEqual(2_000_000, video_segment["source_timerange"]["start"])
self.assertEqual(3_000_000, video_segment["source_timerange"]["duration"])
self.assertEqual(0.0, video_segment["volume"])
text_tracks = [track for track in draft_info["tracks"] if track["type"] == "text"]
self.assertEqual(1, len(text_tracks))
self.assertEqual(1, len(text_tracks[0]["segments"]))
self.assertEqual(1_500_000, text_tracks[0]["segments"][0]["target_timerange"]["duration"])
def test_build_jianying_draft_script_references_original_video(self):
with tempfile.TemporaryDirectory() as temp_dir:
video_one = Path(temp_dir) / "one.mp4"
video_two = Path(temp_dir) / "two.mp4"
audio_path = Path(temp_dir) / "audio.mp3"
video_one.write_bytes(b"one")
video_two.write_bytes(b"two")
audio_path.write_bytes(b"audio")
params = VideoClipParams(
video_origin_path=str(video_one),
video_origin_paths=[str(video_one), str(video_two)],
)
script = [
{
"_id": 9,
"video_id": 2,
"timestamp": "00:00:05,000-00:00:07,000",
"narration": "解说",
"OST": 0,
}
]
tts_results = [
{
"_id": 9,
"timestamp": "00:00:05,000-00:00:07,000",
"audio_file": str(audio_path),
"subtitle_file": "",
"duration": 1.25,
}
]
draft_script = jianying_task._build_jianying_draft_script(script, params, tts_results)
self.assertEqual(str(video_two), draft_script[0]["video"])
self.assertEqual(str(audio_path), draft_script[0]["audio"])
self.assertEqual(5.0, draft_script[0]["source_start_time"])
self.assertEqual(1.25, draft_script[0]["duration"])
self.assertTrue(draft_script[0]["use_source_timerange"])
def test_get_original_subtitle_paths_falls_back_to_matching_video_name(self):
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
video_path = temp_path / "episode_20260608010240.mp4"
older_subtitle = temp_path / "episode_fun_asr_20260608000100.srt"
newer_subtitle = temp_path / "episode_fun_asr_20260608010100.srt"
video_path.write_bytes(b"video")
older_subtitle.write_text("old", encoding="utf-8")
newer_subtitle.write_text("new", encoding="utf-8")
params = VideoClipParams(video_origin_path=str(video_path))
with patch.object(jianying_task.utils, "subtitle_dir", return_value=str(temp_path)):
subtitle_paths = jianying_task._get_original_subtitle_paths(params)
self.assertEqual([str(newer_subtitle)], subtitle_paths)
def test_create_jianying_subtitle_file_includes_original_audio_subtitles(self):
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
task_dir = temp_path / "task"
task_dir.mkdir()
video_path = temp_path / "episode.mp4"
subtitle_path = temp_path / "episode.srt"
video_path.write_bytes(b"video")
subtitle_path.write_text(
"1\n00:00:05,000 --> 00:00:06,500\n原片对白\n",
encoding="utf-8",
)
params = VideoClipParams(video_origin_path=str(video_path), subtitle_enabled=True)
draft_script = jianying_task._build_jianying_draft_script(
[
{
"_id": 1,
"timestamp": "00:00:05,000-00:00:07,000",
"narration": "播放原片1",
"OST": 1,
}
],
params,
[],
)
with (
patch.object(jianying_task.utils, "subtitle_dir", return_value=str(temp_path)),
patch.object(jianying_task.utils, "task_dir", return_value=str(task_dir)),
):
output_path = jianying_task._create_jianying_subtitle_file(
"task-id",
draft_script,
params,
)
self.assertTrue(output_path)
self.assertIn("原片对白", Path(output_path).read_text(encoding="utf-8"))
def test_start_export_jianying_draft_does_not_clip_video(self):
with tempfile.TemporaryDirectory() as temp_dir:
root_path = Path(temp_dir) / "drafts"
task_dir = Path(temp_dir) / "task"
root_path.mkdir()
task_dir.mkdir()
video_path = Path(temp_dir) / "source.mp4"
audio_path = task_dir / "audio.mp3"
script_path = Path(temp_dir) / "script.json"
subtitle_path = task_dir / "script_subtitles.srt"
video_path.write_bytes(b"video")
audio_path.write_bytes(b"audio")
script_path.write_text(
json.dumps([
{
"_id": 1,
"timestamp": "00:00:01,000-00:00:03,000",
"narration": "测试解说",
"OST": 0,
}
], ensure_ascii=False),
encoding="utf-8",
)
params = VideoClipParams(
video_clip_json_path=str(script_path),
video_origin_path=str(video_path),
tts_engine="edge_tts",
voice_name="zh-CN-YunjianNeural",
subtitle_enabled=True,
draft_name="NarratoAI_no_clip",
)
tts_results = [
{
"_id": 1,
"timestamp": "00:00:01,000-00:00:03,000",
"audio_file": str(audio_path),
"subtitle_file": "",
"duration": 1.5,
}
]
with (
patch.dict(jianying_task.config.ui, {"jianying_draft_path": str(root_path)}, clear=False),
patch.object(jianying_task.utils, "task_dir", return_value=str(task_dir)),
patch.object(jianying_task.voice, "tts_multiple", return_value=tts_results),
patch.object(jianying_task, "_create_jianying_subtitle_file", return_value=str(subtitle_path)),
patch.object(jianying_task, "write_plaintext_jianying_draft", return_value=(str(root_path / "draft"), "NarratoAI_no_clip")) as write_draft,
patch.object(jianying_task.clip_video, "clip_video_unified") as clip_video_unified,
):
result = jianying_task.start_export_jianying_draft("task-id", params)
clip_video_unified.assert_not_called()
write_kwargs = write_draft.call_args.kwargs
self.assertTrue(write_kwargs["new_script_list"][0]["use_source_timerange"])
self.assertEqual(str(audio_path), write_kwargs["new_script_list"][0]["audio"])
self.assertEqual(str(subtitle_path), write_kwargs["subtitle_path"])
self.assertEqual(str(subtitle_path), result["subtitles"][0])
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,120 @@
import subprocess
import unittest
from unittest import mock
from app.services import merger_video
class MergerVideoConcatTests(unittest.TestCase):
def test_can_concat_video_copy_when_signatures_match(self):
signature = {
"codec_name": "h264",
"profile": "High",
"width": 1080,
"height": 1920,
"pix_fmt": "yuv420p",
"r_frame_rate": "30/1",
"avg_frame_rate": "30/1",
"time_base": "1/15360",
"sample_aspect_ratio": "1:1",
}
with mock.patch.object(
merger_video,
"_get_video_stream_signature",
side_effect=[signature, dict(signature)],
):
self.assertTrue(merger_video._can_concat_video_copy(["1.mp4", "2.mp4"]))
def test_can_concat_video_copy_rejects_mismatched_signature(self):
base_signature = {
"codec_name": "h264",
"profile": "High",
"width": 1080,
"height": 1920,
"pix_fmt": "yuv420p",
"r_frame_rate": "30/1",
"avg_frame_rate": "30/1",
"time_base": "1/15360",
"sample_aspect_ratio": "1:1",
}
mismatch_signature = dict(base_signature, r_frame_rate="24000/1001")
with mock.patch.object(
merger_video,
"_get_video_stream_signature",
side_effect=[base_signature, mismatch_signature],
):
self.assertFalse(merger_video._can_concat_video_copy(["1.mp4", "2.mp4"]))
def test_concat_video_streams_prefers_copy_when_compatible(self):
completed = subprocess.CompletedProcess(args=["ffmpeg"], returncode=0)
with (
mock.patch.object(merger_video, "_can_concat_video_copy", return_value=True),
mock.patch.object(merger_video, "_concat_duration_matches", return_value=True),
mock.patch.object(merger_video.subprocess, "run", return_value=completed) as run_mock,
):
merger_video._concat_video_streams(
["1.mp4", "2.mp4"],
"concat.txt",
"video_concat.mp4",
threads=4,
)
cmd = run_mock.call_args.args[0]
self.assertEqual("copy", cmd[cmd.index("-c:v") + 1])
self.assertNotIn("libx264", cmd)
def test_concat_video_streams_falls_back_when_copy_duration_mismatches(self):
completed = subprocess.CompletedProcess(args=["ffmpeg"], returncode=0)
with (
mock.patch.object(merger_video, "_can_concat_video_copy", return_value=True),
mock.patch.object(merger_video, "_concat_duration_matches", return_value=False),
mock.patch.object(merger_video.os.path, "exists", return_value=False),
mock.patch.object(merger_video.subprocess, "run", return_value=completed) as run_mock,
):
merger_video._concat_video_streams(
["1.mp4", "2.mp4"],
"concat.txt",
"video_concat.mp4",
threads=6,
)
self.assertEqual(2, run_mock.call_count)
fallback_cmd = run_mock.call_args_list[1].args[0]
self.assertEqual("libx264", fallback_cmd[fallback_cmd.index("-c:v") + 1])
self.assertEqual("6", fallback_cmd[fallback_cmd.index("-threads") + 1])
def test_concat_video_streams_falls_back_to_reencode_when_copy_fails(self):
copy_error = subprocess.CalledProcessError(
returncode=1,
cmd=["ffmpeg"],
stderr=b"copy failed",
)
completed = subprocess.CompletedProcess(args=["ffmpeg"], returncode=0)
with (
mock.patch.object(merger_video, "_can_concat_video_copy", return_value=True),
mock.patch.object(
merger_video.subprocess,
"run",
side_effect=[copy_error, completed],
) as run_mock,
):
merger_video._concat_video_streams(
["1.mp4", "2.mp4"],
"concat.txt",
"video_concat.mp4",
threads=8,
)
self.assertEqual(2, run_mock.call_count)
fallback_cmd = run_mock.call_args_list[1].args[0]
self.assertEqual("libx264", fallback_cmd[fallback_cmd.index("-c:v") + 1])
self.assertEqual("8", fallback_cmd[fallback_cmd.index("-threads") + 1])
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,101 @@
import json
import os
import tempfile
import unittest
from unittest import mock
from app.services import clip_video
from app.utils import check_script
class TestMultiVideoScriptSources(unittest.TestCase):
def test_clip_command_uses_input_fast_seek(self):
encoder_config = clip_video.get_safe_encoder_config(None)
cmd = clip_video._build_ffmpeg_command_with_audio_control(
input_path="/tmp/input.mp4",
output_path="/tmp/output.mp4",
start_time="00:27:32.240",
end_time="00:27:38.240",
encoder_config=encoder_config,
hwaccel_args=[],
remove_audio=False,
)
self.assertLess(cmd.index("-ss"), cmd.index("-i"))
self.assertEqual("6", cmd[cmd.index("-t") + 1])
self.assertNotIn("-to", cmd)
def test_check_format_accepts_optional_video_source_fields(self):
script = [
{
"_id": 1,
"video_id": 2,
"video_name": "2.mp4",
"timestamp": "00:00:00,000-00:00:03,000",
"picture": "画面",
"narration": "解说",
"OST": 0,
}
]
result = check_script.check_format(json.dumps(script, ensure_ascii=False))
self.assertTrue(result["success"])
def test_clip_video_unified_resolves_source_by_video_id_and_name(self):
with tempfile.TemporaryDirectory() as temp_dir:
video_1 = os.path.join(temp_dir, "1.mp4")
video_2 = os.path.join(temp_dir, "2.mp4")
for video_path in [video_1, video_2]:
with open(video_path, "wb") as file:
file.write(b"video")
output_dir = os.path.join(temp_dir, "clips")
used_sources = []
def fake_process(source_video_path, script_item, output_dir_arg, *_args):
used_sources.append(source_video_path)
output_path = os.path.join(output_dir_arg, f"{script_item['_id']}.mp4")
with open(output_path, "wb") as file:
file.write(b"clip")
return output_path
script_list = [
{
"_id": 1,
"video_id": 2,
"timestamp": "00:00:00,000-00:00:03,000",
"picture": "视频2画面",
"narration": "播放原片1",
"OST": 1,
},
{
"_id": 2,
"video_name": "1.mp4",
"timestamp": "00:00:03,000-00:00:06,000",
"picture": "视频1画面",
"narration": "播放原片2",
"OST": 1,
},
]
with (
mock.patch.object(clip_video, "check_hardware_acceleration", return_value=None),
mock.patch.object(clip_video, "_process_original_audio_segment", side_effect=fake_process),
):
result = clip_video.clip_video_unified(
video_origin_path=video_1,
video_origin_paths=[video_1, video_2],
script_list=script_list,
tts_results=[],
output_dir=output_dir,
task_id="multi-video-test",
)
self.assertEqual([video_2, video_1], used_sources)
self.assertEqual({1, 2}, set(result.keys()))
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,192 @@
import tempfile
import unittest
from pathlib import Path
from app.services import script_subtitle
class ScriptSubtitleTests(unittest.TestCase):
def test_split_narration_prefers_punctuation_boundaries(self):
chunks = script_subtitle.split_narration(
"她终于意识到,这场婚姻不是爱情,而是一场交易。",
max_chars=12,
)
self.assertEqual(
["她终于意识到", "这场婚姻不是爱情", "而是一场交易"],
chunks,
)
def test_time_range_parsing_supports_milliseconds(self):
start, end = script_subtitle.parse_time_range("00:00:01,500-00:00:03,250")
self.assertAlmostEqual(1.5, start)
self.assertAlmostEqual(3.25, end)
def test_create_script_subtitle_file_skips_original_audio_segments(self):
list_script = [
{
"_id": 1,
"OST": 0,
"narration": "第一句解说。第二句解说。",
"editedTimeRange": "00:00:00-00:00:04",
"duration": 4,
},
{
"_id": 2,
"OST": 1,
"narration": "这句是原声,不应该默认生成。",
"editedTimeRange": "00:00:04-00:00:08",
"duration": 4,
},
{
"_id": 3,
"OST": 2,
"narration": "混合片段也保留解说字幕。",
"editedTimeRange": "00:00:08-00:00:12",
"duration": 4,
},
]
with tempfile.TemporaryDirectory() as temp_dir:
output_file = Path(temp_dir) / "script_subtitles.srt"
result = script_subtitle.create_script_subtitle_file(
task_id="test",
list_script=list_script,
output_file=str(output_file),
max_chars=16,
)
self.assertEqual(str(output_file), result)
content = output_file.read_text(encoding="utf-8")
self.assertIn("00:00:00,000 -->", content)
self.assertIn("第一句解说", content)
self.assertIn("混合片段也保留解说字幕", content)
self.assertNotIn("这句是原声", content)
self.assertNotIn("", content)
self.assertNotIn("", content)
def test_create_script_subtitle_file_uses_duration_when_edited_range_missing(self):
list_script = [
{
"_id": 1,
"OST": 0,
"narration": "没有 editedTimeRange 时使用 duration。",
"duration": 3,
}
]
with tempfile.TemporaryDirectory() as temp_dir:
output_file = Path(temp_dir) / "script_subtitles.srt"
script_subtitle.create_script_subtitle_file(
task_id="test",
list_script=list_script,
output_file=str(output_file),
)
content = output_file.read_text(encoding="utf-8")
self.assertIn("00:00:00,000 -->", content)
self.assertIn("--> 00:00:03,000", content)
def test_create_script_subtitle_file_includes_original_audio_subtitles(self):
list_script = [
{
"_id": 1,
"OST": 0,
"narration": "前情解说。",
"editedTimeRange": "00:00:00-00:00:02",
"duration": 2,
},
{
"_id": 2,
"video_id": 1,
"video_name": "source.mp4",
"OST": 1,
"narration": "播放原片2",
"timestamp": "00:00:10,000-00:00:14,000",
"sourceTimeRange": "00:00:10,000-00:00:14,000",
"editedTimeRange": "00:00:02-00:00:06",
"duration": 4,
},
]
original_srt = """1
00:00:09,000 --> 00:00:11,000
开头会被裁掉一秒
2
00:00:11,500 --> 00:00:13,000
这句原声对白应该出现
3
00:00:13,500 --> 00:00:15,000
结尾只保留半秒
"""
with tempfile.TemporaryDirectory() as temp_dir:
subtitle_file = Path(temp_dir) / "source.srt"
subtitle_file.write_text(original_srt, encoding="utf-8")
output_file = Path(temp_dir) / "script_subtitles.srt"
script_subtitle.create_script_subtitle_file(
task_id="test",
list_script=list_script,
output_file=str(output_file),
original_subtitle_paths=[str(subtitle_file)],
video_origin_paths=["source.mp4"],
max_chars=16,
)
content = output_file.read_text(encoding="utf-8")
self.assertIn("前情解说", content)
self.assertIn("开头会被裁掉一秒", content)
self.assertIn("这句原声对白应该出现", content)
self.assertIn("结尾只保留半秒", content)
self.assertIn("00:00:02,000 --> 00:00:03,000", content)
self.assertIn("00:00:03,500 --> 00:00:05,000", content)
self.assertIn("00:00:05,500 --> 00:00:06,000", content)
self.assertNotIn("播放原片2", content)
def test_create_script_subtitle_file_uses_matching_video_id_for_original_subtitles(self):
list_script = [
{
"_id": 1,
"video_id": 2,
"video_name": "second.mp4",
"OST": 1,
"narration": "播放原片1",
"timestamp": "00:00:01,000-00:00:03,000",
"sourceTimeRange": "00:00:01,000-00:00:03,000",
"editedTimeRange": "00:00:00-00:00:02",
"duration": 2,
},
]
first_srt = """1
00:00:01,000 --> 00:00:03,000
第一个视频的字幕不应该出现
"""
second_srt = """1
00:00:01,000 --> 00:00:03,000
第二个视频的字幕应该出现
"""
with tempfile.TemporaryDirectory() as temp_dir:
first_file = Path(temp_dir) / "first.srt"
second_file = Path(temp_dir) / "second.srt"
output_file = Path(temp_dir) / "script_subtitles.srt"
first_file.write_text(first_srt, encoding="utf-8")
second_file.write_text(second_srt, encoding="utf-8")
script_subtitle.create_script_subtitle_file(
task_id="test",
list_script=list_script,
output_file=str(output_file),
original_subtitle_paths=[str(first_file), str(second_file)],
video_origin_paths=["first.mp4", "second.mp4"],
)
content = output_file.read_text(encoding="utf-8")
self.assertIn("第二个视频的字幕应该出现", content)
self.assertNotIn("第一个视频的字幕不应该出现", content)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,290 @@
import unittest
from app.services.short_drama_narration_validation import (
build_subtitle_index,
normalize_script_video_sources,
validate_narration_script_items,
)
SUBTITLE_CONTENT = """# 视频 1: first.mp4
字幕文件: first.srt
1
00:00:01,000 --> 00:00:04,000
女主被众人误会
2
00:00:04,000 --> 00:00:08,000
男主冷眼看着她
# 视频 2: second.mp4
字幕文件: second.srt
1
00:00:02,000 --> 00:00:05,000
女主终于拿出证据
2
00:00:05,000 --> 00:00:09,000
众人震惊反派慌了
"""
class ShortDramaNarrationValidationTests(unittest.TestCase):
def setUp(self):
self.video_paths = ["/tmp/first.mp4", "/tmp/second.mp4"]
self.subtitle_index = build_subtitle_index(SUBTITLE_CONTENT, self.video_paths)
def test_build_subtitle_index_preserves_multi_video_sources(self):
self.assertEqual(4, len(self.subtitle_index))
self.assertEqual({1, 2}, {cue.video_id for cue in self.subtitle_index})
self.assertEqual("first.mp4", self.subtitle_index[0].video_name)
self.assertEqual("second.mp4", self.subtitle_index[2].video_name)
self.assertEqual("00:00:02,000-00:00:05,000", self.subtitle_index[2].timestamp)
def test_valid_script_passes_and_normalizes_video_name(self):
items = [
{
"_id": 1,
"video_id": 1,
"video_name": "wrong-name.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"picture": "女主被误会",
"narration": "她被当众误会。",
"OST": 0,
},
{
"_id": 2,
"video_name": "second.mp4",
"timestamp": "00:00:02,000-00:00:05,000",
"picture": "女主拿出证据",
"narration": "播放原片2",
"OST": 1,
},
]
normalized = normalize_script_video_sources(items, self.video_paths)
result = validate_narration_script_items(normalized, self.subtitle_index, self.video_paths)
self.assertTrue(result.valid, result.errors)
self.assertEqual(2, result.items[1]["video_id"])
self.assertEqual("second.mp4", result.items[1]["video_name"])
def test_invalid_timestamp_and_overlap_fail(self):
items = [
{
"_id": 1,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:01,000-00:00:05,000",
"picture": "画面",
"narration": "解说",
"OST": 0,
},
{
"_id": 2,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:04,500-00:00:08,000",
"picture": "画面",
"narration": "解说",
"OST": 0,
},
{
"_id": 3,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "bad",
"picture": "画面",
"narration": "解说",
"OST": 0,
},
]
result = validate_narration_script_items(items, self.subtitle_index, self.video_paths)
self.assertFalse(result.valid)
self.assertTrue(any("重叠" in error for error in result.errors))
self.assertTrue(any("时间戳格式" in error for error in result.errors))
def test_invalid_video_id_does_not_default_to_first_video(self):
items = [
{
"_id": 1,
"video_id": 99,
"video_name": "missing.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"picture": "画面",
"narration": "解说",
"OST": 0,
}
]
result = validate_narration_script_items(items, self.subtitle_index, self.video_paths)
self.assertFalse(result.valid)
self.assertTrue(any("video_id=99" in error for error in result.errors))
def test_out_of_range_timestamp_fails(self):
items = [
{
"_id": 1,
"video_id": 2,
"video_name": "second.mp4",
"timestamp": "00:00:20,000-00:00:25,000",
"picture": "画面",
"narration": "解说",
"OST": 0,
}
]
result = validate_narration_script_items(items, self.subtitle_index, self.video_paths)
self.assertFalse(result.valid)
self.assertTrue(any("不在视频 2 的字幕范围内" in error for error in result.errors))
def test_three_consecutive_original_audio_segments_fail(self):
items = [
{
"_id": 1,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"picture": "女主被误会",
"narration": "她被当众误会。",
"OST": 0,
},
{
"_id": 2,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:04,000-00:00:05,000",
"picture": "男主看着她",
"narration": "播放原片2",
"OST": 1,
},
{
"_id": 3,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:05,000-00:00:06,000",
"picture": "男主看着她",
"narration": "播放原片3",
"OST": 1,
},
{
"_id": 4,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:06,000-00:00:08,000",
"picture": "男主继续观察",
"narration": "播放原片4",
"OST": 1,
},
]
result = validate_narration_script_items(items, self.subtitle_index, self.video_paths)
self.assertFalse(result.valid)
self.assertTrue(any("连续原声过多" in error for error in result.errors))
def test_cross_video_original_audio_requires_narration_bridge(self):
items = [
{
"_id": 1,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"picture": "女主被误会",
"narration": "她被当众误会。",
"OST": 0,
},
{
"_id": 2,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:04,000-00:00:08,000",
"picture": "男主看着她",
"narration": "播放原片2",
"OST": 1,
},
{
"_id": 3,
"video_id": 2,
"video_name": "second.mp4",
"timestamp": "00:00:02,000-00:00:05,000",
"picture": "女主拿出证据",
"narration": "播放原片3",
"OST": 1,
},
]
result = validate_narration_script_items(items, self.subtitle_index, self.video_paths)
self.assertFalse(result.valid)
self.assertTrue(any("跨视频切换缺少 OST=0 解说桥段" in error for error in result.errors))
def test_cross_video_switch_with_narration_bridge_passes(self):
items = [
{
"_id": 1,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"picture": "女主被误会",
"narration": "她被当众误会。",
"OST": 0,
},
{
"_id": 2,
"video_id": 2,
"video_name": "second.mp4",
"timestamp": "00:00:02,000-00:00:05,000",
"picture": "女主拿出证据",
"narration": "播放原片2",
"OST": 1,
},
]
result = validate_narration_script_items(items, self.subtitle_index, self.video_paths)
self.assertTrue(result.valid, result.errors)
def test_first_segment_must_be_narration_hook(self):
items = [
{
"_id": 1,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"picture": "女主被误会",
"narration": "播放原片1",
"OST": 1,
}
]
result = validate_narration_script_items(items, self.subtitle_index, self.video_paths)
self.assertFalse(result.valid)
self.assertTrue(any("解说开场钩子" in error for error in result.errors))
def test_dense_narration_fails_when_video_duration_is_too_short(self):
items = [
{
"_id": 1,
"video_id": 1,
"video_name": "first.mp4",
"timestamp": "00:00:01,000-00:00:04,000",
"picture": "女主被误会",
"narration": "她明明什么都没做却被所有人推到风口浪尖只能独自承受委屈",
"OST": 0,
}
]
result = validate_narration_script_items(items, self.subtitle_index, self.video_paths)
self.assertFalse(result.valid)
self.assertTrue(any("解说过密" in error for error in result.errors))
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,100 @@
import json
import tempfile
import unittest
from pathlib import Path
from unittest import mock
from app.services import subtitle_corrector as corrector
SAMPLE_SRT = """1
00:00:01,000 --> 00:00:03,000
今天我们来看张三的顾是
2
00:00:04,000 --> 00:00:06,000
他来到北精找李四
"""
class SubtitleCorrectorTests(unittest.TestCase):
def test_correct_srt_content_preserves_timecodes_and_rebuilds_text(self):
llm_output = {
"items": [
{"id": 1, "text": "今天我们来看张三的故事"},
{"id": 2, "text": "他来到北京找李四"},
]
}
with (
mock.patch("app.services.subtitle_corrector._ensure_llm_providers_registered"),
mock.patch(
"app.services.subtitle_corrector._run_async_safely",
return_value=json.dumps(llm_output, ensure_ascii=False),
) as run_llm,
):
corrected = corrector.correct_srt_content(
SAMPLE_SRT,
provider="openai",
api_key="sk-test",
base_url="https://llm.example/v1",
)
self.assertIn("00:00:01,000 --> 00:00:03,000", corrected)
self.assertIn("今天我们来看张三的故事", corrected)
self.assertIn("他来到北京找李四", corrected)
self.assertNotIn("顾是", corrected)
call_kwargs = run_llm.call_args.kwargs
self.assertEqual("openai", call_kwargs["provider"])
self.assertEqual("sk-test", call_kwargs["api_key"])
self.assertEqual("https://llm.example/v1", call_kwargs["api_base"])
self.assertEqual("json", call_kwargs["response_format"])
self.assertIn("多语言字幕校对员", call_kwargs["system_prompt"])
self.assertIn("保持原语言", call_kwargs["prompt"])
def test_correct_srt_content_rejects_missing_items(self):
llm_output = {"items": [{"id": 1, "text": "今天我们来看张三的故事"}]}
with (
mock.patch("app.services.subtitle_corrector._ensure_llm_providers_registered"),
mock.patch(
"app.services.subtitle_corrector._run_async_safely",
return_value=json.dumps(llm_output, ensure_ascii=False),
),
):
with self.assertRaises(corrector.SubtitleCorrectionError):
corrector.correct_srt_content(SAMPLE_SRT, provider="openai")
def test_correct_subtitle_file_writes_corrected_srt(self):
llm_output = {
"items": [
{"id": 1, "text": "今天我们来看张三的故事"},
{"id": 2, "text": "他来到北京找李四"},
]
}
with tempfile.TemporaryDirectory() as tmp_dir:
input_file = Path(tmp_dir) / "input.srt"
output_file = Path(tmp_dir) / "output.srt"
input_file.write_text(SAMPLE_SRT, encoding="utf-8")
with (
mock.patch("app.services.subtitle_corrector._ensure_llm_providers_registered"),
mock.patch(
"app.services.subtitle_corrector._run_async_safely",
return_value=json.dumps(llm_output, ensure_ascii=False),
),
):
result_path = corrector.correct_subtitle_file(
str(input_file),
str(output_file),
provider="openai",
)
self.assertEqual(str(output_file), result_path)
self.assertIn("北京", output_file.read_text(encoding="utf-8"))
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,46 @@
import tempfile
import time
import unittest
from pathlib import Path
from app.models.schema import VideoClipParams
from app.services import task
class TaskSubtitleResolutionTests(unittest.TestCase):
def test_get_original_subtitle_paths_falls_back_to_matching_video_name(self):
original_subtitle_dir = task.utils.subtitle_dir
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
older = temp_path / "01_1080p_fun_asr.srt"
newer = temp_path / "01_1080p_fun_asr_20260608010240.srt"
unrelated = temp_path / "other_fun_asr.srt"
older.write_text("older", encoding="utf-8")
unrelated.write_text("other", encoding="utf-8")
time.sleep(0.01)
newer.write_text("newer", encoding="utf-8")
task.utils.subtitle_dir = lambda: str(temp_path)
params = VideoClipParams(
video_origin_path="/tmp/01_1080p_20260608113314.mp4",
)
try:
subtitle_paths = task._get_original_subtitle_paths(params)
finally:
task.utils.subtitle_dir = original_subtitle_dir
self.assertEqual([str(newer)], subtitle_paths)
def test_get_original_subtitle_paths_keeps_explicit_params(self):
params = VideoClipParams(
video_origin_path="/tmp/01_1080p_20260608113314.mp4",
original_subtitle_paths=["/tmp/provided.srt"],
)
self.assertEqual(["/tmp/provided.srt"], task._get_original_subtitle_paths(params))
if __name__ == "__main__":
unittest.main()

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import os
import re
import json
@ -6,6 +8,7 @@ import edge_tts
import asyncio
import requests
import uuid
from functools import lru_cache
from loguru import logger
from typing import List, Union, Tuple
from datetime import datetime
@ -20,6 +23,7 @@ except ImportError:
MOVIEPY_AVAILABLE = False
logger.warning("moviepy 未安装,将使用估算方法计算音频时长")
import time
from urllib.parse import urljoin
from app.config import config
from app.utils import utils
@ -282,7 +286,7 @@ Gender: Male
Name: en-AU-NatashaNeural
Gender: Female
Name: en-AU-WilliamNeural
Name: en-AU-WilliamMultilingualNeural
Gender: Male
Name: en-CA-ClaraNeural
@ -369,21 +373,33 @@ Gender: Female
Name: en-US-AndrewNeural
Gender: Male
Name: en-US-AndrewMultilingualNeural
Gender: Male
Name: en-US-AriaNeural
Gender: Female
Name: en-US-AvaNeural
Gender: Female
Name: en-US-AvaMultilingualNeural
Gender: Female
Name: en-US-BrianNeural
Gender: Male
Name: en-US-BrianMultilingualNeural
Gender: Male
Name: en-US-ChristopherNeural
Gender: Male
Name: en-US-EmmaNeural
Gender: Female
Name: en-US-EmmaMultilingualNeural
Gender: Female
Name: en-US-EricNeural
Gender: Male
@ -666,12 +682,24 @@ Gender: Male
Name: it-IT-ElsaNeural
Gender: Female
Name: it-IT-GiuseppeNeural
Name: it-IT-GiuseppeMultilingualNeural
Gender: Male
Name: it-IT-IsabellaNeural
Gender: Female
Name: iu-Cans-CA-SiqiniqNeural
Gender: Female
Name: iu-Cans-CA-TaqqiqNeural
Gender: Male
Name: iu-Latn-CA-SiqiniqNeural
Gender: Female
Name: iu-Latn-CA-TaqqiqNeural
Gender: Male
Name: ja-JP-KeitaNeural
Gender: Male
@ -708,7 +736,7 @@ Gender: Male
Name: kn-IN-SapnaNeural
Gender: Female
Name: ko-KR-HyunsuNeural
Name: ko-KR-HyunsuMultilingualNeural
Gender: Male
Name: ko-KR-InJoonNeural
@ -822,7 +850,7 @@ Gender: Male
Name: pt-BR-FranciscaNeural
Gender: Female
Name: pt-BR-ThalitaNeural
Name: pt-BR-ThalitaMultilingualNeural
Gender: Female
Name: pt-PT-DuarteNeural
@ -1238,6 +1266,8 @@ def doubaotts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
def tts(
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str, tts_engine: str
) -> Union[SubMaker, None]:
tts_engine = config.normalize_tts_engine_name(tts_engine)
voice_name = config.normalize_indextts_voice_prefix(voice_name)
logger.info(f"使用 TTS 引擎: '{tts_engine}', 语音: '{voice_name}'")
if tts_engine == "tencent_tts":
@ -1263,9 +1293,17 @@ def tts(
logger.info("分发到 Edge TTS")
return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
if tts_engine == "indextts2":
logger.info("分发到 IndexTTS2")
return indextts2_tts(text, voice_name, voice_file, speed=voice_rate)
if tts_engine == config.INDEXTTS_ENGINE:
logger.info("分发到 IndexTTS-1.5")
return indextts_tts(text, voice_name, voice_file, speed=voice_rate)
if tts_engine == config.INDEXTTS2_ENGINE:
logger.info("分发到 IndexTTS-2")
return indextts2_tts(text, voice_name, voice_file)
if tts_engine == config.OMNIVOICE_ENGINE:
logger.info("分发到 OmniVoice")
return omnivoice_tts(text, voice_name, voice_file, speed=voice_rate)
if tts_engine == "doubaotts":
logger.info("分发到豆包语音 TTS")
@ -1306,6 +1344,52 @@ def get_edge_tts_proxy() -> str | None:
return proxy_url or None
def _run_async_safely(coro_func, *args, **kwargs):
"""在同步代码里安全运行异步 edge_tts 调用。"""
def run_in_new_loop():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(coro_func(*args, **kwargs))
finally:
loop.close()
asyncio.set_event_loop(None)
try:
asyncio.get_running_loop()
except RuntimeError:
return run_in_new_loop()
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
return executor.submit(run_in_new_loop).result()
@lru_cache(maxsize=8)
def _get_all_edge_voices_cached(proxy: str | None) -> list[str]:
async def _list_voices():
return await edge_tts.list_voices(proxy=proxy)
voices = []
for item in _run_async_safely(_list_voices):
name = item.get("ShortName", "").strip()
gender = item.get("Gender", "").strip()
if name and gender:
voices.append(f"{name}-{gender}")
voices.sort()
return voices
def get_all_edge_voices() -> list[str]:
"""获取 Edge TTS 当前支持的全部语言和音色,失败时回退到内置列表。"""
try:
return _get_all_edge_voices_cached(get_edge_tts_proxy())
except Exception as e:
logger.warning(f"获取 Edge TTS 在线音色列表失败,使用内置音色列表: {e}")
return [v for v in get_all_azure_voices(filter_locals=[]) if "-V2" not in v]
def azure_tts_v1(
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
) -> Union[SubMaker, None]:
@ -1701,15 +1785,21 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
:param tts_engine: TTS 引擎
:return: 生成的音频文件列表
"""
voice_name = parse_voice_name(voice_name)
tts_engine = config.normalize_tts_engine_name(tts_engine)
voice_name = config.normalize_indextts_voice_prefix(parse_voice_name(voice_name))
output_dir = utils.task_dir(task_id)
tts_results = []
audio_extension = ".wav" if tts_engine in (
config.INDEXTTS_ENGINE,
config.INDEXTTS2_ENGINE,
config.OMNIVOICE_ENGINE,
) else ".mp3"
for item in list_script:
if item['OST'] != 1:
# 将时间戳中的冒号替换为下划线
timestamp = item['timestamp'].replace(':', '_')
audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
audio_file = os.path.join(output_dir, f"audio_{timestamp}{audio_extension}")
subtitle_file = os.path.join(output_dir, f"subtitle_{timestamp}.srt")
text = item['narration']
@ -1729,8 +1819,13 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
f"或者使用其他 tts 引擎")
continue
else:
# SoulVoice、Qwen3、IndexTTS2、豆包语音 引擎不生成字幕文件
if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2" or tts_engine == "doubaotts":
# SoulVoice、Qwen3、IndexTTS、OmniVoice、豆包语音 引擎不生成精确字幕文件
if (
is_soulvoice_voice(voice_name)
or is_qwen_engine(tts_engine)
or tts_engine in (config.INDEXTTS_ENGINE, config.INDEXTTS2_ENGINE, config.OMNIVOICE_ENGINE)
or tts_engine == "doubaotts"
):
# 获取实际音频文件的时长
duration = get_audio_duration_from_file(audio_file)
if duration <= 0:
@ -2148,24 +2243,47 @@ def parse_soulvoice_voice(voice_name: str) -> str:
return voice_name
def parse_indextts2_voice(voice_name: str) -> str:
def parse_indextts_voice(voice_name: str) -> str:
"""
解析 IndexTTS2 语音名称
支持格式indextts2:reference_audio_path
解析 IndexTTS-1.5 语音名称
支持格式indextts:reference_audio_path
返回参考音频文件路径
"""
if voice_name.startswith("indextts2:"):
return voice_name[10:] # 移除 "indextts2:" 前缀
voice_name = config.normalize_indextts_voice_prefix(voice_name)
if voice_name.startswith(config.INDEXTTS_VOICE_PREFIX):
return voice_name[len(config.INDEXTTS_VOICE_PREFIX):]
return voice_name
def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
def parse_indextts2_voice(voice_name: str) -> str:
"""
使用 IndexTTS2 API 进行零样本语音克隆
解析 IndexTTS-2 语音名称
支持格式indextts2:reference_audio_path
返回参考音频文件路径
"""
if isinstance(voice_name, str) and voice_name.startswith(config.INDEXTTS2_VOICE_PREFIX):
return voice_name[len(config.INDEXTTS2_VOICE_PREFIX):]
return voice_name
def parse_omnivoice_voice(voice_name: str) -> str:
"""
解析 OmniVoice 语音名称
支持格式omnivoice:reference_audio_path
返回参考音频文件路径或模式名
"""
if isinstance(voice_name, str) and voice_name.startswith(config.OMNIVOICE_VOICE_PREFIX):
return voice_name[len(config.OMNIVOICE_VOICE_PREFIX):]
return voice_name
def indextts_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""
使用 IndexTTS-1.5 API 进行零样本语音克隆
Args:
text: 要转换的文本
voice_name: 参考音频路径格式indextts2:path/to/audio.wav
voice_name: 参考音频文件格式indextts:path/to/audio.wav
voice_file: 输出音频文件路径
speed: 语音速度此引擎暂不支持速度调节
@ -2173,20 +2291,20 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
SubMaker: 包含时间戳信息的字幕制作器失败时返回 None
"""
# 获取配置
api_url = config.indextts2.get("api_url", "http://192.168.3.6:8081/tts")
infer_mode = config.indextts2.get("infer_mode", "普通推理")
temperature = config.indextts2.get("temperature", 1.0)
top_p = config.indextts2.get("top_p", 0.8)
top_k = config.indextts2.get("top_k", 30)
do_sample = config.indextts2.get("do_sample", True)
num_beams = config.indextts2.get("num_beams", 3)
repetition_penalty = config.indextts2.get("repetition_penalty", 10.0)
api_url = config.indextts.get("api_url", "http://192.168.3.6:8081/tts")
infer_mode = config.indextts.get("infer_mode", "普通推理")
temperature = config.indextts.get("temperature", 1.0)
top_p = config.indextts.get("top_p", 0.8)
top_k = config.indextts.get("top_k", 30)
do_sample = config.indextts.get("do_sample", True)
num_beams = config.indextts.get("num_beams", 3)
repetition_penalty = config.indextts.get("repetition_penalty", 10.0)
# 解析参考音频路径
reference_audio_path = parse_indextts2_voice(voice_name)
# 解析参考音频文件
reference_audio_path = parse_indextts_voice(voice_name)
if not reference_audio_path or not os.path.exists(reference_audio_path):
logger.error(f"IndexTTS2 参考音频文件不存在: {reference_audio_path}")
logger.error(f"IndexTTS-1.5 参考音频文件不存在: {reference_audio_path}")
return None
# 准备请求数据
@ -2208,7 +2326,7 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
# 重试机制
for attempt in range(3):
try:
logger.info(f"{attempt + 1} 次调用 IndexTTS2 API")
logger.info(f"{attempt + 1} 次调用 IndexTTS-1.5 API")
# 设置代理
proxies = {}
@ -2224,7 +2342,7 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
files=files,
data=data,
proxies=proxies,
timeout=120 # IndexTTS2 推理可能需要较长时间
timeout=120 # IndexTTS-1.5 推理可能需要较长时间
)
if response.status_code == 200:
@ -2232,9 +2350,9 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
with open(voice_file, 'wb') as f:
f.write(response.content)
logger.info(f"IndexTTS2 成功生成音频: {voice_file}, 大小: {len(response.content)} 字节")
logger.info(f"IndexTTS-1.5 成功生成音频: {voice_file}, 大小: {len(response.content)} 字节")
# IndexTTS2 不支持精确字幕生成,返回简单的 SubMaker 对象
# IndexTTS-1.5 不支持精确字幕生成,返回简单的 SubMaker 对象
sub_maker = new_sub_maker()
# 估算音频时长(基于文本长度)
estimated_duration_ms = max(1000, int(len(text) * 200))
@ -2243,14 +2361,14 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
return sub_maker
else:
logger.error(f"IndexTTS2 API 调用失败: {response.status_code} - {response.text}")
logger.error(f"IndexTTS-1.5 API 调用失败: {response.status_code} - {response.text}")
except requests.exceptions.Timeout:
logger.error(f"IndexTTS2 API 调用超时 (尝试 {attempt + 1}/3)")
logger.error(f"IndexTTS-1.5 API 调用超时 (尝试 {attempt + 1}/3)")
except requests.exceptions.RequestException as e:
logger.error(f"IndexTTS2 API 网络错误: {str(e)} (尝试 {attempt + 1}/3)")
logger.error(f"IndexTTS-1.5 API 网络错误: {str(e)} (尝试 {attempt + 1}/3)")
except Exception as e:
logger.error(f"IndexTTS2 TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)")
logger.error(f"IndexTTS-1.5 TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)")
finally:
# 确保关闭文件
try:
@ -2267,5 +2385,270 @@ def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.
except:
pass
logger.error("IndexTTS2 TTS 生成失败,已达到最大重试次数")
logger.error("IndexTTS-1.5 TTS 生成失败,已达到最大重试次数")
return None
def _normalize_indextts2_api_url(api_url: str) -> str:
api_url = (api_url or "http://192.168.3.6:7863/tts").strip()
if api_url.endswith("/tts"):
return api_url
return f"{api_url.rstrip('/')}/tts"
def _get_configured_proxies() -> dict:
if not config.proxy.get("http"):
return {}
return {
"http": config.proxy.get("http"),
"https": config.proxy.get("https", config.proxy.get("http")),
}
def _download_indextts2_audio(response: requests.Response, api_url: str, voice_file: str, proxies: dict) -> bool:
content_type = response.headers.get("content-type", "").lower()
if "application/json" not in content_type:
with open(voice_file, "wb") as f:
f.write(response.content)
return os.path.getsize(voice_file) > 0
result = response.json()
downloads = result.get("downloads") if isinstance(result, dict) else {}
download_url = downloads.get("wav") if isinstance(downloads, dict) else ""
if not download_url:
logger.error(f"IndexTTS-2 API 响应中没有音频下载地址: {result}")
return False
audio_url = urljoin(api_url, download_url)
audio_response = requests.get(audio_url, proxies=proxies, timeout=120)
if audio_response.status_code != 200:
logger.error(f"IndexTTS-2 音频下载失败: {audio_response.status_code} - {audio_response.text}")
return False
with open(voice_file, "wb") as f:
f.write(audio_response.content)
return os.path.getsize(voice_file) > 0
def indextts2_tts(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]:
"""
使用 IndexTTS-2 API 进行零样本语音克隆
接口兼容 IndexTTS2-Pack POST /tts multipart form
"""
api_url = _normalize_indextts2_api_url(config.indextts2.get("api_url", "http://192.168.3.6:7863/tts"))
reference_audio_path = parse_indextts2_voice(voice_name)
if not reference_audio_path or not os.path.exists(reference_audio_path):
logger.error(f"IndexTTS-2 参考音频文件不存在: {reference_audio_path}")
return None
emotion_mode = config.indextts2.get("emotion_mode", "speaker")
emotion_audio_path = config.indextts2.get("emotion_audio", "")
data = {
"text": text.strip(),
"emotion_mode": emotion_mode,
"emotion_alpha": config.indextts2.get("emotion_alpha", 0.65),
"emotion_text": config.indextts2.get("emotion_text", ""),
"use_random": str(bool(config.indextts2.get("use_random", False))).lower(),
"max_text_tokens_per_segment": config.indextts2.get("max_text_tokens_per_segment", 120),
"vec_happy": config.indextts2.get("vec_happy", 0.0),
"vec_angry": config.indextts2.get("vec_angry", 0.0),
"vec_sad": config.indextts2.get("vec_sad", 0.0),
"vec_afraid": config.indextts2.get("vec_afraid", 0.0),
"vec_disgusted": config.indextts2.get("vec_disgusted", 0.0),
"vec_melancholic": config.indextts2.get("vec_melancholic", 0.0),
"vec_surprised": config.indextts2.get("vec_surprised", 0.0),
"vec_calm": config.indextts2.get("vec_calm", 0.8),
"temperature": config.indextts2.get("temperature", 0.8),
"top_p": config.indextts2.get("top_p", 0.8),
"top_k": config.indextts2.get("top_k", 30),
"num_beams": config.indextts2.get("num_beams", 3),
"repetition_penalty": config.indextts2.get("repetition_penalty", 10.0),
"max_mel_tokens": config.indextts2.get("max_mel_tokens", 1500),
}
proxies = _get_configured_proxies()
for attempt in range(3):
files = {}
try:
files["speaker_audio"] = open(reference_audio_path, "rb")
if emotion_mode == "audio":
if not emotion_audio_path or not os.path.exists(emotion_audio_path):
logger.error(f"IndexTTS-2 情感参考音频文件不存在: {emotion_audio_path}")
return None
files["emotion_audio"] = open(emotion_audio_path, "rb")
logger.info(f"{attempt + 1} 次调用 IndexTTS-2 API: {api_url}")
response = requests.post(
api_url,
files=files,
data=data,
proxies=proxies,
timeout=180,
)
if response.status_code == 200 and _download_indextts2_audio(response, api_url, voice_file, proxies):
logger.info(f"IndexTTS-2 成功生成音频: {voice_file}, 大小: {os.path.getsize(voice_file)} 字节")
sub_maker = new_sub_maker()
duration = get_audio_duration_from_file(voice_file)
duration_ms = int(duration * 1000) if duration > 0 else max(1000, int(len(text) * 200))
add_subtitle_event(sub_maker, 0, duration_ms * 10000, text)
return sub_maker
logger.error(f"IndexTTS-2 API 调用失败: {response.status_code} - {response.text}")
except requests.exceptions.Timeout:
logger.error(f"IndexTTS-2 API 调用超时 (尝试 {attempt + 1}/3)")
except requests.exceptions.RequestException as e:
logger.error(f"IndexTTS-2 API 网络错误: {str(e)} (尝试 {attempt + 1}/3)")
except Exception as e:
logger.error(f"IndexTTS-2 TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)")
finally:
for file_obj in files.values():
try:
file_obj.close()
except Exception:
pass
if attempt < 2:
time.sleep(2)
logger.error("IndexTTS-2 TTS 生成失败,已达到最大重试次数")
return None
def _normalize_omnivoice_api_url(api_url: str) -> str:
api_url = (api_url or "http://127.0.0.1:7866/tts").strip()
if api_url.endswith("/tts"):
return api_url
if api_url.endswith("/tts/json"):
return f"{api_url[:-len('/tts/json')]}/tts"
return f"{api_url.rstrip('/')}/tts"
def _download_omnivoice_audio(response: requests.Response, api_url: str, voice_file: str, proxies: dict) -> bool:
content_type = response.headers.get("content-type", "").lower()
if "application/json" not in content_type:
with open(voice_file, "wb") as f:
f.write(response.content)
return os.path.getsize(voice_file) > 0
result = response.json()
audio_url = result.get("audio_url") if isinstance(result, dict) else ""
if not audio_url:
logger.error(f"OmniVoice API 响应中没有音频下载地址: {result}")
return False
audio_response = requests.get(urljoin(api_url, audio_url), proxies=proxies, timeout=180)
if audio_response.status_code != 200:
logger.error(f"OmniVoice 音频下载失败: {audio_response.status_code} - {audio_response.text}")
return False
with open(voice_file, "wb") as f:
f.write(audio_response.content)
return os.path.getsize(voice_file) > 0
def _optional_omnivoice_generation_data(voice_speed: float) -> dict:
omnivoice_config = getattr(config, "omnivoice", {}) or {}
data = {
"speed": voice_speed or omnivoice_config.get("speed", 1.0),
}
optional_fields = {
"num_step": omnivoice_config.get("num_step"),
"guidance_scale": omnivoice_config.get("guidance_scale"),
"duration": omnivoice_config.get("duration"),
}
for key, value in optional_fields.items():
if value not in (None, ""):
data[key] = value
for key in ("denoise", "postprocess_output", "preprocess_prompt"):
if key in omnivoice_config:
data[key] = str(bool(omnivoice_config.get(key))).lower()
return data
def omnivoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]:
"""
使用 OmniVoice-Pack FastAPI 服务进行语音合成
支持自动音色指令音色和参考音频克隆三种模式
"""
omnivoice_config = getattr(config, "omnivoice", {}) or {}
api_url = _normalize_omnivoice_api_url(omnivoice_config.get("api_url", "http://127.0.0.1:7866/tts"))
mode = omnivoice_config.get("mode", "auto")
language = (omnivoice_config.get("language", "zh") or "").strip()
instruct = (omnivoice_config.get("instruct", "") or "").strip()
ref_text = (omnivoice_config.get("ref_text", "") or "").strip()
parsed_voice = parse_omnivoice_voice(voice_name)
if mode != "voice_clone" and parsed_voice and os.path.isfile(parsed_voice):
mode = "voice_clone"
reference_audio_path = ""
if mode == "voice_clone":
candidate = parsed_voice
if candidate and os.path.isfile(candidate):
reference_audio_path = candidate
else:
reference_audio_path = parse_omnivoice_voice(omnivoice_config.get("reference_audio", "") or "")
if not reference_audio_path or not os.path.exists(reference_audio_path):
logger.error(f"OmniVoice 参考音频文件不存在: {reference_audio_path}")
return None
elif mode != "voice_design":
instruct = ""
data = {
"text": text.strip(),
"language": language,
**_optional_omnivoice_generation_data(speed),
}
if mode == "voice_design" and instruct:
data["instruct"] = instruct
if mode == "voice_clone" and ref_text:
data["ref_text"] = ref_text
proxies = _get_configured_proxies()
for attempt in range(3):
files = {}
try:
if reference_audio_path:
files["ref_audio"] = open(reference_audio_path, "rb")
logger.info(f"{attempt + 1} 次调用 OmniVoice API: {api_url}, mode={mode}")
response = requests.post(
api_url,
files=files or None,
data=data,
proxies=proxies,
timeout=240,
)
if response.status_code == 200 and _download_omnivoice_audio(response, api_url, voice_file, proxies):
logger.info(f"OmniVoice 成功生成音频: {voice_file}, 大小: {os.path.getsize(voice_file)} 字节")
sub_maker = new_sub_maker()
duration = get_audio_duration_from_file(voice_file)
duration_ms = int(duration * 1000) if duration > 0 else max(1000, int(len(text) * 200))
add_subtitle_event(sub_maker, 0, duration_ms * 10000, text)
return sub_maker
logger.error(f"OmniVoice API 调用失败: {response.status_code} - {response.text}")
except requests.exceptions.Timeout:
logger.error(f"OmniVoice API 调用超时 (尝试 {attempt + 1}/3)")
except requests.exceptions.RequestException as e:
logger.error(f"OmniVoice API 网络错误: {str(e)} (尝试 {attempt + 1}/3)")
except Exception as e:
logger.error(f"OmniVoice TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)")
finally:
for file_obj in files.values():
try:
file_obj.close()
except Exception:
pass
if attempt < 2:
time.sleep(2)
logger.error("OmniVoice TTS 生成失败,已达到最大重试次数")
return None

View File

@ -57,6 +57,23 @@ def check_format(script_content: str) -> Dict[str, Any]:
'details': f'当前值: {clip["_id"]} (类型: {type(clip["_id"]).__name__})'
}
# 验证可选视频来源字段。旧脚本可以不包含,新脚本用于多视频定位。
if 'video_id' in clip and clip['video_id'] not in ("", None):
if not isinstance(clip['video_id'], int) or clip['video_id'] <= 0:
return {
'success': False,
'message': f'{i+1}个片段的video_id必须是正整数',
'details': f'当前值: {clip["video_id"]} (类型: {type(clip["video_id"]).__name__})'
}
if 'video_name' in clip and clip['video_name'] not in ("", None):
if not isinstance(clip['video_name'], str):
return {
'success': False,
'message': f'{i+1}个片段的video_name必须是字符串',
'details': f'当前值: {clip["video_name"]} (类型: {type(clip["video_name"]).__name__})'
}
# 验证 timestamp 字段格式
timestamp_pattern = r'^\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}$'
if not isinstance(clip['timestamp'], str) or not re.match(timestamp_pattern, clip['timestamp']):

View File

@ -0,0 +1,493 @@
"""FFmpeg engine discovery and capability diagnostics."""
from __future__ import annotations
import os
import platform
import re
import shutil
import subprocess
import sys
import tempfile
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
from loguru import logger
_FFMPEG_EXE = "ffmpeg.exe" if os.name == "nt" else "ffmpeg"
_FFPROBE_EXE = "ffprobe.exe" if os.name == "nt" else "ffprobe"
_SOURCE_PRIORITY = {
"Configured": 0,
"NarratoAI packaged runtime": 1,
"Integrated runtime": 2,
"System PATH": 3,
"Homebrew": 4,
"Python environment": 5,
"Python executable folder": 6,
"IMAGEIO_FFMPEG_EXE": 7,
"imageio-ffmpeg": 8,
"System": 9,
}
@dataclass(frozen=True)
class FFmpegEngine:
"""A discovered FFmpeg executable."""
path: str
source: str
ffprobe_path: str
available: bool
version_line: str
@property
def label(self) -> str:
status = "OK" if self.available else "Unavailable"
version = self.version_line.replace("ffmpeg version", "").strip() or "unknown version"
return f"{self.source} - {version} - {self.path} ({status})"
def to_dict(self) -> dict[str, Any]:
payload = asdict(self)
payload["label"] = self.label
return payload
def _run_command(args: list[str], timeout: int = 10) -> subprocess.CompletedProcess[str]:
return subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=False,
timeout=timeout,
)
def _first_line(text: str) -> str:
for line in (text or "").splitlines():
stripped = line.strip()
if stripped:
return stripped
return ""
def _is_executable(path: str) -> bool:
if not path:
return False
if os.name == "nt":
return os.path.isfile(path)
return os.path.isfile(path) and os.access(path, os.X_OK)
def _normalize_path(path: str) -> str:
return str(Path(path).expanduser().resolve())
def _ffmpeg_version_line(ffmpeg_path: str) -> tuple[bool, str]:
if not _is_executable(ffmpeg_path):
return False, ""
try:
result = _run_command([ffmpeg_path, "-version"], timeout=8)
except Exception as exc:
logger.debug(f"FFmpeg version check failed for {ffmpeg_path}: {exc}")
return False, ""
output = result.stdout or result.stderr
return result.returncode == 0, _first_line(output)
def _paired_ffprobe_path(ffmpeg_path: str) -> str:
ffmpeg = Path(ffmpeg_path)
sibling = ffmpeg.with_name(_FFPROBE_EXE)
if _is_executable(str(sibling)):
return _normalize_path(str(sibling))
scoped_path = os.pathsep.join([str(ffmpeg.parent), os.environ.get("PATH", "")])
discovered = shutil.which(_FFPROBE_EXE, path=scoped_path)
return _normalize_path(discovered) if discovered else ""
def _candidate_paths(root_dir: str = "", include_system: bool = True) -> list[tuple[str, str]]:
candidates: list[tuple[str, str]] = []
root = Path(root_dir).expanduser().resolve() if root_dir else Path.cwd().resolve()
project_parent = root.parent
candidates.extend(
[
("Integrated runtime", str(root / "runtime" / "python" / "bin" / _FFMPEG_EXE)),
("Integrated runtime", str(root.parent / "runtime" / "python" / "bin" / _FFMPEG_EXE)),
(
"NarratoAI packaged runtime",
str(
project_parent
/ "NarratoAI-Pack"
/ "dist"
/ "NarratoAI-macos-arm64"
/ "runtime"
/ "python"
/ "bin"
/ _FFMPEG_EXE
),
),
("Python environment", str(Path(sys.prefix) / "bin" / _FFMPEG_EXE)),
("Python executable folder", str(Path(sys.executable).with_name(_FFMPEG_EXE))),
]
)
env_ffmpeg = os.environ.get("IMAGEIO_FFMPEG_EXE", "")
if env_ffmpeg:
candidates.append(("IMAGEIO_FFMPEG_EXE", env_ffmpeg))
if include_system:
path_ffmpeg = shutil.which(_FFMPEG_EXE)
if path_ffmpeg:
candidates.append(("System PATH", path_ffmpeg))
for source, path in (
("Homebrew", f"/opt/homebrew/bin/{_FFMPEG_EXE}"),
("Homebrew", f"/usr/local/bin/{_FFMPEG_EXE}"),
("System", f"/usr/bin/{_FFMPEG_EXE}"),
):
candidates.append((source, path))
try:
import imageio_ffmpeg
candidates.append(("imageio-ffmpeg", imageio_ffmpeg.get_ffmpeg_exe()))
except Exception as exc:
logger.debug(f"imageio-ffmpeg discovery skipped: {exc}")
return candidates
def discover_ffmpeg_engines(
configured_path: str = "",
root_dir: str = "",
include_system: bool = True,
) -> list[dict[str, Any]]:
"""Discover available FFmpeg engines from config, packaged runtime and PATH."""
candidates: list[tuple[str, str]] = []
if configured_path:
candidates.append(("Configured", configured_path))
candidates.extend(_candidate_paths(root_dir=root_dir, include_system=include_system))
engines: list[FFmpegEngine] = []
seen: set[str] = set()
for source, raw_path in candidates:
if not raw_path:
continue
try:
path = _normalize_path(raw_path)
except Exception:
path = str(Path(raw_path).expanduser())
key = os.path.normcase(path)
if key in seen:
continue
seen.add(key)
available, version_line = _ffmpeg_version_line(path)
if not available and source not in {"Configured", "IMAGEIO_FFMPEG_EXE"}:
continue
engines.append(
FFmpegEngine(
path=path,
source=source,
ffprobe_path=_paired_ffprobe_path(path),
available=available,
version_line=version_line,
)
)
engines.sort(
key=lambda engine: (
not engine.available,
_SOURCE_PRIORITY.get(engine.source, 99),
engine.path,
)
)
return [engine.to_dict() for engine in engines]
def _parse_hwaccels(output: str) -> list[str]:
values: list[str] = []
for line in output.splitlines():
item = line.strip().lower()
if not item or item.startswith("hardware acceleration"):
continue
if re.fullmatch(r"[a-z0-9_]+", item):
values.append(item)
return sorted(set(values))
def _parse_ffmpeg_table_names(output: str) -> set[str]:
names: set[str] = set()
for line in output.splitlines():
match = re.match(r"\s*[A-Z.]{2,}\s+([A-Za-z0-9_]+)\b", line)
if match:
names.add(match.group(1).lower())
return names
def _run_optional(args: list[str], timeout: int = 15, max_output_chars: int = 1200) -> tuple[bool, str]:
try:
result = _run_command(args, timeout=timeout)
except subprocess.TimeoutExpired:
return False, "Command timed out"
except Exception as exc:
return False, str(exc)
output = "\n".join(part for part in (result.stderr, result.stdout) if part)
if max_output_chars > 0:
output = output[-max_output_chars:]
return result.returncode == 0, output
def _hardware_candidates() -> list[tuple[str, str, list[str]]]:
system = platform.system().lower()
if system == "darwin":
return [
("videotoolbox", "h264_videotoolbox", ["-c:v", "h264_videotoolbox", "-q:v", "65"]),
]
if system == "windows":
return [
("nvenc", "h264_nvenc", ["-c:v", "h264_nvenc", "-preset", "fast"]),
("qsv", "h264_qsv", ["-c:v", "h264_qsv", "-preset", "fast"]),
("amf", "h264_amf", ["-c:v", "h264_amf"]),
]
return [
("nvenc", "h264_nvenc", ["-c:v", "h264_nvenc", "-preset", "fast"]),
("qsv", "h264_qsv", ["-vf", "format=nv12", "-c:v", "h264_qsv"]),
("vaapi", "h264_vaapi", ["-vf", "format=nv12,hwupload", "-c:v", "h264_vaapi"]),
]
def _detect_hardware_encoding(ffmpeg_path: str, encoders: set[str]) -> dict[str, Any]:
tested: list[dict[str, Any]] = []
for accel_type, encoder, encoder_args in _hardware_candidates():
if encoder.lower() not in encoders:
tested.append(
{
"type": accel_type,
"encoder": encoder,
"available": False,
"message": "Encoder not listed by this FFmpeg build",
}
)
continue
cmd = [
ffmpeg_path,
"-y",
"-hide_banner",
"-loglevel",
"error",
"-f",
"lavfi",
"-i",
"testsrc=duration=0.5:size=128x72:rate=15",
"-frames:v",
"5",
*encoder_args,
"-pix_fmt",
"yuv420p",
"-f",
"null",
"-",
]
ok, message = _run_optional(cmd, timeout=18)
tested.append(
{
"type": accel_type,
"encoder": encoder,
"available": ok,
"message": "Hardware encode test passed" if ok else message,
}
)
if ok:
return {
"available": True,
"type": accel_type,
"encoder": encoder,
"message": "Hardware encode test passed",
"tested": tested,
}
return {
"available": False,
"type": None,
"encoder": None,
"message": "No hardware encoder passed the runtime test",
"tested": tested,
}
def _escape_filter_path(path: str) -> str:
return path.replace("\\", "\\\\").replace(":", "\\:").replace("'", "\\'")
def _test_subtitle_burn(ffmpeg_path: str, filters: set[str]) -> dict[str, Any]:
filter_status = {
"subtitles": "subtitles" in filters,
"ass": "ass" in filters,
"drawtext": "drawtext" in filters,
"overlay": "overlay" in filters,
}
if filter_status["subtitles"]:
with tempfile.TemporaryDirectory() as tmp_dir:
srt_path = Path(tmp_dir) / "subtitle_test.srt"
srt_path.write_text(
"1\n00:00:00,000 --> 00:00:00,800\nNarratoAI FFmpeg subtitle test\n",
encoding="utf-8",
)
ok, message = _run_optional(
[
ffmpeg_path,
"-y",
"-hide_banner",
"-loglevel",
"error",
"-f",
"lavfi",
"-i",
"color=black:size=320x180:duration=1",
"-vf",
f"subtitles={_escape_filter_path(str(srt_path))}",
"-frames:v",
"1",
"-f",
"null",
"-",
],
timeout=18,
)
if ok:
return {
"available": True,
"method": "subtitles",
"message": "SRT subtitle burn-in test passed",
"filters": filter_status,
}
subtitles_error = message
else:
subtitles_error = "subtitles filter is not listed by this FFmpeg build"
if filter_status["drawtext"]:
ok, message = _run_optional(
[
ffmpeg_path,
"-y",
"-hide_banner",
"-loglevel",
"error",
"-f",
"lavfi",
"-i",
"color=black:size=320x180:duration=1",
"-vf",
"drawtext=text=NarratoAI:x=10:y=10:fontsize=18:fontcolor=white",
"-frames:v",
"1",
"-f",
"null",
"-",
],
timeout=18,
)
if ok:
return {
"available": True,
"method": "drawtext",
"message": "drawtext burn-in fallback test passed",
"filters": filter_status,
}
drawtext_error = message
else:
drawtext_error = "drawtext filter is not listed by this FFmpeg build"
return {
"available": False,
"method": None,
"message": f"{subtitles_error}\n{drawtext_error}".strip(),
"filters": filter_status,
}
def validate_ffmpeg_engine(ffmpeg_path: str) -> dict[str, Any]:
"""Run runtime checks for a selected FFmpeg engine."""
path = _normalize_path(ffmpeg_path)
report: dict[str, Any] = {
"path": path,
"ffmpeg_available": False,
"version_line": "",
"ffprobe_path": "",
"ffprobe_available": False,
"ffprobe_version_line": "",
"hwaccels": [],
"hardware_acceleration": {
"available": False,
"type": None,
"encoder": None,
"message": "",
"tested": [],
},
"subtitle_burn": {
"available": False,
"method": None,
"message": "",
"filters": {},
},
"software_encoder_available": False,
"errors": [],
}
available, version_line = _ffmpeg_version_line(path)
report["ffmpeg_available"] = available
report["version_line"] = version_line
if not available:
report["errors"].append("FFmpeg is not executable or failed to run -version")
return report
ffprobe_path = _paired_ffprobe_path(path)
report["ffprobe_path"] = ffprobe_path
if ffprobe_path:
probe_available, probe_version = _ffmpeg_version_line(ffprobe_path)
report["ffprobe_available"] = probe_available
report["ffprobe_version_line"] = probe_version
ok, hwaccel_output = _run_optional(
[path, "-hide_banner", "-hwaccels"],
timeout=10,
max_output_chars=0,
)
if ok:
report["hwaccels"] = _parse_hwaccels(hwaccel_output)
else:
report["errors"].append(f"Failed to list hardware acceleration methods: {hwaccel_output}")
ok, encoders_output = _run_optional(
[path, "-hide_banner", "-encoders"],
timeout=10,
max_output_chars=0,
)
encoders = _parse_ffmpeg_table_names(encoders_output) if ok else set()
report["software_encoder_available"] = "libx264" in encoders or "libopenh264" in encoders
if not ok:
report["errors"].append(f"Failed to list encoders: {encoders_output}")
ok, filters_output = _run_optional(
[path, "-hide_banner", "-filters"],
timeout=10,
max_output_chars=0,
)
filters = _parse_ffmpeg_table_names(filters_output) if ok else set()
if not ok:
report["errors"].append(f"Failed to list filters: {filters_output}")
report["hardware_acceleration"] = _detect_hardware_encoding(path, encoders)
report["subtitle_burn"] = _test_subtitle_burn(path, filters)
return report

View File

@ -0,0 +1,76 @@
import os
import tempfile
import unittest
from pathlib import Path
from app.utils import ffmpeg_detector
class FFmpegDetectorTests(unittest.TestCase):
def _write_fake_binary(self, path: Path, first_line: str) -> None:
path.write_text(
"#!/bin/sh\n"
"if [ \"$1\" = \"-version\" ]; then\n"
f" echo \"{first_line}\"\n"
" exit 0\n"
"fi\n"
"if [ \"$2\" = \"-hwaccels\" ]; then\n"
" echo \"Hardware acceleration methods:\"\n"
" echo \"videotoolbox\"\n"
" exit 0\n"
"fi\n"
"if [ \"$2\" = \"-encoders\" ]; then\n"
" echo \" V....D h264_videotoolbox Apple VideoToolbox H.264\"\n"
" echo \" V....D h264_nvenc NVIDIA NVENC H.264\"\n"
" echo \" V....D h264_qsv Intel QSV H.264\"\n"
" echo \" V....D libx264 libx264 H.264\"\n"
" exit 0\n"
"fi\n"
"if [ \"$2\" = \"-filters\" ]; then\n"
" echo \" ... subtitles V->V Render text subtitles\"\n"
" echo \" ... drawtext V->V Draw text\"\n"
" echo \" ... overlay VV->V Overlay video\"\n"
" exit 0\n"
"fi\n"
"exit 0\n",
encoding="utf-8",
)
path.chmod(0o755)
@unittest.skipIf(os.name == "nt", "shell fake binaries are POSIX-only")
def test_discover_includes_configured_path(self):
with tempfile.TemporaryDirectory() as tmp_dir:
ffmpeg_path = Path(tmp_dir) / "ffmpeg"
ffprobe_path = Path(tmp_dir) / "ffprobe"
self._write_fake_binary(ffmpeg_path, "ffmpeg version fake-1.0")
self._write_fake_binary(ffprobe_path, "ffprobe version fake-1.0")
engines = ffmpeg_detector.discover_ffmpeg_engines(
configured_path=str(ffmpeg_path),
root_dir=tmp_dir,
include_system=False,
)
self.assertEqual(engines[0]["path"], str(ffmpeg_path.resolve()))
self.assertEqual(engines[0]["ffprobe_path"], str(ffprobe_path.resolve()))
self.assertTrue(engines[0]["available"])
@unittest.skipIf(os.name == "nt", "shell fake binaries are POSIX-only")
def test_validate_reports_hardware_and_subtitle_support(self):
with tempfile.TemporaryDirectory() as tmp_dir:
ffmpeg_path = Path(tmp_dir) / "ffmpeg"
ffprobe_path = Path(tmp_dir) / "ffprobe"
self._write_fake_binary(ffmpeg_path, "ffmpeg version fake-1.0")
self._write_fake_binary(ffprobe_path, "ffprobe version fake-1.0")
report = ffmpeg_detector.validate_ffmpeg_engine(str(ffmpeg_path))
self.assertTrue(report["ffmpeg_available"])
self.assertTrue(report["ffprobe_available"])
self.assertTrue(report["hardware_acceleration"]["available"])
self.assertTrue(report["subtitle_burn"]["available"])
self.assertEqual(report["subtitle_burn"]["method"], "subtitles")
if __name__ == "__main__":
unittest.main()

View File

@ -25,6 +25,10 @@
vision_openai_model_name = "Qwen/Qwen3.5-122B-A10B"
vision_openai_api_key = "" # 填入对应 provider 的 API key
vision_openai_base_url = "https://api.siliconflow.cn/v1" # 可选:自定义 API base URL官方 OpenAI 可留空)
vision_openai_temperature = 1.0
vision_openai_top_p = 0.95
vision_openai_max_tokens = 65536
vision_openai_thinking_level = "auto" # auto/off/low/medium/high
# ===== 文本模型配置 =====
text_llm_provider = "openai"
@ -40,6 +44,16 @@
text_openai_model_name = "Pro/zai-org/GLM-5"
text_openai_api_key = "" # 填入对应 provider 的 API key
text_openai_base_url = "https://api.siliconflow.cn/v1" # 可选:自定义 API base URL官方 OpenAI 可留空)
text_openai_temperature = 1.0
text_openai_top_p = 0.95
text_openai_max_tokens = 65536
text_openai_thinking_level = "auto" # auto/off/low/medium/high
# ===== Tavily 联网搜索配置 =====
# 用于短剧剧情理解前,按短剧名称检索公开剧情/人物/分集信息
tavily_api_key = "" # 获取地址https://app.tavily.com
tavily_search_depth = "basic" # basic / advanced / fast / ultra-fast
tavily_max_results = 5
# ===== API Keys 参考 =====
# 主流 LLM Providers API Key 获取地址:
@ -61,6 +75,10 @@
# WebUI 界面是否显示配置项
hide_config = true
# FFmpeg 引擎路径(可选)
# 为空时使用系统 PATH也可以在系统设置中通过下拉框选择整合包或本机 ffmpeg。
ffmpeg_path = ""
# 官方 OpenAI 默认端点(可选):
# text_openai_base_url = "https://api.openai.com/v1"
@ -95,24 +113,32 @@
model_name = "qwen3-tts-flash"
[fun_asr]
# 阿里百炼 Fun-ASR 字幕转录配置
# 访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取你的 API 密钥
# Fun-ASR 字幕转录配置
# backend = "local" 使用本地 FunASR-Pack APIbackend = "firered" 使用本地 FireRedASR2-AED-Pack APIbackend = "bailian" 使用阿里百炼在线 fun-asr
auto_transcribe_enabled = false
backend = "local"
api_url = "http://127.0.0.1:7860"
firered_api_url = "http://127.0.0.1:7867"
hotword = ""
enable_spk = false
# 使用阿里百炼在线 fun-asr 时,访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取 API Key
api_key = ""
model = "fun-asr"
[indextts2]
# IndexTTS2 语音克隆配置
[indextts]
# IndexTTS-1.5 语音克隆配置
# 这是一个开源的零样本语音克隆项目,需要自行部署
# 项目地址https://github.com/index-tts/index-tts
# 默认 API 地址(本地部署)
api_url = "http://127.0.0.1:8081/tts"
# 默认参考音频路径(可选)
# 默认参考音频(可选)
reference_audio_source = "resource"
# reference_audio = "/path/to/reference_audio.wav"
# 推理模式:普通推理 / 快速推理
infer_mode = "普通推理"
# 高级参数
temperature = 1.0
top_p = 0.8
@ -120,6 +146,66 @@
do_sample = true
num_beams = 3
repetition_penalty = 10.0
[indextts2]
# IndexTTS-2 语音克隆配置
# 支持 IndexTTS2-Pack FastAPI 接口POST /tts
api_url = "http://192.168.3.6:7863/tts"
# 默认参考音频(可选),音色列表复用 IndexTTS-1.5 的资源目录
reference_audio_source = "resource"
# reference_audio = "/path/to/reference_audio.wav"
# 情感控制speaker / audio / vector / text
emotion_mode = "speaker"
emotion_audio = ""
emotion_alpha = 0.65
emotion_text = ""
use_random = false
max_text_tokens_per_segment = 120
# 8 维情感向量顺序happy, angry, sad, afraid, disgusted, melancholic, surprised, calm
vec_happy = 0.0
vec_angry = 0.0
vec_sad = 0.0
vec_afraid = 0.0
vec_disgusted = 0.0
vec_melancholic = 0.0
vec_surprised = 0.0
vec_calm = 0.8
# 高级生成参数
temperature = 0.8
top_p = 0.8
top_k = 30
num_beams = 3
repetition_penalty = 10.0
max_mel_tokens = 1500
[omnivoice]
# OmniVoice-Pack 语音合成配置
# 支持 OmniVoice-Pack FastAPI 接口POST /tts
api_url = "http://127.0.0.1:7866/tts"
language = "zh"
# 生成模式auto / voice_design / voice_clone
mode = "auto"
instruct = ""
# voice_clone 模式下使用,音色列表复用 IndexTTS-1.5 的资源目录
reference_audio_source = "resource"
reference_audio = ""
ref_text = ""
# 高级生成参数
num_step = 32
guidance_scale = 2.0
speed = 1.0
duration = ""
denoise = true
postprocess_output = true
preprocess_prompt = true
[doubaotts]
# 豆包语音 TTS 配置
# 申请流程:
@ -138,8 +224,8 @@
silence_duration = 0.125
[ui]
# TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen, doubaotts)
tts_engine = "edge_tts"
# TTS引擎选择 (indextts, indextts2, omnivoice, edge_tts, qwen3_tts, tencent_tts, doubaotts, azure_speech)
tts_engine = "indextts"
# Edge TTS 配置
edge_voice_name = "zh-CN-XiaoyiNeural-Female"
@ -157,6 +243,23 @@
doubaotts_voice_type = "BV700_V2_streaming"
doubaotts_rate = 1.0
# 字幕遮罩配置:用于在烧录新字幕前遮盖原视频自带字幕
subtitle_mask_enabled = false
subtitle_mask_landscape_x_percent = 10
subtitle_mask_landscape_y_percent = 78
subtitle_mask_landscape_width_percent = 80
subtitle_mask_landscape_height_percent = 14
subtitle_mask_landscape_blur_radius = 18
subtitle_mask_landscape_opacity_percent = 82
subtitle_mask_portrait_x_percent = 8
subtitle_mask_portrait_y_percent = 79
subtitle_mask_portrait_width_percent = 84
subtitle_mask_portrait_height_percent = 16
subtitle_mask_portrait_blur_radius = 26
subtitle_mask_portrait_opacity_percent = 84
subtitle_position_landscape_y_percent = 85
subtitle_position_portrait_y_percent = 82
##########################################
# 代理和网络配置
##########################################

View File

@ -199,7 +199,7 @@ Gender: Male
Name: en-AU-NatashaNeural
Gender: Female
Name: en-AU-WilliamNeural
Name: en-AU-WilliamMultilingualNeural
Gender: Male
Name: en-CA-ClaraNeural
@ -286,21 +286,33 @@ Gender: Female
Name: en-US-AndrewNeural
Gender: Male
Name: en-US-AndrewMultilingualNeural
Gender: Male
Name: en-US-AriaNeural
Gender: Female
Name: en-US-AvaNeural
Gender: Female
Name: en-US-AvaMultilingualNeural
Gender: Female
Name: en-US-BrianNeural
Gender: Male
Name: en-US-BrianMultilingualNeural
Gender: Male
Name: en-US-ChristopherNeural
Gender: Male
Name: en-US-EmmaNeural
Gender: Female
Name: en-US-EmmaMultilingualNeural
Gender: Female
Name: en-US-EricNeural
Gender: Male
@ -583,12 +595,24 @@ Gender: Male
Name: it-IT-ElsaNeural
Gender: Female
Name: it-IT-GiuseppeNeural
Name: it-IT-GiuseppeMultilingualNeural
Gender: Male
Name: it-IT-IsabellaNeural
Gender: Female
Name: iu-Cans-CA-SiqiniqNeural
Gender: Female
Name: iu-Cans-CA-TaqqiqNeural
Gender: Male
Name: iu-Latn-CA-SiqiniqNeural
Gender: Female
Name: iu-Latn-CA-TaqqiqNeural
Gender: Male
Name: ja-JP-KeitaNeural
Gender: Male
@ -625,7 +649,7 @@ Gender: Male
Name: kn-IN-SapnaNeural
Gender: Female
Name: ko-KR-HyunsuNeural
Name: ko-KR-HyunsuMultilingualNeural
Gender: Male
Name: ko-KR-InJoonNeural
@ -739,7 +763,7 @@ Gender: Male
Name: pt-BR-FranciscaNeural
Gender: Female
Name: pt-BR-ThalitaNeural
Name: pt-BR-ThalitaMultilingualNeural
Gender: Female
Name: pt-PT-DuarteNeural

View File

@ -1 +1 @@
0.7.9
0.8.1

View File

@ -2,7 +2,7 @@
requests>=2.32.0
moviepy==2.1.1
edge-tts==7.2.7
streamlit>=1.45.0
streamlit==1.56.0
watchdog==6.0.0
loguru>=0.7.3
tomli>=2.2.1
@ -35,6 +35,3 @@ tenacity>=9.0.0
# torch>=2.0.0
# torchvision>=0.15.0
# torchaudio>=2.0.0
# 剪映草稿导出依赖
pyJianYingDraft>=0.1.0

542
webui.py
View File

@ -2,6 +2,7 @@ import streamlit as st
import os
import sys
import time
from html import escape
from loguru import logger
from app.config import config
from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \
@ -9,6 +10,7 @@ from webui.components import basic_settings, video_settings, audio_settings, sub
# from webui.utils import cache, file_utils
from app.utils import utils
from app.utils import ffmpeg_utils
from app.models import const
from app.models.schema import VideoClipParams, VideoAspect
@ -128,6 +130,82 @@ def tr(key):
return loc.get("Translation", {}).get(key, key)
VIDEO_GENERATION_STEP_LABELS = [
"正在加载剪辑脚本",
"正在生成 TTS 配音",
"正在按脚本裁剪视频片段",
"正在合并配音和字幕",
"正在合并视频片段",
"正在合成最终视频",
]
def _safe_int(value, default=0):
try:
return int(value)
except (TypeError, ValueError):
return default
def _format_optional_percent(value):
try:
percent = max(0.0, min(100.0, float(value)))
except (TypeError, ValueError):
return None
if percent.is_integer():
return str(int(percent))
return f"{percent:.1f}"
def _render_generation_status(task: dict | None) -> str:
task = task or {}
state = task.get("state")
current_step = _safe_int(task.get("step_current"), 0)
step_total = _safe_int(task.get("step_total"), len(VIDEO_GENERATION_STEP_LABELS))
message = str(task.get("message") or "")
ffmpeg_percent = _format_optional_percent(task.get("ffmpeg_progress"))
if current_step <= 0:
return f"<div style='font-weight:650;color:#262730;'>{escape(message or '正在生成视频,请稍候...')}</div>"
lines = []
for index, default_label in enumerate(VIDEO_GENERATION_STEP_LABELS, start=1):
is_current = index == current_step
is_complete = state == const.TASK_STATE_COMPLETE
is_done = is_complete or index < current_step
label = message if is_current and message else default_label
suffix = f"{index}/{step_total}"
if (
is_current
and index == step_total
and ffmpeg_percent is not None
and not is_complete
):
suffix = f"{suffix}ffmpeg {ffmpeg_percent}%"
color = "#262730" if is_current else "#8b9099" if is_done else "#b9bec7"
weight = "650" if is_current else "500"
lines.append(
"<div style='"
"font-size:1.02rem;"
"line-height:1.85;"
"margin:0.28rem 0;"
f"color:{color};"
f"font-weight:{weight};"
"'>"
f"{escape(label)} <span style='white-space:nowrap;'>({escape(suffix)})</span>"
"</div>"
)
return "".join(lines)
def get_help_text():
"""返回带当前项目版本号的帮助文案"""
return tr("Get Help").replace("🎉🎉🎉", f" v{config.project_version}")
def render_generate_button():
"""渲染生成按钮和处理逻辑"""
if st.button(tr("Generate Video"), use_container_width=True, type="primary"):
@ -143,10 +221,10 @@ def render_generate_button():
# 移除task_id检查 - 现在使用统一裁剪策略,不再需要预裁剪
# 直接检查必要的文件是否存在
if not st.session_state.get('video_clip_json_path'):
st.error(tr("脚本文件不能为空"))
st.error(tr("Script file cannot be empty"))
return
if not st.session_state.get('video_origin_path'):
st.error(tr("视频文件不能为空"))
st.error(tr("Video file cannot be empty"))
return
# 获取所有参数
@ -169,79 +247,189 @@ def render_generate_button():
# 生成一个新的task_id用于本次处理
task_id = str(uuid.uuid4())
# 创建进度条
progress_bar = st.progress(0)
status_text = st.empty()
@st.dialog(tr("Generating Video"), width="large")
def generate_video_dialog():
st.markdown(
"""
<style>
div[data-testid="stDialog"] div[data-testid="stStatusWidget"] {
margin-top: 0.25rem;
}
div[data-testid="stDialog"] div[data-testid="stProgress"] {
margin-bottom: 0.75rem;
}
div[data-testid="stDialog"] video {
max-height: 62vh;
object-fit: contain;
background: #000;
}
</style>
""",
unsafe_allow_html=True,
)
def run_task():
try:
tm.start_subclip_unified(
task_id=task_id,
params=params
progress_bar = st.progress(0)
status_panel = st.status(tr("Generating Video"), expanded=True)
with status_panel:
status_placeholder = st.empty()
status_placeholder.markdown(
_render_generation_status(None),
unsafe_allow_html=True,
)
except Exception as e:
logger.error(f"任务执行失败: {e}")
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, message=str(e))
# 在新线程中启动任务
thread = threading.Thread(target=run_task)
thread.start()
def run_task():
try:
tm.start_subclip_unified(
task_id=task_id,
params=params
)
except Exception as e:
logger.error(f"任务执行失败: {e}")
current_task = sm.state.get_task(task_id) or {}
sm.state.update_task(
task_id,
state=const.TASK_STATE_FAILED,
progress=current_task.get("progress", 0),
message=str(e),
)
# 轮询任务状态
while True:
task = sm.state.get_task(task_id)
if task:
progress = task.get("progress", 0)
state = task.get("state")
# 更新进度条
progress_bar.progress(progress / 100)
status_text.text(f"Processing... {progress}%")
# 在新线程中启动任务
thread = threading.Thread(target=run_task)
thread.start()
last_status_key = None
# 轮询任务状态
while True:
task = sm.state.get_task(task_id)
if task:
progress = task.get("progress", 0)
state = task.get("state")
if state == const.TASK_STATE_COMPLETE:
status_text.text(tr("视频生成完成"))
progress_bar.progress(1.0)
# 显示结果
video_files = task.get("videos", [])
try:
if video_files:
player_cols = st.columns(len(video_files) * 2 + 1)
for i, url in enumerate(video_files):
player_cols[i * 2 + 1].video(url)
except Exception as e:
logger.error(f"播放视频失败: {e}")
st.success(tr("视频生成完成"))
break
elif state == const.TASK_STATE_FAILED:
st.error(f"任务失败: {task.get('message', 'Unknown error')}")
break
time.sleep(0.5)
progress = int(progress)
except (TypeError, ValueError):
progress = 0
progress = max(0, min(progress, 100))
# 更新进度条和阶段状态
progress_bar.progress(progress / 100)
current_message = task.get("message") or f"Processing... {progress}%"
status_key = (
state,
progress,
current_message,
task.get("step_current"),
task.get("step_total"),
task.get("ffmpeg_progress"),
)
if status_key != last_status_key:
status_placeholder.markdown(
_render_generation_status(task),
unsafe_allow_html=True,
)
last_status_key = status_key
if state == const.TASK_STATE_COMPLETE:
status_panel.update(
label=tr("Video Generation Completed"),
state="complete",
expanded=False,
)
progress_bar.progress(1.0)
# 显示结果
video_files = task.get("videos", [])
try:
if video_files:
aspect = getattr(params, "video_aspect", "")
aspect = getattr(aspect, "value", aspect)
preview_width = 320 if aspect in {
VideoAspect.portrait.value,
VideoAspect.portrait_2.value,
} else 600
for url in video_files:
_, preview_col, _ = st.columns([1, 2, 1])
with preview_col:
st.video(url, width=preview_width)
except Exception as e:
logger.error(f"播放视频失败: {e}")
st.success(tr("Video Generation Completed"))
break
if state == const.TASK_STATE_FAILED:
status_panel.update(
label=f"{tr('Task failed')}: {task.get('message', 'Unknown error')}",
state="error",
expanded=True,
)
st.error(f"{tr('Task failed')}: {task.get('message', 'Unknown error')}")
break
time.sleep(0.5)
generate_video_dialog()
def get_voice_name_for_tts_engine(tts_engine: str) -> str:
"""根据TTS引擎获取用户选择的音色"""
if tts_engine == 'edge_tts':
return config.ui.get('edge_voice_name', 'zh-CN-XiaoxiaoNeural-Female')
if tts_engine == 'azure_speech':
return config.ui.get('azure_voice_name', 'zh-CN-XiaoxiaoMultilingualNeural')
if tts_engine == 'tencent_tts':
return f"tencent:{config.ui.get('tencent_voice_type', '101001')}"
if tts_engine == 'qwen3_tts':
return f"qwen3:{config.ui.get('qwen_voice_type', 'Cherry')}"
if tts_engine == config.INDEXTTS2_ENGINE:
reference_audio = config.indextts2.get('reference_audio', '')
if reference_audio:
return f"{config.INDEXTTS2_VOICE_PREFIX}{reference_audio}"
return config.ui.get('voice_name', '')
if config.normalize_tts_engine_name(tts_engine) == config.INDEXTTS_ENGINE:
reference_audio = config.indextts.get('reference_audio', '')
if reference_audio:
return f"{config.INDEXTTS_VOICE_PREFIX}{reference_audio}"
return config.ui.get('voice_name', '')
if tts_engine == config.OMNIVOICE_ENGINE:
mode = config.omnivoice.get('mode', 'auto')
reference_audio = config.omnivoice.get('reference_audio', '')
if mode == 'voice_clone' and reference_audio:
return f"{config.OMNIVOICE_VOICE_PREFIX}{reference_audio}"
return f"{config.OMNIVOICE_VOICE_PREFIX}{mode}"
if tts_engine == 'doubaotts':
return st.session_state.get('voice_name', config.ui.get('doubaotts_voice_type', 'BV700_streaming'))
elif tts_engine == 'azure_speech':
return st.session_state.get('voice_name', config.ui.get('azure_voice_name', 'zh-CN-XiaoxiaoMultilingualNeural'))
else:
return st.session_state.get('voice_name', config.ui.get('edge_voice_name', 'zh-CN-XiaoxiaoNeural-Female'))
return config.ui.get('doubaotts_voice_type', 'BV700_streaming')
if tts_engine == 'soulvoice':
voice_uri = config.soulvoice.get('voice_uri', '')
if voice_uri and not voice_uri.startswith(('soulvoice:', 'speech:')):
return f"soulvoice:{voice_uri}"
return voice_uri
return config.ui.get('voice_name', config.ui.get('edge_voice_name', 'zh-CN-XiaoxiaoNeural-Female'))
def get_jianying_export_params() -> VideoClipParams:
def get_jianying_export_params(draft_name=None) -> VideoClipParams:
"""获取导出到剪映草稿的参数"""
tts_engine = st.session_state.get('tts_engine', 'azure')
tts_engine = st.session_state.get('tts_engine', config.ui.get('tts_engine', 'edge_tts'))
voice_name = get_voice_name_for_tts_engine(tts_engine)
voice_rate = st.session_state.get('voice_rate', 1.0)
voice_pitch = st.session_state.get('voice_pitch', 1.0)
subtitle_paths = st.session_state.get('subtitle_paths', [])
if isinstance(subtitle_paths, str):
subtitle_paths = [subtitle_paths]
subtitle_paths = [
path for path in subtitle_paths
if isinstance(path, str) and path.strip()
]
if not subtitle_paths and st.session_state.get('subtitle_path'):
subtitle_paths = [st.session_state.get('subtitle_path')]
return VideoClipParams(
video_clip_json_path=st.session_state['video_clip_json_path'],
video_origin_path=st.session_state['video_origin_path'],
video_origin_paths=st.session_state.get('video_origin_paths', []),
original_subtitle_path=subtitle_paths[0] if subtitle_paths else "",
original_subtitle_paths=subtitle_paths,
tts_engine=tts_engine,
voice_name=voice_name,
voice_rate=voice_rate,
@ -257,108 +445,208 @@ def get_jianying_export_params() -> VideoClipParams:
tts_volume=st.session_state.get('tts_volume', 1.0),
original_volume=st.session_state.get('original_volume', 0.7),
bgm_volume=st.session_state.get('bgm_volume', 0.3),
draft_name=st.session_state.get('draft_name_input', f"NarratoAI_{int(time.time())}")
draft_name=(
draft_name
if draft_name is not None
else st.session_state.get('draft_name_input', f"NarratoAI_{int(time.time())}")
)
)
def _render_jianying_export_status():
"""渲染剪映导出的结果提示。"""
result = st.session_state.get('jianying_export_result')
error = st.session_state.get('jianying_export_error')
if result:
st.success(tr("Jianying draft exported successfully").format(name=result['draft_name']))
st.info(tr("Draft saved to").format(path=result['draft_path']))
elif error:
st.error(f"{tr('Failed to export Jianying draft')}: {error}")
def _render_jianying_export_dialog():
"""使用弹窗确认剪映草稿名称。"""
import uuid
from loguru import logger
@st.dialog(tr("Export to Jianying Draft"), width="small")
def jianying_export_dialog():
jianying_draft_path = config.ui.get("jianying_draft_path", "")
dialog_title = escape(tr("Jianying export dialog title"))
dialog_description = escape(tr("Jianying export dialog description"))
destination_label = escape(tr("Jianying export destination"))
destination_path = escape(jianying_draft_path or "-")
st.markdown(
f"""
<style>
.jianying-export-panel {{
display: flex;
gap: 12px;
align-items: flex-start;
padding: 14px;
margin: 2px 0 18px;
border: 1px solid rgba(255, 75, 75, 0.24);
border-radius: 8px;
background: linear-gradient(135deg, rgba(255, 75, 75, 0.10), rgba(255, 255, 255, 0.96));
}}
.jianying-export-icon {{
width: 38px;
height: 38px;
display: flex;
align-items: center;
justify-content: center;
flex: 0 0 auto;
border-radius: 8px;
color: #ffffff;
background: #ff4b4b;
font-size: 20px;
line-height: 1;
}}
.jianying-export-title {{
color: #202534;
font-size: 17px;
font-weight: 700;
line-height: 1.35;
margin-bottom: 4px;
}}
.jianying-export-description {{
color: #5f6575;
font-size: 13px;
line-height: 1.55;
}}
.jianying-export-path {{
padding: 10px 12px;
margin: 2px 0 16px;
border: 1px solid #e4e7ef;
border-radius: 8px;
background: #f8f9fc;
color: #323846;
font-size: 13px;
line-height: 1.45;
word-break: break-all;
}}
.jianying-export-path-label {{
display: block;
color: #7a8192;
font-size: 12px;
margin-bottom: 4px;
}}
</style>
<div class="jianying-export-panel">
<div class="jianying-export-icon">📤</div>
<div>
<div class="jianying-export-title">{dialog_title}</div>
<div class="jianying-export-description">{dialog_description}</div>
</div>
</div>
<div class="jianying-export-path">
<span class="jianying-export-path-label">{destination_label}</span>
{destination_path}
</div>
""",
unsafe_allow_html=True,
)
draft_name = st.text_input(
tr("Jianying draft name"),
key="draft_name_input",
placeholder="NarratoAI_",
)
error = st.session_state.get('jianying_export_error')
if error:
st.error(f"{tr('Failed to export Jianying draft')}: {error}")
cancel_col, confirm_col = st.columns(2)
with cancel_col:
if st.button(tr("Cancel"), key="cancel_export", use_container_width=True):
st.session_state['jianying_export_error'] = None
st.rerun()
with confirm_col:
if st.button(tr("Confirm Export"), key="confirm_export", type="primary", use_container_width=True):
draft_name = (draft_name or "").strip()
if not draft_name:
st.error(tr("Please enter draft name"))
return
# 创建任务ID
task_id = str(uuid.uuid4())
st.session_state['task_id'] = task_id
# 构建参数
try:
params = get_jianying_export_params(draft_name)
except Exception as e:
logger.error(f"构建参数失败: {e}")
st.session_state['jianying_export_error'] = f"{tr('Failed to build parameters')}: {e}"
st.error(st.session_state['jianying_export_error'])
return
with st.spinner(tr("Exporting to Jianying draft...")):
try:
from app.services import jianying_task
# 调用导出到剪映草稿的任务
result = jianying_task.start_export_jianying_draft(task_id, params)
# 记录日志
logger.info(f"成功导出到剪映草稿: {result['draft_name']}")
logger.info(f"草稿已保存到: {result['draft_path']}")
# 保存结果到session state
st.session_state['jianying_export_result'] = result
st.session_state['jianying_export_error'] = None
st.rerun()
except Exception as e:
logger.error(f"导出到剪映草稿失败: {e}")
import traceback
logger.error(f"错误详情: {traceback.format_exc()}")
st.session_state['jianying_export_error'] = str(e)
st.session_state['jianying_export_result'] = None
st.error(f"{tr('Failed to export Jianying draft')}: {e}")
jianying_export_dialog()
def render_export_jianying_button():
"""渲染导出到剪映草稿按钮和处理逻辑"""
import os
import time
import uuid
from loguru import logger
# 初始化session state
if 'show_jianying_export_form' not in st.session_state:
st.session_state['show_jianying_export_form'] = False
if 'jianying_export_result' not in st.session_state:
st.session_state['jianying_export_result'] = None
if 'jianying_export_error' not in st.session_state:
st.session_state['jianying_export_error'] = None
if st.button("📤 导出到剪映草稿", use_container_width=True, type="secondary"):
if st.button(tr("Export to Jianying Draft"), use_container_width=True, type="secondary"):
config.save_config()
if not st.session_state.get('video_clip_json_path'):
st.error("脚本文件不能为空")
st.error(tr("Script file cannot be empty"))
return
if not st.session_state.get('video_origin_path'):
st.error("视频文件不能为空")
st.error(tr("Video file cannot be empty"))
return
jianying_draft_path = config.ui.get("jianying_draft_path", "")
if not jianying_draft_path:
st.error("请在基础设置中配置剪映草稿地址")
st.error(tr("Please configure Jianying draft folder in basic settings"))
return
if not os.path.exists(jianying_draft_path):
st.error(f"剪映草稿文件夹不存在: {jianying_draft_path}")
st.error(tr("Jianying draft folder does not exist").format(path=jianying_draft_path))
return
# 显示导出表单
st.session_state['show_jianying_export_form'] = True
st.session_state['jianying_export_result'] = None
st.session_state['jianying_export_error'] = None
st.session_state['draft_name_input'] = f"NarratoAI_{int(time.time())}"
_render_jianying_export_dialog()
# 显示导出表单
if st.session_state['show_jianying_export_form']:
st.markdown("---")
st.subheader("导出到剪映草稿")
draft_name = st.text_input(
"请输入剪映草稿名称",
value=f"NarratoAI_{int(time.time())}",
key="draft_name_input"
)
if st.button("确认导出", key="confirm_export"):
if not draft_name:
st.error("请输入草稿名称")
return
# 创建任务ID
task_id = str(uuid.uuid4())
st.session_state['task_id'] = task_id
# 构建参数
try:
params = get_jianying_export_params()
except Exception as e:
logger.error(f"构建参数失败: {e}")
st.error(f"参数构建失败: {e}")
return
with st.spinner("正在导出到剪映草稿,请稍候..."):
try:
from app.services import jianying_task
# 调用导出到剪映草稿的任务
result = jianying_task.start_export_jianying_draft(task_id, params)
# 记录日志
logger.info(f"成功导出到剪映草稿: {result['draft_name']}")
logger.info(f"草稿已保存到: {result['draft_path']}")
# 保存结果到session state
st.session_state['jianying_export_result'] = result
st.session_state['jianying_export_error'] = None
st.session_state['show_jianying_export_form'] = False
st.success(f"✅ 成功导出到剪映草稿: {result['draft_name']}")
st.info(f"📁 草稿已保存到: {result['draft_path']}")
except Exception as e:
logger.error(f"导出到剪映草稿失败: {e}")
import traceback
logger.error(f"错误详情: {traceback.format_exc()}")
st.session_state['jianying_export_error'] = str(e)
st.session_state['jianying_export_result'] = None
st.error(f"❌ 导出到剪映草稿失败: {e}")
if st.button("取消", key="cancel_export"):
st.session_state['show_jianying_export_form'] = False
st.session_state['jianying_export_result'] = None
st.session_state['jianying_export_error'] = None
st.rerun()
_render_jianying_export_status()
@ -379,7 +667,7 @@ def main():
logger.error(f"❌ LLM 提供商注册失败: {str(e)}")
import traceback
logger.error(traceback.format_exc())
st.error(f"⚠️ LLM 初始化失败: {str(e)}\n\n请检查配置文件和依赖是否正确安装。")
st.error(tr("LLM initialization failed").format(error=str(e)))
# 不抛出异常,允许应用继续运行(但 LLM 功能不可用)
# 检测FFmpeg硬件加速但只打印一次日志使用 session_state 持久化)
@ -402,7 +690,7 @@ def main():
logger.warning(f"资源初始化时出现警告: {e}")
st.title(f"Narrato:blue[AI]:sunglasses: 📽️")
st.write(tr("Get Help"))
st.write(get_help_text())
# 首先渲染不依赖PyTorch的UI部分
# 渲染基础设置面板

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,8 @@ import streamlit as st
import os
from app.config import config
from app.config.defaults import (
DEFAULT_LLM_GENERATION_CONFIG,
DEFAULT_LLM_THINKING_LEVELS,
DEFAULT_OPENAI_COMPATIBLE_BASE_URL,
DEFAULT_OPENAI_COMPATIBLE_PROVIDER,
DEFAULT_TEXT_LLM_PROVIDER,
@ -26,7 +28,7 @@ OPENAI_COMPATIBLE_GATEWAY_BASE_URLS = {
}
def build_base_url_help(provider: str, model_type: str) -> tuple[str, bool, str]:
def build_base_url_help(provider: str, model_type: str, tr=lambda key: key) -> tuple[str, bool, str]:
"""
根据 provider 返回 Base URL 的帮助文案
@ -35,14 +37,14 @@ def build_base_url_help(provider: str, model_type: str) -> tuple[str, bool, str]
requires_base: 是否强制提示必须填写 Base URL
placeholder: 推荐的默认值可为空字符串
"""
default_help = "自定义 API 端点(可选),当使用自建或第三方代理时需要填写"
default_help = tr("Custom API endpoint help")
provider_key = (provider or "").lower()
example_url = OPENAI_COMPATIBLE_GATEWAY_BASE_URLS.get(provider_key)
if example_url is not None:
extra = f"\n推荐接口地址: {example_url}" if example_url else ""
extra = f"\n{tr('Recommended API endpoint')}: {example_url}" if example_url else ""
help_text = (
f"{model_type} 选择的提供商基于 OpenAI 兼容网关,必须填写完整的接口地址。"
f"{tr('OpenAI compatible gateway help').format(model_type=model_type)}"
f"{extra}"
)
return help_text, True, example_url
@ -87,7 +89,7 @@ def validate_openai_compatible_model_name(model_name: str, model_type: str) -> t
Args:
model_name: 模型名称应为 provider/model 格式
model_type: 模型类型"分析""文案生成"
model_type: 模型类型"分析""文案生成"
Returns:
(是否有效, 错误消息)
@ -140,6 +142,113 @@ def show_config_validation_errors(errors: list):
st.error(error)
def update_app_config_if_changed(key: str, value) -> bool:
"""Update app config only when the value really changed."""
if config.app.get(key) == value:
return False
config.app[key] = value
return True
def render_openai_compatible_protocol_field(tr, label_key: str, key: str) -> None:
"""Render the fixed OpenAI-compatible protocol as a non-selectable field."""
st.text_input(
tr(label_key),
value=tr("OpenAI compatible protocol"),
help=tr("OpenAI compatible protocol help"),
disabled=True,
key=key,
)
def get_generation_config_value(model_prefix: str, param_name: str):
"""Read a per-model generation parameter with a shared default."""
config_key = f"{model_prefix}_openai_{param_name}"
if config_key in config.app:
return config.app.get(config_key)
if model_prefix == "text" and param_name == "temperature":
return st.session_state.get("temperature", DEFAULT_LLM_GENERATION_CONFIG[param_name])
return DEFAULT_LLM_GENERATION_CONFIG[param_name]
def render_llm_generation_settings(tr, model_prefix: str) -> dict:
"""Render generation parameters directly below a model's Base URL."""
st.markdown(f"**{tr('Generation Settings')}**")
row1 = st.columns(2)
with row1[0]:
temperature = st.slider(
tr("Sampling Temperature"),
min_value=0.0,
max_value=2.0,
value=float(get_generation_config_value(model_prefix, "temperature")),
step=0.05,
help=tr("Sampling Temperature Help"),
key=f"{model_prefix}_openai_temperature_input",
)
with row1[1]:
top_p = st.slider(
tr("Top P"),
min_value=0.0,
max_value=1.0,
value=float(get_generation_config_value(model_prefix, "top_p")),
step=0.05,
help=tr("Top P Help"),
key=f"{model_prefix}_openai_top_p_input",
)
row2 = st.columns(2)
with row2[0]:
max_tokens = st.number_input(
tr("Max Output Tokens"),
min_value=0,
max_value=200000,
value=int(get_generation_config_value(model_prefix, "max_tokens")),
step=256,
help=tr("Max Output Tokens Help"),
key=f"{model_prefix}_openai_max_tokens_input",
)
with row2[1]:
current_thinking_level = str(get_generation_config_value(model_prefix, "thinking_level") or "auto")
if current_thinking_level not in DEFAULT_LLM_THINKING_LEVELS:
current_thinking_level = "auto"
thinking_level = st.selectbox(
tr("Thinking Level"),
options=DEFAULT_LLM_THINKING_LEVELS,
index=DEFAULT_LLM_THINKING_LEVELS.index(current_thinking_level),
format_func=lambda level: tr(f"Thinking Level {level.title()}"),
help=tr("Thinking Level Help"),
key=f"{model_prefix}_openai_thinking_level_input",
)
params = {
"temperature": round(float(temperature), 2),
"top_p": round(float(top_p), 2),
"max_tokens": int(max_tokens),
"thinking_level": thinking_level,
}
if model_prefix == "text":
st.session_state["temperature"] = params["temperature"]
return params
def save_llm_generation_settings(model_prefix: str, params: dict) -> bool:
"""Persist per-model generation parameters in app config."""
changed = False
for param_name, value in params.items():
config_key = f"{model_prefix}_openai_{param_name}"
changed |= update_app_config_if_changed(config_key, value)
st.session_state[config_key] = value
return changed
def render_basic_settings(tr):
"""渲染基础设置面板"""
with st.expander(tr("Basic Settings"), expanded=False):
@ -151,14 +260,24 @@ def render_basic_settings(tr):
with left_config_panel:
render_language_settings(tr)
render_proxy_settings(tr)
render_tavily_search_settings(tr)
with middle_config_panel:
render_vision_llm_settings(tr) # 视分析模型设置
render_vision_llm_settings(tr) # 视分析模型设置
with right_config_panel:
render_text_llm_settings(tr) # 文案生成模型设置
def render_generation_settings(tr):
"""渲染通用生成参数。"""
st.divider()
st.subheader(tr("Generation Settings"))
if 'temperature' not in st.session_state:
st.session_state['temperature'] = DEFAULT_LLM_GENERATION_CONFIG["temperature"]
st.slider("temperature", 0.0, 2.0, key="temperature")
def render_language_settings(tr):
st.subheader(tr("Proxy Settings"))
@ -218,15 +337,41 @@ def render_proxy_settings(tr):
config.proxy["https"] = ""
# 剪映草稿地址设置
st.subheader("剪映草稿设置")
st.subheader(tr("Jianying Draft Settings"))
jianying_draft_path = st.text_input(
"剪映草稿文件夹路径",
tr("Jianying Draft Folder Path"),
value=config.ui.get("jianying_draft_path", ""),
help="剪映草稿文件夹路径例如C:\\Users\\用户名\\Documents\\JianyingPro Drafts"
help=tr("Jianying Draft Folder Path Help")
)
config.ui["jianying_draft_path"] = jianying_draft_path
def render_tavily_search_settings(tr):
"""Render Tavily API key settings used by short drama web search."""
st.subheader(tr("Tavily Search Settings"))
st.markdown(
f"{tr('API Key URL')}: "
"[https://app.tavily.com](https://app.tavily.com)"
)
tavily_api_key = st.text_input(
tr("Tavily API Key"),
value=config.app.get("tavily_api_key", ""),
type="password",
help=tr("Tavily API Key Help"),
key="tavily_api_key_input",
)
if update_app_config_if_changed("tavily_api_key", str(tavily_api_key or "").strip()):
try:
config.save_config()
st.session_state["tavily_api_key"] = str(tavily_api_key or "").strip()
st.success(tr("Tavily config saved"))
except Exception as e:
st.error(f"{tr('Failed to save config')}: {str(e)}")
logger.error(f"保存 Tavily 配置失败: {str(e)}")
def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
"""测试视觉模型连接
@ -435,7 +580,7 @@ def test_openai_compatible_text_model(api_key: str, base_url: str, model_name: s
return False, f"连接失败: {error_msg}"
def render_vision_llm_settings(tr):
"""渲染视分析模型设置OpenAI 兼容 统一配置)"""
"""渲染视分析模型设置OpenAI 兼容 统一配置)"""
st.subheader(tr("Vision Model Settings"))
# 固定使用 OpenAI 兼容 提供商
@ -447,36 +592,35 @@ def render_vision_llm_settings(tr):
vision_base_url = config.app.get("vision_openai_base_url", DEFAULT_OPENAI_COMPATIBLE_BASE_URL)
# 固定 provider 为 openai模型输入框保留完整模型名称
current_provider, current_model = get_openai_compatible_ui_values(
_current_provider, current_model = get_openai_compatible_ui_values(
full_vision_model_name,
DEFAULT_VISION_OPENAI_MODEL_NAME,
provider=DEFAULT_VISION_LLM_PROVIDER,
)
# 定义支持的 provider 列表
OPENAI_COMPATIBLE_PROVIDERS = ["openai"]
selected_provider = DEFAULT_VISION_LLM_PROVIDER
# 渲染配置输入框
col1, col2 = st.columns([1, 2])
with col1:
selected_provider = st.selectbox(
tr("Vision Model Provider"),
options=OPENAI_COMPATIBLE_PROVIDERS,
index=OPENAI_COMPATIBLE_PROVIDERS.index(current_provider) if current_provider in OPENAI_COMPATIBLE_PROVIDERS else 0,
key="vision_provider_select"
render_openai_compatible_protocol_field(
tr,
"Vision Model Provider",
key="vision_openai_protocol_display",
)
with col2:
model_name_input = st.text_input(
tr("Vision Model Name"),
value=current_model,
help="输入完整模型名称\n\n"
"常用示例:\n"
"• Qwen/Qwen3.5-122B-A10B\n"
"• gemini/gemini-2.0-flash-lite\n"
"• gpt-4o\n"
"• Qwen/Qwen2.5-VL-32B-Instruct (SiliconFlow)\n\n"
"支持常见 OpenAI 兼容网关(如 OpenAI/DeepSeek/OpenRouter/SiliconFlow",
help=(
tr("Model Name Input Help")
+ "\n\n"
+ "• Qwen/Qwen3.5-122B-A10B\n"
+ "• gemini/gemini-2.0-flash-lite\n"
+ "• gpt-4o\n"
+ "• Qwen/Qwen2.5-VL-32B-Instruct (SiliconFlow)\n\n"
+ tr("OpenAI compatible providers help")
),
key="vision_model_input"
)
@ -487,16 +631,18 @@ def render_vision_llm_settings(tr):
tr("Vision API Key"),
value=vision_api_key,
type="password",
help="对应 provider 的 API 密钥\n\n"
"获取地址:\n"
"• Gemini: https://makersuite.google.com/app/apikey\n"
"• OpenAI: https://platform.openai.com/api-keys\n"
"• Qwen: https://bailian.console.aliyun.com/\n"
"• SiliconFlow: https://cloud.siliconflow.cn/account/ak"
help=(
tr("Provider API Key Help")
+ "\n\n"
+ "• Gemini: https://makersuite.google.com/app/apikey\n"
+ "• OpenAI: https://platform.openai.com/api-keys\n"
+ "• Qwen: https://bailian.console.aliyun.com/\n"
+ "• SiliconFlow: https://cloud.siliconflow.cn/account/ak"
)
)
vision_base_help, vision_base_required, vision_placeholder = build_base_url_help(
selected_provider, "视频分析模型"
selected_provider, tr("Vision model"), tr
)
st_vision_base_url = st.text_input(
tr("Vision Base URL"),
@ -506,15 +652,17 @@ def render_vision_llm_settings(tr):
)
if vision_base_required and not st_vision_base_url:
info_example = vision_placeholder or "https://your-openai-compatible-endpoint/v1"
st.info(f"请在上方填写 OpenAI 兼容网关地址,例如:{info_example}")
st.info(tr("Please fill OpenAI compatible gateway").format(example=info_example))
vision_generation_params = render_llm_generation_settings(tr, "vision")
# 添加测试连接按钮
if st.button(tr("Test Connection"), key="test_vision_connection"):
test_errors = []
if not st_vision_api_key:
test_errors.append("请先输入 API 密钥")
test_errors.append(tr("Please enter API key"))
if not model_name_input:
test_errors.append("请先输入模型名称")
test_errors.append(tr("Please enter model name"))
if test_errors:
for error in test_errors:
@ -534,8 +682,8 @@ def render_vision_llm_settings(tr):
else:
st.error(message)
except Exception as e:
st.error(f"测试连接时发生错误: {str(e)}")
logger.error(f"OpenAI 兼容 视分析模型连接测试失败: {str(e)}")
st.error(f"{tr('Connection test error')}: {str(e)}")
logger.error(f"OpenAI 兼容 视分析模型连接测试失败: {str(e)}")
# 验证和保存配置
validation_errors = []
@ -544,34 +692,42 @@ def render_vision_llm_settings(tr):
# 验证模型名称
if st_vision_model_name:
# 这里的验证逻辑可能需要微调,因为我们现在是自动组合的
is_valid, error_msg = validate_openai_compatible_model_name(st_vision_model_name, "分析")
is_valid, error_msg = validate_openai_compatible_model_name(st_vision_model_name, "分析")
if is_valid:
config.app["vision_openai_model_name"] = st_vision_model_name
config_changed |= update_app_config_if_changed(
"vision_openai_model_name",
st_vision_model_name
)
st.session_state["vision_openai_model_name"] = st_vision_model_name
config_changed = True
else:
validation_errors.append(error_msg)
# 验证 API 密钥
if st_vision_api_key:
is_valid, error_msg = validate_api_key(st_vision_api_key, "分析")
is_valid, error_msg = validate_api_key(st_vision_api_key, "分析")
if is_valid:
config.app["vision_openai_api_key"] = st_vision_api_key
config_changed |= update_app_config_if_changed(
"vision_openai_api_key",
st_vision_api_key
)
st.session_state["vision_openai_api_key"] = st_vision_api_key
config_changed = True
else:
validation_errors.append(error_msg)
# 验证 Base URL可选
if st_vision_base_url:
is_valid, error_msg = validate_base_url(st_vision_base_url, "分析")
is_valid, error_msg = validate_base_url(st_vision_base_url, "分析")
if is_valid:
config.app["vision_openai_base_url"] = st_vision_base_url
config_changed |= update_app_config_if_changed(
"vision_openai_base_url",
st_vision_base_url
)
st.session_state["vision_openai_base_url"] = st_vision_base_url
config_changed = True
else:
validation_errors.append(error_msg)
config_changed |= save_llm_generation_settings("vision", vision_generation_params)
# 显示验证错误
show_config_validation_errors(validation_errors)
@ -582,10 +738,10 @@ def render_vision_llm_settings(tr):
# 清除缓存,确保下次使用新配置
UnifiedLLMService.clear_cache()
if st_vision_api_key or st_vision_base_url or st_vision_model_name:
st.success(f"视频分析模型配置已保存OpenAI 兼容)")
st.success(tr("Vision model config saved"))
except Exception as e:
st.error(f"保存配置失败: {str(e)}")
logger.error(f"保存视分析配置失败: {str(e)}")
st.error(f"{tr('Failed to save config')}: {str(e)}")
logger.error(f"保存视分析配置失败: {str(e)}")
def test_text_model_connection(api_key, base_url, model_name, provider, tr):
@ -704,36 +860,35 @@ def render_text_llm_settings(tr):
text_base_url = config.app.get("text_openai_base_url", DEFAULT_OPENAI_COMPATIBLE_BASE_URL)
# 固定 provider 为 openai模型输入框保留完整模型名称
current_provider, current_model = get_openai_compatible_ui_values(
_current_provider, current_model = get_openai_compatible_ui_values(
full_text_model_name,
DEFAULT_TEXT_OPENAI_MODEL_NAME,
provider=DEFAULT_TEXT_LLM_PROVIDER,
)
# 定义支持的 provider 列表
OPENAI_COMPATIBLE_PROVIDERS = ["openai"]
selected_provider = DEFAULT_TEXT_LLM_PROVIDER
# 渲染配置输入框
col1, col2 = st.columns([1, 2])
with col1:
selected_provider = st.selectbox(
tr("Text Model Provider"),
options=OPENAI_COMPATIBLE_PROVIDERS,
index=OPENAI_COMPATIBLE_PROVIDERS.index(current_provider) if current_provider in OPENAI_COMPATIBLE_PROVIDERS else 0,
key="text_provider_select"
render_openai_compatible_protocol_field(
tr,
"Text Model Provider",
key="text_openai_protocol_display",
)
with col2:
model_name_input = st.text_input(
tr("Text Model Name"),
value=current_model,
help="输入完整模型名称\n\n"
"常用示例:\n"
"• Pro/zai-org/GLM-5\n"
"• deepseek/deepseek-chat\n"
"• gpt-4o\n"
"• deepseek-ai/DeepSeek-R1 (SiliconFlow)\n\n"
"支持常见 OpenAI 兼容网关(如 OpenAI/DeepSeek/OpenRouter/SiliconFlow",
help=(
tr("Model Name Input Help")
+ "\n\n"
+ "• Pro/zai-org/GLM-5\n"
+ "• deepseek/deepseek-chat\n"
+ "• gpt-4o\n"
+ "• deepseek-ai/DeepSeek-R1 (SiliconFlow)\n\n"
+ tr("OpenAI compatible providers help")
),
key="text_model_input"
)
@ -744,18 +899,20 @@ def render_text_llm_settings(tr):
tr("Text API Key"),
value=text_api_key,
type="password",
help="对应 provider 的 API 密钥\n\n"
"获取地址:\n"
"• DeepSeek: https://platform.deepseek.com/api_keys\n"
"• Gemini: https://makersuite.google.com/app/apikey\n"
"• OpenAI: https://platform.openai.com/api-keys\n"
"• Qwen: https://bailian.console.aliyun.com/\n"
"• SiliconFlow: https://cloud.siliconflow.cn/account/ak\n"
"• Moonshot: https://platform.moonshot.cn/console/api-keys"
help=(
tr("Provider API Key Help")
+ "\n\n"
+ "• DeepSeek: https://platform.deepseek.com/api_keys\n"
+ "• Gemini: https://makersuite.google.com/app/apikey\n"
+ "• OpenAI: https://platform.openai.com/api-keys\n"
+ "• Qwen: https://bailian.console.aliyun.com/\n"
+ "• SiliconFlow: https://cloud.siliconflow.cn/account/ak\n"
+ "• Moonshot: https://platform.moonshot.cn/console/api-keys"
)
)
text_base_help, text_base_required, text_placeholder = build_base_url_help(
selected_provider, "文案生成模型"
selected_provider, tr("Text model"), tr
)
st_text_base_url = st.text_input(
tr("Text Base URL"),
@ -765,15 +922,17 @@ def render_text_llm_settings(tr):
)
if text_base_required and not st_text_base_url:
info_example = text_placeholder or "https://your-openai-compatible-endpoint/v1"
st.info(f"请在上方填写 OpenAI 兼容网关地址,例如:{info_example}")
st.info(tr("Please fill OpenAI compatible gateway").format(example=info_example))
text_generation_params = render_llm_generation_settings(tr, "text")
# 添加测试连接按钮
if st.button(tr("Test Connection"), key="test_text_connection"):
test_errors = []
if not st_text_api_key:
test_errors.append("请先输入 API 密钥")
test_errors.append(tr("Please enter API key"))
if not model_name_input:
test_errors.append("请先输入模型名称")
test_errors.append(tr("Please enter model name"))
if test_errors:
for error in test_errors:
@ -793,7 +952,7 @@ def render_text_llm_settings(tr):
else:
st.error(message)
except Exception as e:
st.error(f"测试连接时发生错误: {str(e)}")
st.error(f"{tr('Connection test error')}: {str(e)}")
logger.error(f"OpenAI 兼容 文案生成模型连接测试失败: {str(e)}")
# 验证和保存配置
@ -804,9 +963,11 @@ def render_text_llm_settings(tr):
if st_text_model_name:
is_valid, error_msg = validate_openai_compatible_model_name(st_text_model_name, "文案生成")
if is_valid:
config.app["text_openai_model_name"] = st_text_model_name
text_config_changed |= update_app_config_if_changed(
"text_openai_model_name",
st_text_model_name
)
st.session_state["text_openai_model_name"] = st_text_model_name
text_config_changed = True
else:
text_validation_errors.append(error_msg)
@ -814,9 +975,11 @@ def render_text_llm_settings(tr):
if st_text_api_key:
is_valid, error_msg = validate_api_key(st_text_api_key, "文案生成")
if is_valid:
config.app["text_openai_api_key"] = st_text_api_key
text_config_changed |= update_app_config_if_changed(
"text_openai_api_key",
st_text_api_key
)
st.session_state["text_openai_api_key"] = st_text_api_key
text_config_changed = True
else:
text_validation_errors.append(error_msg)
@ -824,12 +987,16 @@ def render_text_llm_settings(tr):
if st_text_base_url:
is_valid, error_msg = validate_base_url(st_text_base_url, "文案生成")
if is_valid:
config.app["text_openai_base_url"] = st_text_base_url
text_config_changed |= update_app_config_if_changed(
"text_openai_base_url",
st_text_base_url
)
st.session_state["text_openai_base_url"] = st_text_base_url
text_config_changed = True
else:
text_validation_errors.append(error_msg)
text_config_changed |= save_llm_generation_settings("text", text_generation_params)
# 显示验证错误
show_config_validation_errors(text_validation_errors)
@ -840,9 +1007,9 @@ def render_text_llm_settings(tr):
# 清除缓存,确保下次使用新配置
UnifiedLLMService.clear_cache()
if st_text_api_key or st_text_base_url or st_text_model_name:
st.success(f"文案生成模型配置已保存OpenAI 兼容)")
st.success(tr("Text model config saved"))
except Exception as e:
st.error(f"保存配置失败: {str(e)}")
st.error(f"{tr('Failed to save config')}: {str(e)}")
logger.error(f"保存文案生成配置失败: {str(e)}")
# # Cloudflare 特殊配置

File diff suppressed because it is too large Load Diff

View File

@ -1,47 +1,573 @@
from loguru import logger
import streamlit as st
from app.config import config
from app.utils import utils
from webui.utils.cache import get_fonts_cache
import hashlib
import os
SUBTITLE_MASK_DEFAULTS = {
"landscape": {
"x_percent": 10,
"y_percent": 78,
"width_percent": 80,
"height_percent": 14,
"blur_radius": 18,
"opacity_percent": 82,
},
"portrait": {
"x_percent": 8,
"y_percent": 79,
"width_percent": 84,
"height_percent": 16,
"blur_radius": 26,
"opacity_percent": 84,
},
}
SUBTITLE_POSITION_DEFAULTS = {
"landscape": {
"y_percent": 85,
},
"portrait": {
"y_percent": 82,
},
}
VIDEO_PREVIEW_UPLOAD_TYPES = ["mp4", "mov", "avi", "flv", "mkv", "mpeg4"]
def render_subtitle_panel(tr):
"""渲染字幕设置面板"""
with st.container(border=True):
st.write(tr("Subtitle Settings"))
st.info("💡 提示:目前仅 **edge-tts** 引擎支持自动生成字幕,其他 TTS 引擎暂不支持。")
# 检查是否选择了 SoulVoice qwen3_tts引擎
from app.services import voice
# current_voice = st.session_state.get('voice_name', '')
tts_engine = config.ui.get('tts_engine', '')
is_disabled_subtitle = is_disabled_subtitle_settings(tts_engine)
if is_disabled_subtitle:
# SoulVoice 引擎时显示禁用提示
st.warning(f"⚠️ {tts_engine}不支持精确字幕生成")
st.info("💡 建议使用专业剪辑工具如剪映、PR等手动添加字幕")
st.warning(tr("TTS engine does not support precise subtitles").format(engine=tts_engine))
# 强制禁用字幕
st.session_state['subtitle_enabled'] = False
enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True)
st.session_state['subtitle_enabled'] = enable_subtitles
# 显示禁用状态的复选框
st.checkbox(
tr("Enable Subtitles"),
value=False,
disabled=True,
help="SoulVoice 引擎不支持字幕生成,请使用其他 TTS 引擎"
)
if enable_subtitles:
render_subtitle_mask_settings(tr)
render_auto_transcription_settings(tr)
render_font_settings(tr)
render_position_settings(tr)
render_style_settings(tr)
else:
# 其他引擎正常显示字幕选项
enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True)
st.session_state['subtitle_enabled'] = enable_subtitles
st.session_state['subtitle_mask_enabled'] = False
config.ui["subtitle_mask_enabled"] = False
st.session_state['subtitle_auto_transcribe_enabled'] = False
config.fun_asr["auto_transcribe_enabled"] = False
if enable_subtitles:
render_font_settings(tr)
render_position_settings(tr)
render_style_settings(tr)
def _subtitle_mask_key(orientation, field):
return f"subtitle_mask_{orientation}_{field}"
def _get_subtitle_mask_value(orientation, field):
key = _subtitle_mask_key(orientation, field)
return config.ui.get(key, SUBTITLE_MASK_DEFAULTS[orientation][field])
def _set_subtitle_mask_value(orientation, field, value):
key = _subtitle_mask_key(orientation, field)
config.ui[key] = value
st.session_state[key] = value
def _subtitle_position_key(orientation, field):
return f"subtitle_position_{orientation}_{field}"
def _get_orientation_subtitle_position_value(orientation, field):
key = _subtitle_position_key(orientation, field)
return config.ui.get(key, SUBTITLE_POSITION_DEFAULTS[orientation][field])
def _set_orientation_subtitle_position_value(orientation, field, value):
key = _subtitle_position_key(orientation, field)
config.ui[key] = value
st.session_state[key] = value
def _format_preview_time(seconds):
seconds = max(0.0, float(seconds or 0))
minutes = int(seconds // 60)
remaining_seconds = seconds - minutes * 60
return f"{minutes:02d}:{remaining_seconds:04.1f}"
def _get_current_preview_video_path():
uploaded_path = st.session_state.get("subtitle_mask_preview_video_path")
if uploaded_path and os.path.exists(uploaded_path):
return uploaded_path
video_path = st.session_state.get("video_origin_path", "")
if isinstance(video_path, str) and video_path and os.path.exists(video_path):
return video_path
video_paths = st.session_state.get("video_origin_paths", [])
if isinstance(video_paths, list):
for path in video_paths:
if isinstance(path, str) and path and os.path.exists(path):
return path
return ""
def _save_subtitle_mask_preview_video(uploaded_file):
if uploaded_file is None:
return ""
signature = f"{uploaded_file.name}:{uploaded_file.size}"
existing_signature = st.session_state.get("subtitle_mask_preview_upload_signature")
existing_path = st.session_state.get("subtitle_mask_preview_video_path", "")
if signature == existing_signature and existing_path and os.path.exists(existing_path):
return existing_path
target_dir = utils.temp_dir("subtitle_mask_preview")
safe_name = os.path.basename(uploaded_file.name).strip() or "preview.mp4"
digest = hashlib.md5(signature.encode("utf-8")).hexdigest()[:10]
preview_path = os.path.join(target_dir, f"{digest}_{safe_name}")
with open(preview_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.session_state["subtitle_mask_preview_upload_signature"] = signature
st.session_state["subtitle_mask_preview_video_path"] = preview_path
return preview_path
def _video_mtime(video_path):
try:
return os.path.getmtime(video_path)
except OSError:
return 0
@st.cache_data(show_spinner=False)
def _probe_subtitle_mask_preview_video(video_path, mtime):
from moviepy import VideoFileClip
clip = VideoFileClip(video_path)
try:
return {
"duration": float(clip.duration or 0),
"width": int(clip.w),
"height": int(clip.h),
}
finally:
clip.close()
@st.cache_data(show_spinner=False)
def _extract_subtitle_mask_preview_frame(video_path, timestamp, mtime):
import numpy as np
from moviepy import VideoFileClip
clip = VideoFileClip(video_path)
try:
safe_time = min(max(float(timestamp or 0), 0.0), max(float(clip.duration or 0), 0.0))
frame = np.asarray(clip.get_frame(safe_time))
if frame.dtype != np.uint8:
frame = np.clip(frame, 0, 255).astype(np.uint8)
return frame
finally:
clip.close()
def _build_subtitle_mask_preview_options():
options = {"subtitle_mask_enabled": True}
for orientation in ("landscape", "portrait"):
for field in ("x_percent", "y_percent", "width_percent", "height_percent", "blur_radius", "opacity_percent"):
options[_subtitle_mask_key(orientation, field)] = _get_subtitle_mask_value(orientation, field)
options[_subtitle_position_key(orientation, "y_percent")] = _get_orientation_subtitle_position_value(
orientation,
"y_percent",
)
return options
def _draw_subtitle_mask_preview(frame):
from PIL import Image, ImageDraw
from app.services.generate_video import _resolve_subtitle_mask_region
image = Image.fromarray(frame).convert("RGBA")
region = _resolve_subtitle_mask_region(image.width, image.height, _build_subtitle_mask_preview_options())
overlay = Image.new("RGBA", image.size, (0, 0, 0, 0))
draw = ImageDraw.Draw(overlay)
rect = (
region["x"],
region["y"],
region["x"] + region["width"],
region["y"] + region["height"],
)
draw.rounded_rectangle(
rect,
radius=region["corner_radius"],
fill=(0, 0, 0, 96),
outline=(255, 75, 85, 235),
width=max(2, round(min(image.width, image.height) * 0.004)),
)
subtitle_y_percent = _get_orientation_subtitle_position_value(region["orientation"], "y_percent")
subtitle_y = round((image.height - 1) * subtitle_y_percent / 100)
line_width = max(2, round(min(image.width, image.height) * 0.004))
draw.line(
(0, subtitle_y, image.width, subtitle_y),
fill=(59, 130, 246, 220),
width=line_width,
)
image.alpha_composite(overlay)
return image.convert("RGB"), region
def _resize_subtitle_mask_preview_image(image, max_width=520, max_height=360):
image = image.copy()
image.thumbnail((max_width, max_height))
return image
def _render_subtitle_mask_preview(tr):
st.subheader(tr("Subtitle Mask Preview"))
uploaded_path = st.session_state.get("subtitle_mask_preview_video_path", "")
if uploaded_path and os.path.exists(uploaded_path):
preview_cols = st.columns([0.68, 0.32], vertical_alignment="center")
with preview_cols[0]:
st.caption(
tr("Using Subtitle Mask Preview Video").format(
file=os.path.basename(uploaded_path)
)
)
with preview_cols[1]:
if st.button(
tr("Change Subtitle Mask Preview Video"),
key="change_subtitle_mask_preview_video",
use_container_width=True,
):
st.session_state.pop("subtitle_mask_preview_video_path", None)
st.session_state.pop("subtitle_mask_preview_upload_signature", None)
st.rerun(scope="fragment")
else:
uploaded_file = st.file_uploader(
tr("Upload Subtitle Mask Preview Video"),
type=VIDEO_PREVIEW_UPLOAD_TYPES,
key="subtitle_mask_preview_video_uploader",
help=tr("Upload Subtitle Mask Preview Video Help"),
)
uploaded_path = _save_subtitle_mask_preview_video(uploaded_file)
if uploaded_path:
st.rerun(scope="fragment")
preview_video_path = uploaded_path or _get_current_preview_video_path()
if not preview_video_path:
st.info(tr("Subtitle Mask Preview Empty"))
return
try:
mtime = _video_mtime(preview_video_path)
video_info = _probe_subtitle_mask_preview_video(preview_video_path, mtime)
duration = max(0.0, video_info["duration"])
if duration <= 0:
st.warning(tr("Subtitle Mask Preview Failed"))
return
selected_time = st.slider(
tr("Subtitle Mask Preview Timeline"),
min_value=0.0,
max_value=duration,
value=min(float(st.session_state.get("subtitle_mask_preview_time", 0.0)), duration),
step=0.1,
format="%.1f",
key="subtitle_mask_preview_time",
help=tr("Subtitle Mask Preview Timeline Help"),
)
frame = _extract_subtitle_mask_preview_frame(preview_video_path, selected_time, mtime)
preview_image, region = _draw_subtitle_mask_preview(frame)
preview_image = _resize_subtitle_mask_preview_image(preview_image, max_width=420, max_height=280)
st.image(
preview_image,
caption=tr("Subtitle Mask Preview Frame Caption").format(
time=_format_preview_time(selected_time),
orientation=tr("Portrait") if region["orientation"] == "portrait" else tr("Landscape"),
),
)
except Exception:
st.warning(tr("Subtitle Mask Preview Failed"))
def _render_subtitle_mask_region_controls(tr, orientation):
x_percent = st.slider(
tr("Subtitle Mask Left"),
min_value=0,
max_value=99,
value=int(_get_subtitle_mask_value(orientation, "x_percent")),
help=tr("Subtitle Mask Left Help"),
key=f"{orientation}_subtitle_mask_x_percent",
)
_set_subtitle_mask_value(orientation, "x_percent", x_percent)
y_percent = st.slider(
tr("Subtitle Mask Top"),
min_value=0,
max_value=99,
value=int(_get_subtitle_mask_value(orientation, "y_percent")),
help=tr("Subtitle Mask Top Help"),
key=f"{orientation}_subtitle_mask_y_percent",
)
_set_subtitle_mask_value(orientation, "y_percent", y_percent)
max_width = max(2, 100 - x_percent)
width_widget_key = f"{orientation}_subtitle_mask_width_percent"
if st.session_state.get(width_widget_key, 2) < 2:
st.session_state[width_widget_key] = 2
if st.session_state.get(width_widget_key, 0) > max_width:
st.session_state[width_widget_key] = max_width
width_percent = st.slider(
tr("Subtitle Mask Width"),
min_value=2,
max_value=max_width,
value=min(int(_get_subtitle_mask_value(orientation, "width_percent")), max_width),
help=tr("Subtitle Mask Width Help"),
key=width_widget_key,
)
_set_subtitle_mask_value(orientation, "width_percent", width_percent)
max_height = max(2, 100 - y_percent)
height_widget_key = f"{orientation}_subtitle_mask_height_percent"
if st.session_state.get(height_widget_key, 2) < 2:
st.session_state[height_widget_key] = 2
if st.session_state.get(height_widget_key, 0) > max_height:
st.session_state[height_widget_key] = max_height
height_percent = st.slider(
tr("Subtitle Mask Height"),
min_value=2,
max_value=max_height,
value=min(int(_get_subtitle_mask_value(orientation, "height_percent")), max_height),
help=tr("Subtitle Mask Height Help"),
key=height_widget_key,
)
_set_subtitle_mask_value(orientation, "height_percent", height_percent)
blur_radius = st.slider(
tr("Subtitle Mask Blur Radius"),
min_value=0,
max_value=200,
value=int(_get_subtitle_mask_value(orientation, "blur_radius")),
help=tr("Subtitle Mask Blur Radius Help"),
key=f"{orientation}_subtitle_mask_blur_radius",
)
_set_subtitle_mask_value(orientation, "blur_radius", blur_radius)
opacity_percent = st.slider(
tr("Subtitle Mask Opacity"),
min_value=0,
max_value=100,
value=int(_get_subtitle_mask_value(orientation, "opacity_percent")),
help=tr("Subtitle Mask Opacity Help"),
key=f"{orientation}_subtitle_mask_opacity_percent",
)
_set_subtitle_mask_value(orientation, "opacity_percent", opacity_percent)
def _render_subtitle_position_controls(tr, orientation):
y_percent = st.slider(
tr("Subtitle Burn Position"),
min_value=0,
max_value=99,
value=int(_get_orientation_subtitle_position_value(orientation, "y_percent")),
help=tr("Subtitle Burn Position Help"),
key=f"{orientation}_subtitle_burn_y_percent",
)
_set_orientation_subtitle_position_value(orientation, "y_percent", y_percent)
def _render_subtitle_mask_dialog(tr):
@st.dialog(tr("Subtitle Mask Settings"), width="large")
def subtitle_mask_dialog():
preview_col, settings_col = st.columns([1, 1], vertical_alignment="top")
with settings_col:
st.caption(tr("Subtitle Mask Settings Caption"))
st.caption(tr("Subtitle Mask Preview Caption"))
landscape_mask_tab, portrait_mask_tab, landscape_position_tab, portrait_position_tab = st.tabs([
tr("Landscape Subtitle Mask"),
tr("Portrait Subtitle Mask"),
tr("Landscape Subtitle Position"),
tr("Portrait Subtitle Position"),
])
with landscape_mask_tab:
_render_subtitle_mask_region_controls(tr, "landscape")
with portrait_mask_tab:
_render_subtitle_mask_region_controls(tr, "portrait")
with landscape_position_tab:
_render_subtitle_position_controls(tr, "landscape")
with portrait_position_tab:
_render_subtitle_position_controls(tr, "portrait")
with preview_col:
_render_subtitle_mask_preview(tr)
if st.button(tr("Save Subtitle Mask Settings"), type="primary", use_container_width=True):
config.save_config()
st.rerun()
subtitle_mask_dialog()
def render_subtitle_mask_settings(tr):
"""渲染原字幕遮罩设置。"""
mask_enabled = st.checkbox(
tr("Enable Subtitle Mask"),
value=bool(config.ui.get("subtitle_mask_enabled", False)),
help=tr("Enable Subtitle Mask Help"),
key="subtitle_mask_enabled_checkbox",
)
st.session_state['subtitle_mask_enabled'] = mask_enabled
config.ui["subtitle_mask_enabled"] = mask_enabled
if not mask_enabled:
return
button_col, summary_col = st.columns([0.35, 0.65], vertical_alignment="center")
with button_col:
if st.button(tr("Set Subtitle Mask"), key="set_subtitle_mask", use_container_width=True):
_render_subtitle_mask_dialog(tr)
with summary_col:
st.caption(
tr("Subtitle Mask Summary").format(
landscape_x=_get_subtitle_mask_value("landscape", "x_percent"),
landscape_y=_get_subtitle_mask_value("landscape", "y_percent"),
landscape_width=_get_subtitle_mask_value("landscape", "width_percent"),
landscape_height=_get_subtitle_mask_value("landscape", "height_percent"),
portrait_x=_get_subtitle_mask_value("portrait", "x_percent"),
portrait_y=_get_subtitle_mask_value("portrait", "y_percent"),
portrait_width=_get_subtitle_mask_value("portrait", "width_percent"),
portrait_height=_get_subtitle_mask_value("portrait", "height_percent"),
)
)
def _get_saved_auto_transcribe_backend():
saved_backend = str(config.fun_asr.get("backend", "")).strip().lower()
if saved_backend not in {"local", "firered", "bailian"}:
saved_backend = (
"bailian"
if config.fun_asr.get("api_key") and not config.fun_asr.get("api_url")
else "local"
)
return saved_backend
def render_auto_transcription_settings(tr):
"""渲染最终视频自动转录设置。"""
from app.services import fun_asr_subtitle
auto_transcribe_enabled = st.checkbox(
tr("Enable Auto Transcription"),
value=bool(config.fun_asr.get("auto_transcribe_enabled", False)),
help=tr("Enable Auto Transcription Help"),
key="subtitle_auto_transcribe_enabled_checkbox",
)
st.session_state['subtitle_auto_transcribe_enabled'] = auto_transcribe_enabled
config.fun_asr["auto_transcribe_enabled"] = auto_transcribe_enabled
backend = _get_saved_auto_transcribe_backend()
api_url = config.fun_asr.get("api_url", fun_asr_subtitle.LOCAL_FUN_ASR_API_URL)
firered_api_url = config.fun_asr.get("firered_api_url", fun_asr_subtitle.LOCAL_FIRERED_ASR_API_URL)
hotword = config.fun_asr.get("hotword", "")
enable_spk = bool(config.fun_asr.get("enable_spk", False))
api_key = config.fun_asr.get("api_key", "")
if not auto_transcribe_enabled:
st.session_state['subtitle_auto_transcribe_backend'] = backend
st.session_state['subtitle_auto_transcribe_api_url'] = api_url
st.session_state['subtitle_auto_transcribe_firered_api_url'] = firered_api_url
st.session_state['subtitle_auto_transcribe_hotword'] = hotword
st.session_state['subtitle_auto_transcribe_enable_spk'] = enable_spk
st.session_state['subtitle_auto_transcribe_api_key'] = api_key
return
backend_options = {
tr("Local FunASR-Pack API"): "local",
tr("Local FireRedASR API"): "firered",
tr("Ali Bailian Online Fun-ASR"): "bailian",
}
backend_values = list(backend_options.values())
backend_labels = list(backend_options.keys())
backend_label = st.selectbox(
tr("Subtitle Processing Method"),
options=backend_labels,
index=backend_values.index(backend),
key="subtitle_auto_transcribe_backend_select",
)
backend = backend_options[backend_label]
if backend == "local":
st.caption(tr("Auto Transcription Local Caption"))
api_url = st.text_input(
tr("Local FunASR-Pack API URL"),
value=api_url,
help=tr("Local FunASR-Pack API URL Help"),
key="subtitle_auto_transcribe_api_url_input",
)
hotword = st.text_input(
tr("Fun-ASR Hotword"),
value=hotword,
help=tr("Fun-ASR Hotword Help"),
key="subtitle_auto_transcribe_hotword_input",
)
enable_spk = st.checkbox(
tr("Enable speaker diarization"),
value=enable_spk,
help=tr("Enable speaker diarization Help"),
key="subtitle_auto_transcribe_enable_spk_checkbox",
)
elif backend == "firered":
st.caption(tr("Auto Transcription FireRed Caption"))
firered_api_url = st.text_input(
tr("Local FireRedASR API URL"),
value=firered_api_url,
help=tr("Local FireRedASR API URL Help"),
key="subtitle_auto_transcribe_firered_api_url_input",
)
else:
st.caption(tr("Auto Transcription Online Caption"))
st.markdown(
f"{tr('API Key URL')}: "
"[https://bailian.console.aliyun.com/?tab=model#/api-key]"
"(https://bailian.console.aliyun.com/?tab=model#/api-key)"
)
api_key = st.text_input(
tr("Ali Bailian API Key"),
value=api_key,
type="password",
help=tr("Ali Bailian API Key Help"),
key="subtitle_auto_transcribe_api_key_input",
)
config.fun_asr["backend"] = backend
config.fun_asr["api_url"] = str(api_url).strip()
config.fun_asr["firered_api_url"] = str(firered_api_url).strip()
config.fun_asr["api_key"] = str(api_key).strip()
config.fun_asr["hotword"] = str(hotword).strip()
config.fun_asr["enable_spk"] = bool(enable_spk)
config.fun_asr["model"] = "fun-asr"
st.session_state['subtitle_auto_transcribe_backend'] = backend
st.session_state['subtitle_auto_transcribe_api_url'] = str(api_url).strip()
st.session_state['subtitle_auto_transcribe_firered_api_url'] = str(firered_api_url).strip()
st.session_state['subtitle_auto_transcribe_api_key'] = str(api_key).strip()
st.session_state['subtitle_auto_transcribe_hotword'] = str(hotword).strip()
st.session_state['subtitle_auto_transcribe_enable_spk'] = bool(enable_spk)
def render_font_settings(tr):
@ -90,7 +616,7 @@ def render_font_settings(tr):
def is_disabled_subtitle_settings(tts_engine:str)->bool:
"""是否禁用字幕设置"""
return tts_engine=="soulvoice" or tts_engine=="qwen3_tts"
return tts_engine=="soulvoice" or tts_engine=="qwen3_tts" or tts_engine==config.OMNIVOICE_ENGINE
def render_position_settings(tr):
"""渲染位置设置"""
@ -154,6 +680,46 @@ def get_subtitle_params():
font_name = st.session_state.get('font_name') or "SimHei"
return {
'subtitle_enabled': st.session_state.get('subtitle_enabled', True),
'subtitle_mask_enabled': st.session_state.get('subtitle_mask_enabled', False),
'subtitle_mask_landscape_x_percent': _get_subtitle_mask_value("landscape", "x_percent"),
'subtitle_mask_landscape_y_percent': _get_subtitle_mask_value("landscape", "y_percent"),
'subtitle_mask_landscape_width_percent': _get_subtitle_mask_value("landscape", "width_percent"),
'subtitle_mask_landscape_height_percent': _get_subtitle_mask_value("landscape", "height_percent"),
'subtitle_mask_landscape_blur_radius': _get_subtitle_mask_value("landscape", "blur_radius"),
'subtitle_mask_landscape_opacity_percent': _get_subtitle_mask_value("landscape", "opacity_percent"),
'subtitle_mask_portrait_x_percent': _get_subtitle_mask_value("portrait", "x_percent"),
'subtitle_mask_portrait_y_percent': _get_subtitle_mask_value("portrait", "y_percent"),
'subtitle_mask_portrait_width_percent': _get_subtitle_mask_value("portrait", "width_percent"),
'subtitle_mask_portrait_height_percent': _get_subtitle_mask_value("portrait", "height_percent"),
'subtitle_mask_portrait_blur_radius': _get_subtitle_mask_value("portrait", "blur_radius"),
'subtitle_mask_portrait_opacity_percent': _get_subtitle_mask_value("portrait", "opacity_percent"),
'subtitle_position_landscape_y_percent': _get_orientation_subtitle_position_value("landscape", "y_percent"),
'subtitle_position_portrait_y_percent': _get_orientation_subtitle_position_value("portrait", "y_percent"),
'subtitle_auto_transcribe_enabled': st.session_state.get('subtitle_auto_transcribe_enabled', False),
'subtitle_auto_transcribe_backend': st.session_state.get(
'subtitle_auto_transcribe_backend',
_get_saved_auto_transcribe_backend()
),
'subtitle_auto_transcribe_api_url': st.session_state.get(
'subtitle_auto_transcribe_api_url',
config.fun_asr.get("api_url", "")
),
'subtitle_auto_transcribe_firered_api_url': st.session_state.get(
'subtitle_auto_transcribe_firered_api_url',
config.fun_asr.get("firered_api_url", "")
),
'subtitle_auto_transcribe_api_key': st.session_state.get(
'subtitle_auto_transcribe_api_key',
config.fun_asr.get("api_key", "")
),
'subtitle_auto_transcribe_hotword': st.session_state.get(
'subtitle_auto_transcribe_hotword',
config.fun_asr.get("hotword", "")
),
'subtitle_auto_transcribe_enable_spk': st.session_state.get(
'subtitle_auto_transcribe_enable_spk',
bool(config.fun_asr.get("enable_spk", False))
),
'font_name': font_name,
'font_size': st.session_state.get('font_size', 60),
'text_fore_color': st.session_state.get('text_fore_color', '#FFFFFF'),

View File

@ -3,6 +3,8 @@ import os
import shutil
from loguru import logger
from app.config import config
from app.utils import ffmpeg_detector, ffmpeg_utils
from app.utils.utils import storage_dir
@ -27,6 +29,162 @@ def clear_directory(dir_path, tr):
else:
st.warning(tr("Directory does not exist"))
def _format_engine_label(engines_by_path, tr):
def formatter(path):
engine = engines_by_path.get(path, {})
source = engine.get("source", "")
source_key = f"FFmpeg source {source}"
translated_source = tr(source_key)
if translated_source == source_key:
translated_source = source
version = str(engine.get("version_line", "")).replace("ffmpeg version", "").strip()
version = version or "unknown version"
status = _status_text(engine.get("available"), tr)
return f"{translated_source} - {version} - {path} ({status})"
return formatter
def _status_text(value, tr):
return tr("Available") if value else tr("Unavailable")
def _render_ffmpeg_report(report, tr):
st.write(f"**{tr('FFmpeg detection details')}**")
st.caption(f"{tr('Path')}: {report.get('path', '')}")
if report.get("version_line"):
st.caption(f"{tr('Version')}: {report['version_line']}")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("FFmpeg", _status_text(report.get("ffmpeg_available"), tr))
with col2:
st.metric("FFprobe", _status_text(report.get("ffprobe_available"), tr))
with col3:
hwaccel = report.get("hardware_acceleration", {})
st.metric(tr("Hardware Acceleration"), _status_text(hwaccel.get("available"), tr))
with col4:
subtitle_burn = report.get("subtitle_burn", {})
st.metric(tr("Subtitle Burn-in"), _status_text(subtitle_burn.get("available"), tr))
if report.get("ffmpeg_available") and report.get("subtitle_burn", {}).get("available"):
if report.get("hardware_acceleration", {}).get("available"):
st.success(tr("FFmpeg engine passed all checks"))
else:
st.warning(tr("FFmpeg engine works but hardware acceleration is unavailable"))
else:
st.error(tr("FFmpeg engine check failed"))
hwaccel = report.get("hardware_acceleration", {})
subtitle_burn = report.get("subtitle_burn", {})
col1, col2 = st.columns(2)
with col1:
st.write(f"**{tr('Hardware acceleration detail')}**")
st.write(f"- {tr('Type')}: {hwaccel.get('type') or '-'}")
st.write(f"- {tr('Encoder')}: {hwaccel.get('encoder') or '-'}")
st.write(f"- {tr('Message')}: {hwaccel.get('message') or '-'}")
hwaccels = report.get("hwaccels") or []
st.write(f"- {tr('Supported Hardware Methods')}: {', '.join(hwaccels) if hwaccels else '-'}")
with col2:
filters = subtitle_burn.get("filters") or {}
st.write(f"**{tr('Subtitle burn-in detail')}**")
st.write(f"- {tr('Method')}: {subtitle_burn.get('method') or '-'}")
st.write(f"- {tr('Message')}: {subtitle_burn.get('message') or '-'}")
st.write(
"- "
+ tr("Subtitle Filters")
+ ": "
+ ", ".join(
f"{name}={_status_text(enabled, tr)}"
for name, enabled in filters.items()
)
)
errors = report.get("errors") or []
if errors:
with st.expander(tr("FFmpeg errors")):
for error in errors:
st.write(f"- {error}")
with st.expander(tr("Raw FFmpeg report")):
st.json(report)
def render_ffmpeg_engine_settings(tr):
"""Render FFmpeg engine discovery, selection and diagnostics."""
st.divider()
st.subheader(tr("FFmpeg Engine Detection"))
engines = ffmpeg_detector.discover_ffmpeg_engines(
configured_path=config.app.get("ffmpeg_path", ""),
root_dir=config.root_dir,
)
engines_by_path = {engine["path"]: engine for engine in engines}
engine_paths = list(engines_by_path.keys())
if not engine_paths:
st.warning(tr("No FFmpeg engines found"))
current_path = config.app.get("ffmpeg_path", "")
selected_index = 0
if current_path in engines_by_path:
selected_index = engine_paths.index(current_path)
selected_path = ""
if engine_paths:
selected_path = st.selectbox(
tr("FFmpeg Engine"),
options=engine_paths,
index=selected_index,
format_func=_format_engine_label(engines_by_path, tr),
help=tr("FFmpeg Engine Help"),
)
custom_path = st.text_input(
tr("Custom FFmpeg Path"),
value="",
help=tr("Custom FFmpeg Path Help"),
placeholder="/path/to/ffmpeg",
).strip()
effective_path = custom_path or selected_path
active_path = config.app.get("ffmpeg_path", "")
if active_path:
st.caption(f"{tr('Current FFmpeg Engine')}: {active_path}")
col1, col2 = st.columns(2)
with col1:
if st.button(tr("Save FFmpeg Engine"), use_container_width=True, disabled=not effective_path):
try:
if not os.path.isfile(effective_path):
st.error(tr("Selected FFmpeg path is invalid"))
else:
config.app["ffmpeg_path"] = effective_path
config.ffmpeg_path = effective_path
config.apply_ffmpeg_path(effective_path)
config.save_config()
ffmpeg_utils.reset_hwaccel_detection()
st.success(tr("FFmpeg engine saved"))
except Exception as e:
st.error(f"{tr('Failed to save config')}: {str(e)}")
logger.error(f"保存 FFmpeg 引擎失败: {e}")
with col2:
if st.button(tr("Test Selected FFmpeg"), use_container_width=True, disabled=not effective_path):
with st.spinner(tr("Testing FFmpeg engine")):
try:
st.session_state["ffmpeg_engine_report"] = ffmpeg_detector.validate_ffmpeg_engine(effective_path)
except Exception as e:
st.error(f"{tr('FFmpeg engine check failed')}: {str(e)}")
logger.error(f"FFmpeg 引擎检测失败: {e}")
report = st.session_state.get("ffmpeg_engine_report")
if report:
_render_ffmpeg_report(report, tr)
def render_system_panel(tr):
"""渲染系统设置面板"""
with st.expander(tr("System settings"), expanded=False):
@ -43,3 +201,5 @@ def render_system_panel(tr):
with col3:
if st.button(tr("Clear tasks"), use_container_width=True):
clear_directory(os.path.join(storage_dir(), "tasks"), tr)
render_ffmpeg_engine_settings(tr)

View File

@ -8,11 +8,23 @@
"Script Files": "Script Files",
"Generate Video Script and Keywords": "Click to use AI to generate **Video Script** and **Video Keywords** based on the **subject**",
"Auto Detect": "Auto Detect",
"Auto Generate": "Auto Generate",
"Video Script": "Video Script (:blue[①Optional, use AI to generate ②Proper punctuation helps in generating subtitles])",
"Auto Generate": "Frame Analysis",
"Video Script": "Video Script",
"Edit Video Script": "View/Edit Video Script",
"Video script row count": "{count} script rows",
"Video script table help": "Edit the full script JSON as a table. You can add or delete rows; saving will validate and write the script file again.",
"Raw JSON Preview": "Raw JSON Preview",
"Script Column ID": "ID",
"Script Column Video ID": "Video",
"Script Column Video Name": "Video Name",
"Script Column Timestamp": "Timestamp",
"Script Column Picture": "Picture",
"Script Column Narration": "Narration",
"Script Column OST": "Mark",
"Generation Settings": "Generation Settings",
"Save Script": "Save Script",
"Crop Video": "Crop Video",
"Video File": "Video File (:blue[1⃣Supports uploading video files (limit 2G) 2⃣For large files, it is recommended to directly import them into the ./resource/videos directory])",
"Video File": "Video File",
"Plot Description": "Plot Description (:blue[Can be obtained from https://www.tvmao.com/])",
"Generate Video Keywords": "Click to use AI to generate **Video Keywords** based on the **script**",
"Please Enter the Video Subject": "Please enter the video script first",
@ -41,9 +53,56 @@
"Random Background Music": "Random Background Music",
"Custom Background Music": "Custom Background Music",
"Custom Background Music File": "Please enter the file path of the custom background music",
"Background Music Source": "Background Music Source",
"Background Music Source Help": "Choose background music from the resource directory, upload a new file, or disable background music.",
"Upload Background Music": "Upload Background Music",
"Background Music Path Help": "Choose the background music used for video synthesis.",
"No Background Music Resources Found": "No background music resources found. Please upload a background music file.",
"Preview Background Music Help": "Play the selected background music.",
"Upload Background Music File": "Upload Background Music File",
"Upload Background Music Help": "Upload an audio file to use as background music.",
"Background Music uploaded": "Background music uploaded: {path}",
"Background Music Volume": "Background Music Volume (0.2 represents 20%, background sound should not be too loud)",
"Subtitle Settings": "**Subtitle Settings**",
"Enable Subtitles": "Enable Subtitles (If unchecked, the following settings will not take effect)",
"Enable Subtitle Mask": "Enable Subtitle Mask",
"Enable Subtitle Mask Help": "Before burning in new subtitles, cover the original subtitle area with a soft blurred mask.",
"Set Subtitle Mask": "Set Subtitle Mask",
"Subtitle Mask Summary": "Landscape {landscape_x}%/{landscape_y}% · {landscape_width}%×{landscape_height}%; portrait {portrait_x}%/{portrait_y}% · {portrait_width}%×{portrait_height}%",
"Subtitle Mask Settings": "Subtitle Mask Settings",
"Subtitle Mask Settings Caption": "Save landscape and portrait mask regions as frame percentages. The mask is applied before new subtitles are burned in.",
"Landscape Subtitle Mask": "Landscape Mask",
"Portrait Subtitle Mask": "Portrait Mask",
"Landscape Subtitle Position": "Landscape Subtitle Position",
"Portrait Subtitle Position": "Portrait Subtitle Position",
"Save Subtitle Mask Settings": "Save Subtitle Mask Settings",
"Subtitle Mask Left": "Left Position",
"Subtitle Mask Left Help": "Mask distance from the left edge as a frame percentage.",
"Subtitle Mask Top": "Top Position",
"Subtitle Mask Top Help": "Mask distance from the top edge as a frame percentage.",
"Subtitle Mask Width": "Mask Width",
"Subtitle Mask Width Help": "Width of the covered mask region as a frame percentage.",
"Subtitle Mask Height": "Mask Height",
"Subtitle Mask Height Help": "Height of the covered mask region as a frame percentage.",
"Subtitle Mask Blur Radius": "Blur Radius",
"Subtitle Mask Blur Radius Help": "Blur strength for the mask background and edge.",
"Subtitle Mask Opacity": "Mask Strength",
"Subtitle Mask Opacity Help": "Mask blend strength. Higher values cover source subtitles more strongly.",
"Subtitle Burn Position": "Subtitle Position",
"Subtitle Burn Position Help": "New subtitle distance from the top edge as a frame percentage. The blue line in preview shows this position.",
"Subtitle Mask Preview": "Source Subtitle Mask Preview",
"Subtitle Mask Preview Caption": "Upload a source video for preview, or use the currently selected source video. Uploaded files here are only used for mask preview.",
"Upload Subtitle Mask Preview Video": "Upload Preview Source Video",
"Upload Subtitle Mask Preview Video Help": "Only used for previewing the mask in this dialog. It will not replace the source video used for generation.",
"Using Subtitle Mask Preview Video": "Preview video: {file}",
"Change Subtitle Mask Preview Video": "Change Video",
"Subtitle Mask Preview Empty": "Upload a preview video, or select a source video above first.",
"Subtitle Mask Preview Timeline": "Preview Timeline (seconds)",
"Subtitle Mask Preview Timeline Help": "Drag to a frame where the source subtitles appear, then fine-tune the mask region.",
"Subtitle Mask Preview Frame Caption": "{time} · {orientation} · red outline is the mask, blue line is the subtitle position",
"Subtitle Mask Preview Failed": "Unable to read this video preview. Please try another video file.",
"Enable Auto Transcription": "Enable Auto Transcription",
"Enable Auto Transcription Help": "After the final video is merged, transcribe the whole video into subtitles and burn them into the output.",
"Font": "Subtitle Font",
"Position": "Subtitle Position",
"Top": "Top",
@ -84,8 +143,560 @@
"Synthesizing Voice": "Synthesizing voice, please wait...",
"TTS Provider": "TTS Provider",
"Hide Log": "Hide Log",
"Select from resource directory": "Select from resource directory",
"Select a video from resource videos directory": "Select a video from the ./resource/videos directory",
"Upload a new video file up to 2GB": "Upload a new video file, up to 2GB",
"Upload new video files up to 2GB each": "Upload one or more video files, up to 2GB each",
"Select Video": "Select Video",
"Choose a video file": "Choose a video file",
"Upload Video": "Upload Video",
"No video files found in resource videos directory": "No video files found in the ./resource/videos directory",
"Upload Local Files": "Upload Local Files",
"File Uploaded Successfully": "File Uploaded Successfully",
"Frame Interval (seconds)": "Frame Interval (seconds) (More keyframes consume more tokens)"
"Selected videos for processing": "Selected {count} video(s): {files}",
"Frame Interval (seconds)": "Frame Interval (seconds)",
"Generate Video Script": "Generate Video Script",
"Video Theme": "Video Theme",
"Generation Prompt": "Custom Prompt",
"Video LLM Provider": "Video Analysis Model",
"timestamp": "Timestamp",
"Picture description": "Picture Description",
"Narration": "Narration",
"Rebuild": "Regenerate",
"Load Video Script": "Load Video Script",
"Speech Pitch": "Speech Pitch",
"Please Select Script File": "Please Select Script File",
"Check Format": "Check Format",
"Script Loaded Successfully": "Script Loaded Successfully",
"Script loaded successfully": "Script loaded successfully",
"Script format check passed": "Script format check passed",
"Script format check failed": "Script format check failed",
"Failed to Load Script": "Failed to Load Script",
"Failed to load script": "Failed to load script",
"Failed to Save Script": "Failed to Save Script",
"Failed to save script": "Failed to save script",
"Script saved successfully": "Script saved successfully",
"Video Quality": "Video Quality",
"Custom prompt for LLM, leave empty to use default prompt": "Custom prompt for LLM. Leave empty to use the default prompt.",
"Proxy Settings": "Proxy Settings",
"HTTP_PROXY": "HTTP Proxy",
"HTTPs_PROXY": "HTTPS Proxy",
"Vision Model Settings": "Vision Model Settings",
"Vision Model Provider": "API Protocol",
"Vision API Key": "Vision API Key",
"Vision Base URL": "Vision Base URL",
"Vision Model Name": "Vision Model Name",
"Text Generation Model Settings": "Text Generation Model Settings",
"LLM Model Name": "LLM Model Name",
"LLM Model API Key": "LLM Model API Key",
"Text Model Provider": "API Protocol",
"Text API Key": "Text API Key",
"Text Base URL": "Text Base URL",
"Text Model Name": "Text Model Name",
"Top P": "Top P",
"Top K": "Top K",
"Max Output Tokens": "Max Output Tokens",
"Max Output Tokens Help": "Maximum generated output length. 0 uses the provider default.",
"Thinking Level": "Thinking Level",
"Thinking Level Help": "Controls reasoning effort. Auto sends no extra thinking parameter; low/medium/high tries reasoning_effort.",
"Thinking Level Auto": "Auto",
"Thinking Level Off": "Off",
"Thinking Level Low": "Low",
"Thinking Level Medium": "Medium",
"Thinking Level High": "High",
"Skip the first few seconds": "Skip the first few seconds",
"Difference threshold": "Difference Threshold",
"Vision processing batch size": "Vision Processing Batch Size",
"Test Connection": "Test Connection",
"Testing connection...": "Testing connection...",
"gemini model is available": "Gemini model is available",
"gemini model is not available": "Gemini model is not available",
"Unsupported provider": "Unsupported provider",
"0: Keep the audio only, 1: Keep the original sound only, 2: Keep the original sound and audio": "0: Keep the narration only, 1: Keep the original sound only, 2: Keep both original sound and narration",
"Text model is not available": "Text model is not available",
"Text model is available": "Text model is available",
"Upload Script": "Upload Script",
"Upload Script File": "Upload Script File",
"Script Uploaded Successfully": "Script Uploaded Successfully",
"Invalid JSON format": "Invalid JSON format",
"Upload failed": "Upload failed",
"Enable Proxy": "Enable Proxy",
"QwenVL model is available": "QwenVL model is available",
"QwenVL model is not available": "QwenVL model is not available",
"QwenVL model returned invalid response": "QwenVL model returned an invalid response",
"System settings": "System Settings",
"Clear Cache": "Clear Cache",
"Cache cleared": "Cache cleared",
"storage directory does not exist": "Storage directory does not exist",
"Failed to clear cache": "Failed to clear cache",
"Clear frames": "Clear frames",
"Clear clip videos": "Clear clip videos",
"Clear tasks": "Clear tasks",
"Directory cleared": "Directory cleared",
"Directory does not exist": "Directory does not exist",
"Failed to clear directory": "Failed to clear directory",
"FFmpeg Engine Detection": "FFmpeg Engine Detection",
"FFmpeg Engine": "FFmpeg Engine",
"FFmpeg Engine Help": "Choose the ffmpeg executable this app should prefer; the packaged runtime and local PATH are discovered automatically",
"No FFmpeg engines found": "No FFmpeg engines found",
"Custom FFmpeg Path": "Custom FFmpeg Path",
"Custom FFmpeg Path Help": "Paste an absolute path to an ffmpeg executable if the target engine is not listed",
"Current FFmpeg Engine": "Current FFmpeg Engine",
"Save FFmpeg Engine": "Save Engine",
"Test Selected FFmpeg": "Test Selected FFmpeg",
"Testing FFmpeg engine": "Testing FFmpeg engine...",
"FFmpeg engine saved": "FFmpeg engine saved",
"Selected FFmpeg path is invalid": "Selected FFmpeg path is invalid",
"FFmpeg detection details": "FFmpeg detection details",
"FFmpeg source Configured": "Configured",
"FFmpeg source NarratoAI packaged runtime": "NarratoAI packaged runtime",
"FFmpeg source Integrated runtime": "Integrated runtime",
"FFmpeg source System PATH": "System PATH",
"FFmpeg source Homebrew": "Homebrew",
"FFmpeg source Python environment": "Python environment",
"FFmpeg source Python executable folder": "Python executable folder",
"FFmpeg source IMAGEIO_FFMPEG_EXE": "IMAGEIO_FFMPEG_EXE",
"FFmpeg source imageio-ffmpeg": "imageio-ffmpeg",
"FFmpeg source System": "System",
"Version": "Version",
"Path": "Path",
"Available": "Available",
"Unavailable": "Unavailable",
"Hardware Acceleration": "Hardware Acceleration",
"Subtitle Burn-in": "Subtitle Burn-in",
"FFmpeg engine passed all checks": "FFmpeg engine passed all checks: basic execution, hardware acceleration and subtitle burn-in are available",
"FFmpeg engine works but hardware acceleration is unavailable": "FFmpeg and subtitle burn-in work, but hardware acceleration is unavailable; software encoding will be used",
"FFmpeg engine check failed": "FFmpeg engine check failed",
"Hardware acceleration detail": "Hardware acceleration detail",
"Subtitle burn-in detail": "Subtitle burn-in detail",
"Type": "Type",
"Encoder": "Encoder",
"Message": "Message",
"Method": "Method",
"Supported Hardware Methods": "Supported hardware methods",
"Subtitle Filters": "Subtitle filters",
"FFmpeg errors": "FFmpeg errors",
"Raw FFmpeg report": "Raw FFmpeg report",
"Subtitle Preview": "Subtitle Preview",
"One-Click Transcribe": "One-Click Transcribe",
"Transcribing...": "Transcribing...",
"Transcription Complete!": "Transcription Complete!",
"Transcription Failed. Please try again.": "Transcription failed. Please try again.",
"API rate limit exceeded. Please wait about an hour and try again.": "API rate limit exceeded. Please wait about an hour and try again.",
"Resources exhausted. Please try again later.": "Resources exhausted. Please try again later.",
"Transcription Failed": "Transcription Failed",
"Short Generate": "Short Drama Mix",
"Generate Short Video Script": "Generate Short Video Script",
"Adjust the volume of the original audio": "Adjust the volume of the original audio",
"Original Volume": "Original Volume",
"Frame Interval (seconds) (More keyframes consume more tokens)": "Frame Interval (seconds) (More keyframes consume more tokens)",
"Batch Size": "Batch Size",
"Batch Size (More keyframes consume more tokens)": "Batch Size (smaller batches consume more tokens)",
"Short Drama Summary": "Short Drama Summary",
"Film TV Narration": "Film/TV Narration",
"Video Type": "Creation Type",
"Select/Upload Script": "Custom Script",
"原生Gemini模型连接成功": "Native Gemini model connection succeeded",
"原生Gemini模型连接失败": "Native Gemini model connection failed",
"OpenAI兼容Gemini代理连接成功": "OpenAI-compatible Gemini proxy connection succeeded",
"OpenAI兼容Gemini代理连接失败": "OpenAI-compatible Gemini proxy connection failed",
"Connection failed": "Connection failed",
"自定义片段": "Custom Clips",
"设置需要生成的短视频片段数量": "Set the number of short video clips to generate",
"上传字幕文件": "Upload SRT",
"清除已上传字幕": "Clear Uploaded Subtitle",
"无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312": "Unable to read the subtitle file. Please check the file encoding. Supported encodings: UTF-8, UTF-16, GBK, GB2312.",
"字幕文件内容似乎为空,请检查文件": "The subtitle file appears to be empty. Please check the file.",
"字幕上传成功": "Subtitle uploaded successfully",
"短剧名称": "Short Drama Name",
"影视名称": "Film/TV Title",
"解说语言": "Narration Language",
"自定义解说语言": "Custom Narration Language",
"例如:意大利语(意大利)": "For example: Italian (Italy)",
"请输入自定义解说语言": "Please enter a custom narration language",
"简体中文(中国)": "Simplified Chinese (China)",
"英语(美国)": "English (United States)",
"日语(日本)": "Japanese (Japan)",
"韩语(韩国)": "Korean (South Korea)",
"法语(法国)": "French (France)",
"德语(德国)": "German (Germany)",
"西班牙语(西班牙)": "Spanish (Spain)",
"葡萄牙语(巴西)": "Portuguese (Brazil)",
"俄语(俄罗斯)": "Russian (Russia)",
"自定义": "Custom",
"短剧类型": "Short Drama Type",
"自定义短剧类型": "Custom Short Drama Type",
"影视类型": "Film/TV Type",
"自定义影视类型": "Custom Film/TV Type",
"原片占比": "Original Footage Ratio",
"例如:豪门虐恋": "For example: billionaire angst romance",
"例如:悬疑犯罪": "For example: suspense crime",
"请输入自定义短剧类型": "Please enter a custom short drama type",
"请输入自定义影视类型": "Please enter a custom film/TV type",
"逆袭/复仇": "Counterattack / Revenge",
"霸总/甜宠": "CEO Romance / Sweet Romance",
"家庭伦理": "Family Ethics",
"古装/权谋": "Costume / Power Struggle",
"悬疑/犯罪": "Suspense / Crime",
"都市情感": "Urban Romance",
"年代/乡村": "Period / Rural",
"剧情/情感": "Drama / Emotion",
"动作/冒险": "Action / Adventure",
"喜剧/轻松": "Comedy / Light",
"科幻/奇幻": "Sci-Fi / Fantasy",
"历史/战争": "History / War",
"恐怖/惊悚": "Horror / Thriller",
"生成解说文案": "Generate Narration Copy",
"生成剪辑脚本": "Generate Editing Script",
"短剧解说文案": "Short Drama Narration Copy",
"影视解说文案": "Film/TV Narration Copy",
"Narration Copy Help": "Generate the narration copy first, review or rewrite it here, then generate the editing script to match footage and timestamps.",
"Narration copy generated successfully": "Narration copy generated. Please review and edit it.",
"生成短剧解说脚本": "Generate Short Drama Narration Script",
"请输入视频脚本": "Please enter the video script",
"TTS engine does not support precise subtitles": "⚠️ {engine} does not support precise subtitle generation",
"Manual subtitle editing recommendation": "💡 We recommend adding subtitles manually in a professional editor such as CapCut or Premiere Pro.",
"Disabled subtitles help": "This TTS engine does not support subtitle generation. Please use another TTS engine.",
"Tencent Cloud TTS": "Tencent Cloud TTS",
"Tongyi Qwen3 TTS": "Tongyi Qwen3 TTS",
"IndexTTS Voice Clone": "IndexTTS-1.5 Voice Clone",
"Doubao TTS": "Doubao TTS",
"Edge TTS features": "Completely free, but service stability can vary and voice cloning is not supported.",
"Edge TTS use case": "Testing and lightweight use",
"Azure Speech Services features": "Includes a free quota, then pay-as-you-go billing. An overseas credit card may be required.",
"Azure Speech Services use case": "Enterprise use cases that need a stable service",
"Tencent Cloud TTS features": "Includes a free quota, good voice quality, multiple voices, and fast access in mainland China.",
"Tencent Cloud TTS use case": "Personal and enterprise users who need stable Chinese speech synthesis",
"Tongyi Qwen3 TTS features": "Alibaba Cloud Tongyi Qwen speech synthesis with high-quality voices and multiple voice options.",
"High-quality Chinese speech synthesis use case": "Users who need high-quality Chinese speech synthesis",
"IndexTTS features": "A locally or privately deployed IndexTTS-1.5 voice-cloning engine. Choose a resource audio file or upload a reference audio file, then synthesize narration in that voice.",
"IndexTTS use case": "Best for fixed narrator voices, character dubbing, or generating multiple videos with the same voice. Start the IndexTTS-1.5 API service before use. Deployment package: https://pan.quark.cn/s/0767c9bcefd5",
"IndexTTS download link": "Download link: https://pan.quark.cn/s/0767c9bcefd5",
"IndexTTS2 features": "A locally or privately deployed IndexTTS-2 voice-cloning engine with emotion control and fuller generation parameters.",
"IndexTTS2 use case": "Best for fixed voices, emotional narration, and local speech synthesis workflows that need finer sampling controls. Start the IndexTTS-2 API service before use.",
"OmniVoice features": "A locally or privately deployed OmniVoice-Pack multilingual TTS engine with automatic voice generation, voice design, and reference-audio cloning.",
"OmniVoice use case": "Best for local controllable multilingual narration, voice design, or reference-audio cloning. Start the OmniVoice-Pack API service before use.",
"Doubao TTS features": "Volcengine Doubao speech synthesis with multiple voices and emotions, plus fast access in mainland China.",
"Local Deployment": "Local Deployment",
"Cloud Service": "Cloud Service",
"Select TTS Engine": "Select TTS Engine",
"Select TTS Engine Help": "Choose the text-to-speech engine you want to use.",
"TTS Engine Details": "{engine} Details",
"Features": "Features",
"Use Case": "Use Case",
"Registration URL": "Registration URL",
"Voice Selection": "Voice Selection",
"Select Edge TTS Voice": "Select an Edge TTS voice",
"Edge TTS Voice Description": "Edge TTS Voice Notes",
"Loaded voice count": "Loaded {count} voices",
"Female Voice": "Female voice",
"Male Voice": "Male voice",
"Voice Volume": "Voice Volume",
"Voice Volume Help Percent": "Adjust voice volume (0-100)",
"Voice Rate": "Voice Rate",
"Voice Rate Help 0.5-2.0": "Adjust voice speed (0.5-2.0x)",
"Voice Pitch": "Voice Pitch",
"Voice Pitch Help Percent": "Adjust voice pitch (-50% to +50%)",
"Service Region": "Service Region",
"Service Region Placeholder": "e.g. eastus",
"Azure Service Region Help": "Azure Speech Services region, such as eastus, westus2, or eastasia.",
"Azure Speech Key Help": "Azure Speech Services API key",
"Voice Name": "Voice Name",
"Azure Voice Name Help": "Enter an Azure Speech Services voice name. You can use the official voice name directly, such as zh-CN-YunzeNeural.",
"Common Voice Reference": "Common Voice Reference",
"Chinese Voices": "Chinese Voices",
"English Voices": "English Voices",
"Multilingual": "multilingual",
"Azure Voices Docs Notice": "For more voices, see the [Azure Speech Services documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support).",
"Quick Select": "Quick Select",
"Chinese Female Voice": "Chinese Female Voice",
"Chinese Male Voice": "Chinese Male Voice",
"English Female Voice": "English Female Voice",
"Voice name valid": "Voice name is valid: {voice}",
"Voice name format may be invalid": "Voice name format may be incorrect: {voice}",
"Azure voice name format notice": "Azure voice names usually follow this format: [language]-[region]-[name]Neural",
"Azure Speech Services configured": "Azure Speech Services is configured",
"Please configure service region": "Please configure the service region",
"Please configure API Key": "Please configure the API Key",
"Task failed": "Task failed",
"Script file cannot be empty": "Script file cannot be empty",
"Video file cannot be empty": "Video file cannot be empty",
"Export to Jianying Draft": "📤 Export to Jianying Draft",
"Please configure Jianying draft folder in basic settings": "Please configure the Jianying draft folder in Basic Settings",
"Jianying draft folder does not exist": "Jianying draft folder does not exist: {path}",
"Jianying export dialog title": "Confirm draft name",
"Jianying export dialog description": "Confirm the Jianying draft name before exporting. Once complete, you can open it from the Jianying draft folder.",
"Jianying export destination": "Save location",
"Jianying draft name": "Draft name",
"Please enter Jianying draft name": "Please enter the Jianying draft name",
"Confirm Export": "Confirm Export",
"Please enter draft name": "Please enter a draft name",
"Failed to build parameters": "Failed to build parameters",
"Exporting to Jianying draft...": "Exporting to Jianying draft, please wait...",
"Jianying draft exported successfully": "✅ Successfully exported to Jianying draft: {name}",
"Draft saved to": "📁 Draft saved to: {path}",
"Failed to export Jianying draft": "❌ Failed to export Jianying draft",
"Cancel": "Cancel",
"LLM initialization failed": "⚠️ LLM initialization failed: {error}\n\nPlease check whether the configuration file and dependencies are installed correctly.",
"Jianying Draft Settings": "Jianying Draft Settings",
"Jianying Draft Folder Path": "Jianying Draft Folder Path",
"Jianying Draft Folder Path Help": "Jianying draft folder path, for example: C:\\Users\\Username\\Documents\\JianyingPro Drafts",
"Custom API endpoint help": "OpenAI-compatible endpoint URL. Use a full /v1 URL for third-party or self-hosted gateways; leave empty for the official OpenAI API.",
"Recommended API endpoint": "Recommended endpoint",
"OpenAI compatible gateway help": "{model_type} uses an OpenAI-compatible API, so a complete endpoint URL is required.",
"Vision model": "Vision model",
"Text model": "Text model",
"Model Name Input Help": "Enter the full model name.\n\nCommon examples:",
"OpenAI compatible providers help": "The vendor is not limited here; OpenAI, DeepSeek, OpenRouter, SiliconFlow, or a self-hosted gateway all work as long as the endpoint is OpenAI-compatible.",
"OpenAI compatible protocol": "OpenAI-compatible",
"OpenAI compatible protocol help": "This does not require the official OpenAI model; any service that supports the OpenAI Chat Completions compatible API can be used.",
"Provider API Key Help": "API key for the model service.\n\nCommon places to get one:",
"Please fill OpenAI compatible gateway": "Please fill in the OpenAI-compatible gateway URL above, for example: {example}",
"Please enter API key": "Please enter the API key first",
"Please enter model name": "Please enter the model name first",
"Connection test error": "An error occurred while testing the connection",
"Vision model config saved": "Vision model configuration saved (OpenAI compatible)",
"Text model config saved": "Text generation model configuration saved (OpenAI compatible)",
"Failed to save config": "Failed to save configuration",
"Custom Position (% from top)": "Custom Position (% from top)",
"Please enter a value between 0 and 100": "Please enter a value between 0 and 100",
"Please enter a valid number": "Please enter a valid number",
"None": "None",
"Uploaded subtitle": "Uploaded subtitle: {file}",
"Encoding": "Encoding",
"Size": "Size",
"Characters": "characters",
"Ali Bailian Fun-ASR Subtitle Transcription": "Subtitle Processing",
"Subtitle Processing Method": "Subtitle Processing Method",
"Fun-ASR Backend": "Fun-ASR Backend",
"Local FunASR-Pack API": "FunASR (Local)",
"Local FireRedASR API": "FireRedASR2 (Local)",
"Ali Bailian Online Fun-ASR": "FunASR (Online)",
"Local Fun-ASR upload caption": "The current video above will be converted to SRT subtitles through the locally running FunASR-Pack API.",
"Local FireRed-ASR upload caption": "The current video above will be converted to SRT subtitles through the locally running FireRedASR2-AED-Pack API.",
"Fun-ASR upload caption": "The current video above will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.",
"Auto Transcription Local Caption": "After the final video is merged, it will be converted to SRT subtitles through the locally running FunASR-Pack API.",
"Auto Transcription FireRed Caption": "After the final video is merged, it will be converted to SRT subtitles through the locally running FireRedASR2-AED-Pack API.",
"Auto Transcription Online Caption": "After the final video is merged, it will be uploaded to temporary Ali Bailian storage and converted to SRT subtitles with fun-asr.",
"Local FunASR-Pack API URL": "Local FunASR-Pack API URL",
"Local FunASR-Pack API URL Help": "For example, http://127.0.0.1:7860. A full /asr endpoint URL is also supported.",
"Local FireRedASR API URL": "Local ASR API URL",
"Local FireRedASR API URL Help": "For example, http://127.0.0.1:7867. A full /asr endpoint URL is also supported.",
"Fun-ASR Hotword": "Hotword",
"Fun-ASR Hotword Help": "Optional hotwords passed to the local FunASR-Pack API.",
"Enable speaker diarization": "Enable speaker diarization",
"Enable speaker diarization Help": "Requires the local FunASR-Pack service to enable and load the spk model.",
"API Key URL": "API Key URL",
"Ali Bailian API Key": "Ali Bailian API Key",
"Ali Bailian API Key Help": "Enter your Ali Bailian API Key. After saving, it will be written to the local config.toml file.",
"Upload media to transcribe": "Upload audio/video to transcribe",
"Using selected video for subtitle transcription": "Using current video for subtitle transcription: {file}",
"Using selected videos for subtitle transcription": "Using {count} current videos for subtitle transcription: {files}",
"Please select or upload a video first": "Please select or upload a video file above first",
"Selected video file does not exist": "The selected video file does not exist. Please select or upload it again",
"Selected video files do not exist": "These selected video files do not exist. Please select or upload them again: {files}",
"Transcribe subtitles": "Transcribe Subtitles",
"Calibrate subtitles": "Calibrate Subtitles",
"Please enter Ali Bailian API Key": "Please enter the Ali Bailian API Key first",
"Please enter local FunASR-Pack API URL": "Please enter the local FunASR-Pack API URL first",
"Please enter local FireRedASR API URL": "Please enter the local ASR API URL first",
"Please upload media to transcribe": "Please upload the audio or video file to transcribe first",
"Transcribing with local FunASR-Pack...": "Transcribing subtitles with local FunASR-Pack, please wait...",
"Transcribing with local FireRedASR...": "Transcribing subtitles with local ASR, please wait...",
"Transcribing with Fun-ASR...": "Transcribing subtitles with Ali Bailian Fun-ASR, please wait...",
"Fun-ASR failed without subtitle file": "Fun-ASR transcription failed: no subtitle file was generated",
"Subtitle transcription succeeded": "Subtitle transcription succeeded: {file}",
"Subtitle transcription succeeded for multiple files": "Subtitle transcription succeeded for {count} files: {files}",
"Calibrating subtitles...": "Calibrating subtitles with the LLM, please wait...",
"Subtitle calibration succeeded": "Subtitle calibration succeeded: {file}",
"Subtitle calibration succeeded for multiple files": "Subtitle calibration succeeded for {count} files: {files}",
"Subtitle calibration failed": "Subtitle calibration failed",
"Transcribed subtitles storage hint": "Previously transcribed subtitles are saved in {path}; drag a file from that folder to upload",
"Tavily Search Settings": "Tavily Web Search",
"Tavily API Key": "Tavily API Key",
"Tavily API Key Help": "Used for web search before plot analysis. When Web Search is enabled, the app searches plot, character, and background context by title, then combines it with subtitles.",
"Tavily config saved": "Tavily configuration saved",
"联网搜索": "Web Search",
"Enable Web Search Help": "When enabled, plot analysis searches the web with Tavily by title before combining those results with subtitles.",
"Please configure Tavily API Key in Basic Settings": "Please configure the Tavily API Key in Basic Settings first",
"Please enter short drama name before web search": "Please enter the short drama name before enabling web search",
"Please enter film/tv title before web search": "Please enter the film/TV title before enabling web search",
"Searching short drama with Tavily...": "Searching short drama context with Tavily...",
"Tavily search failed": "Tavily search failed",
"剧情理解": "Plot Analysis",
"剧情理解结果": "Plot Analysis Result",
"Analyzing plot...": "Analyzing plot...",
"Plot analysis completed": "Plot analysis completed",
"Please generate or upload subtitles first": "Please transcribe or upload subtitles first",
"Please transcribe or upload subtitles first": "Please transcribe or upload subtitles first",
"Fun-ASR transcription failed": "Fun-ASR transcription failed",
"Validating script format...": "Validating script format...",
"Script format validation failed": "Script format validation failed",
"Error Message": "Error Message",
"Details": "Details",
"Correct script format example": "Correct script format example",
"Script format validation error": "An error occurred during script format validation",
"Script validated and saved successfully": "✅ Script format validated and saved successfully!",
"Tencent Secret ID Help": "Enter your Tencent Cloud Secret ID",
"Tencent Secret Key Help": "Enter your Tencent Cloud Secret Key",
"Tencent Service Region Help": "Select the Tencent Cloud TTS service region",
"Custom Voice": "Custom Voice",
"Select Tencent TTS Voice": "Select a Tencent Cloud TTS voice",
"Tencent Cloud TTS Voice Description": "Tencent Cloud TTS Voice Notes",
"Female Voices": "Female Voices",
"Male Voices": "Male Voices",
"Tencent More Voices Notice": "See the official Tencent Cloud documentation for more voices.",
"Qwen DashScope API Key Help": "Tongyi Qwen DashScope API Key",
"TTS Model Name": "TTS Model Name",
"Qwen TTS Model Help": "Qwen TTS model name, for example qwen3-tts-flash",
"Select Qwen3 TTS Voice": "Select a Qwen3 TTS voice",
"API URL": "API URL",
"IndexTTS API URL Help": "IndexTTS-1.5 API service URL",
"IndexTTS2 API URL Help": "IndexTTS-2 API service URL. You can enter the service root or the full /tts endpoint.",
"OmniVoice API URL Help": "OmniVoice-Pack API service URL. You can enter the service root or the full /tts endpoint.",
"OmniVoice Language Code": "Synthesis Language",
"OmniVoice Language Code Help": "The language parameter sent to OmniVoice-Pack, such as zh or en.",
"OmniVoice Generation Mode": "Generation Mode",
"OmniVoice Generation Mode Help": "Automatic voice needs no extra fields; voice design uses an instruction; reference-audio cloning needs reference audio and matching text.",
"OmniVoice Mode Auto": "Automatic Voice",
"OmniVoice Mode Voice Design": "Voice Design",
"OmniVoice Mode Voice Clone": "Reference Audio Clone",
"OmniVoice Instruct": "Voice Instruction",
"OmniVoice Instruct Help": "Describe the desired voice, such as gender, pitch, accent, or style.",
"OmniVoice Instruct Placeholder": "e.g. female, low pitch, british accent",
"OmniVoice Reference Text": "Reference Audio Text",
"OmniVoice Reference Text Help": "The exact transcript of the reference audio. Required when the deployed service has ASR disabled.",
"OmniVoice Reference Text Placeholder": "Enter the text spoken in the reference audio",
"OmniVoice Num Step Help": "Diffusion generation steps. Higher values usually improve quality but slow generation.",
"OmniVoice Guidance Scale Help": "Controls how strongly text conditions guide generation.",
"OmniVoice Duration": "Target Duration (seconds)",
"OmniVoice Duration Help": "0 lets the model decide the duration automatically.",
"OmniVoice Denoise": "Enable Denoise",
"OmniVoice Denoise Help": "Ask OmniVoice-Pack to denoise the generated output.",
"OmniVoice Postprocess Output": "Postprocess Output",
"OmniVoice Postprocess Output Help": "Enable OmniVoice-Pack output post-processing.",
"OmniVoice Preprocess Prompt": "Preprocess Text",
"OmniVoice Preprocess Prompt Help": "Enable OmniVoice-Pack text preprocessing.",
"Reference Audio Source": "Reference Audio Source",
"Reference Audio Source Help": "Choose a reference audio from the resource directory or upload a new one.",
"Select from Resource Directory": "Select from Resource Directory",
"Upload Reference Audio": "Upload Reference Audio",
"Reference Audio Path": "Reference Audio",
"Reference Audio Path Help": "Choose the reference audio for voice cloning (WAV/MP3, 3-10 seconds recommended)",
"No Reference Audio Resources Found": "No reference audio resources found. Please upload a reference audio file.",
"Preview Reference Audio": "Preview",
"Preview Reference Audio Help": "Play the selected reference audio.",
"Upload Reference Audio File": "Upload Reference Audio File",
"Upload Reference Audio Help": "Upload a clear audio clip for voice cloning",
"Audio uploaded": "Audio uploaded: {path}",
"Inference Mode": "Inference Mode",
"Standard Inference": "Standard Inference",
"Fast Inference": "Fast Inference",
"Inference Mode Help": "Standard inference has higher quality but is slower. Fast inference is faster with slightly lower quality.",
"Advanced Parameters": "Advanced Parameters",
"Sampling Temperature": "Sampling Temperature",
"Sampling Temperature Help": "Controls randomness. Higher values are more random; lower values are more deterministic.",
"Top P Help": "Probability threshold for nucleus sampling. Smaller values make results more deterministic.",
"Top K Help": "The k value for top-k sampling. 0 disables top-k.",
"Num Beams": "Num Beams",
"Num Beams Help": "Number of beams for beam search. Higher values may improve quality but slow generation.",
"Repetition Penalty": "Repetition Penalty",
"Repetition Penalty Help": "Higher values reduce repetition, but overly high values may sound unnatural.",
"Enable Sampling": "Enable Sampling",
"Enable Sampling Help": "Enable sampling for more natural speech.",
"IndexTTS Usage Instructions Title": "IndexTTS-1.5 Usage Instructions",
"IndexTTS Usage Instructions": "**Zero-shot voice cloning**\n\n1. **Prepare reference audio**: upload or specify a clear audio file (3-10 seconds recommended)\n2. **Set API URL**: make sure the IndexTTS-1.5 service is running\n3. **Start synthesis**: the system will use the reference voice to synthesize new speech\n\n**Notes**:\n- Reference audio quality directly affects synthesis quality\n- Use clean audio without background noise when possible\n- Keep text length within a reasonable range\n- The first synthesis may take longer",
"IndexTTS2 Emotion Parameters": "Emotion Parameters",
"Emotion Mode": "Emotion Mode",
"Emotion Mode Help": "Choose the emotion control source for IndexTTS-2.",
"Emotion Mode Speaker": "Same as speaker reference",
"Emotion Mode Audio": "Use emotion reference audio",
"Emotion Mode Vector": "Use emotion vector",
"Emotion Mode Text": "Use emotion text",
"Emotion Alpha": "Emotion Alpha",
"Emotion Alpha Help": "Controls how strongly the emotion condition affects generation. 0 is weak, 1 is strong.",
"Emotion Reference Audio Path": "Emotion Reference Audio Path",
"Emotion Reference Audio Path Help": "Local emotion reference audio path used when emotion_mode=audio.",
"Emotion Text": "Emotion Text",
"Emotion Text Help": "Emotion description used when emotion_mode=text, such as happy, nervous, or aggrieved.",
"Emotion Text Placeholder": "e.g. calm, nervous, happy",
"Use Random Emotion": "Use Random Emotion",
"Use Random Emotion Help": "Let IndexTTS-2 use random emotion sampling during generation.",
"Emotion Happy": "Happy",
"Emotion Angry": "Angry",
"Emotion Sad": "Sad",
"Emotion Afraid": "Afraid",
"Emotion Disgusted": "Disgusted",
"Emotion Melancholic": "Melancholic",
"Emotion Surprised": "Surprised",
"Emotion Calm": "Calm",
"Max Text Tokens Per Segment": "Max Text Tokens Per Segment",
"Max Text Tokens Per Segment Help": "Maximum text tokens per segment for IndexTTS-2 inference.",
"Max Mel Tokens": "Max Mel Tokens",
"Max Mel Tokens Help": "Controls the maximum mel tokens generated in one request. Higher values can produce longer audio.",
"IndexTTS2 Usage Instructions Title": "IndexTTS-2 Usage Instructions",
"IndexTTS2 Usage Instructions": "**IndexTTS-2 voice cloning**\n\n1. **Choose a voice**: reuse IndexTTS-1.5 resource audio or upload a reference audio file\n2. **Set API URL**: for example http://192.168.3.6:7863/tts, or enter the service root\n3. **Tune emotion**: speaker is the default; switch to audio, vector, or text when needed\n4. **Tune generation**: temperature, top_p, top_k, num_beams, repetition_penalty, and max_mel_tokens are sent directly to the IndexTTS-2 API\n\n**Notes**:\n- Reference audio quality directly affects cloning quality\n- The first request may load the model and take longer\n- CPU deployments are much slower than GPU deployments",
"OmniVoice Usage Instructions Title": "OmniVoice Usage Instructions",
"OmniVoice Usage Instructions": "**OmniVoice-Pack speech synthesis**\n\n1. **Automatic voice**: set the API URL and language, then synthesize directly.\n2. **Voice design**: fill instruct with the desired gender, pitch, accent, or style.\n3. **Reference-audio clone**: upload or choose reference audio and fill its matching transcript.\n\n**Notes**:\n- The default service URL is http://127.0.0.1:7866/tts\n- Reference-audio cloning requires reference text when the service has no ASR model loaded\n- OmniVoice returns WAV audio, and NarratoAI estimates subtitle segment timing from the audio duration",
"Volcengine Access Key Help": "Volcengine Access Key",
"Volcengine Secret Key Help": "Volcengine Secret Key",
"Doubao AppID Help": "Doubao TTS application AppID",
"Doubao Token Help": "Doubao TTS application Token",
"Cluster": "Cluster",
"Doubao Cluster Help": "Business cluster. Standard voices use volcano_tts.",
"Select Doubao TTS Voice": "Select a Doubao TTS voice",
"Voice Rate Help 0.2-3.0": "Adjust voice speed (0.2-3.0)",
"Voice Volume Help 0.1-2.0": "Adjust voice volume (0.1-2.0)",
"Voice Pitch Help 0.5-1.5": "Adjust voice pitch (0.5-1.5)",
"Sentence Silence Duration": "Sentence-end Silence Duration (seconds)",
"Sentence Silence Duration Help": "Adjust sentence-end silence duration (0.0-2.0 seconds)",
"Doubao TTS API Key Application Process": "Doubao TTS API Key Application Process",
"Application Steps": "Application Steps",
"Doubao TTS Step 1": "1. Open [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)",
"Doubao TTS Step 2": "2. Create a new Access Key and Secret Key",
"Doubao TTS Step 3": "3. Open [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)",
"Doubao TTS Step 4": "4. Click Start Now",
"Doubao TTS Step 5": "5. In the left API Service Center, find Speech Synthesis under Audio Generation (note: Speech Synthesis, not the speech synthesis large model)",
"Doubao TTS Step 6": "6. Scroll to the bottom to get the APPID and Access Token",
"Doubao TTS Fill Credentials Notice": "Fill the Access Key, Secret Key, AppID, and Token above.",
"Doubao TTS configured": "Doubao TTS is configured",
"Please configure missing fields": "Please configure: {fields}",
"Preview Voice Synthesis": "Preview Voice Synthesis",
"Voice Preview Sample": "Thanks for using NarratoAI. If you have any questions or suggestions, please join the community for help and discussion.",
"Please configure voice settings first": "Please configure voice settings first",
"Voice synthesis successful": "Voice synthesis successful!",
"Voice synthesis failed": "Voice synthesis failed. Please check your configuration.",
"SoulVoice pitch not supported": "SoulVoice does not support pitch adjustment",
"Progress": "Progress",
"Generating script...": "Generating script...",
"Please select video file first": "Please select a video file first",
"Extracting keyframes...": "Extracting keyframes...",
"Script generation completed": "Script generation completed",
"Script generation completed!": "Script generation completed!",
"Video script generated successfully": "✅ Video script generated successfully!",
"Generation error": "❌ An error occurred during generation",
"Please upload subtitle file first": "Please upload a subtitle file first",
"Video": "Video",
"Subtitle": "Subtitle",
"Preparing script generation": "Preparing script generation",
"Script generation failed check logs": "Script generation failed. Please check the logs.",
"Parsing subtitles...": "Parsing subtitles...",
"Analyzing subtitles with model...": "Waiting for the model to analyze subtitles...",
"Subtitle file does not exist": "Subtitle file does not exist",
"Subtitle file is empty or unreadable": "Subtitle file is empty or unreadable",
"Generating narration copy...": "Generating narration copy...",
"Generated narration copy is empty": "The generated narration copy is empty",
"Please generate and review narration copy first": "Please generate and review the narration copy first",
"Matching narration copy to footage...": "Matching narration copy to footage and timestamps...",
"Waiting for model stream...": "Waiting for model stream...",
"Streaming unavailable fallback waiting...": "Streaming is unavailable for this request. Waiting for the full response...",
"LLM stream window title": "Model reasoning / output stream",
"Model reasoning stream": "[Model reasoning]",
"Model output preview": "[Model output preview]",
"Repairing narration script...": "Repairing narration script...",
"Generated narration JSON parse failed": "The generated narration format is invalid and could not be parsed as JSON",
"Generated narration missing items field": "The generated narration is missing the required 'items' field",
"Generated narration validation failed": "The generated narration script failed validation",
"Preparing output...": "Preparing output..."
}
}
}

View File

@ -10,8 +10,9 @@
"Auto Detect": "自动检测",
"Video Theme": "视频主题",
"Generation Prompt": "自定义提示词",
"Generation Settings": "生成参数",
"Save Script": "保存脚本",
"Video File": "视频文件:blue[1⃣支持上传视频文件(限制2G) 2⃣大文件建议直接导入 ./resource/videos 目录]",
"Video File": "视频文件",
"Plot Description": "剧情描述 (:blue[可从 https://www.tvmao.com/ 获取])",
"Generate Video Keywords": "点击使用AI根据**文案**生成【视频关键】",
"Please Enter the Video Subject": "请先填写视频文案",
@ -40,9 +41,56 @@
"Random Background Music": "随机背景音乐",
"Custom Background Music": "自定义背景音乐",
"Custom Background Music File": "请输入自定义背景音乐的文件路径",
"Background Music Source": "背景音乐来源",
"Background Music Source Help": "选择资源目录中的背景音乐、上传新的背景音乐,或关闭背景音乐",
"Upload Background Music": "上传背景音乐",
"Background Music Path Help": "选择用于视频合成的背景音乐",
"No Background Music Resources Found": "未找到资源目录中的背景音乐,请上传背景音乐文件",
"Preview Background Music Help": "播放当前背景音乐",
"Upload Background Music File": "上传背景音乐文件",
"Upload Background Music Help": "上传一个音频文件作为背景音乐",
"Background Music uploaded": "背景音乐已上传: {path}",
"Background Music Volume": "背景音乐音量0.2表示20%,背景声音不宜过高)",
"Subtitle Settings": "**字幕设置**",
"Enable Subtitles": "启用字幕(若取消勾选,下面的设置都将不生效)",
"Enable Subtitle Mask": "启用字幕遮罩",
"Enable Subtitle Mask Help": "开启后会在烧录新字幕前,先用模糊遮罩覆盖原视频自带字幕区域",
"Set Subtitle Mask": "设置字幕遮罩",
"Subtitle Mask Summary": "横屏 {landscape_x}%/{landscape_y}% · {landscape_width}%×{landscape_height}%;竖屏 {portrait_x}%/{portrait_y}% · {portrait_width}%×{portrait_height}%",
"Subtitle Mask Settings": "字幕遮罩设置",
"Subtitle Mask Settings Caption": "按画面百分比保存横屏和竖屏遮罩区域;生成视频时会先叠加柔化遮罩,再烧录新字幕。",
"Landscape Subtitle Mask": "横屏遮罩",
"Portrait Subtitle Mask": "竖屏遮罩",
"Landscape Subtitle Position": "横屏字幕位置",
"Portrait Subtitle Position": "竖屏字幕位置",
"Save Subtitle Mask Settings": "保存字幕遮罩设置",
"Subtitle Mask Left": "左侧位置",
"Subtitle Mask Left Help": "遮罩距离画面左侧的百分比",
"Subtitle Mask Top": "顶部位置",
"Subtitle Mask Top Help": "遮罩距离画面顶部的百分比",
"Subtitle Mask Width": "遮罩宽度",
"Subtitle Mask Width Help": "遮罩覆盖区域的宽度百分比",
"Subtitle Mask Height": "遮罩高度",
"Subtitle Mask Height Help": "遮罩覆盖区域的高度百分比",
"Subtitle Mask Blur Radius": "模糊半径",
"Subtitle Mask Blur Radius Help": "遮罩边缘和背景的模糊强度",
"Subtitle Mask Opacity": "遮罩强度",
"Subtitle Mask Opacity Help": "遮罩融合强度,数值越高越容易遮住原字幕",
"Subtitle Burn Position": "字幕位置",
"Subtitle Burn Position Help": "新字幕距离画面顶部的百分比;预览中的蓝线表示当前字幕位置",
"Subtitle Mask Preview": "原字幕遮罩预览",
"Subtitle Mask Preview Caption": "可上传一段原视频作为预览,也可直接使用当前已选择的原视频;上传内容仅用于预览遮罩位置。",
"Upload Subtitle Mask Preview Video": "上传预览原视频",
"Upload Subtitle Mask Preview Video Help": "仅用于在弹窗中预览遮罩,不会替换生成视频使用的原视频",
"Using Subtitle Mask Preview Video": "当前预览视频: {file}",
"Change Subtitle Mask Preview Video": "更换视频",
"Subtitle Mask Preview Empty": "请上传预览视频,或先在上方选择原视频",
"Subtitle Mask Preview Timeline": "预览时间轴(秒)",
"Subtitle Mask Preview Timeline Help": "拖动到原字幕出现的画面,方便微调遮罩区域",
"Subtitle Mask Preview Frame Caption": "{time} · {orientation} · 红框为遮罩区域,蓝线为字幕位置",
"Subtitle Mask Preview Failed": "无法读取该视频预览,请尝试更换视频文件",
"Enable Auto Transcription": "启用自动转录",
"Enable Auto Transcription Help": "开启后会在最终视频合并完成后,对整条视频转录生成字幕并压入成片",
"Font": "字幕字体",
"Position": "字幕位置",
"Top": "顶部",
@ -80,8 +128,17 @@
"Synthesizing Voice": "语音合成中,请稍候...",
"TTS Provider": "语音合成提供商",
"Hide Log": "隐藏日志",
"Select from resource directory": "从资源目录选择",
"Select a video from resource videos directory": "选择 ./resource/videos 目录中的视频",
"Upload a new video file up to 2GB": "上传一个新的视频文件,限制 2GB",
"Upload new video files up to 2GB each": "上传一个或多个视频文件,单个文件限制 2GB",
"Select Video": "选择视频",
"Choose a video file": "选择一个视频文件",
"Upload Video": "上传视频",
"No video files found in resource videos directory": "未在 ./resource/videos 目录中找到视频文件",
"Upload Local Files": "上传本地文件",
"File Uploaded Successfully": "文件上传成功",
"Selected videos for processing": "已选择 {count} 个视频: {files}",
"timestamp": "时间戳",
"Picture description": "图片描述",
"Narration": "视频文案",
@ -97,23 +154,45 @@
"Failed to Save Script": "保存脚本失败",
"Script saved successfully": "脚本保存成功",
"Video Script": "视频脚本",
"Edit Video Script": "查看/编辑视频脚本",
"Video script row count": "共 {count} 条脚本",
"Video script table help": "在表格中编辑完整脚本 JSON。可新增、删除行保存时会重新校验并写入脚本文件。",
"Raw JSON Preview": "原始 JSON 预览",
"Script Column ID": "序号",
"Script Column Video ID": "视频",
"Script Column Video Name": "视频文件",
"Script Column Timestamp": "时间戳",
"Script Column Picture": "画面描述",
"Script Column Narration": "解说台词",
"Script Column OST": "标记",
"Video Quality": "视频质量",
"Custom prompt for LLM, leave empty to use default prompt": "自定义提示词,留空则使用默认提示词",
"Proxy Settings": "代理设置",
"HTTP_PROXY": "HTTP 代理",
"HTTPs_PROXY": "HTTPS 代理",
"Vision Model Settings": "视频分析模型设置",
"Vision Model Provider": "视频分析模型提供商",
"Vision API Key": "视频分析 API 密钥",
"Vision Base URL": "视频分析接口地址",
"Vision Model Name": "视频分析模型名称",
"Vision Model Settings": "视分析模型设置",
"Vision Model Provider": "接口规范",
"Vision API Key": "视分析 API 密钥",
"Vision Base URL": "视分析接口地址",
"Vision Model Name": "视分析模型名称",
"Text Generation Model Settings": "文案生成模型设置",
"LLM Model Name": "大语言模型名称",
"LLM Model API Key": "大语言模型 API 密钥",
"Text Model Provider": "文案生成模型提供商",
"Text Model Provider": "接口规范",
"Text API Key": "文案生成 API 密钥",
"Text Base URL": "文案生成接口地址",
"Text Model Name": "文案生成模型名称",
"Top P": "Top P",
"Top K": "Top K",
"Max Output Tokens": "最大输出 Token",
"Max Output Tokens Help": "单次生成的最大输出长度0 表示使用服务端默认值",
"Thinking Level": "思考等级",
"Thinking Level Help": "控制推理/思考强度。自动表示不额外发送思考参数,低/中/高会尝试传递 reasoning_effort",
"Thinking Level Auto": "自动",
"Thinking Level Off": "关闭",
"Thinking Level Low": "低",
"Thinking Level Medium": "中",
"Thinking Level High": "高",
"Account ID": "账户 ID",
"Skip the first few seconds": "跳过开头多少秒",
"Difference threshold": "差异阈值",
@ -144,6 +223,48 @@
"Directory cleared": "目录清理完成",
"Directory does not exist": "目录不存在",
"Failed to clear directory": "清理目录失败",
"FFmpeg Engine Detection": "FFmpeg 引擎检测",
"FFmpeg Engine": "FFmpeg 引擎",
"FFmpeg Engine Help": "选择当前应用优先使用的 ffmpeg 可执行文件;会自动发现整合包运行时和本机 PATH 中的 ffmpeg",
"No FFmpeg engines found": "未发现可用 FFmpeg 引擎",
"Custom FFmpeg Path": "自定义 FFmpeg 路径",
"Custom FFmpeg Path Help": "如果下拉框没有列出目标引擎,可以粘贴 ffmpeg 可执行文件的绝对路径",
"Current FFmpeg Engine": "当前生效引擎",
"Save FFmpeg Engine": "保存引擎",
"Test Selected FFmpeg": "检测所选 FFmpeg",
"Testing FFmpeg engine": "正在检测 FFmpeg 引擎...",
"FFmpeg engine saved": "FFmpeg 引擎已保存",
"Selected FFmpeg path is invalid": "所选 FFmpeg 路径无效",
"FFmpeg detection details": "FFmpeg 检测详情",
"FFmpeg source Configured": "已配置",
"FFmpeg source NarratoAI packaged runtime": "NarratoAI 整合包运行时",
"FFmpeg source Integrated runtime": "内置运行时",
"FFmpeg source System PATH": "系统 PATH",
"FFmpeg source Homebrew": "Homebrew",
"FFmpeg source Python environment": "Python 环境",
"FFmpeg source Python executable folder": "Python 可执行目录",
"FFmpeg source IMAGEIO_FFMPEG_EXE": "IMAGEIO_FFMPEG_EXE",
"FFmpeg source imageio-ffmpeg": "imageio-ffmpeg",
"FFmpeg source System": "系统路径",
"Version": "版本",
"Path": "路径",
"Available": "可用",
"Unavailable": "不可用",
"Hardware Acceleration": "硬件加速",
"Subtitle Burn-in": "字幕烧录",
"FFmpeg engine passed all checks": "FFmpeg 引擎检测通过:基础功能、硬件加速和字幕烧录均可用",
"FFmpeg engine works but hardware acceleration is unavailable": "FFmpeg 基础功能和字幕烧录可用,但硬件加速不可用,将使用软件编码",
"FFmpeg engine check failed": "FFmpeg 引擎检测失败",
"Hardware acceleration detail": "硬件加速详情",
"Subtitle burn-in detail": "字幕烧录详情",
"Type": "类型",
"Encoder": "编码器",
"Message": "信息",
"Method": "方式",
"Supported Hardware Methods": "支持的硬件加速方法",
"Subtitle Filters": "字幕滤镜",
"FFmpeg errors": "FFmpeg 错误",
"Raw FFmpeg report": "原始 FFmpeg 报告",
"Subtitle Preview": "字幕预览",
"One-Click Transcribe": "一键转录",
"Transcribing...": "正在转录中...",
@ -156,13 +277,422 @@
"Generate Short Video Script": "AI生成短剧混剪脚本",
"Adjust the volume of the original audio": "调整原始音频的音量",
"Original Volume": "视频音量",
"Auto Generate": "逐帧解说",
"Auto Generate": "逐帧分析",
"Frame Interval (seconds)": "帧间隔 (秒)",
"Frame Interval (seconds) (More keyframes consume more tokens)": "帧间隔 (秒) (更多关键帧消耗更多令牌)",
"Batch Size": "批处理大小",
"Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多",
"Short Drama Summary": "短剧解说",
"Video Type": "视频类型",
"Select/Upload Script": "选择/上传脚本"
"Film TV Narration": "影视解说",
"Video Type": "创作类型",
"Select/Upload Script": "自定义脚本",
"Script loaded successfully": "脚本加载成功",
"Failed to load script": "加载脚本失败",
"Failed to save script": "保存脚本失败",
"QwenVL model returned invalid response": "QwenVL 模型返回了无效响应",
"Testing connection...": "正在测试连接...",
"Connection failed": "连接失败",
"TTS engine does not support precise subtitles": "⚠️ {engine} 不支持精确字幕生成",
"Manual subtitle editing recommendation": "💡 建议使用专业剪辑工具如剪映、PR 等)手动添加字幕",
"Disabled subtitles help": "当前 TTS 引擎不支持字幕生成,请使用其他 TTS 引擎",
"Tencent Cloud TTS": "腾讯云 TTS",
"Tongyi Qwen3 TTS": "通义千问 Qwen3 TTS",
"IndexTTS Voice Clone": "IndexTTS-1.5 语音克隆",
"Doubao TTS": "豆包语音 TTS",
"Edge TTS features": "完全免费,但服务稳定性一般,不支持语音克隆功能",
"Edge TTS use case": "测试和轻量级使用",
"Azure Speech Services features": "提供一定免费额度,超出后按量付费,需要绑定海外信用卡",
"Azure Speech Services use case": "企业级应用,需要稳定服务",
"Tencent Cloud TTS features": "提供免费额度,音质优秀,支持多种音色,国内访问速度快",
"Tencent Cloud TTS use case": "个人和企业用户,需要稳定的中文语音合成",
"Tongyi Qwen3 TTS features": "阿里云通义千问语音合成,音质优秀,支持多种音色",
"High-quality Chinese speech synthesis use case": "需要高质量中文语音合成的用户",
"IndexTTS features": "本地/私有部署的 IndexTTS-1.5 语音克隆引擎。选择资源目录音频或上传参考音频后,可按该音色合成旁白。",
"IndexTTS use case": "适合需要固定旁白音色、角色配音或批量生成同一音色视频的场景。使用前请先启动 IndexTTS-1.5 API 服务部署包下载https://pan.quark.cn/s/0767c9bcefd5",
"IndexTTS download link": "下载地址https://pan.quark.cn/s/0767c9bcefd5",
"IndexTTS2 features": "本地/私有部署的 IndexTTS-2 语音克隆引擎,支持情感控制和更完整的生成参数。",
"IndexTTS2 use case": "适合需要固定音色、情绪化旁白或更细致采样控制的本地语音合成场景。使用前请先启动 IndexTTS-2 API 服务。",
"OmniVoice features": "本地/私有部署的 OmniVoice-Pack 多语种语音合成引擎,支持自动音色、指令音色和参考音频克隆。",
"OmniVoice use case": "适合需要本地可控、多语言旁白、音色设计或参考音频克隆的场景。使用前请先启动 OmniVoice-Pack API 服务。",
"Doubao TTS features": "火山引擎豆包语音合成,支持多种音色和情感,国内访问速度快",
"Local Deployment": "本地部署",
"Cloud Service": "云端服务",
"Select TTS Engine": "选择 TTS 引擎",
"Select TTS Engine Help": "选择您要使用的文本转语音引擎",
"TTS Engine Details": "{engine} 详细说明",
"Features": "特点",
"Use Case": "适用场景",
"Registration URL": "注册地址",
"Voice Selection": "音色选择",
"Select Edge TTS Voice": "选择 Edge TTS 音色",
"Edge TTS Voice Description": "Edge TTS 音色说明",
"Loaded voice count": "已加载 {count} 个音色",
"Female Voice": "女声",
"Male Voice": "男声",
"Voice Volume": "音量调节",
"Voice Volume Help Percent": "调节语音音量 (0-100)",
"Voice Rate": "语速调节",
"Voice Rate Help 0.5-2.0": "调节语音速度 (0.5-2.0 倍速)",
"Voice Pitch": "语调调节",
"Voice Pitch Help Percent": "调节语音音调 (-50% 到 +50%)",
"Service Region": "服务区域",
"Service Region Placeholder": "例如eastus",
"Azure Service Region Help": "Azure Speech Services 服务区域eastus、westus2、eastasia 等",
"Azure Speech Key Help": "Azure Speech Services API 密钥",
"Voice Name": "音色名称",
"Azure Voice Name Help": "输入 Azure Speech Services 音色名称直接使用官方音色名称即可。例如zh-CN-YunzeNeural",
"Common Voice Reference": "常用音色参考",
"Chinese Voices": "中文音色",
"English Voices": "英文音色",
"Multilingual": "多语言",
"Azure Voices Docs Notice": "更多音色请参考 [Azure Speech Services 官方文档](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support)",
"Quick Select": "快速选择",
"Chinese Female Voice": "中文女声",
"Chinese Male Voice": "中文男声",
"English Female Voice": "英文女声",
"Voice name valid": "音色名称有效: {voice}",
"Voice name format may be invalid": "音色名称格式可能不正确: {voice}",
"Azure voice name format notice": "Azure 音色名称通常格式为: [语言]-[地区]-[名称]Neural",
"Azure Speech Services configured": "Azure Speech Services 配置已设置",
"Please configure service region": "请配置服务区域",
"Please configure API Key": "请配置 API Key",
"Language": "界面语言",
"Task failed": "任务失败",
"Script file cannot be empty": "脚本文件不能为空",
"Video file cannot be empty": "视频文件不能为空",
"Export to Jianying Draft": "📤 导出到剪映草稿",
"Please configure Jianying draft folder in basic settings": "请在基础设置中配置剪映草稿地址",
"Jianying draft folder does not exist": "剪映草稿文件夹不存在: {path}",
"Jianying export dialog title": "确认草稿名称",
"Jianying export dialog description": "导出前请确认剪映草稿名称,完成后可在剪映草稿目录中打开。",
"Jianying export destination": "保存目录",
"Jianying draft name": "草稿名称",
"Please enter Jianying draft name": "请输入剪映草稿名称",
"Confirm Export": "确认导出",
"Please enter draft name": "请输入草稿名称",
"Failed to build parameters": "参数构建失败",
"Exporting to Jianying draft...": "正在导出到剪映草稿,请稍候...",
"Jianying draft exported successfully": "✅ 成功导出到剪映草稿: {name}",
"Draft saved to": "📁 草稿已保存到: {path}",
"Failed to export Jianying draft": "❌ 导出到剪映草稿失败",
"Cancel": "取消",
"LLM initialization failed": "⚠️ LLM 初始化失败: {error}\n\n请检查配置文件和依赖是否正确安装。",
"Jianying Draft Settings": "剪映草稿设置",
"Jianying Draft Folder Path": "剪映草稿文件夹路径",
"Jianying Draft Folder Path Help": "剪映草稿文件夹路径例如C:\\Users\\用户名\\Documents\\JianyingPro Drafts",
"Custom API endpoint help": "OpenAI 兼容接口地址。使用第三方或自建网关时填写完整 /v1 地址;使用 OpenAI 官方接口可留空。",
"Recommended API endpoint": "推荐接口地址",
"OpenAI compatible gateway help": "{model_type} 使用 OpenAI 兼容接口,请填写完整的接口地址。",
"Vision model": "视觉分析模型",
"Text model": "文案生成模型",
"Model Name Input Help": "输入完整模型名称\n\n常用示例:",
"OpenAI compatible providers help": "这里不限定模型厂商OpenAI、DeepSeek、OpenRouter、SiliconFlow 或自建网关均可,只需提供兼容 OpenAI 的接口地址和模型名称。",
"OpenAI compatible protocol": "OpenAI 兼容",
"OpenAI compatible protocol help": "不是限定 OpenAI 官方模型;只要模型服务支持 OpenAI Chat Completions 兼容接口即可。",
"Provider API Key Help": "模型服务的 API 密钥\n\n常见获取地址:",
"Please fill OpenAI compatible gateway": "请在上方填写 OpenAI 兼容网关地址,例如:{example}",
"Please enter API key": "请先输入 API 密钥",
"Please enter model name": "请先输入模型名称",
"Connection test error": "测试连接时发生错误",
"Vision model config saved": "视觉分析模型配置已保存OpenAI 兼容)",
"Text model config saved": "文案生成模型配置已保存OpenAI 兼容)",
"Failed to save config": "保存配置失败",
"Custom Position (% from top)": "自定义位置(距顶部百分比)",
"Please enter a value between 0 and 100": "请输入 0 到 100 之间的值",
"Please enter a valid number": "请输入有效数字",
"None": "无",
"Uploaded subtitle": "已上传字幕: {file}",
"Encoding": "编码",
"Size": "大小",
"Characters": "字符",
"Ali Bailian Fun-ASR Subtitle Transcription": "字幕处理",
"Subtitle Processing Method": "字幕处理方式",
"Fun-ASR Backend": "Fun-ASR 后端",
"Local FunASR-Pack API": "FunASR(本地部署)",
"Local FireRedASR API": "FireRedASR2(本地部署)",
"Ali Bailian Online Fun-ASR": "FunASR(在线服务)",
"Local Fun-ASR upload caption": "将使用上方当前视频,通过本机运行的 FunASR-Pack API 生成 SRT 字幕。",
"Local FireRed-ASR upload caption": "将使用上方当前视频,通过本机运行的 FireRedASR2-AED-Pack API 生成 SRT 字幕。",
"Fun-ASR upload caption": "将使用上方当前视频,自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。",
"Auto Transcription Local Caption": "将在最终视频合并完成后,通过本机运行的 FunASR-Pack API 生成 SRT 字幕。",
"Auto Transcription FireRed Caption": "将在最终视频合并完成后,通过本机运行的 FireRedASR2-AED-Pack API 生成 SRT 字幕。",
"Auto Transcription Online Caption": "将在最终视频合并完成后,自动上传到阿里百炼临时存储并通过 fun-asr 生成 SRT 字幕。",
"Local FunASR-Pack API URL": "本地 FunASR-Pack API 地址",
"Local FunASR-Pack API URL Help": "例如 http://127.0.0.1:7860也可以直接填到 /asr 的完整地址。",
"Local FireRedASR API URL": "本地ASR API 地址",
"Local FireRedASR API URL Help": "例如 http://127.0.0.1:7867也可以直接填到 /asr 的完整地址。",
"Fun-ASR Hotword": "热词",
"Fun-ASR Hotword Help": "可选,传给本地 FunASR-Pack 的热词参数。",
"Enable speaker diarization": "启用说话人分段",
"Enable speaker diarization Help": "需要本地 FunASR-Pack 已启用并加载 spk 模型。",
"API Key URL": "API Key 获取地址",
"Ali Bailian API Key": "阿里百炼 API Key",
"Ali Bailian API Key Help": "请输入你自己的阿里百炼 API Key保存配置后会写入本地 config.toml",
"Upload media to transcribe": "上传需要转录的音频/视频",
"Using selected video for subtitle transcription": "将使用当前视频生成字幕: {file}",
"Using selected videos for subtitle transcription": "将使用当前 {count} 个视频生成字幕: {files}",
"Please select or upload a video first": "请先在上方选择或上传视频文件",
"Selected video file does not exist": "当前视频文件不存在,请重新选择或上传",
"Selected video files do not exist": "以下视频文件不存在,请重新选择或上传: {files}",
"Transcribe subtitles": "转录字幕",
"Calibrate subtitles": "校准字幕",
"Please enter Ali Bailian API Key": "请先输入阿里百炼 API Key",
"Please enter local FunASR-Pack API URL": "请先输入本地 FunASR-Pack API 地址",
"Please enter local FireRedASR API URL": "请先输入本地ASR API 地址",
"Please upload media to transcribe": "请先上传需要转录的音频或视频文件",
"Transcribing with local FunASR-Pack...": "正在使用本地 FunASR-Pack 转写字幕,请稍候...",
"Transcribing with local FireRedASR...": "正在使用本地ASR转写字幕请稍候...",
"Transcribing with Fun-ASR...": "正在使用阿里百炼 Fun-ASR 转写字幕,请稍候...",
"Fun-ASR failed without subtitle file": "Fun-ASR 转写失败:未生成字幕文件",
"Subtitle transcription succeeded": "字幕转写成功: {file}",
"Subtitle transcription succeeded for multiple files": "字幕转写成功,共 {count} 个文件: {files}",
"Calibrating subtitles...": "正在使用大模型校准字幕,请稍候...",
"Subtitle calibration succeeded": "字幕校准成功: {file}",
"Subtitle calibration succeeded for multiple files": "字幕校准成功,共 {count} 个文件: {files}",
"Subtitle calibration failed": "字幕校准失败",
"Transcribed subtitles storage hint": "之前转录生成的字幕保存在 {path},可从该目录拖入上传",
"Tavily Search Settings": "Tavily 联网搜索",
"Tavily API Key": "Tavily API Key",
"Tavily API Key Help": "用于剧情理解前的联网检索。开启“联网搜索”后,会先按作品名称检索剧情、人物和背景信息,再结合字幕分析。",
"Tavily config saved": "Tavily 配置已保存",
"联网搜索": "联网搜索",
"Enable Web Search Help": "开启后,剧情理解会先使用 Tavily 按作品名称联网检索,再结合检索结果和字幕分析剧情。",
"Please configure Tavily API Key in Basic Settings": "请先在基础设置中配置 Tavily API Key",
"Please enter short drama name before web search": "开启联网搜索前,请先填写短剧名称",
"Please enter film/tv title before web search": "开启联网搜索前,请先填写影视名称",
"Searching short drama with Tavily...": "正在使用 Tavily 检索短剧信息...",
"Tavily search failed": "Tavily 检索失败",
"剧情理解": "剧情理解",
"剧情理解结果": "剧情理解结果",
"Analyzing plot...": "正在理解剧情...",
"Plot analysis completed": "剧情理解完成",
"Please generate or upload subtitles first": "请先转写或上传字幕",
"Please transcribe or upload subtitles first": "请先转写或上传字幕",
"Fun-ASR transcription failed": "Fun-ASR 字幕转写失败",
"Validating script format...": "正在验证脚本格式...",
"Script format validation failed": "脚本格式验证失败",
"Error Message": "错误信息",
"Details": "详细说明",
"Correct script format example": "正确的脚本格式示例",
"Script format validation error": "格式验证过程中发生错误",
"Script validated and saved successfully": "✅ 脚本格式验证通过,保存成功!",
"Tencent Secret ID Help": "请输入您的腾讯云 Secret ID",
"Tencent Secret Key Help": "请输入您的腾讯云 Secret Key",
"Tencent Service Region Help": "选择腾讯云 TTS 服务地域",
"Custom Voice": "自定义音色",
"Select Tencent TTS Voice": "选择腾讯云 TTS 音色",
"Tencent Cloud TTS Voice Description": "腾讯云 TTS 音色说明",
"Female Voices": "女声音色",
"Male Voices": "男声音色",
"Tencent More Voices Notice": "更多音色请参考腾讯云官方文档",
"Qwen DashScope API Key Help": "通义千问 DashScope API Key",
"TTS Model Name": "模型名称",
"Qwen TTS Model Help": "Qwen TTS 模型名,例如 qwen3-tts-flash",
"Select Qwen3 TTS Voice": "选择 Qwen3 TTS 音色",
"API URL": "API 地址",
"IndexTTS API URL Help": "IndexTTS-1.5 API 服务地址",
"IndexTTS2 API URL Help": "IndexTTS-2 API 服务地址,可填写服务根地址或完整 /tts 地址",
"OmniVoice API URL Help": "OmniVoice-Pack API 服务地址,可填写服务根地址或完整 /tts 地址",
"OmniVoice Language Code": "合成语言",
"OmniVoice Language Code Help": "传给 OmniVoice-Pack 的 language 参数,例如 zh、en。",
"OmniVoice Generation Mode": "生成模式",
"OmniVoice Generation Mode Help": "自动音色无需额外参数;指令音色使用描述词;参考音频克隆需要参考音频和对应文本。",
"OmniVoice Mode Auto": "自动音色",
"OmniVoice Mode Voice Design": "指令音色",
"OmniVoice Mode Voice Clone": "参考音频克隆",
"OmniVoice Instruct": "音色指令",
"OmniVoice Instruct Help": "描述希望生成的音色,例如性别、音高、口音或风格。",
"OmniVoice Instruct Placeholder": "例如female, low pitch, british accent",
"OmniVoice Reference Text": "参考音频文本",
"OmniVoice Reference Text Help": "参考音频对应的逐字文本;当前部署未启用 ASR 时必须填写。",
"OmniVoice Reference Text Placeholder": "请输入参考音频中实际朗读的内容",
"OmniVoice Num Step Help": "扩散生成步数,值越大通常质量更高但速度更慢。",
"OmniVoice Guidance Scale Help": "控制文本条件的引导强度。",
"OmniVoice Duration": "目标时长(秒)",
"OmniVoice Duration Help": "0 表示由模型自动决定时长。",
"OmniVoice Denoise": "启用降噪",
"OmniVoice Denoise Help": "让 OmniVoice-Pack 对生成结果执行降噪处理。",
"OmniVoice Postprocess Output": "后处理输出",
"OmniVoice Postprocess Output Help": "启用 OmniVoice-Pack 的输出后处理。",
"OmniVoice Preprocess Prompt": "预处理文本",
"OmniVoice Preprocess Prompt Help": "启用 OmniVoice-Pack 的文本预处理。",
"Reference Audio Source": "参考音频来源",
"Reference Audio Source Help": "选择从资源目录选择参考音频,或上传新的参考音频",
"Select from Resource Directory": "从资源目录选择",
"Upload Reference Audio": "上传参考音频",
"Reference Audio Path": "参考音频",
"Reference Audio Path Help": "选择用于语音克隆的参考音频WAV/MP3 格式,建议 3-10 秒)",
"No Reference Audio Resources Found": "未找到资源目录中的参考音频,请上传参考音频文件",
"Preview Reference Audio": "试听",
"Preview Reference Audio Help": "播放当前参考音频",
"Upload Reference Audio File": "上传参考音频文件",
"Upload Reference Audio Help": "上传一段清晰的音频用于语音克隆",
"Audio uploaded": "音频已上传: {path}",
"Inference Mode": "推理模式",
"Standard Inference": "普通推理",
"Fast Inference": "快速推理",
"Inference Mode Help": "普通推理质量更高但速度较慢,快速推理速度更快但质量略低",
"Advanced Parameters": "高级参数",
"Sampling Temperature": "采样温度 (Temperature)",
"Sampling Temperature Help": "控制随机性,值越高输出越随机,值越低越确定",
"Top P Help": "nucleus 采样的概率阈值,值越小结果越确定",
"Top K Help": "top-k 采样的 k 值0 表示不使用 top-k",
"Num Beams": "束搜索 (Num Beams)",
"Num Beams Help": "束搜索的 beam 数量,值越大质量可能越好但速度越慢",
"Repetition Penalty": "重复惩罚 (Repetition Penalty)",
"Repetition Penalty Help": "值越大越能避免重复,但过大可能导致不自然",
"Enable Sampling": "启用采样",
"Enable Sampling Help": "启用采样可以获得更自然的语音",
"IndexTTS Usage Instructions Title": "IndexTTS-1.5 使用说明",
"IndexTTS Usage Instructions": "**零样本语音克隆**\n\n1. **准备参考音频**:上传或指定一段清晰的音频文件(建议 3-10 秒)\n2. **设置 API 地址**:确保 IndexTTS-1.5 服务正常运行\n3. **开始合成**:系统会自动使用参考音频的音色合成新语音\n\n**注意事项**\n- 参考音频质量直接影响合成效果\n- 建议使用无背景噪音的清晰音频\n- 文本长度建议控制在合理范围内\n- 首次合成可能需要较长时间",
"IndexTTS2 Emotion Parameters": "情感参数",
"Emotion Mode": "情感控制方式",
"Emotion Mode Help": "选择 IndexTTS-2 的情感控制来源",
"Emotion Mode Speaker": "与音色参考相同",
"Emotion Mode Audio": "使用情感参考音频",
"Emotion Mode Vector": "使用情感向量",
"Emotion Mode Text": "使用情感描述文本",
"Emotion Alpha": "情感权重",
"Emotion Alpha Help": "控制情感条件的影响强度0 表示弱1 表示强",
"Emotion Reference Audio Path": "情感参考音频路径",
"Emotion Reference Audio Path Help": "emotion_mode=audio 时使用的本地情感参考音频路径",
"Emotion Text": "情感描述文本",
"Emotion Text Help": "emotion_mode=text 时使用的情感描述,例如开心、紧张、委屈",
"Emotion Text Placeholder": "例如:沉稳、紧张、开心",
"Use Random Emotion": "启用随机情感",
"Use Random Emotion Help": "让 IndexTTS-2 在生成时使用随机情感采样",
"Emotion Happy": "开心",
"Emotion Angry": "愤怒",
"Emotion Sad": "悲伤",
"Emotion Afraid": "害怕",
"Emotion Disgusted": "厌恶",
"Emotion Melancholic": "忧郁",
"Emotion Surprised": "惊讶",
"Emotion Calm": "平静",
"Max Text Tokens Per Segment": "单段最大文本 Token",
"Max Text Tokens Per Segment Help": "IndexTTS-2 分段推理的最大文本 token 数",
"Max Mel Tokens": "最大 Mel Tokens",
"Max Mel Tokens Help": "控制单次生成的最大 mel token 数,值越大可生成更长音频",
"IndexTTS2 Usage Instructions Title": "IndexTTS-2 使用说明",
"IndexTTS2 Usage Instructions": "**IndexTTS-2 语音克隆**\n\n1. **选择音色**:复用 IndexTTS-1.5 的资源音频或上传参考音频\n2. **设置 API 地址**:例如 http://192.168.3.6:7863/tts也可以填写服务根地址\n3. **调整情感参数**:默认使用 speaker可按需切换到 audio、vector 或 text\n4. **调整生成参数**temperature、top_p、top_k、num_beams、repetition_penalty 和 max_mel_tokens 会直接传给 IndexTTS-2 接口\n\n**注意事项**\n- 参考音频质量会直接影响克隆效果\n- 首次请求可能需要加载模型,耗时更长\n- CPU 部署生成速度会明显慢于 GPU",
"OmniVoice Usage Instructions Title": "OmniVoice 使用说明",
"OmniVoice Usage Instructions": "**OmniVoice-Pack 语音合成**\n\n1. **自动音色**:只需要设置 API 地址和语言,可直接合成。\n2. **指令音色**:填写 instruct 描述想要的性别、音高、口音或风格。\n3. **参考音频克隆**:上传或选择参考音频,并填写该音频对应文本。\n\n**注意事项**\n- 当前默认服务地址为 http://127.0.0.1:7866/tts\n- 参考音频克隆在服务未加载 ASR 模型时必须填写参考文本\n- OmniVoice 返回 WAV 音频,系统会按音频时长估算字幕段落",
"Volcengine Access Key Help": "火山引擎 Access Key",
"Volcengine Secret Key Help": "火山引擎 Secret Key",
"Doubao AppID Help": "豆包语音应用 AppID",
"Doubao Token Help": "豆包语音应用 Token",
"Cluster": "集群",
"Doubao Cluster Help": "业务集群,标准音色使用 volcano_tts",
"Select Doubao TTS Voice": "选择豆包语音 TTS 音色",
"Voice Rate Help 0.2-3.0": "调节语音速度 (0.2-3.0)",
"Voice Volume Help 0.1-2.0": "调节语音音量 (0.1-2.0)",
"Voice Pitch Help 0.5-1.5": "调节语音音高 (0.5-1.5)",
"Sentence Silence Duration": "句尾静音时长 (秒)",
"Sentence Silence Duration Help": "调节句尾静音时长 (0.0-2.0 秒)",
"Doubao TTS API Key Application Process": "豆包语音 TTS API Key申请流程",
"Application Steps": "申请步骤",
"Doubao TTS Step 1": "1. 打开 [https://console.volcengine.com/iam/keymanage](https://console.volcengine.com/iam/keymanage)",
"Doubao TTS Step 2": "2. 新建 Access Key 和 Secret Key",
"Doubao TTS Step 3": "3. 打开 [https://www.volcengine.com/product/voice-tech](https://www.volcengine.com/product/voice-tech)",
"Doubao TTS Step 4": "4. 点击立即使用",
"Doubao TTS Step 5": "5. 在最左边的 API 服务中心找到音频生成下面的语音合成(注意:是语音合成,不是语音合成大模型)",
"Doubao TTS Step 6": "6. 翻到最下面获取 APPID 和 Access Token",
"Doubao TTS Fill Credentials Notice": "请将获取到的 Access Key、Secret Key、AppID 和 Token 填写到上方的配置中",
"Doubao TTS configured": "豆包语音 TTS 配置已设置",
"Please configure missing fields": "请配置: {fields}",
"Preview Voice Synthesis": "试听语音合成",
"Voice Preview Sample": "感谢关注 NarratoAI有任何问题或建议可以加入社区频道求助或讨论",
"Please configure voice settings first": "请先配置语音设置",
"Voice synthesis successful": "语音合成成功!",
"Voice synthesis failed": "语音合成失败,请检查配置",
"SoulVoice pitch not supported": "SoulVoice 引擎不支持音调调节",
"上传字幕文件": "上传字幕",
"清除已上传字幕": "清除已上传字幕",
"无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312": "无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312",
"字幕文件内容似乎为空,请检查文件": "字幕文件内容似乎为空,请检查文件",
"字幕上传成功": "字幕上传成功",
"短剧名称": "短剧名称",
"影视名称": "影视名称",
"解说语言": "解说语言",
"自定义解说语言": "自定义解说语言",
"例如:意大利语(意大利)": "例如:意大利语(意大利)",
"请输入自定义解说语言": "请输入自定义解说语言",
"简体中文(中国)": "简体中文(中国)",
"英语(美国)": "英语(美国)",
"日语(日本)": "日语(日本)",
"韩语(韩国)": "韩语(韩国)",
"法语(法国)": "法语(法国)",
"德语(德国)": "德语(德国)",
"西班牙语(西班牙)": "西班牙语(西班牙)",
"葡萄牙语(巴西)": "葡萄牙语(巴西)",
"俄语(俄罗斯)": "俄语(俄罗斯)",
"自定义": "自定义",
"短剧类型": "短剧类型",
"自定义短剧类型": "自定义短剧类型",
"影视类型": "影视类型",
"自定义影视类型": "自定义影视类型",
"原片占比": "原片占比",
"例如:豪门虐恋": "例如:豪门虐恋",
"例如:悬疑犯罪": "例如:悬疑犯罪",
"请输入自定义短剧类型": "请输入自定义短剧类型",
"请输入自定义影视类型": "请输入自定义影视类型",
"逆袭/复仇": "逆袭/复仇",
"霸总/甜宠": "霸总/甜宠",
"家庭伦理": "家庭伦理",
"古装/权谋": "古装/权谋",
"悬疑/犯罪": "悬疑/犯罪",
"都市情感": "都市情感",
"年代/乡村": "年代/乡村",
"剧情/情感": "剧情/情感",
"动作/冒险": "动作/冒险",
"喜剧/轻松": "喜剧/轻松",
"科幻/奇幻": "科幻/奇幻",
"历史/战争": "历史/战争",
"恐怖/惊悚": "恐怖/惊悚",
"生成解说文案": "生成解说文案",
"生成剪辑脚本": "生成剪辑脚本",
"短剧解说文案": "短剧解说文案",
"影视解说文案": "影视解说文案",
"Narration Copy Help": "先点击生成解说文案;审核、删改或重写这段文案后,再点击生成剪辑脚本匹配画面和时间戳。",
"Narration copy generated successfully": "解说文案已生成,可先审核修改",
"生成短剧解说脚本": "生成短剧解说脚本",
"请输入视频脚本": "请输入视频脚本",
"自定义片段": "自定义片段",
"设置需要生成的短视频片段数量": "设置需要生成的短视频片段数量",
"原生Gemini模型连接成功": "原生 Gemini 模型连接成功",
"原生Gemini模型连接失败": "原生 Gemini 模型连接失败",
"OpenAI兼容Gemini代理连接成功": "OpenAI 兼容 Gemini 代理连接成功",
"OpenAI兼容Gemini代理连接失败": "OpenAI 兼容 Gemini 代理连接失败",
"Progress": "进度",
"Generating script...": "正在生成脚本...",
"Please select video file first": "请先选择视频文件",
"Extracting keyframes...": "正在提取关键帧...",
"Script generation completed": "脚本生成完成",
"Script generation completed!": "🎉 脚本生成完成!",
"Video script generated successfully": "✅ 视频脚本生成成功!",
"Generation error": "❌ 生成过程中发生错误",
"Please upload subtitle file first": "请先上传字幕文件",
"Video": "视频",
"Subtitle": "字幕",
"Preparing script generation": "开始准备生成脚本",
"Script generation failed check logs": "生成脚本失败,请检查日志",
"Parsing subtitles...": "正在解析字幕...",
"Analyzing subtitles with model...": "正在等待模型分析字幕...",
"Subtitle file does not exist": "字幕文件不存在",
"Subtitle file is empty or unreadable": "字幕文件内容为空或无法读取",
"Generating narration copy...": "正在生成文案...",
"Generated narration copy is empty": "生成的解说文案为空",
"Please generate and review narration copy first": "请先生成并审核解说文案",
"Matching narration copy to footage...": "正在根据解说文案匹配画面和时间戳...",
"Waiting for model stream...": "正在等待模型流式输出...",
"Streaming unavailable fallback waiting...": "当前接口未返回流式内容,正在等待完整响应...",
"LLM stream window title": "模型思考 / 输出流",
"Model reasoning stream": "【模型思考】",
"Model output preview": "【模型输出预览】",
"Repairing narration script...": "正在修复解说脚本...",
"Generated narration JSON parse failed": "生成的解说文案格式错误,无法解析为 JSON",
"Generated narration missing items field": "生成的解说文案缺少必要的 'items' 字段",
"Generated narration validation failed": "生成的解说脚本校验失败",
"Preparing output...": "整理输出..."
}
}
}

View File

@ -24,7 +24,7 @@ def _normalize_progress_value(progress: float | int) -> int:
return max(0, min(100, int(round(value))))
def generate_script_docu(params):
def generate_script_docu(params, tr=lambda key: key):
"""
生成纪录片视频脚本
要求: 原视频无字幕无配音
@ -39,12 +39,12 @@ def generate_script_docu(params):
if message:
status_text.text(f"🎬 {message}")
else:
status_text.text(f"📊 进度: {normalized_progress}%")
status_text.text(f"📊 {tr('Progress')}: {normalized_progress}%")
try:
with st.spinner("正在生成脚本..."):
with st.spinner(tr("Generating script...")):
if not params.video_origin_path:
st.error("请先选择视频文件")
st.error(tr("Please select video file first"))
return
vision_llm_provider = (
@ -76,7 +76,7 @@ def generate_script_docu(params):
"vision_max_concurrency", 2
)
update_progress(10, "正在提取关键帧...")
update_progress(10, tr("Extracting keyframes..."))
service = DocumentaryFrameAnalysisService()
script_items = asyncio.run(
service.generate_documentary_script(
@ -100,15 +100,15 @@ def generate_script_docu(params):
st.session_state["video_clip_json"] = script
elif isinstance(script, str):
st.session_state["video_clip_json"] = json.loads(script)
update_progress(100, "脚本生成完成")
update_progress(100, tr("Script generation completed"))
time.sleep(0.1)
progress_bar.progress(100)
status_text.text("🎉 脚本生成完成!")
st.success("✅ 视频脚本生成成功!")
status_text.text(tr("Script generation completed!"))
st.success(tr("Video script generated successfully"))
except Exception as err:
st.error(f"❌ 生成过程中发生错误: {str(err)}")
st.error(f"{tr('Generation error')}: {str(err)}")
logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")
finally:
time.sleep(2)

View File

@ -27,21 +27,21 @@ def generate_script_short(tr, params, custom_clips=5):
if message:
status_text.text(f"{progress}% - {message}")
else:
status_text.text(f"进度: {progress}%")
status_text.text(f"{tr('Progress')}: {progress}%")
try:
with st.spinner("正在生成脚本..."):
with st.spinner(tr("Generating script...")):
# ========== 严格验证:必须上传视频和字幕(与短剧解说保持一致)==========
# 1. 验证视频文件
video_path = getattr(params, "video_origin_path", None)
if not video_path or not str(video_path).strip():
st.error("请先选择视频文件")
st.error(tr("Please select video file first"))
st.stop()
try:
ensure_existing_file(
str(video_path),
label="视频",
label=tr("Video"),
allowed_exts=(".mp4", ".mov", ".avi", ".flv", ".mkv"),
)
except InputValidationError as e:
@ -51,13 +51,13 @@ def generate_script_short(tr, params, custom_clips=5):
# 2. 验证字幕文件(移除推断逻辑,必须上传)
subtitle_path = st.session_state.get("subtitle_path")
if not subtitle_path or not str(subtitle_path).strip():
st.error("请先上传字幕文件")
st.error(tr("Please upload subtitle file first"))
st.stop()
try:
subtitle_path = ensure_existing_file(
str(subtitle_path),
label="字幕",
label=tr("Subtitle"),
allowed_exts=(".srt",),
)
except InputValidationError as e:
@ -78,7 +78,7 @@ def generate_script_short(tr, params, custom_clips=5):
vision_model = st.session_state.get(f'vision_{vision_llm_provider}_model_name') or config.app.get(f'vision_{vision_llm_provider}_model_name', "")
vision_base_url = st.session_state.get(f'vision_{vision_llm_provider}_base_url') or config.app.get(f'vision_{vision_llm_provider}_base_url', "")
update_progress(20, "开始准备生成脚本")
update_progress(20, tr("Preparing script generation"))
# ========== 调用后端生成脚本 ==========
from app.services.SDP.generate_script_short import generate_script_result
@ -103,7 +103,7 @@ def generate_script_short(tr, params, custom_clips=5):
)
if result.get("status") != "success":
st.error(result.get("message", "生成脚本失败,请检查日志"))
st.error(result.get("message", tr("Script generation failed check logs")))
st.stop()
script = result.get("script")
@ -114,14 +114,14 @@ def generate_script_short(tr, params, custom_clips=5):
elif isinstance(script, str):
st.session_state['video_clip_json'] = json.loads(script)
update_progress(80, "脚本生成完成")
update_progress(80, tr("Script generation completed"))
time.sleep(0.1)
progress_bar.progress(100)
status_text.text("脚本生成完成!")
st.success("视频脚本生成成功!")
status_text.text(tr("Script generation completed!"))
st.success(tr("Video script generated successfully"))
except Exception as err:
progress_bar.progress(100)
st.error(f"生成过程中发生错误: {str(err)}")
st.error(f"{tr('Generation error')}: {str(err)}")
logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")

View File

@ -11,18 +11,92 @@ import os
import json
import time
import traceback
import html
import streamlit as st
from loguru import logger
from app.config import config
from app.services.SDE.short_drama_explanation import analyze_subtitle, generate_narration_script
from app.services.SDE.short_drama_explanation import (
analyze_subtitle,
generate_narration_copy as generate_narration_copy_legacy,
match_narration_copy_to_script as match_narration_copy_to_script_legacy,
)
from app.services.subtitle_text import read_subtitle_text
from app.services.short_drama_narration_validation import (
normalize_script_video_sources,
)
from app.services.tavily_search import TavilySearchError, format_search_context, search_story_context
# 导入新的LLM服务模块 - 确保提供商被注册
import app.services.llm # 这会触发提供商注册
from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter
import re
PUBLIC_SCRIPT_FIELDS = ["_id", "video_id", "video_name", "timestamp", "picture", "narration", "OST"]
SHORT_DRAMA_PROMPT_CATEGORY = "short_drama_narration"
FILM_TV_PROMPT_CATEGORY = "film_tv_narration"
SHORT_DRAMA_SEARCH_KEYWORDS = "短剧 剧情 介绍 人物 结局"
FILM_TV_SEARCH_KEYWORDS = "影视 剧情 介绍 人物 结局 电影 电视剧"
def _normalize_paths(paths):
if isinstance(paths, str):
paths = [paths]
if not paths:
return []
normalized_paths = []
seen = set()
for path in paths:
if not isinstance(path, str):
continue
path = path.strip()
if not path or path in seen:
continue
normalized_paths.append(path)
seen.add(path)
return normalized_paths
def _build_combined_subtitle_content(subtitle_paths, video_paths=None):
sections = []
video_paths = _normalize_paths(video_paths)
for index, subtitle_path in enumerate(_normalize_paths(subtitle_paths), start=1):
if not os.path.exists(subtitle_path):
continue
video_path = video_paths[index - 1] if index <= len(video_paths) else ""
if video_path:
header = (
f"# 视频 {index}: {os.path.basename(video_path)}\n"
f"字幕文件: {os.path.basename(subtitle_path)}"
)
else:
header = f"# 视频 {index}\n字幕文件: {os.path.basename(subtitle_path)}"
sections.append(f"{header}\n{read_subtitle_text(subtitle_path).text}".strip())
return "\n\n".join(sections)
def _normalize_narration_items_video_sources(items, video_paths):
return normalize_script_video_sources(items, _normalize_paths(video_paths))
def _strip_planner_only_fields(items):
return [
{field: item[field] for field in PUBLIC_SCRIPT_FIELDS if field in item}
for item in items
if isinstance(item, dict)
]
def _format_progress_status(progress, message: str = "", tr=lambda key: key):
message = str(message or "").strip()
if message:
return message
return f"{tr('Progress')}: {progress}%"
def parse_and_fix_json(json_string):
"""
解析并修复JSON字符串
@ -114,55 +188,382 @@ def parse_and_fix_json(json_string):
logger.debug(f"综合修复失败: {e}")
pass
# 如果所有方法都失败,尝试创建一个基本的结构
# 如果所有方法都失败,直接返回 None避免生成不可剪辑的默认假脚本
logger.error(f"所有JSON解析方法都失败原始内容: {json_string[:200]}...")
return None
def _get_tavily_api_key() -> str:
return (
st.session_state.get("tavily_api_key")
or config.app.get("tavily_api_key")
or ""
).strip()
def _build_tavily_context(
title: str,
tr=lambda key: key,
search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS,
empty_title_message_key: str = "Please enter short drama name before web search",
) -> str | None:
title = str(title or "").strip()
if not title:
st.error(tr(empty_title_message_key))
return None
api_key = _get_tavily_api_key()
if not api_key:
st.error(tr("Please configure Tavily API Key in Basic Settings"))
return None
# 尝试从文本中提取关键信息创建基本结构
try:
# 这是一个简单的回退方案
return {
"items": [
{
"_id": 1,
"timestamp": "00:00:00,000-00:00:10,000",
"picture": "解析失败,使用默认内容",
"narration": json_string[:100] + "..." if len(json_string) > 100 else json_string,
"OST": 0
}
]
}
except Exception:
search_data = search_story_context(
title,
api_key,
search_keywords=search_keywords,
empty_name_message=tr(empty_title_message_key),
search_depth=config.app.get("tavily_search_depth", "basic"),
max_results=config.app.get("tavily_max_results", 5),
)
return format_search_context(search_data)
except TavilySearchError as e:
logger.error(f"Tavily 短剧检索失败: {str(e)}")
st.error(f"{tr('Tavily search failed')}: {str(e)}")
return None
except Exception as e:
logger.error(f"Tavily 短剧检索异常: {traceback.format_exc()}")
st.error(f"{tr('Tavily search failed')}: {str(e)}")
return None
def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature):
def _build_plot_analysis_input(
subtitle_content: str,
short_name: str = "",
enable_web_search: bool = False,
tr=lambda key: key,
search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS,
empty_title_message_key: str = "Please enter short drama name before web search",
web_search_context_description: str = "短剧名称、人物关系、剧情背景和公开剧情梗概",
) -> str | None:
subtitle_content = str(subtitle_content or "").strip()
if not enable_web_search:
return subtitle_content
tavily_context = _build_tavily_context(
short_name,
tr,
search_keywords=search_keywords,
empty_title_message_key=empty_title_message_key,
)
if tavily_context is None:
return None
return f"""# 分析补充说明
请先参考 Tavily 联网检索结果理解{web_search_context_description}再结合原始字幕完成剧情理解
如果联网检索结果与字幕内容冲突请以字幕内容为准时间戳必须只从字幕内容中提取
{tavily_context}
# 原始字幕
{subtitle_content}"""
def analyze_short_drama_plot(
subtitle_path,
temperature,
tr=lambda key: key,
subtitle_content=None,
short_name: str = "",
enable_web_search: bool = False,
video_paths=None,
prompt_category: str = SHORT_DRAMA_PROMPT_CATEGORY,
search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS,
empty_title_message_key: str = "Please enter short drama name before web search",
web_search_context_description: str = "短剧名称、人物关系、剧情背景和公开剧情梗概",
):
"""仅执行短剧字幕剧情理解,返回可编辑的剧情分析文本。"""
subtitle_paths = _normalize_paths(subtitle_path)
if not subtitle_paths:
st.error(tr("Please generate or upload subtitles first"))
return None
missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)]
if missing_subtitle_paths:
st.error(tr("Subtitle file does not exist"))
return None
text_provider = config.app.get('text_llm_provider', 'gemini').lower()
text_api_key = config.app.get(f'text_{text_provider}_api_key')
text_model = config.app.get(f'text_{text_provider}_model_name')
text_base_url = config.app.get(f'text_{text_provider}_base_url')
subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content(
subtitle_paths,
video_paths,
)
if not subtitle_content:
st.error(tr("Subtitle file is empty or unreadable"))
return None
plot_analysis_input = _build_plot_analysis_input(
subtitle_content,
short_name=short_name,
enable_web_search=enable_web_search,
tr=tr,
search_keywords=search_keywords,
empty_title_message_key=empty_title_message_key,
web_search_context_description=web_search_context_description,
)
if plot_analysis_input is None:
return None
try:
logger.info("使用新的LLM服务架构进行字幕分析")
analyzer = SubtitleAnalyzerAdapter(
text_api_key,
text_model,
text_base_url,
text_provider,
prompt_category=prompt_category,
)
analysis_result = analyzer.analyze_subtitle(plot_analysis_input)
except Exception as e:
logger.warning(f"使用新LLM服务失败回退到旧实现: {str(e)}")
analysis_result = analyze_subtitle(
subtitle_content=plot_analysis_input,
api_key=text_api_key,
model=text_model,
base_url=text_base_url,
save_result=True,
temperature=temperature,
provider=text_provider,
prompt_category=prompt_category,
)
if analysis_result["status"] != "success":
logger.error(f"分析失败: {analysis_result['message']}")
st.error(tr("Script generation failed check logs"))
return None
return analysis_result["analysis"]
def generate_short_drama_narration_copy(
subtitle_path,
video_theme,
temperature,
tr=lambda key: key,
plot_analysis=None,
subtitle_content=None,
enable_web_search: bool = False,
video_paths=None,
narration_language: str = "简体中文(中国)",
drama_genre: str = "逆袭/复仇",
prompt_category: str = SHORT_DRAMA_PROMPT_CATEGORY,
search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS,
empty_title_message_key: str = "Please enter short drama name before web search",
web_search_context_description: str = "短剧名称、人物关系、剧情背景和公开剧情梗概",
):
"""生成可由用户审核修改的短剧解说正文,不绑定时间戳。"""
subtitle_paths = _normalize_paths(subtitle_path)
if not subtitle_paths:
st.error(tr("Please generate or upload subtitles first"))
return None
missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)]
if missing_subtitle_paths:
st.error(tr("Subtitle file does not exist"))
return None
selected_video_paths = _normalize_paths(video_paths)
subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content(
subtitle_paths,
selected_video_paths,
)
if not subtitle_content:
st.error(tr("Subtitle file is empty or unreadable"))
return None
analysis_text = str(plot_analysis or "").strip()
if not analysis_text:
analysis_text = analyze_short_drama_plot(
subtitle_paths,
temperature,
tr,
subtitle_content=subtitle_content,
short_name=video_theme,
enable_web_search=enable_web_search,
video_paths=selected_video_paths,
prompt_category=prompt_category,
search_keywords=search_keywords,
empty_title_message_key=empty_title_message_key,
web_search_context_description=web_search_context_description,
)
if not analysis_text:
return None
text_provider = config.app.get('text_llm_provider', 'gemini').lower()
text_api_key = config.app.get(f'text_{text_provider}_api_key')
text_model = config.app.get(f'text_{text_provider}_model_name')
text_base_url = config.app.get(f'text_{text_provider}_base_url')
try:
logger.info("使用新的LLM服务架构生成可审核解说文案")
analyzer = SubtitleAnalyzerAdapter(
text_api_key,
text_model,
text_base_url,
text_provider,
prompt_category=prompt_category,
)
narration_result = analyzer.generate_narration_copy(
short_name=video_theme,
plot_analysis=analysis_text,
subtitle_content=subtitle_content,
temperature=temperature,
narration_language=narration_language,
drama_genre=drama_genre,
)
except Exception as e:
logger.warning(f"使用新LLM服务生成文案失败回退到旧实现: {str(e)}")
narration_result = generate_narration_copy_legacy(
short_name=video_theme,
plot_analysis=analysis_text,
subtitle_content=subtitle_content,
api_key=text_api_key,
model=text_model,
base_url=text_base_url,
temperature=temperature,
provider=text_provider,
narration_language=narration_language,
drama_genre=drama_genre,
prompt_category=prompt_category,
)
if narration_result.get("status") != "success":
logger.error(f"解说文案正文生成失败: {narration_result.get('message')}")
st.error(tr("Script generation failed check logs"))
return None
narration_copy = str(narration_result.get("narration_copy", "")).strip()
if not narration_copy:
logger.error("模型返回空解说文案正文")
st.error(tr("Generated narration copy is empty"))
return None
return {
"narration_copy": narration_copy,
"plot_analysis": analysis_text,
"subtitle_content": subtitle_content,
}
def generate_script_short_sunmmary(
params,
subtitle_path,
video_theme,
temperature,
tr=lambda key: key,
plot_analysis=None,
subtitle_content=None,
enable_web_search: bool = False,
video_paths=None,
narration_language: str = "简体中文(中国)",
narration_copy: str = "",
drama_genre: str = "逆袭/复仇",
original_sound_ratio: int = 30,
prompt_category: str = SHORT_DRAMA_PROMPT_CATEGORY,
search_keywords: str = SHORT_DRAMA_SEARCH_KEYWORDS,
empty_title_message_key: str = "Please enter short drama name before web search",
web_search_context_description: str = "短剧名称、人物关系、剧情背景和公开剧情梗概",
):
"""
生成 短剧解说 视频脚本
要求: 提供高质量短剧字幕
适合场景: 短剧
"""
progress_bar = st.progress(0)
progress_bar = st.empty()
status_text = st.empty()
stream_text = st.empty()
stream_state = {
"reasoning": "",
"content": "",
"last_update": 0.0,
}
def update_progress(progress: float, message: str = ""):
progress_bar.progress(progress)
status_text.text(_format_progress_status(progress, message, tr))
def update_waiting(message: str = ""):
progress_bar.empty()
if message:
status_text.text(f"{progress}% - {message}")
status_text.text(message)
else:
status_text.text(f"进度: {progress}%")
status_text.empty()
def update_stream_window(event):
event = event or {}
chunk_type = str(event.get("type") or "content")
chunk_text = str(event.get("text") or "")
if chunk_type == "done" or not chunk_text:
return
bucket = "reasoning" if chunk_type == "reasoning" else "content"
stream_state[bucket] += chunk_text
now = time.time()
if now - stream_state["last_update"] < 0.12:
return
stream_state["last_update"] = now
blocks = []
if stream_state["reasoning"].strip():
blocks.append(
f"{tr('Model reasoning stream')}\n"
f"{stream_state['reasoning'][-900:]}"
)
if stream_state["content"].strip():
blocks.append(
f"{tr('Model output preview')}\n"
f"{stream_state['content'][-900:]}"
)
preview = "\n\n".join(blocks)[-1800:]
escaped_preview = html.escape(preview)
stream_text.markdown(
f"""
<div style="height:150px; overflow:hidden; border:1px solid #e5e7eb;
border-radius:8px; padding:10px 12px; background:#f8fafc;
color:#334155;">
<div style="font-size:12px; font-weight:600; color:#64748b; margin-bottom:6px;">
{html.escape(tr('LLM stream window title'))}
</div>
<pre style="white-space:pre-wrap; margin:0; font-size:12px; line-height:1.45;
font-family:ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;">{escaped_preview}</pre>
</div>
""",
unsafe_allow_html=True,
)
try:
with st.spinner("正在生成脚本..."):
if not params.video_origin_path:
st.error("请先选择视频文件")
with st.spinner(tr("Generating script...")):
selected_video_paths = _normalize_paths(
video_paths
or getattr(params, "video_origin_paths", [])
or getattr(params, "video_origin_path", "")
)
if not selected_video_paths:
st.error(tr("Please select video file first"))
return
"""
1. 获取字幕
"""
update_progress(30, "正在解析字幕...")
update_progress(30, tr("Parsing subtitles..."))
# 判断字幕文件是否存在
if not os.path.exists(subtitle_path):
st.error("字幕文件不存在")
subtitle_paths = _normalize_paths(subtitle_path)
missing_subtitle_paths = [path for path in subtitle_paths if not os.path.exists(path)]
if not subtitle_paths or missing_subtitle_paths:
st.error(tr("Subtitle file does not exist"))
return
"""
@ -174,72 +575,118 @@ def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperatu
text_base_url = config.app.get(f'text_{text_provider}_base_url')
# 读取字幕文件内容(无论使用哪种实现都需要)
subtitle_content = read_subtitle_text(subtitle_path).text
subtitle_content = str(subtitle_content or "").strip() or _build_combined_subtitle_content(
subtitle_paths,
selected_video_paths,
)
if not subtitle_content:
st.error("字幕文件内容为空或无法读取")
st.error(tr("Subtitle file is empty or unreadable"))
return
try:
# 优先使用新的LLM服务架构
logger.info("使用新的LLM服务架构进行字幕分析")
analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider)
narration_copy = str(narration_copy or "").strip()
if not narration_copy:
st.error(tr("Please generate and review narration copy first"))
return
analysis_result = analyzer.analyze_subtitle(subtitle_content)
except Exception as e:
logger.warning(f"使用新LLM服务失败回退到旧实现: {str(e)}")
# 回退到旧的实现
analysis_result = analyze_subtitle(
subtitle_file_path=subtitle_path,
api_key=text_api_key,
model=text_model,
base_url=text_base_url,
save_result=True,
temperature=temperature,
provider=text_provider
)
"""
3. 根据剧情生成解说文案
"""
if analysis_result["status"] == "success":
logger.info("字幕分析成功!")
update_progress(60, "正在生成文案...")
# 根据剧情生成解说文案 - 使用新的LLM服务架构
analyzer = SubtitleAnalyzerAdapter(
text_api_key,
text_model,
text_base_url,
text_provider,
prompt_category=prompt_category,
)
if plot_analysis and str(plot_analysis).strip():
logger.info("使用用户编辑后的剧情理解结果匹配剪辑脚本")
analysis_result = {
"status": "success",
"analysis": str(plot_analysis).strip(),
}
else:
plot_analysis_input = subtitle_content
if enable_web_search:
update_waiting(tr("Searching short drama with Tavily..."))
plot_analysis_input = _build_plot_analysis_input(
subtitle_content,
short_name=video_theme,
enable_web_search=True,
tr=tr,
search_keywords=search_keywords,
empty_title_message_key=empty_title_message_key,
web_search_context_description=web_search_context_description,
)
if plot_analysis_input is None:
return
try:
# 优先使用新的LLM服务架构
logger.info("使用新的LLM服务架构生成解说文案")
narration_result = analyzer.generate_narration_script(
short_name=video_theme,
plot_analysis=analysis_result["analysis"],
subtitle_content=subtitle_content, # 传递原始字幕内容
temperature=temperature
)
logger.info("使用新的LLM服务架构进行字幕分析")
update_waiting(tr("Analyzing subtitles with model..."))
analysis_result = analyzer.analyze_subtitle(plot_analysis_input)
except Exception as e:
logger.warning(f"使用新LLM服务失败回退到旧实现: {str(e)}")
# 回退到旧的实现
narration_result = generate_narration_script(
short_name=video_theme,
plot_analysis=analysis_result["analysis"],
subtitle_content=subtitle_content, # 传递原始字幕内容
update_waiting(tr("Analyzing subtitles with model..."))
analysis_result = analyze_subtitle(
subtitle_content=plot_analysis_input,
api_key=text_api_key,
model=text_model,
base_url=text_base_url,
save_result=True,
temperature=temperature,
provider=text_provider
provider=text_provider,
prompt_category=prompt_category,
)
"""
3. 根据用户审核后的文案匹配画面与时间戳
"""
if analysis_result["status"] == "success":
logger.info("字幕分析成功!")
update_waiting()
try:
logger.info("使用新的LLM服务架构将审核文案匹配到字幕画面")
update_waiting(tr("Matching narration copy to footage..."))
stream_text.info(tr("Waiting for model stream..."))
narration_result = analyzer.match_narration_copy_to_script(
short_name=video_theme,
plot_analysis=analysis_result["analysis"],
subtitle_content=subtitle_content,
narration_copy=narration_copy,
temperature=temperature,
narration_language=narration_language,
drama_genre=drama_genre,
original_sound_ratio=original_sound_ratio,
stream_callback=update_stream_window,
)
except Exception as e:
logger.warning(f"使用新LLM服务匹配画面失败回退到旧实现: {str(e)}")
stream_text.info(tr("Streaming unavailable fallback waiting..."))
narration_result = match_narration_copy_to_script_legacy(
short_name=video_theme,
plot_analysis=analysis_result["analysis"],
subtitle_content=subtitle_content,
narration_copy=narration_copy,
api_key=text_api_key,
model=text_model,
base_url=text_base_url,
temperature=temperature,
provider=text_provider,
narration_language=narration_language,
drama_genre=drama_genre,
original_sound_ratio=original_sound_ratio,
prompt_category=prompt_category,
)
if narration_result["status"] == "success":
logger.info("\n解说文案生成成功!")
logger.info("\n剪辑脚本匹配成功!")
logger.info(narration_result["narration_script"])
else:
logger.info(f"\n解说文案生成失败: {narration_result['message']}")
st.error("生成脚本失败,请检查日志")
logger.info(f"\n剪辑脚本匹配失败: {narration_result['message']}")
st.error(tr("Script generation failed check logs"))
st.stop()
else:
logger.error(f"分析失败: {analysis_result['message']}")
st.error("生成脚本失败,请检查日志")
st.error(tr("Script generation failed check logs"))
st.stop()
"""
@ -253,37 +700,43 @@ def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperatu
# 增强JSON解析包含错误处理和修复
narration_dict = parse_and_fix_json(narration_script)
if narration_dict is None:
st.error("生成的解说文案格式错误无法解析为JSON")
st.error(tr("Generated narration JSON parse failed"))
logger.error(f"JSON解析失败原始内容: {narration_script}")
st.stop()
# 验证JSON结构
if 'items' not in narration_dict:
st.error("生成的解说文案缺少必要的'items'字段")
st.error(tr("Generated narration missing items field"))
logger.error(f"JSON结构错误缺少items字段: {narration_dict}")
st.stop()
script = json.dumps(narration_dict['items'], ensure_ascii=False, indent=2)
narration_items = _normalize_narration_items_video_sources(
narration_dict['items'],
selected_video_paths,
)
narration_items = _strip_planner_only_fields(narration_items)
script = json.dumps(narration_items, ensure_ascii=False, indent=2)
if script is None:
st.error("生成脚本失败,请检查日志")
st.error(tr("Script generation failed check logs"))
st.stop()
logger.success(f"剪辑脚本生成完成")
if isinstance(script, list):
st.session_state['video_clip_json'] = script
elif isinstance(script, str):
st.session_state['video_clip_json'] = json.loads(script)
update_progress(90, "整理输出...")
update_progress(90, tr("Preparing output..."))
time.sleep(0.1)
progress_bar.progress(100)
status_text.text("脚本生成完成!")
st.success("视频脚本生成成功!")
status_text.text(tr("Script generation completed!"))
st.success(tr("Video script generated successfully"))
except Exception as err:
st.error(f"生成过程中发生错误: {str(err)}")
st.error(f"{tr('Generation error')}: {str(err)}")
logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")
finally:
time.sleep(2)
progress_bar.empty()
status_text.empty()
stream_text.empty()

View File

@ -0,0 +1,27 @@
import unittest
from webui.tools.generate_short_summary import _format_progress_status, parse_and_fix_json
class GenerateShortSummaryJsonTests(unittest.TestCase):
def test_progress_message_does_not_prefix_fake_percentage(self):
status = _format_progress_status(60, "正在生成文案...")
self.assertEqual("正在生成文案...", status)
self.assertNotIn("60%", status)
def test_invalid_json_does_not_create_default_fake_script(self):
self.assertIsNone(parse_and_fix_json("not a json response"))
def test_json_code_block_is_parsed(self):
parsed = parse_and_fix_json(
"""```json
{"items": [{"_id": 1, "timestamp": "00:00:01,000-00:00:02,000"}]}
```"""
)
self.assertEqual(1, parsed["items"][0]["_id"])
if __name__ == "__main__":
unittest.main()