NarratoAI/app/models/schema.py
viccy dc12f390bb feat: 新增原片字幕支持并优化视频合并流程
- 为VideoClipParams新增原字幕路径配置字段,支持单条/多条字幕路径
- 完善webui参数获取逻辑,处理字幕路径兼容性并对接前端选择
- 重构后端字幕处理流程,支持自动匹配视频对应原字幕,合并原声字幕
- 优化视频合并逻辑,新增ffmpeg无损copy合并判断,自动回退重编码提升效率
- 新增ffmpeg快速素材合并路径,支持自定义字幕样式与多音轨混合
- 新增多个单元测试覆盖字幕匹配、合并及视频合并场景
2026-06-08 13:05:30 +08:00

234 lines
8.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import warnings
from enum import Enum
from typing import Any, List, Optional, Union
import pydantic
from pydantic import BaseModel, Field
# 忽略 Pydantic 的特定警告
warnings.filterwarnings(
"ignore",
category=UserWarning,
message="Field name.*shadows an attribute in parent.*",
)
class AudioVolumeDefaults:
"""音量配置默认值常量类 - 确保全局一致性"""
# 语音音量默认值
VOICE_VOLUME = 1.0
TTS_VOLUME = 1.0
# 原声音量默认值 - 提高原声音量以平衡TTS
ORIGINAL_VOLUME = 1.2
# 背景音乐音量默认值
BGM_VOLUME = 0.3
# 音量范围
MIN_VOLUME = 0.0
MAX_VOLUME = 2.0 # 允许原声音量超过1.0以平衡TTS
# 智能音量调整
ENABLE_SMART_VOLUME = True # 是否启用智能音量分析和调整
class VideoConcatMode(str, Enum):
random = "random"
sequential = "sequential"
class VideoAspect(str, Enum):
landscape = "16:9"
landscape_2 = "4:3"
portrait = "9:16"
portrait_2 = "3:4"
square = "1:1"
def to_resolution(self):
if self == VideoAspect.landscape.value:
return 1920, 1080
elif self == VideoAspect.portrait.value:
return 1080, 1920
elif self == VideoAspect.square.value:
return 1080, 1080
return 1080, 1920
class _Config:
arbitrary_types_allowed = True
@pydantic.dataclasses.dataclass(config=_Config)
class MaterialInfo:
provider: str = "pexels"
url: str = ""
duration: int = 0
# VoiceNames = [
# # zh-CN
# "female-zh-CN-XiaoxiaoNeural",
# "female-zh-CN-XiaoyiNeural",
# "female-zh-CN-liaoning-XiaobeiNeural",
# "female-zh-CN-shaanxi-XiaoniNeural",
#
# "male-zh-CN-YunjianNeural",
# "male-zh-CN-YunxiNeural",
# "male-zh-CN-YunxiaNeural",
# "male-zh-CN-YunyangNeural",
#
# # "female-zh-HK-HiuGaaiNeural",
# # "female-zh-HK-HiuMaanNeural",
# # "male-zh-HK-WanLungNeural",
# #
# # "female-zh-TW-HsiaoChenNeural",
# # "female-zh-TW-HsiaoYuNeural",
# # "male-zh-TW-YunJheNeural",
#
# # en-US
# "female-en-US-AnaNeural",
# "female-en-US-AriaNeural",
# "female-en-US-AvaNeural",
# "female-en-US-EmmaNeural",
# "female-en-US-JennyNeural",
# "female-en-US-MichelleNeural",
#
# "male-en-US-AndrewNeural",
# "male-en-US-BrianNeural",
# "male-en-US-ChristopherNeural",
# "male-en-US-EricNeural",
# "male-en-US-GuyNeural",
# "male-en-US-RogerNeural",
# "male-en-US-SteffanNeural",
# ]
class VideoParams(BaseModel):
"""
{
"video_subject": "",
"video_aspect": "横屏 16:9西瓜视频",
"voice_name": "女生-晓晓",
"bgm_name": "random",
"font_name": "STHeitiMedium 黑体-中",
"text_color": "#FFFFFF",
"font_size": 60,
"stroke_color": "#000000",
"stroke_width": 1.5
}
"""
video_subject: str
video_script: str = "" # 用于生成视频的脚本
video_terms: Optional[Union[str, list]] = None # 用于生成视频的关键词
video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
video_clip_duration: Optional[int] = 5
video_count: Optional[int] = 1
video_source: Optional[str] = "pexels"
video_materials: Optional[List[MaterialInfo]] = None # 用于生成视频的素材
video_language: Optional[str] = "" # auto detect
voice_name: Optional[str] = ""
voice_volume: Optional[float] = AudioVolumeDefaults.VOICE_VOLUME
voice_rate: Optional[float] = 1.0
bgm_type: Optional[str] = "random"
bgm_file: Optional[str] = ""
bgm_volume: Optional[float] = AudioVolumeDefaults.BGM_VOLUME
subtitle_enabled: Optional[bool] = True
subtitle_position: Optional[str] = "bottom" # top, bottom, center
custom_position: float = 70.0
font_name: Optional[str] = "STHeitiMedium.ttc"
text_fore_color: Optional[str] = "#FFFFFF"
text_background_color: Optional[str] = "transparent"
font_size: int = 60
stroke_color: Optional[str] = "#000000"
stroke_width: float = 1.5
n_threads: Optional[int] = 2
paragraph_number: Optional[int] = 1
class VideoClipParams(BaseModel):
"""
NarratoAI 数据模型
"""
video_clip_json: Optional[list] = Field(default=[], description="LLM 生成的视频剪辑脚本内容")
video_clip_json_path: Optional[str] = Field(default="", description="LLM 生成的视频剪辑脚本路径")
video_origin_path: Optional[str] = Field(default="", description="原视频路径")
video_origin_paths: Optional[List[str]] = Field(default=[], description="原视频路径列表")
original_subtitle_path: Optional[str] = Field(default="", description="原视频字幕路径")
original_subtitle_paths: Optional[List[str]] = Field(default=[], description="原视频字幕路径列表")
video_aspect: Optional[VideoAspect] = Field(default=VideoAspect.portrait.value, description="视频比例")
video_language: Optional[str] = Field(default="zh-CN", description="视频语言")
# video_clip_duration: Optional[int] = 5 # 视频片段时长
# video_count: Optional[int] = 1 # 视频片段数量
# video_source: Optional[str] = "local"
# video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
voice_volume: Optional[float] = Field(default=AudioVolumeDefaults.VOICE_VOLUME, description="解说语音音量")
voice_rate: Optional[float] = Field(default=1.0, description="语速")
voice_pitch: Optional[float] = Field(default=1.0, description="语调")
tts_engine: Optional[str] = Field(default="", description="TTS 引擎")
bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")
bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
subtitle_enabled: bool = True
subtitle_mask_enabled: bool = False
subtitle_mask_landscape_x_percent: float = 10.0
subtitle_mask_landscape_y_percent: float = 78.0
subtitle_mask_landscape_width_percent: float = 80.0
subtitle_mask_landscape_height_percent: float = 14.0
subtitle_mask_landscape_blur_radius: int = 18
subtitle_mask_landscape_opacity_percent: int = 82
subtitle_mask_portrait_x_percent: float = 8.0
subtitle_mask_portrait_y_percent: float = 79.0
subtitle_mask_portrait_width_percent: float = 84.0
subtitle_mask_portrait_height_percent: float = 16.0
subtitle_mask_portrait_blur_radius: int = 26
subtitle_mask_portrait_opacity_percent: int = 84
subtitle_position_landscape_y_percent: float = 85.0
subtitle_position_portrait_y_percent: float = 82.0
subtitle_auto_transcribe_enabled: bool = False
subtitle_auto_transcribe_backend: str = "local"
subtitle_auto_transcribe_api_url: str = ""
subtitle_auto_transcribe_firered_api_url: str = ""
subtitle_auto_transcribe_api_key: str = ""
subtitle_auto_transcribe_hotword: str = ""
subtitle_auto_transcribe_enable_spk: bool = False
font_name: str = "SimHei" # 默认使用黑体
font_size: int = 36
text_fore_color: str = "white" # 文本前景色
text_back_color: Optional[str] = None # 文本背景色
stroke_color: str = "black" # 描边颜色
stroke_width: float = 1.5 # 描边宽度
subtitle_position: str = "bottom" # top, bottom, center, custom
custom_position: float = 70.0 # 自定义位置
n_threads: Optional[int] = Field(default=16, description="线程数") # 线程数,有助于提升视频处理速度
tts_volume: Optional[float] = Field(default=AudioVolumeDefaults.TTS_VOLUME, description="解说语音音量(后处理)")
original_volume: Optional[float] = Field(default=AudioVolumeDefaults.ORIGINAL_VOLUME, description="视频原声音量")
bgm_volume: Optional[float] = Field(default=AudioVolumeDefaults.BGM_VOLUME, description="背景音乐音量")
draft_name: Optional[str] = Field(default="", description="剪映草稿名称")
class SubtitlePosition(str, Enum):
TOP = "top"
CENTER = "center"
BOTTOM = "bottom"