NarratoAI/app/models/schema.py
viccy 33c17c2636 feat(subtitle, asr, bgm): 添加字幕遮罩、自动转录功能,优化背景音乐设置
- 新增字幕遮罩功能,可在烧录新字幕前遮盖原视频自带的字幕区域,支持横屏/竖屏自定义配置与预览调试
- 新增自动字幕转录功能,支持本地FunASR和阿里百炼在线转写,在最终视频合并完成后自动生成并压入成片字幕
- 重构背景音乐设置面板,新增从资源目录选择BGM、上传本地BGM文件的功能,新增BGM试听预览,优化交互流程
- 更新配置示例文件、数据Schema与多语言翻译文件,完善前后端参数传递逻辑
2026-06-06 01:08:35 +08:00

229 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import warnings
from enum import Enum
from typing import Any, List, Optional, Union
import pydantic
from pydantic import BaseModel, Field
# 忽略 Pydantic 的特定警告
warnings.filterwarnings(
"ignore",
category=UserWarning,
message="Field name.*shadows an attribute in parent.*",
)
class AudioVolumeDefaults:
"""音量配置默认值常量类 - 确保全局一致性"""
# 语音音量默认值
VOICE_VOLUME = 1.0
TTS_VOLUME = 1.0
# 原声音量默认值 - 提高原声音量以平衡TTS
ORIGINAL_VOLUME = 1.2
# 背景音乐音量默认值
BGM_VOLUME = 0.3
# 音量范围
MIN_VOLUME = 0.0
MAX_VOLUME = 2.0 # 允许原声音量超过1.0以平衡TTS
# 智能音量调整
ENABLE_SMART_VOLUME = True # 是否启用智能音量分析和调整
class VideoConcatMode(str, Enum):
random = "random"
sequential = "sequential"
class VideoAspect(str, Enum):
landscape = "16:9"
landscape_2 = "4:3"
portrait = "9:16"
portrait_2 = "3:4"
square = "1:1"
def to_resolution(self):
if self == VideoAspect.landscape.value:
return 1920, 1080
elif self == VideoAspect.portrait.value:
return 1080, 1920
elif self == VideoAspect.square.value:
return 1080, 1080
return 1080, 1920
class _Config:
arbitrary_types_allowed = True
@pydantic.dataclasses.dataclass(config=_Config)
class MaterialInfo:
provider: str = "pexels"
url: str = ""
duration: int = 0
# VoiceNames = [
# # zh-CN
# "female-zh-CN-XiaoxiaoNeural",
# "female-zh-CN-XiaoyiNeural",
# "female-zh-CN-liaoning-XiaobeiNeural",
# "female-zh-CN-shaanxi-XiaoniNeural",
#
# "male-zh-CN-YunjianNeural",
# "male-zh-CN-YunxiNeural",
# "male-zh-CN-YunxiaNeural",
# "male-zh-CN-YunyangNeural",
#
# # "female-zh-HK-HiuGaaiNeural",
# # "female-zh-HK-HiuMaanNeural",
# # "male-zh-HK-WanLungNeural",
# #
# # "female-zh-TW-HsiaoChenNeural",
# # "female-zh-TW-HsiaoYuNeural",
# # "male-zh-TW-YunJheNeural",
#
# # en-US
# "female-en-US-AnaNeural",
# "female-en-US-AriaNeural",
# "female-en-US-AvaNeural",
# "female-en-US-EmmaNeural",
# "female-en-US-JennyNeural",
# "female-en-US-MichelleNeural",
#
# "male-en-US-AndrewNeural",
# "male-en-US-BrianNeural",
# "male-en-US-ChristopherNeural",
# "male-en-US-EricNeural",
# "male-en-US-GuyNeural",
# "male-en-US-RogerNeural",
# "male-en-US-SteffanNeural",
# ]
class VideoParams(BaseModel):
"""
{
"video_subject": "",
"video_aspect": "横屏 16:9西瓜视频",
"voice_name": "女生-晓晓",
"bgm_name": "random",
"font_name": "STHeitiMedium 黑体-中",
"text_color": "#FFFFFF",
"font_size": 60,
"stroke_color": "#000000",
"stroke_width": 1.5
}
"""
video_subject: str
video_script: str = "" # 用于生成视频的脚本
video_terms: Optional[Union[str, list]] = None # 用于生成视频的关键词
video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
video_clip_duration: Optional[int] = 5
video_count: Optional[int] = 1
video_source: Optional[str] = "pexels"
video_materials: Optional[List[MaterialInfo]] = None # 用于生成视频的素材
video_language: Optional[str] = "" # auto detect
voice_name: Optional[str] = ""
voice_volume: Optional[float] = AudioVolumeDefaults.VOICE_VOLUME
voice_rate: Optional[float] = 1.0
bgm_type: Optional[str] = "random"
bgm_file: Optional[str] = ""
bgm_volume: Optional[float] = AudioVolumeDefaults.BGM_VOLUME
subtitle_enabled: Optional[bool] = True
subtitle_position: Optional[str] = "bottom" # top, bottom, center
custom_position: float = 70.0
font_name: Optional[str] = "STHeitiMedium.ttc"
text_fore_color: Optional[str] = "#FFFFFF"
text_background_color: Optional[str] = "transparent"
font_size: int = 60
stroke_color: Optional[str] = "#000000"
stroke_width: float = 1.5
n_threads: Optional[int] = 2
paragraph_number: Optional[int] = 1
class VideoClipParams(BaseModel):
"""
NarratoAI 数据模型
"""
video_clip_json: Optional[list] = Field(default=[], description="LLM 生成的视频剪辑脚本内容")
video_clip_json_path: Optional[str] = Field(default="", description="LLM 生成的视频剪辑脚本路径")
video_origin_path: Optional[str] = Field(default="", description="原视频路径")
video_origin_paths: Optional[List[str]] = Field(default=[], description="原视频路径列表")
video_aspect: Optional[VideoAspect] = Field(default=VideoAspect.portrait.value, description="视频比例")
video_language: Optional[str] = Field(default="zh-CN", description="视频语言")
# video_clip_duration: Optional[int] = 5 # 视频片段时长
# video_count: Optional[int] = 1 # 视频片段数量
# video_source: Optional[str] = "local"
# video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
voice_volume: Optional[float] = Field(default=AudioVolumeDefaults.VOICE_VOLUME, description="解说语音音量")
voice_rate: Optional[float] = Field(default=1.0, description="语速")
voice_pitch: Optional[float] = Field(default=1.0, description="语调")
tts_engine: Optional[str] = Field(default="", description="TTS 引擎")
bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")
bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
subtitle_enabled: bool = True
subtitle_mask_enabled: bool = False
subtitle_mask_landscape_x_percent: float = 10.0
subtitle_mask_landscape_y_percent: float = 78.0
subtitle_mask_landscape_width_percent: float = 80.0
subtitle_mask_landscape_height_percent: float = 14.0
subtitle_mask_landscape_blur_radius: int = 18
subtitle_mask_landscape_opacity_percent: int = 82
subtitle_mask_portrait_x_percent: float = 8.0
subtitle_mask_portrait_y_percent: float = 79.0
subtitle_mask_portrait_width_percent: float = 84.0
subtitle_mask_portrait_height_percent: float = 16.0
subtitle_mask_portrait_blur_radius: int = 26
subtitle_mask_portrait_opacity_percent: int = 84
subtitle_auto_transcribe_enabled: bool = False
subtitle_auto_transcribe_backend: str = "local"
subtitle_auto_transcribe_api_url: str = ""
subtitle_auto_transcribe_api_key: str = ""
subtitle_auto_transcribe_hotword: str = ""
subtitle_auto_transcribe_enable_spk: bool = False
font_name: str = "SimHei" # 默认使用黑体
font_size: int = 36
text_fore_color: str = "white" # 文本前景色
text_back_color: Optional[str] = None # 文本背景色
stroke_color: str = "black" # 描边颜色
stroke_width: float = 1.5 # 描边宽度
subtitle_position: str = "bottom" # top, bottom, center, custom
custom_position: float = 70.0 # 自定义位置
n_threads: Optional[int] = Field(default=16, description="线程数") # 线程数,有助于提升视频处理速度
tts_volume: Optional[float] = Field(default=AudioVolumeDefaults.TTS_VOLUME, description="解说语音音量(后处理)")
original_volume: Optional[float] = Field(default=AudioVolumeDefaults.ORIGINAL_VOLUME, description="视频原声音量")
bgm_volume: Optional[float] = Field(default=AudioVolumeDefaults.BGM_VOLUME, description="背景音乐音量")
draft_name: Optional[str] = Field(default="", description="剪映草稿名称")
class SubtitlePosition(str, Enum):
TOP = "top"
CENTER = "center"
BOTTOM = "bottom"