Merge pull request #134 from linyqh/dev_0.6.0 大版本更新

Dev 0.6.0
This commit is contained in:
viccy 2025-05-08 20:58:12 +08:00 committed by GitHub
commit 9aefe76a8c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
57 changed files with 3368 additions and 1492 deletions

1
.gitignore vendored
View File

@ -32,4 +32,5 @@ resource/fonts/*.ttf
resource/fonts/*.otf
resource/srt/*.srt
app/models/faster-whisper-large-v2/*
app/models/faster-whisper-large-v3/*
app/models/bert/*

View File

@ -4,7 +4,7 @@
<h3 align="center">一站式 AI 影视解说+自动化剪辑工具🎬🎞️ </h3>
<h3>📖 <a href="README-cn.md">English</a> | 简体中文 | <a href="README-ja.md">日本語</a> </h3>
<h3>📖 <a href="README-en.md">English</a> | 简体中文 | <a href="README-ja.md">日本語</a> </h3>
<div align="center">
[//]: # ( <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
@ -83,7 +83,7 @@ _**注意⚠️:近期在 x (推特) 上发现有人冒充作者在 pump.fun
## 配置要求 📦
- 建议最低 CPU 4核或以上内存 8G 或以上,显卡非必须
- Windows 10 或 MacOS 11.0 以上系统
- Windows 10/11 或 MacOS 11.0 以上系统
- [Python 3.10+](https://www.python.org/downloads/)
## 反馈建议 📢

View File

@ -20,7 +20,9 @@ class VideoConcatMode(str, Enum):
class VideoAspect(str, Enum):
landscape = "16:9"
landscape_2 = "4:3"
portrait = "9:16"
portrait_2 = "3:4"
square = "1:1"
def to_resolution(self):
@ -360,13 +362,14 @@ class VideoClipParams(BaseModel):
text_back_color: Optional[str] = None # 文本背景色
stroke_color: str = "black" # 描边颜色
stroke_width: float = 1.5 # 描边宽度
subtitle_position: str = "bottom" # top, bottom, center, custom
subtitle_position: str = "bottom" # top, bottom, center, custom
custom_position: float = 70.0 # 自定义位置
n_threads: Optional[int] = Field(default=16, description="解说语音音量") # 线程<E7BABF><E7A88B><EFBFBD>,有助于提升视频处理速度
n_threads: Optional[int] = Field(default=16, description="线程数") # 线程数,有助于提升视频处理速度
tts_volume: Optional[float] = Field(default=1.0, description="解说语音音量(后处理)")
original_volume: Optional[float] = Field(default=1.0, description="视频原声音量")
bgm_volume: Optional[float] = Field(default=0.6, description="背景音乐音量")
bgm_volume: Optional[float] = Field(default=0.3, description="背景音乐音量")
class VideoTranscriptionRequest(BaseModel):

View File

@ -6,6 +6,7 @@ class GenerateScriptRequest(BaseModel):
video_path: str
video_theme: Optional[str] = ""
custom_prompt: Optional[str] = ""
frame_interval_input: Optional[int] = 5
skip_seconds: Optional[int] = 0
threshold: Optional[int] = 30
vision_batch_size: Optional[int] = 5

Binary file not shown.

Binary file not shown.

View File

@ -18,15 +18,14 @@ def check_ffmpeg():
return False
def merge_audio_files(task_id: str, audio_files: list, total_duration: float, list_script: list):
def merge_audio_files(task_id: str, total_duration: float, list_script: list):
"""
合并音频文件根据OST设置处理不同的音频轨道
合并音频文件
Args:
task_id: 任务ID
audio_files: TTS生成的音频文件列表
total_duration: 总时长
list_script: 完整脚本信息包含OST设置
list_script: 完整脚本信息包含duration时长和audio路径
Returns:
str: 合并后的音频文件路径
@ -39,36 +38,38 @@ def merge_audio_files(task_id: str, audio_files: list, total_duration: float, li
# 创建一个空的音频片段
final_audio = AudioSegment.silent(duration=total_duration * 1000) # 总时长以毫秒为单位
# 计算每个片段的开始位置基于duration字段
current_position = 0 # 初始位置(秒)
# 遍历脚本中的每个片段
for segment, audio_file in zip(list_script, audio_files):
for segment in list_script:
try:
# 加载TTS音频文件
tts_audio = AudioSegment.from_file(audio_file)
# 获取片段的开始和结束时间
start_time, end_time = segment['new_timestamp'].split('-')
start_seconds = utils.time_to_seconds(start_time)
end_seconds = utils.time_to_seconds(end_time)
# 根据OST设置处理音频
if segment['OST'] == 0:
# 只使用TTS音频
final_audio = final_audio.overlay(tts_audio, position=start_seconds * 1000)
elif segment['OST'] == 1:
# 只使用原声(假设原声已经在视频中)
continue
elif segment['OST'] == 2:
# 混合TTS音频和原声
original_audio = AudioSegment.silent(duration=(end_seconds - start_seconds) * 1000)
mixed_audio = original_audio.overlay(tts_audio)
final_audio = final_audio.overlay(mixed_audio, position=start_seconds * 1000)
# 获取片段时长(秒)
duration = segment['duration']
# 检查audio字段是否为空
if segment['audio'] and os.path.exists(segment['audio']):
# 加载TTS音频文件
tts_audio = AudioSegment.from_file(segment['audio'])
# 将TTS音频添加到最终音频
final_audio = final_audio.overlay(tts_audio, position=current_position * 1000)
else:
# audio为空不添加音频仅保留间隔
logger.info(f"片段 {segment.get('timestamp', '')} 没有音频文件,保留 {duration} 秒的间隔")
# 更新下一个片段的开始位置
current_position += duration
except Exception as e:
logger.error(f"处理音频文件 {audio_file} 时出错: {str(e)}")
logger.error(f"处理音频片段时出错: {str(e)}")
# 即使处理失败,也要更新位置,确保后续片段位置正确
if 'duration' in segment:
current_position += segment['duration']
continue
# 保存合并后的音频文件
output_audio_path = os.path.join(utils.task_dir(task_id), "final_audio.mp3")
output_audio_path = os.path.join(utils.task_dir(task_id), "merger_audio.mp3")
final_audio.export(output_audio_path, format="mp3")
logger.info(f"合并后的音频文件已保存: {output_audio_path}")
@ -93,7 +94,7 @@ def time_to_seconds(time_str):
# 分割时间部分
parts = time_part.split(':')
if len(parts) == 3: # HH:MM:SS
h, m, s = map(int, parts)
seconds = h * 3600 + m * 60 + s
@ -118,11 +119,11 @@ def extract_timestamp(filename):
# 从文件名中提取时间部分
time_part = filename.split('_', 1)[1].split('.')[0] # 获取 "00_06,500-00_24,800" 部分
start_time, end_time = time_part.split('-') # 分割成开始和结束时间
# 将下划线格式转换回冒号格式
start_time = start_time.replace('_', ':')
end_time = end_time.replace('_', ':')
# 将时间戳转换为秒
start_seconds = time_to_seconds(start_time)
end_seconds = time_to_seconds(end_time)
@ -135,17 +136,36 @@ def extract_timestamp(filename):
if __name__ == "__main__":
# 示例用法
audio_files =[
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:06-00:24.mp3",
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:32-00:38.mp3",
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:43-00:52.mp3",
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:52-01:09.mp3",
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01:13-01:15.mp3",
]
total_duration = 38
video_script_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json"
with open(video_script_path, "r", encoding="utf-8") as f:
video_script = json.load(f)
total_duration = 90
output_file = merge_audio_files("test456", audio_files, total_duration, video_script)
video_script = [
{'picture': '【解说】好的,各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸范闲在北齐""了?这怎么可能!',
'timestamp': '00:00:00-00:00:26',
'narration': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸范闲在北齐""了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!',
'OST': 0, 'duration': 26,
'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3'},
{'picture': '【解说】上一集我们看到,范闲在北齐遭遇了惊天变故,生死不明!', 'timestamp': '00:01:15-00:01:29',
'narration': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…',
'OST': 0, 'duration': 14,
'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3'},
{'picture': '画面切到王启年小心翼翼地向范闲汇报。', 'timestamp': '00:04:41-00:04:58',
'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪',
'OST': 1, 'duration': 17,
'audio': ''},
{'picture': '【解说】"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。',
'timestamp': '00:04:58-00:05:20',
'narration': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!',
'OST': 0, 'duration': 22,
'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3'},
{'picture': '【解说】但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!',
'timestamp': '00:05:45-00:05:53',
'narration': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!',
'OST': 0, 'duration': 8,
'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'},
{'picture': '画面切换到范闲蒙面闯入皇宫,被侍卫包围的场景。', 'timestamp': '00:06:00-00:06:03',
'narration': '抓刺客',
'OST': 1, 'duration': 3,
'audio': ''}]
output_file = merge_audio_files("test456", total_duration, video_script)
print(output_file)

256
app/services/clip_video.py Normal file
View File

@ -0,0 +1,256 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project: NarratoAI
@File : clip_video
@Author : 小林同学
@Date : 2025/5/6 下午6:14
'''
import os
import subprocess
import json
import hashlib
from loguru import logger
from typing import Dict, List, Optional
from pathlib import Path
def parse_timestamp(timestamp: str) -> tuple:
"""
解析时间戳字符串返回开始和结束时间
Args:
timestamp: 格式为'HH:MM:SS-HH:MM:SS''HH:MM:SS,sss-HH:MM:SS,sss'的时间戳字符串
Returns:
tuple: (开始时间, 结束时间) 格式为'HH:MM:SS''HH:MM:SS,sss'
"""
start_time, end_time = timestamp.split('-')
return start_time, end_time
def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str:
"""
根据开始时间和持续时间计算结束时间
Args:
start_time: 开始时间格式为'HH:MM:SS''HH:MM:SS,sss'(带毫秒)
duration: 持续时间单位为秒
extra_seconds: 额外添加的秒数默认为1秒
Returns:
str: 计算后的结束时间格式与输入格式相同
"""
# 检查是否包含毫秒
has_milliseconds = ',' in start_time
milliseconds = 0
if has_milliseconds:
time_part, ms_part = start_time.split(',')
h, m, s = map(int, time_part.split(':'))
milliseconds = int(ms_part)
else:
h, m, s = map(int, start_time.split(':'))
# 转换为总毫秒数
total_milliseconds = ((h * 3600 + m * 60 + s) * 1000 + milliseconds +
int((duration + extra_seconds) * 1000))
# 计算新的时、分、秒、毫秒
ms_new = total_milliseconds % 1000
total_seconds = total_milliseconds // 1000
h_new = int(total_seconds // 3600)
m_new = int((total_seconds % 3600) // 60)
s_new = int(total_seconds % 60)
# 返回与输入格式一致的时间字符串
if has_milliseconds:
return f"{h_new:02d}:{m_new:02d}:{s_new:02d},{ms_new:03d}"
else:
return f"{h_new:02d}:{m_new:02d}:{s_new:02d}"
def check_hardware_acceleration() -> Optional[str]:
"""
检查系统支持的硬件加速选项
Returns:
Optional[str]: 硬件加速参数如果不支持则返回None
"""
# 检查NVIDIA GPU支持
try:
nvidia_check = subprocess.run(
["ffmpeg", "-hwaccel", "cuda", "-i", "/dev/null", "-f", "null", "-"],
stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
)
if nvidia_check.returncode == 0:
return "cuda"
except Exception:
pass
# 检查MacOS videotoolbox支持
try:
videotoolbox_check = subprocess.run(
["ffmpeg", "-hwaccel", "videotoolbox", "-i", "/dev/null", "-f", "null", "-"],
stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
)
if videotoolbox_check.returncode == 0:
return "videotoolbox"
except Exception:
pass
# 检查Intel Quick Sync支持
try:
qsv_check = subprocess.run(
["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"],
stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
)
if qsv_check.returncode == 0:
return "qsv"
except Exception:
pass
return None
def clip_video(
video_origin_path: str,
tts_result: List[Dict],
output_dir: Optional[str] = None,
task_id: Optional[str] = None
) -> Dict[str, str]:
"""
根据时间戳裁剪视频
Args:
video_origin_path: 原始视频的路径
tts_result: 包含时间戳和持续时间信息的列表
output_dir: 输出目录路径默认为None时会自动生成
task_id: 任务ID用于生成唯一的输出目录默认为None时会自动生成
Returns:
Dict[str, str]: 时间戳到裁剪后视频路径的映射
"""
# 检查视频文件是否存在
if not os.path.exists(video_origin_path):
raise FileNotFoundError(f"视频文件不存在: {video_origin_path}")
# 如果未提供task_id则根据输入生成一个唯一ID
if task_id is None:
content_for_hash = f"{video_origin_path}_{json.dumps(tts_result)}"
task_id = hashlib.md5(content_for_hash.encode()).hexdigest()
# 设置输出目录
if output_dir is None:
output_dir = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
"storage", "temp", "clip_video", task_id
)
# 确保输出目录存在
Path(output_dir).mkdir(parents=True, exist_ok=True)
# 检查硬件加速支持
hwaccel = check_hardware_acceleration()
hwaccel_args = []
if hwaccel:
hwaccel_args = ["-hwaccel", hwaccel]
logger.info(f"使用硬件加速: {hwaccel}")
# 存储裁剪结果
result = {}
for item in tts_result:
_id = item.get("_id", item.get("timestamp", "unknown"))
timestamp = item["timestamp"]
start_time, _ = parse_timestamp(timestamp)
# 根据持续时间计算真正的结束时间加上1秒余量
duration = item["duration"]
calculated_end_time = calculate_end_time(start_time, duration)
# 转换为FFmpeg兼容的时间格式逗号替换为点
ffmpeg_start_time = start_time.replace(',', '.')
ffmpeg_end_time = calculated_end_time.replace(',', '.')
# 格式化输出文件名(使用连字符替代冒号和逗号)
safe_start_time = start_time.replace(':', '-').replace(',', '-')
safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4"
output_path = os.path.join(output_dir, output_filename)
# 构建FFmpeg命令
ffmpeg_cmd = [
"ffmpeg", "-y", *hwaccel_args,
"-i", video_origin_path,
"-ss", ffmpeg_start_time,
"-to", ffmpeg_end_time,
"-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264",
"-c:a", "aac",
"-strict", "experimental",
output_path
]
# 执行FFmpeg命令
try:
logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}{ffmpeg_end_time}")
# logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}")
process = subprocess.run(
ffmpeg_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
result[_id] = output_path
except subprocess.CalledProcessError as e:
logger.error(f"裁剪视频片段失败: {timestamp}")
logger.error(f"错误信息: {e.stderr}")
raise RuntimeError(f"视频裁剪失败: {e.stderr}")
return result
if __name__ == "__main__":
video_origin_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/qyn2-2无片头片尾.mp4"
tts_result = [{'timestamp': '00:00:00-00:01:15',
'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3',
'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt',
'duration': 25.55,
'text': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸范闲在北齐""了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!'},
{'timestamp': '00:01:15-00:04:40',
'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3',
'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt',
'duration': 13.488,
'text': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…'},
{'timestamp': '00:04:58-00:05:45',
'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3',
'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt',
'duration': 21.363,
'text': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!'},
{'timestamp': '00:05:45-00:06:00',
'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3',
'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt',
'duration': 7.675, 'text': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!'}]
subclip_path_videos = {
'00:00:00-00:01:15': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-00-00-00-01-15.mp4',
'00:01:15-00:04:40': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-01-15-00-04-40.mp4',
'00:04:41-00:04:58': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4',
'00:04:58-00:05:45': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-58-00-05-45.mp4',
'00:05:45-00:06:00': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-05-45-00-06-00.mp4',
'00:06:00-00:06:03': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4',
}
# 使用方法示例
try:
result = clip_video(video_origin_path, tts_result, subclip_path_videos)
print("裁剪结果:")
print(json.dumps(result, indent=4, ensure_ascii=False))
except Exception as e:
print(f"发生错误: {e}")

View File

@ -0,0 +1,264 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project: NarratoAI
@File : 生成介绍文案
@Author : 小林同学
@Date : 2025/5/8 上午11:33
'''
import json
import os
import traceback
from openai import OpenAI
from loguru import logger
def parse_frame_analysis_to_markdown(json_file_path):
"""
解析视频帧分析JSON文件并转换为Markdown格式
:param json_file_path: JSON文件路径
:return: Markdown格式的字符串
"""
# 检查文件是否存在
if not os.path.exists(json_file_path):
return f"错误: 文件 {json_file_path} 不存在"
try:
# 读取JSON文件
with open(json_file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
# 初始化Markdown字符串
markdown = ""
# 获取总结和帧观察数据
summaries = data.get('overall_activity_summaries', [])
frame_observations = data.get('frame_observations', [])
# 按批次组织数据
batch_frames = {}
for frame in frame_observations:
batch_index = frame.get('batch_index')
if batch_index not in batch_frames:
batch_frames[batch_index] = []
batch_frames[batch_index].append(frame)
# 生成Markdown内容
for i, summary in enumerate(summaries, 1):
batch_index = summary.get('batch_index')
time_range = summary.get('time_range', '')
batch_summary = summary.get('summary', '')
markdown += f"## 片段 {i}\n"
markdown += f"- 时间范围:{time_range}\n"
# 添加片段描述
markdown += f"- 片段描述:{batch_summary}\n" if batch_summary else f"- 片段描述:\n"
markdown += "- 详细描述:\n"
# 添加该批次的帧观察详情
frames = batch_frames.get(batch_index, [])
for frame in frames:
timestamp = frame.get('timestamp', '')
observation = frame.get('observation', '')
# 直接使用原始文本,不进行分割
markdown += f" - {timestamp}: {observation}\n" if observation else f" - {timestamp}: \n"
markdown += "\n"
return markdown
except Exception as e:
return f"处理JSON文件时出错: {traceback.format_exc()}"
def generate_narration(markdown_content, api_key, base_url, model):
"""
调用OpenAI API根据视频帧分析的Markdown内容生成解说文案
:param markdown_content: Markdown格式的视频帧分析内容
:param api_key: OpenAI API密钥
:param base_url: API基础URL如果使用非官方API
:param model: 使用的模型名称
:return: 生成的解说文案
"""
try:
# 构建提示词
prompt = """
我是一名荒野建造解说的博主以下是一些同行的对标文案请你深度学习并总结这些文案的风格特点跟内容特点
<example_text_1>
解压助眠的天花板就是荒野建造沉浸丝滑的搭建过程可以说每一帧都是极致享受我保证强迫症来了都找不出一丁点毛病更别说全屋严丝合缝的拼接工艺还能轻松抵御零下二十度气温让你居住的每一天都温暖如春
在家闲不住的西姆今天也打算来一次野外建造行走没多久他就发现许多倒塌的树任由它们自生自灭不如将其利用起来想到这他就开始挥舞铲子要把地基挖掘出来虽然每次只能挖一点点但架不住他体能惊人没多长时间一个 2x3 的深坑就赫然出现这深度住他一人绰绰有余
随后他去附近收集来原木这些都是搭建墙壁的最好材料而在投入使用前自然要把表皮刮掉防止森林中的白蚁蛀虫处理好一大堆后西姆还在两端打孔使用木钉固定在一起这可不是用来做墙壁的而是做庇护所的承重柱只要木头间的缝隙足够紧密那搭建出的木屋就能足够坚固
每向上搭建一层他都会在中间塞入苔藓防寒保证不会泄露一丝热量其他几面也是用相同方法很快西姆就做好了三面墙壁每一根木头都极其工整保证强迫症来了都要点个赞再走
在继续搭建墙壁前西姆决定将壁炉制作出来毕竟森林夜晚的气温会很低保暖措施可是重中之重完成后他找来一块大树皮用来充当庇护所的大门而上面刮掉的木屑还能作为壁炉的引火物可以说再完美不过
测试了排烟没问题后他才开始搭建最后一面墙壁这一面要预留门和窗所以在搭建到一半后还需要在原木中间开出卡口让自己劈砍时能轻松许多此时只需将另外一根如法炮制两端拼接在一起后就是一扇大小适中的窗户而随着随后一层苔藓铺好最后一根原木落位这个庇护所的雏形就算完成
大门的安装他没选择用合页而是在底端雕刻出榫头门框上则雕刻出榫眼只能说西姆的眼就是一把尺这完全就是严丝合缝此时他才开始搭建屋顶这里西姆用的方法不同他先把最外围的原木固定好随后将原木平铺在上面就能得到完美的斜面屋顶等他将四周的围栏也装好后工整的屋顶看起来十分舒服西姆躺上去都不想动
稍作休息后他利用剩余的苔藓对屋顶的缝隙处密封可这样西姆觉得不够保险于是他找来一些黏土再次对原本的缝隙二次加工保管这庇护所冬天也暖和最后只需要平铺上枯叶以及挖掘出的泥土整个屋顶就算完成
考虑到庇护所的美观性自然少不了覆盖上苔藓翠绿的颜色看起来十分舒服就连门口的庭院旁他都移植了许多小树做点缀让这木屋与周边环境融为一体西姆才刚完成好这件事一场大雨就骤然降临好在此时的他已经不用淋雨更别说这屋顶防水十分不错室内没一点雨水渗透进来
等待温度回升的过程西姆利用墙壁本身的凹槽把床框镶嵌在上面只需要铺上苔藓以及自带的床单枕头一张完美的单人床就做好辛苦劳作一整天西姆可不会亏待自己他将自带的牛肉腌制好后直接放到壁炉中烤只需要等待三十分钟就能享受这美味的一顿
在辛苦建造一星期后他终于可以在自己搭建的庇护所中享受最纯正的野外露营后面西姆回家补给了一堆物资再次回来时森林已经大雪纷飞让他原本翠绿的小屋更换上了冬季限定皮肤好在内部设施没受什么影响和他离开时一样整洁
就是房间中已经没多少柴火让西姆今天又得劈柴寒冷干燥的天气让木头劈起来十分轻松没多久他就收集到一大堆这些足够燃烧好几天虽然此时外面大雪纷飞但小屋中却开始逐渐温暖这次他除了带来一些食物外还有几瓶调味料以及一整套被褥让自己的居住舒适度提高一大截
而秋天他有收集干草的缘故只需要塞入枕套中密封起来就能作为靠垫用就这居住条件比一般人在家过的还要奢侈趁着壁炉木头变木炭的过程西姆则开始不紧不慢的处理食物他取出一块牛排改好花刀以后撒上一堆调料腌制起来接着用锡纸包裹好放到壁炉中直接炭烤搭配上自带的红酒是一个非常好的选择
随着时间来到第二天外面的积雪融化了不少西姆简单做顿煎蛋补充体力后决定制作一个室外篝火堆用来晚上驱散周边野兽搭建这玩意没什么技巧只需要找到一大堆木棍利用大树的夹缝将其掰弯然后将其堆积在一起就是一个简易版的篝火堆看这外形有点像帐篷好在西姆没想那么多
等待天色暗淡下来后他才来到室外将其点燃顺便处理下多余的废料只可惜这场景没朋友陪在身边对西姆来说可能是个遗憾而哪怕森林只有他一个人都依旧做了好几个小时等到里面的篝火彻底燃尽后西姆还找来雪球覆盖到上面将火熄灭这防火意识可谓十分好最后在室内二十五度的高温下裹着被子睡觉
</example_text_1>
<example_text_2>
解压助眠的天花板就是荒野建造沉浸丝滑的搭建过程每一帧都是极致享受全屋严丝合缝的拼接工艺能轻松抵御零下二十度气温居住体验温暖如春
在家闲不住的西姆开启野外建造他发现倒塌的树决定加以利用先挖掘出 2x3 的深坑作为地基接着收集原木刮掉表皮防白蚁蛀虫打孔用木钉固定制作承重柱搭建墙壁时每一层都塞入苔藓防寒很快做好三面墙
为应对森林夜晚低温西姆制作壁炉用大树皮当大门刮下的木屑做引火物搭建最后一面墙时预留门窗通过在原木中间开口拼接做出窗户大门采用榫卯结构安装严丝合缝
搭建屋顶时先固定外围原木再平铺原木形成斜面屋顶之后用苔藓黏土密封缝隙铺上枯叶和泥土为美观在木屋覆盖苔藓移植小树点缀完工时遇大雨木屋防水良好
西姆利用墙壁凹槽镶嵌床框铺上苔藓床单枕头做成床劳作一天后他用壁炉烤牛肉享用建造一星期后他开始野外露营
后来西姆回家补给物资回来时森林大雪纷飞他劈柴储备带回食物调味料和被褥提高居住舒适度还用干草做靠垫他用壁炉烤牛排搭配红酒
第二天积雪融化西姆制作室外篝火堆防野兽用大树夹缝掰弯木棍堆积而成晚上点燃处理废料结束后用雪球灭火最后在室内二十五度的环境中裹被入睡
</example_text_2>
<example_text_3>
如果战争到来这个深埋地下十几米的庇护所绝对是 bug 般的存在即使被敌人发现还能通过快速通道一秒逃出里面不仅有竹子地暖地下水井还自制抽水机在解决用水问题的同时甚至自研无土栽培技术过上完全自给自足的生活
阿伟的老婆美如花但阿伟从来不回家来到野外他乐哈哈一言不合就开挖众所周知当战争来临时地下堡垒的安全性是最高的阿伟苦苦研习两载半只为练就一身挖洞本领在这双逆天麒麟臂的加持下如此坚硬的泥土都只能当做炮灰
得到了充足的空间后他便开始对这些边缘进行打磨随后阿伟将细线捆在木棍上以此描绘出圆柱的轮廓接着再一点点铲掉多余的部分虽然是由泥土一体式打造但这样的桌子保准用上千年都不成问题
考虑到十几米的深度进出非常不方便于是阿伟找来两根长达 66.6 米的木头打算为庇护所打造一条快速通道只见他将木桩牢牢地插入地下并顺着洞口的方向延伸出去直到贯穿整个山洞接着在每个木桩的连接处钉入铁钉确保轨道不能有一毫米的偏差完成后再制作一个木质框架从而达到前后滑动的效果
不得不说阿伟这手艺简直就是大钢管子杵青蛙在上面放上一个木制的车斗还能加快搬运泥土的速度没多久庇护所的内部就已经初见雏形为了住起来更加舒适还需要为自己打造一张床虽然深处的泥土同样很坚固但好处就是不用担心垮塌的风险
阿伟不仅设计了更加符合人体工学的拱形并且还在一旁雕刻处壁龛就是这氛围怎么看着有点不太吉利别看阿伟一身腱子肉但这身体里的艺术细菌可不少每个边缘的地方他都做了精雕细琢瞬间让整个卧室的颜值提升一大截
住在地下的好处就是房子面积全靠挖每平方消耗两个半馒头不仅没有了房贷的压力就连买墓地的钱也省了阿伟将中间的墙壁挖空从而得到取暖的壁炉当然最重要的还有排烟问题要想从上往下打通十几米的山体是件极其困难的事好在阿伟年轻时报过忆坤年的古墓派补习班这打洞技术堪比隔壁学校的土拨鼠专业虽然深度长达十几米但排烟效果却一点不受影响一个字专业
随后阿伟继续对壁炉底部雕刻打通了底部放柴火的空间并制作出放锅的灶头完成后阿伟从侧面将壁炉打通并制作出一条导热的通道以此连接到床铺的位置毕竟住在这么一个风湿宝地不注意保暖除湿很容易得老寒腿
阿伟在床面上挖出一条条管道以便于温度能传输到床的每个角落接下来就可以根据这些通道的长度裁切出同样长短的竹子根据竹筒的大小凿出相互连接的孔洞最后再将竹筒内部打通以达到温度传送的效果
而后阿伟将这些管道安装到凹槽内在他严谨的制作工艺下每根竹子刚好都能镶嵌进去在铺设床面之前还需要用木塞把圆孔堵住防止泥土掉落进管道泥土虽然不能隔绝湿气但却是十分优良的导热材料等他把床面都压平后就可以小心的将这些木塞拔出来最后再用黏土把剩余的管道也遮盖起来直到整个墙面恢复原样
接下来还需要测试一下加热效果当他把火点起来后温度很快就传送到了管道内把火力一点点加大直到热气流淌到更远的床面随着小孔里的青烟冒出也预示着阿伟的地暖可以投入使用而后阿伟制作了一些竹条并用细绳将它们喜结连理
千里之行始于足下美好的家园要靠自己双手打造明明可以靠才艺吃饭的阿伟偏偏要用八块腹肌征服大家就问这样的男人哪个野生婆娘不喜欢完成后阿伟还用自己 35 码的大腚感受了一下真烫
随后阿伟来到野区找到一根上好的雷击木他当即就把木头咔嚓成两段并取下两节较为完整的带了回去刚好能和圆桌配套另外一个在里面凿出凹槽并插入木棍连接得到一个夯土的木锤住过农村的小伙伴都知道这样夯出来的地面堪比水泥地不仅坚硬耐磨还不用担心脚底打滑忙碌了一天的阿伟已经饥渴难耐拿出野生小烤肠安安心心住新房光脚爬上大热炕一觉能睡到天亮
第二天阿伟打算将房间扩宽毕竟吃住的地方有了还要解决个人卫生的问题阿伟在另一侧增加了一个房间他打算将这里打造成洗澡的地方为了防止泥土垮塌他将顶部做成圆弧形等挖出足够的空间后旁边的泥土已经堆成了小山
为了方便清理这些泥土阿伟在之前的轨道增加了转弯交接处依然是用铁钉固定一直延伸到房间的最里面有了运输车的帮助这些成吨的泥土也能轻松的运送出去并且还能体验过山车的感觉很快他就完成了清理工作
为了更方便的在里面洗澡他将底部一点点挖空这么大的浴缸看来阿伟并不打算一个人住完成后他将墙面雕刻的凹凸有致让这里看起来更加豪华接着用洛阳铲挖出排水口并用一根相同大小的竹筒作为开关
由于四周都是泥土还不能防水阿伟特意找了一些白蚁巢用来制作可以防水的野生水泥现在就可以将里里外外能接触到水的地方都涂抹一遍细心的阿伟还找来这种 500 克一斤的鹅卵石对池子表面进行装饰
没错水源问题阿伟早已经考虑在内他打算直接在旁边挖个水井毕竟已经挖了这么深再向下挖一挖应该就能到达地下水的深度经过几日的奋战能看得出阿伟已经消瘦了不少但一想到马上就能拥有的豪宅他直接化身为无情的挖土机器很快就挖到了好几米的深度
考虑到自己的弹跳力有限阿伟在一旁定入木桩然后通过绳子爬上爬下随着深度越来越深井底已经开始渗出水来这也预示着打井成功没多久这里面将渗满泉水仅凭一次就能挖到水源看来这里还真是块风湿宝地
随后阿伟在井口四周挖出凹槽以便于井盖的安置这一量才知道井的深度已经达到了足足的 5 阿伟把木板组合在一起再沿着标记切掉多余部分他甚至还给井盖做了把手可是如何从这么深的井里打水还是个问题但从阿伟坚定的眼神来看他应该想到了解决办法
只见他将树桩锯成两半然后用凿子把里面一点点掏空另外一半也是如法炮制接着还要在底部挖出圆孔要想成功将水从 5 米深的地方抽上来那就不得不提到大家熟知的勾股定理没错这跟勾股定理没什么关系
阿伟给竹筒做了一个木塞并在里面打上安装连接轴的孔为了增加密闭性阿伟不得不牺牲了自己的 AJ剪出与木塞相同的大小后再用木钉固定住随后他收集了一些树胶并放到火上加热融化接下来就可以涂在木塞上增加使用寿命
现在将竹筒组装完成就可以利用虹吸原理将水抽上来完成后就可以把井盖盖上去再用泥土在上面覆盖现在就不用担心失足掉下去了
接下来阿伟去采集了一些大漆将它涂抹在木桶接缝处就能将其二合为一完了再接入旁边浴缸的入水口每个连接的地方都要做好密封不然后面很容易漏水随后就可以安装上活塞并用一根木桩作为省力杠杆根据空气压强的原理将井水抽上来
经过半小时的来回拉扯硕大的浴缸终于被灌满阿伟也是忍不住洗了把脸接下来还需要解决排水的问题阿伟在地上挖出沟渠一直贯穿到屋外然后再用竹筒从出水口连接每个接口处都要抹上胶水就连门外的出水口他都做了隐藏
在野外最重要的就是庇护所水源还有食物既然已经完成了前二者那么阿伟还需要拥有可持续发展的食物来源他先是在地上挖了两排地洞然后在每根竹筒的表面都打上无数孔洞这就是他打算用来种植的载体在此之前还需要用大火对竹筒进行杀菌消毒
趁着这时候他去搬了一麻袋的木屑先用芭蕉叶覆盖在上面再铺上厚厚的黏土隔绝温度在火焰的温度下能让里面的木屑达到生长条件
等到第二天所有材料都晾凉后阿伟才将竹筒内部掏空并将木屑一点点地塞入竹筒一切准备就绪就可以将竹筒插入提前挖好的地洞最后再往竹筒里塞入种子依靠房间内的湿度和温度就能达到大棚种植的效果稍加时日这些种子就会慢慢发芽
虽然暂时还吃不上自己培养的食物但好在阿伟从表哥贺强那里学到不少钓鱼本领哪怕只有一根小小的竹竿也能让他钓上两斤半的大鲶鱼新鲜的食材那肯定是少不了高温消毒的过程趁着鱼没熟阿伟直接爬进浴缸冰凉的井水瞬间洗去了身上的疲惫这一刻的阿伟是无比的享受
不久后鱼也烤得差不多了阿伟的生活现在可以说是有滋有味住在十几米的地下不仅能安全感满满哪怕遇到危险还能通过轨道快速逃生
<example_text_3>
<video_frame_description>
%s
</video_frame_description>
我正在尝试做这个内容的解说纪录片视频我需要你以 <video_frame_description> </video_frame_description> 中的内容为解说目标根据我刚才提供给你的对标文案 <example_text> 特点以及你总结的特点帮我生成一段关于荒野建造的解说文案文案需要符合平台受欢迎的解说风格请使用 json 格式进行输出使用 <output> 中的输出格式
<output>
{
"items": [
{
"_id": 1, # 唯一递增id
"timestamp": "00:00:05,390-00:00:10,430",
"picture": "画面描述",
"narration": "解说文案",
}
}
</output>
<restriction>
1. 只输出 json 内容不要输出其他任何说明性的文字
2. 解说文案的语言使用 简体中文
3. 严禁虚构画面所有画面只能从 <video_frame_description> 中摘取
</restriction>
""" % (markdown_content)
# 使用OpenAI SDK初始化客户端
client = OpenAI(
api_key=api_key,
base_url=base_url
)
# 使用SDK发送请求
if model not in ["deepseek-reasoner"]:
# deepseek-reasoner 不支持 json 输出
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"},
{"role": "user", "content": prompt}
],
temperature=1.5,
response_format={"type": "json_object"},
)
# 提取生成的文案
if response.choices and len(response.choices) > 0:
narration_script = response.choices[0].message.content
# 打印消耗的tokens
logger.debug(f"消耗的tokens: {response.usage.total_tokens}")
return narration_script
else:
return "生成解说文案失败: 未获取到有效响应"
else:
# 不支持 json 输出,需要多一步处理 ```json ``` 的步骤
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"},
{"role": "user", "content": prompt}
],
temperature=1.5,
)
# 提取生成的文案
if response.choices and len(response.choices) > 0:
narration_script = response.choices[0].message.content
# 打印消耗的tokens
logger.debug(f"文案消耗的tokens: {response.usage.total_tokens}")
# 清理 narration_script 字符串前后的 ```json ``` 字符串
narration_script = narration_script.replace("```json", "").replace("```", "")
return narration_script
else:
return "生成解说文案失败: 未获取到有效响应"
except Exception as e:
return f"调用API生成解说文案时出错: {traceback.format_exc()}"
if __name__ == '__main__':
text_provider = 'openai'
text_api_key = "sk-xxx"
text_model = "deepseek-reasoner"
text_base_url = "https://api.deepseek.com"
video_frame_description_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1139.json"
# 测试新的JSON文件
test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1458.json"
markdown_output = parse_frame_analysis_to_markdown(test_file_path)
# print(markdown_output)
# 输出到文件以便检查格式
output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/narration_script.md"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(markdown_output)
# print(f"\n已将Markdown输出保存到: {output_file}")
# 生成解说文案
narration = generate_narration(
markdown_output,
text_api_key,
base_url=text_base_url,
model=text_model
)
# 保存解说文案
print(narration)
print(type(narration))
narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json"
with open(narration_file, 'w', encoding='utf-8') as f:
f.write(narration)
print(f"\n已将解说文案保存到: {narration_file}")

View File

@ -0,0 +1,393 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project: NarratoAI
@File : generate_video
@Author : 小林同学
@Date : 2025/5/7 上午11:55
'''
import os
import traceback
from typing import Optional, Dict, Any
from loguru import logger
from moviepy import (
VideoFileClip,
AudioFileClip,
CompositeAudioClip,
CompositeVideoClip,
TextClip,
afx
)
from moviepy.video.tools.subtitles import SubtitlesClip
from PIL import ImageFont
from app.utils import utils
def merge_materials(
video_path: str,
audio_path: str,
output_path: str,
subtitle_path: Optional[str] = None,
bgm_path: Optional[str] = None,
options: Optional[Dict[str, Any]] = None
) -> str:
"""
合并视频音频BGM和字幕素材生成最终视频
参数:
video_path: 视频文件路径
audio_path: 音频文件路径
output_path: 输出文件路径
subtitle_path: 字幕文件路径可选
bgm_path: 背景音乐文件路径可选
options: 其他选项配置可包含以下字段:
- voice_volume: 人声音量默认1.0
- bgm_volume: 背景音乐音量默认0.3
- original_audio_volume: 原始音频音量默认0.0
- keep_original_audio: 是否保留原始音频默认False
- subtitle_font: 字幕字体默认None系统会使用默认字体
- subtitle_font_size: 字幕字体大小默认40
- subtitle_color: 字幕颜色默认白色
- subtitle_bg_color: 字幕背景颜色默认透明
- subtitle_position: 字幕位置可选值'bottom', 'top', 'center'默认'bottom'
- custom_position: 自定义位置
- stroke_color: 描边颜色默认黑色
- stroke_width: 描边宽度默认1
- threads: 处理线程数默认2
- fps: 输出帧率默认30
返回:
输出视频的路径
"""
# 合并选项默认值
if options is None:
options = {}
# 设置默认参数值
voice_volume = options.get('voice_volume', 1.0)
bgm_volume = options.get('bgm_volume', 0.3)
original_audio_volume = options.get('original_audio_volume', 0.0) # 默认为0即不保留原声
keep_original_audio = options.get('keep_original_audio', False) # 是否保留原声
subtitle_font = options.get('subtitle_font', '')
subtitle_font_size = options.get('subtitle_font_size', 40)
subtitle_color = options.get('subtitle_color', '#FFFFFF')
subtitle_bg_color = options.get('subtitle_bg_color', 'transparent')
subtitle_position = options.get('subtitle_position', 'bottom')
custom_position = options.get('custom_position', 70)
stroke_color = options.get('stroke_color', '#000000')
stroke_width = options.get('stroke_width', 1)
threads = options.get('threads', 2)
fps = options.get('fps', 30)
# 处理透明背景色问题 - MoviePy 2.1.1不支持'transparent'值
if subtitle_bg_color == 'transparent':
subtitle_bg_color = None # None在新版MoviePy中表示透明背景
# 创建输出目录(如果不存在)
output_dir = os.path.dirname(output_path)
os.makedirs(output_dir, exist_ok=True)
logger.info(f"开始合并素材...")
logger.info(f" ① 视频: {video_path}")
logger.info(f" ② 音频: {audio_path}")
if subtitle_path:
logger.info(f" ③ 字幕: {subtitle_path}")
if bgm_path:
logger.info(f" ④ 背景音乐: {bgm_path}")
logger.info(f" ⑤ 输出: {output_path}")
# 加载视频
try:
video_clip = VideoFileClip(video_path)
logger.info(f"视频尺寸: {video_clip.size[0]}x{video_clip.size[1]}, 时长: {video_clip.duration}")
# 提取视频原声(如果需要)
original_audio = None
if keep_original_audio and original_audio_volume > 0:
try:
original_audio = video_clip.audio
if original_audio:
original_audio = original_audio.with_effects([afx.MultiplyVolume(original_audio_volume)])
logger.info(f"已提取视频原声,音量设置为: {original_audio_volume}")
else:
logger.warning("视频没有音轨,无法提取原声")
except Exception as e:
logger.error(f"提取视频原声失败: {str(e)}")
original_audio = None
# 移除原始音轨,稍后会合并新的音频
video_clip = video_clip.without_audio()
except Exception as e:
logger.error(f"加载视频失败: {str(e)}")
raise
# 处理背景音乐和所有音频轨道合成
audio_tracks = []
# 先添加主音频(配音)
if audio_path and os.path.exists(audio_path):
try:
voice_audio = AudioFileClip(audio_path).with_effects([afx.MultiplyVolume(voice_volume)])
audio_tracks.append(voice_audio)
logger.info(f"已添加配音音频,音量: {voice_volume}")
except Exception as e:
logger.error(f"加载配音音频失败: {str(e)}")
# 添加原声(如果需要)
if original_audio is not None:
audio_tracks.append(original_audio)
logger.info(f"已添加视频原声,音量: {original_audio_volume}")
# 添加背景音乐(如果有)
if bgm_path and os.path.exists(bgm_path):
try:
bgm_clip = AudioFileClip(bgm_path).with_effects([
afx.MultiplyVolume(bgm_volume),
afx.AudioFadeOut(3),
afx.AudioLoop(duration=video_clip.duration),
])
audio_tracks.append(bgm_clip)
logger.info(f"已添加背景音乐,音量: {bgm_volume}")
except Exception as e:
logger.error(f"添加背景音乐失败: \n{traceback.format_exc()}")
# 合成最终的音频轨道
if audio_tracks:
final_audio = CompositeAudioClip(audio_tracks)
video_clip = video_clip.with_audio(final_audio)
logger.info(f"已合成所有音频轨道,共{len(audio_tracks)}")
else:
logger.warning("没有可用的音频轨道,输出视频将没有声音")
# 处理字体路径
font_path = None
if subtitle_path and subtitle_font:
font_path = os.path.join(utils.font_dir(), subtitle_font)
if os.name == "nt":
font_path = font_path.replace("\\", "/")
logger.info(f"使用字体: {font_path}")
# 处理视频尺寸
video_width, video_height = video_clip.size
# 字幕处理函数
def create_text_clip(subtitle_item):
"""创建单个字幕片段"""
phrase = subtitle_item[1]
max_width = video_width * 0.9
# 如果有字体路径,进行文本换行处理
wrapped_txt = phrase
txt_height = 0
if font_path:
wrapped_txt, txt_height = wrap_text(
phrase,
max_width=max_width,
font=font_path,
fontsize=subtitle_font_size
)
# 创建文本片段
try:
_clip = TextClip(
text=wrapped_txt,
font=font_path,
font_size=subtitle_font_size,
color=subtitle_color,
bg_color=subtitle_bg_color, # 这里已经在前面处理过None表示透明
stroke_color=stroke_color,
stroke_width=stroke_width,
)
except Exception as e:
logger.error(f"创建字幕片段失败: {str(e)}, 使用简化参数重试")
# 如果上面的方法失败,尝试使用更简单的参数
_clip = TextClip(
text=wrapped_txt,
font=font_path,
font_size=subtitle_font_size,
color=subtitle_color,
)
# 设置字幕时间
duration = subtitle_item[0][1] - subtitle_item[0][0]
_clip = _clip.with_start(subtitle_item[0][0])
_clip = _clip.with_end(subtitle_item[0][1])
_clip = _clip.with_duration(duration)
# 设置字幕位置
if subtitle_position == "bottom":
_clip = _clip.with_position(("center", video_height * 0.95 - _clip.h))
elif subtitle_position == "top":
_clip = _clip.with_position(("center", video_height * 0.05))
elif subtitle_position == "custom":
margin = 10
max_y = video_height - _clip.h - margin
min_y = margin
custom_y = (video_height - _clip.h) * (custom_position / 100)
custom_y = max(
min_y, min(custom_y, max_y)
)
_clip = _clip.with_position(("center", custom_y))
else: # center
_clip = _clip.with_position(("center", "center"))
return _clip
# 创建TextClip工厂函数
def make_textclip(text):
return TextClip(
text=text,
font=font_path,
font_size=subtitle_font_size,
color=subtitle_color,
)
# 处理字幕
if subtitle_path and os.path.exists(subtitle_path):
try:
# 加载字幕文件
sub = SubtitlesClip(
subtitles=subtitle_path,
encoding="utf-8",
make_textclip=make_textclip
)
# 创建每个字幕片段
text_clips = []
for item in sub.subtitles:
clip = create_text_clip(subtitle_item=item)
text_clips.append(clip)
# 合成视频和字幕
video_clip = CompositeVideoClip([video_clip, *text_clips])
logger.info(f"已添加{len(text_clips)}个字幕片段")
except Exception as e:
logger.error(f"处理字幕失败: \n{traceback.format_exc()}")
# 导出最终视频
try:
video_clip.write_videofile(
output_path,
audio_codec="aac",
temp_audiofile_path=output_dir,
threads=threads,
fps=fps,
)
logger.success(f"素材合并完成: {output_path}")
except Exception as e:
logger.error(f"导出视频失败: {str(e)}")
raise
finally:
# 释放资源
video_clip.close()
del video_clip
return output_path
def wrap_text(text, max_width, font="Arial", fontsize=60):
"""
文本换行函数使长文本适应指定宽度
参数:
text: 需要换行的文本
max_width: 最大宽度像素
font: 字体路径
fontsize: 字体大小
返回:
换行后的文本和文本高度
"""
# 创建ImageFont对象
try:
font_obj = ImageFont.truetype(font, fontsize)
except:
# 如果无法加载指定字体,使用默认字体
font_obj = ImageFont.load_default()
def get_text_size(inner_text):
inner_text = inner_text.strip()
left, top, right, bottom = font_obj.getbbox(inner_text)
return right - left, bottom - top
width, height = get_text_size(text)
if width <= max_width:
return text, height
processed = True
_wrapped_lines_ = []
words = text.split(" ")
_txt_ = ""
for word in words:
_before = _txt_
_txt_ += f"{word} "
_width, _height = get_text_size(_txt_)
if _width <= max_width:
continue
else:
if _txt_.strip() == word.strip():
processed = False
break
_wrapped_lines_.append(_before)
_txt_ = f"{word} "
_wrapped_lines_.append(_txt_)
if processed:
_wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
result = "\n".join(_wrapped_lines_).strip()
height = len(_wrapped_lines_) * height
return result, height
_wrapped_lines_ = []
chars = list(text)
_txt_ = ""
for word in chars:
_txt_ += word
_width, _height = get_text_size(_txt_)
if _width <= max_width:
continue
else:
_wrapped_lines_.append(_txt_)
_txt_ = ""
_wrapped_lines_.append(_txt_)
result = "\n".join(_wrapped_lines_).strip()
height = len(_wrapped_lines_) * height
return result, height
if __name__ == '__main__':
merger_mp4 = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger.mp4'
merger_sub = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merged_subtitle_00_00_00-00_01_30.srt'
merger_audio = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger_audio.mp3'
bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3'
output_video = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/combined_test.mp4'
# 调用示例
options = {
'voice_volume': 1.0, # 配音音量
'bgm_volume': 0.1, # 背景音乐音量
'original_audio_volume': 1.0, # 视频原声音量0表示不保留
'keep_original_audio': True, # 是否保留原声
'subtitle_font': 'MicrosoftYaHeiNormal.ttc', # 这里使用相对字体路径,会自动在 font_dir() 目录下查找
'subtitle_font_size': 40,
'subtitle_color': '#FFFFFF',
'subtitle_bg_color': None, # 直接使用None表示透明背景
'subtitle_position': 'bottom',
'threads': 2
}
try:
merge_materials(
video_path=merger_mp4,
audio_path=merger_audio,
subtitle_path=merger_sub,
bgm_path=bgm_path,
output_path=output_video,
options=options
)
except Exception as e:
logger.error(f"合并素材失败: \n{traceback.format_exc()}")

View File

@ -7,7 +7,7 @@ from typing import List
from loguru import logger
from openai import OpenAI
from openai import AzureOpenAI
from moviepy.editor import VideoFileClip
from moviepy import VideoFileClip
from openai.types.chat import ChatCompletion
import google.generativeai as gemini
from googleapiclient.errors import ResumableUploadError

View File

@ -4,9 +4,10 @@ import random
import traceback
from urllib.parse import urlencode
from datetime import datetime
import json
import requests
from typing import List
from typing import List, Optional
from loguru import logger
from moviepy.video.io.VideoFileClip import VideoFileClip
@ -306,7 +307,50 @@ def format_timestamp(seconds: float) -> str:
return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict:
def _detect_hardware_acceleration() -> Optional[str]:
"""
检测系统可用的硬件加速器
Returns:
Optional[str]: 硬件加速参数如果不支持则返回None
"""
# 检查NVIDIA GPU支持
try:
nvidia_check = subprocess.run(
["ffmpeg", "-hwaccel", "cuda", "-i", "/dev/null", "-f", "null", "-"],
stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
)
if nvidia_check.returncode == 0:
return "cuda"
except Exception:
pass
# 检查MacOS videotoolbox支持
try:
videotoolbox_check = subprocess.run(
["ffmpeg", "-hwaccel", "videotoolbox", "-i", "/dev/null", "-f", "null", "-"],
stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
)
if videotoolbox_check.returncode == 0:
return "videotoolbox"
except Exception:
pass
# 检查Intel Quick Sync支持
try:
qsv_check = subprocess.run(
["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"],
stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False
)
if qsv_check.returncode == 0:
return "qsv"
except Exception:
pass
return None
def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> str:
"""
保存剪辑后的视频
@ -328,29 +372,43 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# 生成更规范的视频文件名
video_id = f"vid-{timestamp.replace(':', '-').replace(',', '_')}"
video_path = os.path.join(save_dir, f"{video_id}.mp4")
# 解析时间戳
start_str, end_str = timestamp.split('-')
# 格式化输出文件名(使用连字符替代冒号和逗号)
safe_start_time = start_str.replace(':', '-').replace(',', '-')
safe_end_time = end_str.replace(':', '-').replace(',', '-')
output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4"
video_path = os.path.join(save_dir, output_filename)
# 如果视频已存在,直接返回
if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
logger.info(f"video already exists: {video_path}")
return {timestamp: video_path}
logger.info(f"视频已存在: {video_path}")
return video_path
try:
# 加载视频获取总时长
video = VideoFileClip(origin_video)
total_duration = video.duration
# 检查视频是否存在
if not os.path.exists(origin_video):
logger.error(f"源视频文件不存在: {origin_video}")
return ''
# 获取视频总时长
try:
probe_cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", origin_video]
total_duration = float(subprocess.check_output(probe_cmd).decode('utf-8').strip())
except subprocess.CalledProcessError as e:
logger.error(f"获取视频时长失败: {str(e)}")
return ''
# 解析时间戳
start_str, end_str = timestamp.split('-')
# 计算时间点
start = time_to_seconds(start_str)
end = time_to_seconds(end_str)
# 验证时间段
if start >= total_duration:
logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)")
video.close()
return {}
return ''
if end > total_duration:
logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒),将自动调整为视频结尾")
@ -358,55 +416,74 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
if end <= start:
logger.warning(f"结束时间 {format_timestamp(end)} 必须大于起始时间 {format_timestamp(start)}")
video.close()
return {}
return ''
# 剪辑视频
# 计算剪辑时长
duration = end - start
logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)},时长 {format_timestamp(duration)}")
# logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)},时长 {format_timestamp(duration)}")
# 剪辑视频
subclip = video.subclip(start, end)
# 检测可用的硬件加速选项
hwaccel = _detect_hardware_acceleration()
hwaccel_args = []
if hwaccel:
hwaccel_args = ["-hwaccel", hwaccel]
logger.info(f"使用硬件加速: {hwaccel}")
try:
# 检查视频是否有音频轨道并写入文件
subclip.write_videofile(
video_path,
codec='libx264',
audio_codec='aac',
temp_audiofile='temp-audio.m4a',
remove_temp=True,
audio=(subclip.audio is not None),
logger=None
)
# 验证生成的视频文件
if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
with VideoFileClip(video_path) as clip:
if clip.duration > 0 and clip.fps > 0:
return {timestamp: video_path}
raise ValueError("视频文件验证失败")
except Exception as e:
logger.warning(f"视频文件处理失败: {video_path} => {str(e)}")
# 转换为FFmpeg兼容的时间格式逗号替换为点
ffmpeg_start_time = start_str.replace(',', '.')
ffmpeg_end_time = end_str.replace(',', '.')
# 构建FFmpeg命令
ffmpeg_cmd = [
"ffmpeg", "-y", *hwaccel_args,
"-i", origin_video,
"-ss", ffmpeg_start_time,
"-to", ffmpeg_end_time,
"-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264",
"-c:a", "aac",
"-strict", "experimental",
video_path
]
# 执行FFmpeg命令
# logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}")
# logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}")
process = subprocess.run(
ffmpeg_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=False # 不抛出异常,我们会检查返回码
)
# 检查是否成功
if process.returncode != 0:
logger.error(f"视频剪辑失败: {process.stderr}")
if os.path.exists(video_path):
os.remove(video_path)
return ''
# 验证生成的视频文件
if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
# 检查视频是否可播放
probe_cmd = ["ffprobe", "-v", "error", video_path]
validate_result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if validate_result.returncode == 0:
logger.info(f"视频剪辑成功: {video_path}")
return video_path
except Exception as e:
logger.warning(f"视频剪辑失败: \n{str(traceback.format_exc())}")
logger.error("视频文件验证失败")
if os.path.exists(video_path):
os.remove(video_path)
finally:
# 确保视频对象被正确关闭
try:
video.close()
if 'subclip' in locals():
subclip.close()
except:
pass
return {}
return ''
except Exception as e:
logger.error(f"视频剪辑过程中发生错误: \n{str(traceback.format_exc())}")
if os.path.exists(video_path):
os.remove(video_path)
return ''
def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None) -> dict:
@ -428,8 +505,7 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro
try:
saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory)
if saved_video_path:
logger.info(f"video saved: {saved_video_path}")
video_paths.update(saved_video_path)
video_paths.update({index+1:saved_video_path})
# 更新进度
if progress_callback:
@ -439,6 +515,7 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro
return {}
logger.success(f"裁剪 {len(video_paths)} videos")
# logger.debug(json.dumps(video_paths, indent=4, ensure_ascii=False))
return video_paths

View File

@ -0,0 +1,555 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project: NarratoAI
@File : merger_video
@Author : 小林同学
@Date : 2025/5/6 下午7:38
'''
import os
import shutil
import subprocess
from enum import Enum
from typing import List, Optional, Tuple
from loguru import logger
class VideoAspect(Enum):
"""视频宽高比枚举"""
landscape = "16:9" # 横屏 16:9
landscape_2 = "4:3"
portrait = "9:16" # 竖屏 9:16
portrait_2 = "3:4"
square = "1:1" # 方形 1:1
def to_resolution(self) -> Tuple[int, int]:
"""根据宽高比返回标准分辨率"""
if self == VideoAspect.portrait:
return 1080, 1920 # 竖屏 9:16
elif self == VideoAspect.portrait_2:
return 720, 1280 # 竖屏 4:3
elif self == VideoAspect.landscape:
return 1920, 1080 # 横屏 16:9
elif self == VideoAspect.landscape_2:
return 1280, 720 # 横屏 4:3
elif self == VideoAspect.square:
return 1080, 1080 # 方形 1:1
else:
return 1080, 1920 # 默认竖屏
def check_ffmpeg_installation() -> bool:
"""
检查ffmpeg是否已安装
Returns:
bool: 如果安装则返回True否则返回False
"""
try:
subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
return True
except (subprocess.SubprocessError, FileNotFoundError):
logger.error("ffmpeg未安装或不在系统PATH中请安装ffmpeg")
return False
def get_hardware_acceleration_option() -> Optional[str]:
"""
根据系统环境选择合适的硬件加速选项
Returns:
Optional[str]: 硬件加速参数如果不支持则返回None
"""
try:
# 检查NVIDIA GPU支持
nvidia_check = subprocess.run(
['ffmpeg', '-hide_banner', '-hwaccels'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
output = nvidia_check.stdout.lower()
if 'cuda' in output:
return 'cuda'
elif 'nvenc' in output:
return 'nvenc'
elif 'qsv' in output: # Intel Quick Sync
return 'qsv'
elif 'videotoolbox' in output: # macOS
return 'videotoolbox'
elif 'vaapi' in output: # Linux VA-API
return 'vaapi'
else:
logger.info("没有找到支持的硬件加速器,将使用软件编码")
return None
except Exception as e:
logger.warning(f"检测硬件加速器时出错: {str(e)},将使用软件编码")
return None
def check_video_has_audio(video_path: str) -> bool:
"""
检查视频是否包含音频流
Args:
video_path: 视频文件路径
Returns:
bool: 如果视频包含音频流则返回True否则返回False
"""
if not os.path.exists(video_path):
logger.warning(f"视频文件不存在: {video_path}")
return False
probe_cmd = [
'ffprobe', '-v', 'error',
'-select_streams', 'a:0',
'-show_entries', 'stream=codec_type',
'-of', 'csv=p=0',
video_path
]
try:
result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False)
return result.stdout.strip() == 'audio'
except Exception as e:
logger.warning(f"检测视频音频流时出错: {str(e)}")
return False
def create_ffmpeg_concat_file(video_paths: List[str], concat_file_path: str) -> str:
"""
创建ffmpeg合并所需的concat文件
Args:
video_paths: 需要合并的视频文件路径列表
concat_file_path: concat文件的输出路径
Returns:
str: concat文件的路径
"""
with open(concat_file_path, 'w', encoding='utf-8') as f:
for video_path in video_paths:
# 获取绝对路径
abs_path = os.path.abspath(video_path)
# 在Windows上将反斜杠替换为正斜杠
if os.name == 'nt': # Windows系统
abs_path = abs_path.replace('\\', '/')
else: # Unix/Mac系统
# 转义特殊字符
abs_path = abs_path.replace('\\', '\\\\').replace(':', '\\:')
# 处理路径中的单引号 (如果有)
abs_path = abs_path.replace("'", "\\'")
f.write(f"file '{abs_path}'\n")
return concat_file_path
def process_single_video(
input_path: str,
output_path: str,
target_width: int,
target_height: int,
keep_audio: bool = True,
hwaccel: Optional[str] = None
) -> str:
"""
处理单个视频调整分辨率帧率等
Args:
input_path: 输入视频路径
output_path: 输出视频路径
target_width: 目标宽度
target_height: 目标高度
keep_audio: 是否保留音频
hwaccel: 硬件加速选项
Returns:
str: 处理后的视频路径
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"找不到视频文件: {input_path}")
# 构建基本命令
command = ['ffmpeg', '-y']
# 添加硬件加速参数
if hwaccel:
if hwaccel == 'cuda' or hwaccel == 'nvenc':
command.extend(['-hwaccel', 'cuda'])
elif hwaccel == 'qsv':
command.extend(['-hwaccel', 'qsv'])
elif hwaccel == 'videotoolbox':
command.extend(['-hwaccel', 'videotoolbox'])
elif hwaccel == 'vaapi':
command.extend(['-hwaccel', 'vaapi', '-vaapi_device', '/dev/dri/renderD128'])
# 输入文件
command.extend(['-i', input_path])
# 处理音频
if not keep_audio:
command.extend(['-an']) # 移除音频
else:
# 检查输入视频是否有音频流
has_audio = check_video_has_audio(input_path)
if has_audio:
command.extend(['-c:a', 'aac', '-b:a', '128k']) # 音频编码为AAC
else:
logger.warning(f"视频 {input_path} 没有音频流,将会忽略音频设置")
command.extend(['-an']) # 没有音频流时移除音频设置
# 视频处理参数:缩放并添加填充以保持比例
scale_filter = f"scale={target_width}:{target_height}:force_original_aspect_ratio=decrease"
pad_filter = f"pad={target_width}:{target_height}:(ow-iw)/2:(oh-ih)/2"
command.extend([
'-vf', f"{scale_filter},{pad_filter}",
'-r', '30', # 设置帧率为30fps
])
# 选择编码器
if hwaccel == 'cuda' or hwaccel == 'nvenc':
command.extend(['-c:v', 'h264_nvenc', '-preset', 'p4', '-profile:v', 'high'])
elif hwaccel == 'qsv':
command.extend(['-c:v', 'h264_qsv', '-preset', 'medium'])
elif hwaccel == 'videotoolbox':
command.extend(['-c:v', 'h264_videotoolbox', '-profile:v', 'high'])
elif hwaccel == 'vaapi':
command.extend(['-c:v', 'h264_vaapi', '-profile', '100'])
else:
command.extend(['-c:v', 'libx264', '-preset', 'medium', '-profile:v', 'high'])
# 设置视频比特率和其他参数
command.extend([
'-b:v', '5M',
'-maxrate', '8M',
'-bufsize', '10M',
'-pix_fmt', 'yuv420p', # 兼容性更好的颜色格式
])
# 输出文件
command.append(output_path)
# 执行命令
try:
subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return output_path
except subprocess.CalledProcessError as e:
logger.error(f"处理视频失败: {e.stderr.decode() if e.stderr else str(e)}")
raise RuntimeError(f"处理视频失败: {str(e)}")
def combine_clip_videos(
output_video_path: str,
video_paths: List[str],
video_ost_list: List[int],
video_aspect: VideoAspect = VideoAspect.portrait,
threads: int = 4,
) -> str:
"""
合并子视频
Args:
output_video_path: 合并后的存储路径
video_paths: 子视频路径列表
video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说)
video_aspect: 屏幕比例
threads: 线程数
Returns:
str: 合并后的视频路径
"""
# 检查ffmpeg是否安装
if not check_ffmpeg_installation():
raise RuntimeError("未找到ffmpeg请先安装")
# 准备输出目录
output_dir = os.path.dirname(output_video_path)
os.makedirs(output_dir, exist_ok=True)
# 获取目标分辨率
aspect = VideoAspect(video_aspect)
video_width, video_height = aspect.to_resolution()
# 检测可用的硬件加速选项
hwaccel = get_hardware_acceleration_option()
if hwaccel:
logger.info(f"将使用 {hwaccel} 硬件加速")
# 重组视频路径和原声设置为一个字典列表结构
video_segments = []
# 检查视频路径和原声设置列表长度是否匹配
if len(video_paths) != len(video_ost_list):
logger.warning(f"视频路径列表({len(video_paths)})和原声设置列表({len(video_ost_list)})长度不匹配")
# 调整长度以匹配较短的列表
min_length = min(len(video_paths), len(video_ost_list))
video_paths = video_paths[:min_length]
video_ost_list = video_ost_list[:min_length]
# 创建视频处理配置字典列表
for i, (video_path, video_ost) in enumerate(zip(video_paths, video_ost_list)):
if not os.path.exists(video_path):
logger.warning(f"视频不存在,跳过: {video_path}")
continue
# 检查是否有音频流
has_audio = check_video_has_audio(video_path)
# 构建视频片段配置
segment = {
"index": i,
"path": video_path,
"ost": video_ost,
"has_audio": has_audio,
"keep_audio": video_ost > 0 and has_audio # 只有当ost>0且实际有音频时才保留
}
# 记录日志
if video_ost > 0 and not has_audio:
logger.warning(f"视频 {video_path} 设置为保留原声(ost={video_ost}),但该视频没有音频流")
video_segments.append(segment)
# 处理每个视频片段
processed_videos = []
temp_dir = os.path.join(output_dir, "temp_videos")
os.makedirs(temp_dir, exist_ok=True)
try:
# 第一阶段:处理所有视频片段到中间文件
for segment in video_segments:
# 处理单个视频,去除或保留音频
temp_output = os.path.join(temp_dir, f"processed_{segment['index']}.mp4")
try:
process_single_video(
input_path=segment['path'],
output_path=temp_output,
target_width=video_width,
target_height=video_height,
keep_audio=segment['keep_audio'],
hwaccel=hwaccel
)
processed_videos.append({
"index": segment["index"],
"path": temp_output,
"keep_audio": segment["keep_audio"]
})
logger.info(f"视频 {segment['index'] + 1}/{len(video_segments)} 处理完成")
except Exception as e:
logger.error(f"处理视频 {segment['path']} 时出错: {str(e)}")
continue
if not processed_videos:
raise ValueError("没有有效的视频片段可以合并")
# 按原始索引排序处理后的视频
processed_videos.sort(key=lambda x: x["index"])
# 第二阶段:分步骤合并视频 - 避免复杂的filter_complex滤镜
try:
# 1. 首先,将所有没有音频的视频或音频被禁用的视频合并到一个临时文件中
video_paths_only = [video["path"] for video in processed_videos]
video_concat_path = os.path.join(temp_dir, "video_concat.mp4")
# 创建concat文件用于合并视频流
concat_file = os.path.join(temp_dir, "concat_list.txt")
create_ffmpeg_concat_file(video_paths_only, concat_file)
# 合并所有视频流,但不包含音频
concat_cmd = [
'ffmpeg', '-y',
'-f', 'concat',
'-safe', '0',
'-i', concat_file,
'-c:v', 'libx264',
'-preset', 'medium',
'-profile:v', 'high',
'-an', # 不包含音频
'-threads', str(threads),
video_concat_path
]
subprocess.run(concat_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
logger.info("视频流合并完成")
# 2. 提取并合并有音频的片段
audio_segments = [video for video in processed_videos if video["keep_audio"]]
if not audio_segments:
# 如果没有音频片段,直接使用无音频的合并视频作为最终结果
shutil.copy(video_concat_path, output_video_path)
logger.info("无音频视频合并完成")
return output_video_path
# 创建音频中间文件
audio_files = []
for i, segment in enumerate(audio_segments):
# 提取音频
audio_file = os.path.join(temp_dir, f"audio_{i}.aac")
extract_audio_cmd = [
'ffmpeg', '-y',
'-i', segment["path"],
'-vn', # 不包含视频
'-c:a', 'aac',
'-b:a', '128k',
audio_file
]
subprocess.run(extract_audio_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
audio_files.append({
"index": segment["index"],
"path": audio_file
})
logger.info(f"提取音频 {i+1}/{len(audio_segments)} 完成")
# 3. 计算每个音频片段的时间位置
audio_timings = []
current_time = 0.0
# 获取每个视频片段的时长
for i, video in enumerate(processed_videos):
duration_cmd = [
'ffprobe', '-v', 'error',
'-show_entries', 'format=duration',
'-of', 'csv=p=0',
video["path"]
]
result = subprocess.run(duration_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
duration = float(result.stdout.strip())
# 如果当前片段需要保留音频,记录时间位置
if video["keep_audio"]:
for audio in audio_files:
if audio["index"] == video["index"]:
audio_timings.append({
"file": audio["path"],
"start": current_time,
"index": video["index"]
})
break
current_time += duration
# 4. 创建静音音频轨道作为基础
silence_audio = os.path.join(temp_dir, "silence.aac")
create_silence_cmd = [
'ffmpeg', '-y',
'-f', 'lavfi',
'-i', f'anullsrc=r=44100:cl=stereo',
'-t', str(current_time), # 总时长
'-c:a', 'aac',
'-b:a', '128k',
silence_audio
]
subprocess.run(create_silence_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# 5. 创建复杂滤镜命令以混合音频
filter_script = os.path.join(temp_dir, "filter_script.txt")
with open(filter_script, 'w') as f:
f.write(f"[0:a]volume=0.0[silence];\n") # 首先静音背景轨道
# 添加每个音频文件
for i, timing in enumerate(audio_timings):
f.write(f"[{i+1}:a]adelay={int(timing['start']*1000)}|{int(timing['start']*1000)}[a{i}];\n")
# 混合所有音频
mix_str = "[silence]"
for i in range(len(audio_timings)):
mix_str += f"[a{i}]"
mix_str += f"amix=inputs={len(audio_timings)+1}:duration=longest[aout]"
f.write(mix_str)
# 6. 构建音频合并命令
audio_inputs = ['-i', silence_audio]
for timing in audio_timings:
audio_inputs.extend(['-i', timing["file"]])
mixed_audio = os.path.join(temp_dir, "mixed_audio.aac")
audio_mix_cmd = [
'ffmpeg', '-y'
] + audio_inputs + [
'-filter_complex_script', filter_script,
'-map', '[aout]',
'-c:a', 'aac',
'-b:a', '128k',
mixed_audio
]
subprocess.run(audio_mix_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
logger.info("音频混合完成")
# 7. 将合并的视频和混合的音频组合在一起
final_cmd = [
'ffmpeg', '-y',
'-i', video_concat_path,
'-i', mixed_audio,
'-c:v', 'copy',
'-c:a', 'aac',
'-map', '0:v:0',
'-map', '1:a:0',
'-shortest',
output_video_path
]
subprocess.run(final_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
logger.info("视频最终合并完成")
return output_video_path
except subprocess.CalledProcessError as e:
logger.error(f"合并视频过程中出错: {e.stderr.decode() if e.stderr else str(e)}")
# 尝试备用合并方法 - 最简单的无音频合并
logger.info("尝试备用合并方法 - 无音频合并")
try:
concat_file = os.path.join(temp_dir, "concat_list.txt")
video_paths_only = [video["path"] for video in processed_videos]
create_ffmpeg_concat_file(video_paths_only, concat_file)
backup_cmd = [
'ffmpeg', '-y',
'-f', 'concat',
'-safe', '0',
'-i', concat_file,
'-c:v', 'copy',
'-an', # 无音频
output_video_path
]
subprocess.run(backup_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
logger.warning("使用备用方法(无音频)成功合并视频")
return output_video_path
except Exception as backup_error:
logger.error(f"备用合并方法也失败: {str(backup_error)}")
raise RuntimeError(f"无法合并视频: {str(backup_error)}")
except Exception as e:
logger.error(f"合并视频时出错: {str(e)}")
raise
finally:
# 清理临时文件
try:
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
logger.info("已清理临时文件")
except Exception as e:
logger.warning(f"清理临时文件时出错: {str(e)}")
if __name__ == '__main__':
video_paths = [
'/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-00-00-00-00-26.mp4',
'/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-01-15-00-01-29.mp4',
'/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4',
'/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-04-58-00-05-20.mp4',
'/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-05-45-00-05-53.mp4',
'/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4'
]
combine_clip_videos(
output_video_path="/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/merged_123.mp4",
video_paths=video_paths,
video_ost_list=[1, 0, 1, 0, 0, 1],
video_aspect=VideoAspect.portrait
)

View File

@ -3,10 +3,11 @@ import json
import time
import asyncio
import requests
from app.utils import video_processor
from loguru import logger
from typing import List, Dict, Any, Callable
from app.utils import utils, gemini_analyzer, video_processor, video_processor_v2
from app.utils import utils, gemini_analyzer, video_processor
from app.utils.script_generator import ScriptProcessor
from app.config import config
@ -21,6 +22,7 @@ class ScriptGenerator:
video_path: str,
video_theme: str = "",
custom_prompt: str = "",
frame_interval_input: int = 5,
skip_seconds: int = 0,
threshold: int = 30,
vision_batch_size: int = 5,
@ -105,20 +107,13 @@ class ScriptGenerator:
os.makedirs(video_keyframes_dir, exist_ok=True)
try:
if config.frames.get("version") == "v2":
processor = video_processor_v2.VideoProcessor(video_path)
processor.process_video_pipeline(
output_dir=video_keyframes_dir,
skip_seconds=skip_seconds,
threshold=threshold
)
else:
processor = video_processor.VideoProcessor(video_path)
processor.process_video(
output_dir=video_keyframes_dir,
skip_seconds=skip_seconds
)
processor = video_processor.VideoProcessor(video_path)
processor.process_video_pipeline(
output_dir=video_keyframes_dir,
skip_seconds=skip_seconds,
threshold=threshold
)
for filename in sorted(os.listdir(video_keyframes_dir)):
if filename.endswith('.jpg'):
keyframe_files.append(os.path.join(video_keyframes_dir, filename))

View File

@ -4,11 +4,11 @@ import re
import traceback
from typing import Optional
from faster_whisper import WhisperModel
# from faster_whisper import WhisperModel
from timeit import default_timer as timer
from loguru import logger
import google.generativeai as genai
from moviepy.editor import VideoFileClip
from moviepy import VideoFileClip
import os
from app.config import config
@ -33,7 +33,7 @@ def create(audio_file, subtitle_file: str = ""):
"""
global model, device, compute_type
if not model:
model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v2"
model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v3"
model_bin_file = f"{model_path}/model.bin"
if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
logger.error(
@ -45,12 +45,25 @@ def create(audio_file, subtitle_file: str = ""):
)
return None
# 尝试使用 CUDA如果失败则回退到 CPU
# 首先使用CPU模式不触发CUDA检查
use_cuda = False
try:
import torch
if torch.cuda.is_available():
# 在函数中延迟导入torch而不是在全局范围内
# 使用安全的方式检查CUDA可用性
def check_cuda_available():
try:
import torch
return torch.cuda.is_available()
except (ImportError, RuntimeError) as e:
logger.warning(f"检查CUDA可用性时出错: {e}")
return False
# 仅当明确需要时才检查CUDA
use_cuda = check_cuda_available()
if use_cuda:
logger.info(f"尝试使用 CUDA 加载模型: {model_path}")
try:
logger.info(f"尝试使用 CUDA 加载模型: {model_path}")
model = WhisperModel(
model_size_or_path=model_path,
device="cuda",
@ -63,18 +76,18 @@ def create(audio_file, subtitle_file: str = ""):
except Exception as e:
logger.warning(f"CUDA 加载失败,错误信息: {str(e)}")
logger.warning("回退到 CPU 模式")
device = "cpu"
compute_type = "int8"
use_cuda = False
else:
logger.info("未检测到 CUDA使用 CPU 模式")
device = "cpu"
compute_type = "int8"
except ImportError:
logger.warning("未安装 torch使用 CPU 模式")
logger.info("使用 CPU 模式")
except Exception as e:
logger.warning(f"CUDA检查过程出错: {e}")
logger.warning("默认使用CPU模式")
use_cuda = False
# 如果CUDA不可用或加载失败使用CPU
if not use_cuda:
device = "cpu"
compute_type = "int8"
if device == "cpu":
logger.info(f"使用 CPU 加载模型: {model_path}")
model = WhisperModel(
model_size_or_path=model_path,
@ -403,7 +416,7 @@ def extract_audio_and_create_subtitle(video_file: str, subtitle_file: str = "")
logger.info("音频提取完成,开始生成字幕")
# 使用create函数生成字幕
create(audio_file, subtitle_file)
create("/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav", subtitle_file)
# 删除临时音频文件
if os.path.exists(audio_file):
@ -422,8 +435,8 @@ if __name__ == "__main__":
task_id = "123456"
task_dir = utils.task_dir(task_id)
subtitle_file = f"{task_dir}/subtitle_123456.srt"
audio_file = f"{task_dir}/audio.wav"
video_file = "/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_1702.mp4"
audio_file = "/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav"
video_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/qyn2-2-720p.mp4"
extract_audio_and_create_subtitle(video_file, subtitle_file)

View File

@ -0,0 +1,202 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project: NarratoAI
@File : subtitle_merger
@Author : viccy
@Date : 2025/5/6 下午4:00
'''
import re
import os
from datetime import datetime, timedelta
def parse_time(time_str):
"""解析时间字符串为timedelta对象"""
hours, minutes, seconds_ms = time_str.split(':')
seconds, milliseconds = seconds_ms.split(',')
td = timedelta(
hours=int(hours),
minutes=int(minutes),
seconds=int(seconds),
milliseconds=int(milliseconds)
)
return td
def format_time(td):
"""将timedelta对象格式化为SRT时间字符串"""
total_seconds = int(td.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
seconds = total_seconds % 60
milliseconds = td.microseconds // 1000
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def parse_edited_time_range(time_range_str):
"""从editedTimeRange字符串中提取时间范围"""
if not time_range_str:
return None, None
parts = time_range_str.split('-')
if len(parts) != 2:
return None, None
start_time_str, end_time_str = parts
# 将HH:MM:SS格式转换为timedelta
start_h, start_m, start_s = map(int, start_time_str.split(':'))
end_h, end_m, end_s = map(int, end_time_str.split(':'))
start_time = timedelta(hours=start_h, minutes=start_m, seconds=start_s)
end_time = timedelta(hours=end_h, minutes=end_m, seconds=end_s)
return start_time, end_time
def merge_subtitle_files(subtitle_items, output_file=None):
"""
合并多个SRT字幕文件
参数:
subtitle_items: 字典列表每个字典包含subtitle文件路径和editedTimeRange
output_file: 输出文件的路径如果为None则自动生成
返回:
合并后的字幕文件路径
"""
# 按照editedTimeRange的开始时间排序
sorted_items = sorted(subtitle_items,
key=lambda x: parse_edited_time_range(x.get('editedTimeRange', ''))[0] or timedelta())
merged_subtitles = []
subtitle_index = 1
for item in sorted_items:
if not item.get('subtitle') or not os.path.exists(item.get('subtitle')):
continue
# 从editedTimeRange获取起始时间偏移
offset_time, _ = parse_edited_time_range(item.get('editedTimeRange', ''))
if offset_time is None:
print(f"警告: 无法从项目 {item.get('_id')} 的editedTimeRange中提取时间范围跳过该项")
continue
with open(item['subtitle'], 'r', encoding='utf-8') as file:
content = file.read()
# 解析字幕文件
subtitle_blocks = re.split(r'\n\s*\n', content.strip())
for block in subtitle_blocks:
lines = block.strip().split('\n')
if len(lines) < 3: # 确保块有足够的行数
continue
# 解析时间轴行
time_line = lines[1]
time_parts = time_line.split(' --> ')
if len(time_parts) != 2:
continue
start_time = parse_time(time_parts[0])
end_time = parse_time(time_parts[1])
# 应用时间偏移
adjusted_start_time = start_time + offset_time
adjusted_end_time = end_time + offset_time
# 重建字幕块
adjusted_time_line = f"{format_time(adjusted_start_time)} --> {format_time(adjusted_end_time)}"
text_lines = lines[2:]
new_block = [
str(subtitle_index),
adjusted_time_line,
*text_lines
]
merged_subtitles.append('\n'.join(new_block))
subtitle_index += 1
# 确定输出文件路径
if output_file is None:
dir_path = os.path.dirname(sorted_items[0]['subtitle'])
first_start = parse_edited_time_range(sorted_items[0]['editedTimeRange'])[0]
last_end = parse_edited_time_range(sorted_items[-1]['editedTimeRange'])[1]
first_start_h, first_start_m, first_start_s = int(first_start.seconds // 3600), int((first_start.seconds % 3600) // 60), int(first_start.seconds % 60)
last_end_h, last_end_m, last_end_s = int(last_end.seconds // 3600), int((last_end.seconds % 3600) // 60), int(last_end.seconds % 60)
first_start_str = f"{first_start_h:02d}_{first_start_m:02d}_{first_start_s:02d}"
last_end_str = f"{last_end_h:02d}_{last_end_m:02d}_{last_end_s:02d}"
output_file = os.path.join(dir_path, f"merged_subtitle_{first_start_str}-{last_end_str}.srt")
# 合并所有字幕块
merged_content = '\n\n'.join(merged_subtitles)
# 写入合并后的内容
with open(output_file, 'w', encoding='utf-8') as file:
file.write(merged_content)
return output_file
if __name__ == '__main__':
# 测试数据
test_data = [
{'picture': '【解说】好的,各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸范闲在北齐""了?这怎么可能!',
'timestamp': '00:00:00-00:01:15',
'narration': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸范闲在北齐""了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!',
'OST': 0,
'_id': 1,
'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3',
'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt',
'sourceTimeRange': '00:00:00-00:00:26',
'duration': 26,
'editedTimeRange': '00:00:00-00:00:26'
},
{'picture': '【解说】上一集我们看到,范闲在北齐遭遇了惊天变故,生死不明!',
'timestamp': '00:01:15-00:04:40',
'narration': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…',
'OST': 0,
'_id': 2,
'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3',
'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt',
'sourceTimeRange': '00:01:15-00:01:29',
'duration': 14,
'editedTimeRange': '00:00:26-00:00:40'
},
{'picture': '【解说】"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。',
'timestamp': '00:04:58-00:05:45',
'narration': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!',
'OST': 0,
'_id': 4,
'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3',
'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt',
'sourceTimeRange': '00:04:58-00:05:20',
'duration': 22,
'editedTimeRange': '00:00:57-00:01:19'
},
{'picture': '【解说】但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!',
'timestamp': '00:05:45-00:06:00',
'narration': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!',
'OST': 0,
'_id': 5,
'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3',
'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt',
'sourceTimeRange': '00:05:45-00:05:53',
'duration': 8,
'editedTimeRange': '00:01:19-00:01:27'
}
]
output_file = merge_subtitle_files(test_data)
print(f"字幕文件已合并至: {output_file}")

View File

@ -9,167 +9,177 @@ from loguru import logger
from app.config import config
from app.models import const
from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams
from app.services import llm, material, subtitle, video, voice, audio_merger
from app.services import (llm, material, subtitle, video, voice, audio_merger,
subtitle_merger, clip_video, merger_video, update_script, generate_video)
from app.services import state as sm
from app.utils import utils
def generate_script(task_id, params):
logger.info("\n\n## generating video script")
video_script = params.video_script.strip()
if not video_script:
video_script = llm.generate_script(
video_subject=params.video_subject,
language=params.video_language,
paragraph_number=params.paragraph_number,
)
else:
logger.debug(f"video script: \n{video_script}")
# def generate_script(task_id, params):
# logger.info("\n\n## generating video script")
# video_script = params.video_script.strip()
# if not video_script:
# video_script = llm.generate_script(
# video_subject=params.video_subject,
# language=params.video_language,
# paragraph_number=params.paragraph_number,
# )
# else:
# logger.debug(f"video script: \n{video_script}")
if not video_script:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
logger.error("failed to generate video script.")
return None
# if not video_script:
# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
# logger.error("failed to generate video script.")
# return None
return video_script
# return video_script
def generate_terms(task_id, params, video_script):
logger.info("\n\n## generating video terms")
video_terms = params.video_terms
if not video_terms:
video_terms = llm.generate_terms(
video_subject=params.video_subject, video_script=video_script, amount=5
)
else:
if isinstance(video_terms, str):
video_terms = [term.strip() for term in re.split(r"[,]", video_terms)]
elif isinstance(video_terms, list):
video_terms = [term.strip() for term in video_terms]
else:
raise ValueError("video_terms must be a string or a list of strings.")
# def generate_terms(task_id, params, video_script):
# logger.info("\n\n## generating video terms")
# video_terms = params.video_terms
# if not video_terms:
# video_terms = llm.generate_terms(
# video_subject=params.video_subject, video_script=video_script, amount=5
# )
# else:
# if isinstance(video_terms, str):
# video_terms = [term.strip() for term in re.split(r"[,]", video_terms)]
# elif isinstance(video_terms, list):
# video_terms = [term.strip() for term in video_terms]
# else:
# raise ValueError("video_terms must be a string or a list of strings.")
logger.debug(f"video terms: {utils.to_json(video_terms)}")
# logger.debug(f"video terms: {utils.to_json(video_terms)}")
if not video_terms:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
logger.error("failed to generate video terms.")
return None
# if not video_terms:
# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
# logger.error("failed to generate video terms.")
# return None
return video_terms
# return video_terms
def save_script_data(task_id, video_script, video_terms, params):
script_file = path.join(utils.task_dir(task_id), "script.json")
script_data = {
"script": video_script,
"search_terms": video_terms,
"params": params,
}
# def save_script_data(task_id, video_script, video_terms, params):
# script_file = path.join(utils.task_dir(task_id), "script.json")
# script_data = {
# "script": video_script,
# "search_terms": video_terms,
# "params": params,
# }
with open(script_file, "w", encoding="utf-8") as f:
f.write(utils.to_json(script_data))
# with open(script_file, "w", encoding="utf-8") as f:
# f.write(utils.to_json(script_data))
def generate_audio(task_id, params, video_script):
logger.info("\n\n## generating audio")
audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
sub_maker = voice.tts(
text=video_script,
voice_name=voice.parse_voice_name(params.voice_name),
voice_rate=params.voice_rate,
voice_file=audio_file,
)
if sub_maker is None:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
logger.error(
"""failed to generate audio:
1. check if the language of the voice matches the language of the video script.
2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
""".strip()
)
return None, None, None
# def generate_audio(task_id, params, video_script):
# logger.info("\n\n## generating audio")
# audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
# sub_maker = voice.tts(
# text=video_script,
# voice_name=voice.parse_voice_name(params.voice_name),
# voice_rate=params.voice_rate,
# voice_file=audio_file,
# )
# if sub_maker is None:
# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
# logger.error(
# """failed to generate audio:
# 1. check if the language of the voice matches the language of the video script.
# 2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
# """.strip()
# )
# return None, None, None
audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
return audio_file, audio_duration, sub_maker
# audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
# return audio_file, audio_duration, sub_maker
def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
if not params.subtitle_enabled:
return ""
# def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
# if not params.subtitle_enabled:
# return ""
subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt")
subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
# subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt")
# subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
# logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
subtitle_fallback = False
if subtitle_provider == "edge":
voice.create_subtitle(
text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
)
if not os.path.exists(subtitle_path):
subtitle_fallback = True
logger.warning("subtitle file not found, fallback to whisper")
# subtitle_fallback = False
# if subtitle_provider == "edge":
# voice.create_subtitle(
# text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
# )
# if not os.path.exists(subtitle_path):
# subtitle_fallback = True
# logger.warning("subtitle file not found, fallback to whisper")
if subtitle_provider == "whisper" or subtitle_fallback:
subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
logger.info("\n\n## correcting subtitle")
subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
# if subtitle_provider == "whisper" or subtitle_fallback:
# subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
# logger.info("\n\n## correcting subtitle")
# subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
if not subtitle_lines:
logger.warning(f"subtitle file is invalid: {subtitle_path}")
return ""
# subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
# if not subtitle_lines:
# logger.warning(f"subtitle file is invalid: {subtitle_path}")
# return ""
return subtitle_path
# return subtitle_path
def get_video_materials(task_id, params, video_terms, audio_duration):
if params.video_source == "local":
logger.info("\n\n## preprocess local materials")
materials = video.preprocess_video(
materials=params.video_materials, clip_duration=params.video_clip_duration
)
if not materials:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
logger.error(
"no valid materials found, please check the materials and try again."
)
return None
return [material_info.url for material_info in materials]
else:
logger.info(f"\n\n## downloading videos from {params.video_source}")
downloaded_videos = material.download_videos(
task_id=task_id,
search_terms=video_terms,
source=params.video_source,
video_aspect=params.video_aspect,
video_contact_mode=params.video_concat_mode,
audio_duration=audio_duration * params.video_count,
max_clip_duration=params.video_clip_duration,
)
if not downloaded_videos:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
logger.error(
"failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
)
return None
return downloaded_videos
# def get_video_materials(task_id, params, video_terms, audio_duration):
# if params.video_source == "local":
# logger.info("\n\n## preprocess local materials")
# materials = video.preprocess_video(
# materials=params.video_materials, clip_duration=params.video_clip_duration
# )
# if not materials:
# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
# logger.error(
# "no valid materials found, please check the materials and try again."
# )
# return None
# return [material_info.url for material_info in materials]
# else:
# logger.info(f"\n\n## downloading videos from {params.video_source}")
# downloaded_videos = material.download_videos(
# task_id=task_id,
# search_terms=video_terms,
# source=params.video_source,
# video_aspect=params.video_aspect,
# video_contact_mode=params.video_concat_mode,
# audio_duration=audio_duration * params.video_count,
# max_clip_duration=params.video_clip_duration,
# )
# if not downloaded_videos:
# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
# logger.error(
# "failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
# )
# return None
# return downloaded_videos
def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
"""后台任务(自动剪辑视频进行剪辑)"""
"""
后台任务自动剪辑视频进行剪辑
Args:
task_id: 任务ID
params: 视频参数
subclip_path_videos: 视频片段路径
"""
global merged_audio_path, merged_subtitle_path
logger.info(f"\n\n## 开始任务: {task_id}")
# 初始化 ImageMagick
if not utils.init_imagemagick():
logger.warning("ImageMagick 初始化失败,字幕可能无法正常显示")
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0)
# tts 角色名称
voice_name = voice.parse_voice_name(params.voice_name)
# # 初始化 ImageMagick
# if not utils.init_imagemagick():
# logger.warning("ImageMagick 初始化失败,字幕可能无法正常显示")
# # tts 角色名称
# voice_name = voice.parse_voice_name(params.voice_name)
"""
1. 加载剪辑脚本
"""
logger.info("\n\n## 1. 加载视频脚本")
video_script_path = path.join(params.video_clip_json_path)
@ -185,174 +195,144 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
logger.debug(f"解说完整脚本: \n{video_script}")
logger.debug(f"解说 OST 列表: \n{video_ost}")
logger.debug(f"解说时间戳列表: \n{time_list}")
# 获取视频总时长(单位 s)
last_timestamp = list_script[-1]['new_timestamp']
end_time = last_timestamp.split("-")[1]
total_duration = utils.time_to_seconds(end_time)
except Exception as e:
logger.error(f"无法读取视频json脚本请检查配置是否正确。{e}")
raise ValueError("无法读取视频json脚本请检查配置是否正确")
logger.error(f"无法读取视频json脚本请检查脚本格式是否正确")
raise ValueError("无法读取视频json脚本请检查脚本格式是否正确")
else:
logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
raise ValueError("解说脚本不存在!请检查配置是否正确。")
"""
2. 使用 TTS 生成音频素材
"""
logger.info("\n\n## 2. 根据OST设置生成音频列表")
# 只为OST=0或2的片段生成TTS音频
# 只为OST=0 or 2的判断生成音频 OST=0 仅保留解说 OST=2 保留解说和原声
tts_segments = [
segment for segment in list_script
if segment['OST'] in [0, 2]
]
logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}")
# 初始化音频文件路径
audio_files = []
final_audio = ""
tts_results = voice.tts_multiple(
task_id=task_id,
list_script=tts_segments, # 只传入需要TTS的片段
voice_name=params.voice_name,
voice_rate=params.voice_rate,
voice_pitch=params.voice_pitch,
)
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
# """
# 3. (可选) 使用 whisper 生成字幕
# """
# if merged_subtitle_path is None:
# if audio_files:
# merged_subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
# subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
# logger.info(f"\n\n使用 {subtitle_provider} 生成字幕")
#
# subtitle.create(
# audio_file=merged_audio_path,
# subtitle_file=merged_subtitle_path,
# )
# subtitle_lines = subtitle.file_to_subtitles(merged_subtitle_path)
# if not subtitle_lines:
# logger.warning(f"字幕文件无效: {merged_subtitle_path}")
#
# sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
"""
3. 裁剪视频 - 将超出音频长度的视频进行裁剪
"""
logger.info("\n\n## 3. 裁剪视频")
video_clip_result = clip_video.clip_video(params.video_origin_path, tts_results)
# 更新 list_script 中的时间戳
tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results}
subclip_clip_result = {
tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results
}
new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result)
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60)
"""
4. 合并音频和字幕
"""
logger.info("\n\n## 4. 合并音频和字幕")
total_duration = sum([script["duration"] for script in new_script_list])
if tts_segments:
audio_files, sub_maker_list = voice.tts_multiple(
task_id=task_id,
list_script=tts_segments, # 只传入需要TTS的片段
voice_name=voice_name,
voice_rate=params.voice_rate,
voice_pitch=params.voice_pitch,
force_regenerate=True
)
if audio_files is None:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
return
if audio_files:
logger.info(f"合并音频文件: {audio_files}")
try:
# 传入OST信息以便正确处理音频
final_audio = audio_merger.merge_audio_files(
task_id=task_id,
audio_files=audio_files,
total_duration=total_duration,
list_script=list_script # 传入完整脚本以便处理OST
)
logger.info("音频文件合并成功")
except Exception as e:
logger.error(f"合并音频文件失败: {str(e)}")
final_audio = ""
else:
# 如果没有需要生成TTS的片段创建一个空白音频文件
# 这样可以确保后续的音频处理能正确进行
logger.info("没有需要生成TTS的片段将保留原声和背景音乐")
final_audio = path.join(utils.task_dir(task_id), "empty.mp3")
try:
from moviepy.editor import AudioClip
# 创建一个与视频等长的空白音频
empty_audio = AudioClip(make_frame=lambda t: 0, duration=total_duration)
empty_audio.write_audiofile(final_audio, fps=44100)
logger.info(f"已创建空白音频文件: {final_audio}")
except Exception as e:
logger.error(f"创建空白音频文件失败: {str(e)}")
final_audio = ""
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
subtitle_path = ""
if params.subtitle_enabled:
if audio_files:
subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
subtitle.create(
audio_file=final_audio,
subtitle_file=subtitle_path,
# 合并音频文件
merged_audio_path = audio_merger.merge_audio_files(
task_id=task_id,
total_duration=total_duration,
list_script=new_script_list
)
logger.info(f"音频文件合并成功->{merged_audio_path}")
# 合并字幕文件
merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list)
logger.info(f"字幕文件合并成功->{merged_subtitle_path}")
except Exception as e:
logger.error(f"合并音频文件失败: {str(e)}")
else:
logger.warning("没有需要合并的音频/字幕")
merged_audio_path = ""
merged_subtitle_path = ""
subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
if not subtitle_lines:
logger.warning(f"字幕文件无效: {subtitle_path}")
subtitle_path = ""
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
logger.info("\n\n## 4. 裁剪视频")
subclip_videos = [x for x in subclip_path_videos.values()]
# logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}")
if not subclip_videos:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
logger.error(
"裁剪视频失败,可能是 ImageMagick 不可用")
return
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
"""
5. 合并视频
"""
final_video_paths = []
combined_video_paths = []
_progress = 50
index = 1
combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4")
logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")
# 如果 new_script_list 中没有 video则使用 subclip_path_videos 中的视频
video_clips = [new_script['video'] if new_script.get('video') else subclip_path_videos.get(new_script.get('_id', '')) for new_script in new_script_list]
video.combine_clip_videos(
combined_video_path=combined_video_path,
video_paths=subclip_videos,
merger_video.combine_clip_videos(
output_video_path=combined_video_path,
video_paths=video_clips,
video_ost_list=video_ost,
list_script=list_script,
video_aspect=params.video_aspect,
threads=params.n_threads # 多线程
threads=params.n_threads
)
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80)
_progress += 50 / 2
sm.state.update_task(task_id, progress=_progress)
"""
6. 合并字幕/BGM/配音/视频
"""
output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}")
final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
# bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3'
bgm_path = utils.get_bgm_file()
logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}")
# 获取背景音乐
bgm_path = None
if params.bgm_type or params.bgm_file:
try:
bgm_path = utils.get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
if bgm_path:
logger.info(f"使用背景音乐: {bgm_path}")
except Exception as e:
logger.error(f"获取背景音乐失败: {str(e)}")
# 示例:自定义字幕样式
subtitle_style = {
'fontsize': params.font_size, # 字体大小
'color': params.text_fore_color, # 字体颜色
'stroke_color': params.stroke_color, # 描边颜色
'stroke_width': params.stroke_width, # 描边宽度, 范围0-10
'bg_color': params.text_back_color, # 半透明黑色背景
'position': (params.subtitle_position, 0.2), # 距离顶部60%的位置
'method': 'caption' # 渲染方法
# 调用示例
options = {
'voice_volume': params.tts_volume, # 配音音量
'bgm_volume': params.bgm_volume, # 背景音乐音量
'original_audio_volume': params.original_volume, # 视频原声音量0表示不保留
'keep_original_audio': True, # 是否保留原声
'subtitle_font': params.font_name, # 这里使用相对字体路径,会自动在 font_dir() 目录下查找
'subtitle_font_size': params.font_size,
'subtitle_color': params.text_fore_color,
'subtitle_bg_color': None, # 直接使用None表示透明背景
'subtitle_position': params.subtitle_position,
'custom_position': params.custom_position,
'threads': params.n_threads
}
# 示例:自定义音量配置
volume_config = {
'original': params.original_volume, # 原声音量80%
'bgm': params.bgm_volume, # BGM音量20%
'narration': params.tts_volume or params.voice_volume, # 解说音量100%
}
font_path = utils.font_dir(params.font_name)
video.generate_video_v3(
generate_video.merge_materials(
video_path=combined_video_path,
subtitle_path=subtitle_path,
audio_path=merged_audio_path,
subtitle_path=merged_subtitle_path,
bgm_path=bgm_path,
narration_path=final_audio,
output_path=final_video_path,
volume_config=volume_config, # 添加音量配置
subtitle_style=subtitle_style,
font_path=font_path
output_path=output_video_path,
options=options
)
_progress += 50 / 2
sm.state.update_task(task_id, progress=_progress)
final_video_paths.append(final_video_path)
final_video_paths.append(output_video_path)
combined_video_paths.append(combined_video_path)
logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.")
@ -400,35 +380,19 @@ def validate_params(video_path, audio_path, output_file, params):
if __name__ == "__main__":
# task_id = "test123"
# subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4',
# '00:06-00:15': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_06-00_15.mp4',
# '01:10-01:17': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_10-01_17.mp4',
# '00:47-01:03': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_47-01_03.mp4',
# '01:03-01:10': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_03-01_10.mp4',
# '02:40-03:08': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-02_40-03_08.mp4',
# '03:02-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_02-03_20.mp4',
# '03:18-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_18-03_20.mp4'}
#
# params = VideoClipParams(
# video_clip_json_path="E:\\projects\\NarratoAI\\resource/scripts/test003.json",
# video_origin_path="E:\\projects\\NarratoAI\\resource/videos/1.mp4",
# )
# start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
task_id = "demo"
task_id = "test456"
subclip_path_videos = {'01:10-01:17': './storage/cache_videos/vid-01_10-01_17.mp4',
'01:58-02:04': './storage/cache_videos/vid-01_58-02_04.mp4',
'02:25-02:31': './storage/cache_videos/vid-02_25-02_31.mp4',
'01:28-01:33': './storage/cache_videos/vid-01_28-01_33.mp4',
'03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4',
'00:24-00:28': './storage/cache_videos/vid-00_24-00_28.mp4',
'03:02-03:08': './storage/cache_videos/vid-03_02-03_08.mp4',
'00:41-00:44': './storage/cache_videos/vid-00_41-00_44.mp4',
'02:12-02:25': './storage/cache_videos/vid-02_12-02_25.mp4'}
# 提前裁剪是为了方便检查视频
subclip_path_videos = {
1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-05-390@00-00-57-980.mp4',
2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-28-900@00-00-43-700.mp4',
3: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-01-17-840@00-01-27-600.mp4',
4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-02-35-460@00-02-52-380.mp4',
5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-06-59-520@00-07-29-500.mp4',
}
params = VideoClipParams(
video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test004.json",
video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4",
video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/2025-0507-223311.json",
video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_4938.mp4",
)
start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
start_subclip(task_id, params, subclip_path_videos)

View File

@ -0,0 +1,266 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project: NarratoAI
@File : update_script
@Author : 小林同学
@Date : 2025/5/6 下午11:00
'''
import re
import os
from typing import Dict, List, Any, Tuple, Union
def extract_timestamp_from_video_path(video_path: str) -> str:
"""
从视频文件路径中提取时间戳
Args:
video_path: 视频文件路径
Returns:
提取出的时间戳格式为 'HH:MM:SS-HH:MM:SS' 'HH:MM:SS,sss-HH:MM:SS,sss'
"""
# 使用正则表达式从文件名中提取时间戳
filename = os.path.basename(video_path)
# 匹配新格式: vid_00-00-00-000@00-00-20-250.mp4
match_new = re.search(r'vid_(\d{2})-(\d{2})-(\d{2})-(\d{3})@(\d{2})-(\d{2})-(\d{2})-(\d{3})\.mp4', filename)
if match_new:
# 提取并格式化时间戳(包含毫秒)
start_h, start_m, start_s, start_ms = match_new.group(1), match_new.group(2), match_new.group(3), match_new.group(4)
end_h, end_m, end_s, end_ms = match_new.group(5), match_new.group(6), match_new.group(7), match_new.group(8)
return f"{start_h}:{start_m}:{start_s},{start_ms}-{end_h}:{end_m}:{end_s},{end_ms}"
# 匹配旧格式: vid-00-00-00-00-00-00.mp4
match_old = re.search(r'vid-(\d{2}-\d{2}-\d{2})-(\d{2}-\d{2}-\d{2})\.mp4', filename)
if match_old:
# 提取并格式化时间戳
start_time = match_old.group(1).replace('-', ':')
end_time = match_old.group(2).replace('-', ':')
return f"{start_time}-{end_time}"
return ""
def calculate_duration(timestamp: str) -> float:
"""
计算时间戳范围的持续时间
Args:
timestamp: 格式为 'HH:MM:SS-HH:MM:SS' 'HH:MM:SS,sss-HH:MM:SS,sss' 的时间戳
Returns:
持续时间
"""
try:
start_time, end_time = timestamp.split('-')
# 处理毫秒部分
if ',' in start_time:
start_parts = start_time.split(',')
start_time_parts = start_parts[0].split(':')
start_ms = float('0.' + start_parts[1]) if len(start_parts) > 1 else 0
start_h, start_m, start_s = map(int, start_time_parts)
else:
start_h, start_m, start_s = map(int, start_time.split(':'))
start_ms = 0
if ',' in end_time:
end_parts = end_time.split(',')
end_time_parts = end_parts[0].split(':')
end_ms = float('0.' + end_parts[1]) if len(end_parts) > 1 else 0
end_h, end_m, end_s = map(int, end_time_parts)
else:
end_h, end_m, end_s = map(int, end_time.split(':'))
end_ms = 0
# 转换为秒
start_seconds = start_h * 3600 + start_m * 60 + start_s + start_ms
end_seconds = end_h * 3600 + end_m * 60 + end_s + end_ms
# 计算时间差(秒)
return round(end_seconds - start_seconds, 2)
except (ValueError, AttributeError):
return 0.0
def update_script_timestamps(
script_list: List[Dict[str, Any]],
video_result: Dict[Union[str, int], str],
audio_result: Dict[Union[str, int], str] = None,
subtitle_result: Dict[Union[str, int], str] = None,
calculate_edited_timerange: bool = True
) -> List[Dict[str, Any]]:
"""
根据 video_result 中的视频文件更新 script_list 中的时间戳添加持续时间
并根据 audio_result 添加音频路径根据 subtitle_result 添加字幕路径
Args:
script_list: 原始脚本列表
video_result: 视频结果字典键为原时间戳或_id值为视频文件路径
audio_result: 音频结果字典键为原时间戳或_id值为音频文件路径
subtitle_result: 字幕结果字典键为原时间戳或_id值为字幕文件路径
calculate_edited_timerange: 是否计算并添加成品视频中的时间范围
Returns:
更新后的脚本列表
"""
# 创建副本,避免修改原始数据
updated_script = []
# 建立ID和时间戳到视频路径和新时间戳的映射
id_timestamp_mapping = {}
for key, video_path in video_result.items():
new_timestamp = extract_timestamp_from_video_path(video_path)
if new_timestamp:
id_timestamp_mapping[key] = {
'new_timestamp': new_timestamp,
'video_path': video_path
}
# 计算累积时长,用于生成成品视频中的时间范围
accumulated_duration = 0.0
# 更新脚本中的时间戳
for item in script_list:
item_copy = item.copy()
item_id = item_copy.get('_id')
orig_timestamp = item_copy.get('timestamp', '')
# 初始化音频和字幕路径为空字符串
item_copy['audio'] = ""
item_copy['subtitle'] = ""
item_copy['video'] = "" # 初始化视频路径为空字符串
# 如果提供了音频结果字典且ID存在于音频结果中直接使用对应的音频路径
if audio_result:
if item_id and item_id in audio_result:
item_copy['audio'] = audio_result[item_id]
elif orig_timestamp in audio_result:
item_copy['audio'] = audio_result[orig_timestamp]
# 如果提供了字幕结果字典且ID存在于字幕结果中直接使用对应的字幕路径
if subtitle_result:
if item_id and item_id in subtitle_result:
item_copy['subtitle'] = subtitle_result[item_id]
elif orig_timestamp in subtitle_result:
item_copy['subtitle'] = subtitle_result[orig_timestamp]
# 添加视频路径
if item_id and item_id in video_result:
item_copy['video'] = video_result[item_id]
elif orig_timestamp in video_result:
item_copy['video'] = video_result[orig_timestamp]
# 更新时间戳和计算持续时间
current_duration = 0.0
if item_id and item_id in id_timestamp_mapping:
# 根据ID找到对应的新时间戳
item_copy['sourceTimeRange'] = id_timestamp_mapping[item_id]['new_timestamp']
current_duration = calculate_duration(item_copy['sourceTimeRange'])
item_copy['duration'] = current_duration
elif orig_timestamp in id_timestamp_mapping:
# 根据原始时间戳找到对应的新时间戳
item_copy['sourceTimeRange'] = id_timestamp_mapping[orig_timestamp]['new_timestamp']
current_duration = calculate_duration(item_copy['sourceTimeRange'])
item_copy['duration'] = current_duration
elif orig_timestamp:
# 对于未更新的时间戳,也计算并添加持续时间
item_copy['sourceTimeRange'] = orig_timestamp
current_duration = calculate_duration(orig_timestamp)
item_copy['duration'] = current_duration
# 计算片段在成品视频中的时间范围
if calculate_edited_timerange and current_duration > 0:
start_time_seconds = accumulated_duration
end_time_seconds = accumulated_duration + current_duration
# 将秒数转换为 HH:MM:SS 格式
start_h = int(start_time_seconds // 3600)
start_m = int((start_time_seconds % 3600) // 60)
start_s = int(start_time_seconds % 60)
end_h = int(end_time_seconds // 3600)
end_m = int((end_time_seconds % 3600) // 60)
end_s = int(end_time_seconds % 60)
item_copy['editedTimeRange'] = f"{start_h:02d}:{start_m:02d}:{start_s:02d}-{end_h:02d}:{end_m:02d}:{end_s:02d}"
# 更新累积时长
accumulated_duration = end_time_seconds
updated_script.append(item_copy)
return updated_script
if __name__ == '__main__':
list_script = [
{
'picture': '【解说】好的,各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸范闲在北齐""了?这怎么可能!',
'timestamp': '00:00:00,001-00:01:15,001',
'narration': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸范闲在北齐""了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!',
'OST': 0,
'_id': 1
},
{
'picture': '【解说】上一集我们看到,范闲在北齐遭遇了惊天变故,生死不明!',
'timestamp': '00:01:15,001-00:04:40,001',
'narration': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…',
'OST': 0,
'_id': 2
},
{
'picture': '画面切到王启年小心翼翼地向范闲汇报。',
'timestamp': '00:04:41,001-00:04:58,001',
'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪',
'OST': 1,
'_id': 3
},
{
'picture': '【解说】"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。',
'timestamp': '00:04:58,001-00:05:45,001',
'narration': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!',
'OST': 0,
'_id': 4
},
{
'picture': '【解说】但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!',
'timestamp': '00:05:45,001-00:06:00,001',
'narration': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!',
'OST': 0,
'_id': 5
},
{
'picture': '画面切换到范闲蒙面闯入皇宫,被侍卫包围的场景。',
'timestamp': '00:06:00,001-00:06:03,001',
'narration': '抓刺客',
'OST': 1,
'_id': 6
}]
video_res = {
1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-00-000@00-00-20-250.mp4',
2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-30-000@00-00-48-950.mp4',
4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-00-000@00-01-15-688.mp4',
5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-30-000@00-01-49-512.mp4'}
audio_res = {
1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3',
2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3',
4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3',
5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'}
sub_res = {
1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt',
2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt',
4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt',
5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt'}
# 更新并打印结果
updated_list_script = update_script_timestamps(list_script, video_res, audio_res, sub_res)
for item in updated_list_script:
print(
f"ID: {item['_id']} | Picture: {item['picture'][:20]}... | Timestamp: {item['timestamp']} | " +
f"SourceTimeRange: {item['sourceTimeRange']} | EditedTimeRange: {item.get('editedTimeRange', '')} | " +
f"Duration: {item['duration']} 秒 | Audio: {item['audio']} | Video: {item['video']} | Subtitle: {item['subtitle']}")

View File

@ -1,13 +1,13 @@
import traceback
import pysrt
# import pysrt
from typing import Optional
from typing import List
from loguru import logger
from moviepy.editor import *
from moviepy import *
from PIL import ImageFont
from contextlib import contextmanager
from moviepy.editor import (
from moviepy import (
VideoFileClip,
AudioFileClip,
TextClip,
@ -105,86 +105,6 @@ def manage_clip(clip):
del clip
def combine_clip_videos(combined_video_path: str,
video_paths: List[str],
video_ost_list: List[int],
list_script: list,
video_aspect: VideoAspect = VideoAspect.portrait,
threads: int = 2,
) -> str:
"""
合并子视频
Args:
combined_video_path: 合并后的存储路径
video_paths: 子视频路径列表
video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说)
list_script: 剪辑脚本
video_aspect: 屏幕比例
threads: 线程数
Returns:
str: 合并后的视频路径
"""
from app.utils.utils import calculate_total_duration
audio_duration = calculate_total_duration(list_script)
logger.info(f"音频的最大持续时间: {audio_duration} s")
output_dir = os.path.dirname(combined_video_path)
aspect = VideoAspect(video_aspect)
video_width, video_height = aspect.to_resolution()
clips = []
for video_path, video_ost in zip(video_paths, video_ost_list):
try:
clip = VideoFileClip(video_path)
if video_ost == 0: # 不保留原声
clip = clip.without_audio()
# video_ost 为 1 或 2 时都保留原声,不需要特殊处理
clip = clip.set_fps(30)
# 处理视频尺寸
clip_w, clip_h = clip.size
if clip_w != video_width or clip_h != video_height:
clip = resize_video_with_padding(
clip,
target_width=video_width,
target_height=video_height
)
logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}")
clips.append(clip)
except Exception as e:
logger.error(f"处理视频 {video_path} 时出错: {str(e)}")
continue
if not clips:
raise ValueError("没有有效的视频片段可以合并")
try:
video_clip = concatenate_videoclips(clips)
video_clip = video_clip.set_fps(30)
logger.info("开始合并视频... (过程中出现 UserWarning: 不必理会)")
video_clip.write_videofile(
filename=combined_video_path,
threads=threads,
audio_codec="aac",
fps=30,
temp_audiofile=os.path.join(output_dir, "temp-audio.m4a")
)
finally:
# 确保资源被正确放
video_clip.close()
for clip in clips:
clip.close()
logger.success("视频合并完成")
return combined_video_path
def resize_video_with_padding(clip, target_width: int, target_height: int):
"""
调整视频尺寸并添加黑边
@ -443,4 +363,3 @@ def generate_video_v3(
bgm.close()
if narration_path:
narration.close()

View File

@ -4,8 +4,6 @@ from loguru import logger
from typing import Dict, List, Optional, Tuple
from app.services import material
from app.models.schema import VideoClipParams
from app.utils import utils
class VideoService:

View File

@ -5,10 +5,11 @@ import traceback
import edge_tts
import asyncio
from loguru import logger
from typing import List
from typing import List, Union
from datetime import datetime
from xml.sax.saxutils import unescape
from edge_tts import submaker, SubMaker
from edge_tts.submaker import mktimestamp
from moviepy.video.tools import subtitles
import time
@ -1036,7 +1037,7 @@ def is_azure_v2_voice(voice_name: str):
def tts(
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
) -> [SubMaker, None]:
) -> Union[SubMaker, None]:
if is_azure_v2_voice(voice_name):
return azure_tts_v2(text, voice_name, voice_file)
return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
@ -1064,7 +1065,7 @@ def convert_pitch_to_percent(rate: float) -> str:
def azure_tts_v1(
text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
) -> [SubMaker, None]:
) -> Union[SubMaker, None]:
voice_name = parse_voice_name(voice_name)
text = text.strip()
rate_str = convert_rate_to_percent(voice_rate)
@ -1087,11 +1088,6 @@ def azure_tts_v1(
)
return sub_maker, audio_data
# 判断音频文件是否已存在
if os.path.exists(voice_file):
logger.info(f"voice file exists, skip tts: {voice_file}")
continue
# 获取音频数据和字幕信息
sub_maker, audio_data = asyncio.run(_do())
@ -1105,8 +1101,6 @@ def azure_tts_v1(
# 数据有效,写入文件
with open(voice_file, "wb") as file:
file.write(audio_data)
logger.info(f"completed, output file: {voice_file}")
return sub_maker
except Exception as e:
logger.error(f"生成音频文件时出错: {str(e)}")
@ -1115,7 +1109,7 @@ def azure_tts_v1(
return None
def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]:
voice_name = is_azure_v2_voice(voice_name)
if not voice_name:
logger.error(f"invalid voice name: {voice_name}")
@ -1203,11 +1197,14 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None
def _format_text(text: str) -> str:
# text = text.replace("\n", " ")
text = text.replace("\n", " ")
text = text.replace("\"", " ")
text = text.replace("[", " ")
text = text.replace("]", " ")
text = text.replace("(", " ")
text = text.replace(")", " ")
text = text.replace("", " ")
text = text.replace("", " ")
text = text.replace("{", " ")
text = text.replace("}", " ")
text = text.strip()
@ -1240,7 +1237,7 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis
if script_item['OST']:
continue
start_time, end_time = script_item['new_timestamp'].split('-')
start_time, end_time = script_item['timestamp'].split('-')
if sub_maker_index >= len(sub_maker_list):
logger.error(f"Sub maker list index out of range: {sub_maker_index}")
break
@ -1317,6 +1314,99 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis
traceback.print_exc()
def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
"""
优化字幕文件
1. 将字幕文件按照标点符号分割成多行
2. 逐行匹配字幕文件中的文本
3. 生成新的字幕文件
"""
text = _format_text(text)
def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
"""
1
00:00:00,000 --> 00:00:02,360
跑步是一项简单易行的运动
"""
start_t = mktimestamp(start_time).replace(".", ",")
end_t = mktimestamp(end_time).replace(".", ",")
return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n"
start_time = -1.0
sub_items = []
sub_index = 0
script_lines = utils.split_string_by_punctuations(text)
def match_line(_sub_line: str, _sub_index: int):
if len(script_lines) <= _sub_index:
return ""
_line = script_lines[_sub_index]
if _sub_line == _line:
return script_lines[_sub_index].strip()
_sub_line_ = re.sub(r"[^\w\s]", "", _sub_line)
_line_ = re.sub(r"[^\w\s]", "", _line)
if _sub_line_ == _line_:
return _line_.strip()
_sub_line_ = re.sub(r"\W+", "", _sub_line)
_line_ = re.sub(r"\W+", "", _line)
if _sub_line_ == _line_:
return _line.strip()
return ""
sub_line = ""
try:
for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
_start_time, end_time = offset
if start_time < 0:
start_time = _start_time
sub = unescape(sub)
sub_line += sub
sub_text = match_line(sub_line, sub_index)
if sub_text:
sub_index += 1
line = formatter(
idx=sub_index,
start_time=start_time,
end_time=end_time,
sub_text=sub_text,
)
sub_items.append(line)
start_time = -1.0
sub_line = ""
if len(sub_items) == len(script_lines):
with open(subtitle_file, "w", encoding="utf-8") as file:
file.write("\n".join(sub_items) + "\n")
try:
sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
duration = max([tb for ((ta, tb), txt) in sbs])
logger.info(
f"已创建字幕文件: {subtitle_file}, duration: {duration}"
)
return subtitle_file, duration
except Exception as e:
logger.error(f"failed, error: {str(e)}")
os.remove(subtitle_file)
else:
logger.error(
f"字幕创建失败, 字幕长度: {len(sub_items)}, script_lines len: {len(script_lines)}"
f"\nsub_items:{json.dumps(sub_items, indent=4, ensure_ascii=False)}"
f"\nscript_lines:{json.dumps(script_lines, indent=4, ensure_ascii=False)}"
)
except Exception as e:
logger.error(f"failed, error: {str(e)}")
def get_audio_duration(sub_maker: submaker.SubMaker):
"""
获取音频时长
@ -1326,7 +1416,7 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
return sub_maker.offset[-1][1] / 10000000
def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float, force_regenerate: bool = True):
def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float):
"""
根据JSON文件中的多段文本进行TTS转换
@ -1334,25 +1424,18 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
:param list_script: 脚本列表
:param voice_name: 语音名称
:param voice_rate: 语音速率
:param force_regenerate: 是否强制重新生成已存在的音频文件
:return: 生成的音频文件列表
"""
voice_name = parse_voice_name(voice_name)
output_dir = utils.task_dir(task_id)
audio_files = []
sub_maker_list = []
tts_results = []
for item in list_script:
if item['OST'] != 1:
# 将时间戳中的冒号替换为下划线
timestamp = item['new_timestamp'].replace(':', '_')
timestamp = item['timestamp'].replace(':', '_')
audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
# 检查文件是否已存在,如存在且不强制重新生成,则跳过
if os.path.exists(audio_file) and not force_regenerate:
logger.info(f"音频文件已存在,跳过生成: {audio_file}")
audio_files.append(audio_file)
continue
subtitle_file = os.path.join(output_dir, f"subtitle_{timestamp}.srt")
text = item['narration']
@ -1369,9 +1452,18 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
f"如果您在中国请使用VPN; "
f"或者使用其他 tts 引擎")
continue
else:
# 为当前片段生成字幕文件
_, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
audio_files.append(audio_file)
sub_maker_list.append(sub_maker)
tts_results.append({
"_id": item['_id'],
"timestamp": item['timestamp'],
"audio_file": audio_file,
"subtitle_file": subtitle_file,
"duration": duration,
"text": text,
})
logger.info(f"已生成音频文件: {audio_file}")
return audio_files, sub_maker_list
return tts_results

View File

@ -61,7 +61,6 @@ class VisionAnalyzer:
try:
# 加载图片
if isinstance(images[0], str):
logger.info("正在加载图片...")
images = self.load_images(images)
# 验证图片列表
@ -81,11 +80,14 @@ class VisionAnalyzer:
images = valid_images
results = []
total_batches = (len(images) + batch_size - 1) // batch_size
# 视频帧总数除以批量处理大小,如果有小数则+1
batches_needed = len(images) // batch_size
if len(images) % batch_size > 0:
batches_needed += 1
logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed}")
logger.debug(f"{total_batches} 个批次,每批次 {batch_size} 张图片")
with tqdm(total=total_batches, desc="分析进度") as pbar:
with tqdm(total=batches_needed, desc="分析进度") as pbar:
for i in range(0, len(images), batch_size):
batch = images[i:i + batch_size]
retry_count = 0
@ -93,8 +95,8 @@ class VisionAnalyzer:
while retry_count < 3:
try:
# 在每个批次处理前添加小延迟
if i > 0:
await asyncio.sleep(2)
# if i > 0:
# await asyncio.sleep(2)
# 确保每个批次的图片都是有效的
valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)]

View File

@ -30,7 +30,7 @@ class QwenAnalyzer:
self.model_name = model_name
self.api_key = api_key
self.base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
self.base_url = base_url
# 配置API客户端
self._configure_client()
@ -80,7 +80,7 @@ class QwenAnalyzer:
# 添加文本提示
content.append({
"type": "text",
"text": prompt
"text": prompt % (len(content), len(content), len(content))
})
# 调用API
@ -102,7 +102,7 @@ class QwenAnalyzer:
async def analyze_images(self,
images: Union[List[str], List[PIL.Image.Image]],
prompt: str,
batch_size: int = 5) -> List[Dict]:
batch_size: int) -> List[Dict]:
"""
批量分析多张图片
Args:
@ -118,7 +118,6 @@ class QwenAnalyzer:
# 加载图片
if isinstance(images[0], str):
logger.info("正在加载图片...")
images = self.load_images(images)
# 验证图片列表
@ -141,9 +140,14 @@ class QwenAnalyzer:
images = valid_images
results = []
total_batches = (len(images) + batch_size - 1) // batch_size
# 视频帧总数除以批量处理大小,如果有小数则+1
batches_needed = len(images) // batch_size
if len(images) % batch_size > 0:
batches_needed += 1
logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed}")
with tqdm(total=total_batches, desc="分析进度") as pbar:
with tqdm(total=batches_needed, desc="分析进度") as pbar:
for i in range(0, len(images), batch_size):
batch = images[i:i + batch_size]
batch_paths = valid_paths[i:i + batch_size] if valid_paths else None
@ -151,9 +155,9 @@ class QwenAnalyzer:
while retry_count < 3:
try:
# 在每个批次处理前<EFBFBD><EFBFBD>加小延迟
if i > 0:
await asyncio.sleep(2)
# 在每个批次处理前加小延迟
# if i > 0:
# await asyncio.sleep(0.5)
# 确保每个批次的图片都是有效的
valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)]
@ -209,7 +213,7 @@ class QwenAnalyzer:
for i, result in enumerate(results):
response_text = result['response']
# 如果有图片路径信息,<EFBFBD><EFBFBD><EFBFBD>用它来生成文件名
# 如果有图片路径信息,用它来生成文件名
if result.get('image_paths'):
image_paths = result['image_paths']
img_name_start = Path(image_paths[0]).stem.split('_')[-1]

View File

@ -2,7 +2,7 @@ import os
import json
import traceback
from loguru import logger
import tiktoken
# import tiktoken
from typing import List, Dict
from datetime import datetime
from openai import OpenAI
@ -94,12 +94,12 @@ class OpenAIGenerator(BaseGenerator):
"user": "script_generator"
}
# 初始化token计数器
try:
self.encoding = tiktoken.encoding_for_model(self.model_name)
except KeyError:
logger.warning(f"未找到模型 {self.model_name} 的专用编码器,使用默认编码器")
self.encoding = tiktoken.get_encoding("cl100k_base")
# # 初始化token计数器
# try:
# self.encoding = tiktoken.encoding_for_model(self.model_name)
# except KeyError:
# logger.warning(f"未找到模型 {self.model_name} 的专用编码器,使用默认编码器")
# self.encoding = tiktoken.get_encoding("cl100k_base")
def _generate(self, messages: list, params: dict) -> any:
"""实现OpenAI特定的生成逻辑"""

View File

@ -197,6 +197,28 @@ def time_convert_seconds_to_hmsm(seconds) -> str:
return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds)
def format_time(seconds: float) -> str:
"""
将秒数转换为格式化的时间字符串 (HH:MM:SS,mmm)
参数:
seconds: 需要转换的秒数可以是整数或浮点数
返回:
格式化的时间字符串格式为 HH:MM:SS,mmm
"""
# 计算小时、分钟、秒和毫秒
hours = int(seconds // 3600)
remaining_seconds = seconds % 3600
minutes = int(remaining_seconds // 60)
remaining_seconds = remaining_seconds % 60
secs = int(remaining_seconds)
milliseconds = int((remaining_seconds - secs) * 1000)
# 格式化为时间字符串
return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, secs, milliseconds)
def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str:
start_time = time_convert_seconds_to_hmsm(start_time)
end_time = time_convert_seconds_to_hmsm(end_time)
@ -506,7 +528,7 @@ def cut_video(params, progress_callback=None):
st.session_state['subclip_videos'] = subclip_videos
for i, video_script in enumerate(video_script_list):
try:
video_script['path'] = subclip_videos[video_script['timestamp']]
video_script['path'] = subclip_videos[i+1]
except KeyError as err:
logger.error(f"裁剪视频失败: {err}")

View File

@ -1,237 +1,339 @@
import cv2
import numpy as np
from sklearn.cluster import MiniBatchKMeans
"""
视频帧提取工具
这个模块提供了简单高效的视频帧提取功能主要特点
1. 使用ffmpeg进行视频处理支持硬件加速
2. 按指定时间间隔提取视频关键帧
3. 支持多种视频格式
4. 支持高清视频帧输出
5. 直接从原视频提取高质量关键帧
不依赖OpenCV和sklearn等库只使用ffmpeg作为外部依赖降低了安装和使用的复杂度
"""
import os
import re
from typing import List, Tuple, Generator
import time
import subprocess
from typing import List, Dict
from loguru import logger
import gc
from tqdm import tqdm
class VideoProcessor:
def __init__(self, video_path: str, batch_size: int = 100):
def __init__(self, video_path: str):
"""
初始化视频处理器
Args:
video_path: 视频文件路径
batch_size: 批处理大小控制内存使用
"""
if not os.path.exists(video_path):
raise FileNotFoundError(f"视频文件不存在: {video_path}")
self.video_path = video_path
self.batch_size = batch_size
self.cap = cv2.VideoCapture(video_path)
if not self.cap.isOpened():
raise RuntimeError(f"无法打开视频文件: {video_path}")
self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
self.video_info = self._get_video_info()
self.fps = float(self.video_info.get('fps', 25))
self.duration = float(self.video_info.get('duration', 0))
self.width = int(self.video_info.get('width', 0))
self.height = int(self.video_info.get('height', 0))
self.total_frames = int(self.fps * self.duration)
def __del__(self):
"""析构函数,确保视频资源被释放"""
if hasattr(self, 'cap'):
self.cap.release()
gc.collect()
def _get_video_info(self) -> Dict[str, str]:
"""
使用ffprobe获取视频信息
def preprocess_video(self) -> Generator[Tuple[int, np.ndarray], None, None]:
"""
使用生成器方式分批读取视频帧
Yields:
Tuple[int, np.ndarray]: (帧索引, 视频帧)
"""
self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
frame_idx = 0
while self.cap.isOpened():
ret, frame = self.cap.read()
if not ret:
break
# 降低分辨率以减少内存使用
frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)
yield frame_idx, frame
frame_idx += 1
# 定期进行垃圾回收
if frame_idx % 1000 == 0:
gc.collect()
def detect_shot_boundaries(self, threshold: int = 70) -> List[int]:
"""
使用批处理方式检测镜头边界
Args:
threshold: 差异阈值
Returns:
List[int]: 镜头边界帧的索引列表
Dict[str, str]: 包含视频基本信息的字典
"""
shot_boundaries = []
prev_frame = None
prev_idx = -1
pbar = tqdm(self.preprocess_video(),
total=self.total_frames,
desc="检测镜头边界",
unit="")
for frame_idx, curr_frame in pbar:
if prev_frame is not None:
prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)
diff = np.mean(np.abs(curr_gray.astype(float) - prev_gray.astype(float)))
if diff > threshold:
shot_boundaries.append(frame_idx)
pbar.set_postfix({"检测到边界": len(shot_boundaries)})
prev_frame = curr_frame.copy()
prev_idx = frame_idx
del curr_frame
if frame_idx % 100 == 0:
gc.collect()
return shot_boundaries
cmd = [
"ffprobe",
"-v", "error",
"-select_streams", "v:0",
"-show_entries", "stream=width,height,r_frame_rate,duration",
"-of", "default=noprint_wrappers=1:nokey=0",
self.video_path
]
def process_shot(self, shot_frames: List[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, int]:
"""
处理单个镜头的帧
Args:
shot_frames: 镜头中的帧列表
try:
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
lines = result.stdout.strip().split('\n')
info = {}
for line in lines:
if '=' in line:
key, value = line.split('=', 1)
info[key] = value
# 处理帧率(可能是分数形式)
if 'r_frame_rate' in info:
try:
num, den = map(int, info['r_frame_rate'].split('/'))
info['fps'] = str(num / den)
except ValueError:
info['fps'] = info.get('r_frame_rate', '25')
return info
except subprocess.CalledProcessError as e:
logger.error(f"获取视频信息失败: {e.stderr}")
return {
'width': '1280',
'height': '720',
'fps': '25',
'duration': '0'
}
def extract_frames_by_interval(self, output_dir: str, interval_seconds: float = 5.0,
use_hw_accel: bool = True) -> List[int]:
"""
按指定时间间隔提取视频帧
Args:
output_dir: 输出目录
interval_seconds: 帧提取间隔
use_hw_accel: 是否使用硬件加速
Returns:
Tuple[np.ndarray, int]: (关键帧, 帧索引)
List[int]: 提取的帧号列表
"""
if not shot_frames:
return None, -1
frame_features = []
frame_indices = []
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for idx, frame in tqdm(shot_frames,
desc="处理镜头帧",
unit="",
leave=False):
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
resized_gray = cv2.resize(gray, (32, 32))
frame_features.append(resized_gray.flatten())
frame_indices.append(idx)
frame_features = np.array(frame_features)
# 计算起始时间和帧提取点
start_time = 0
end_time = self.duration
extraction_times = []
kmeans = MiniBatchKMeans(n_clusters=1, batch_size=min(len(frame_features), 100),
random_state=0).fit(frame_features)
current_time = start_time
while current_time < end_time:
extraction_times.append(current_time)
current_time += interval_seconds
center_idx = np.argmin(np.sum((frame_features - kmeans.cluster_centers_[0]) ** 2, axis=1))
return shot_frames[center_idx][1], frame_indices[center_idx]
if not extraction_times:
logger.warning("未找到需要提取的帧")
return []
def extract_keyframes(self, shot_boundaries: List[int]) -> Generator[Tuple[np.ndarray, int], None, None]:
"""
使用生成器方式提取关键帧
# 确定硬件加速器选项
hw_accel = []
if use_hw_accel:
# 尝试检测可用的硬件加速器
hw_accel_options = self._detect_hw_accelerator()
if hw_accel_options:
hw_accel = hw_accel_options
logger.info(f"使用硬件加速: {' '.join(hw_accel)}")
else:
logger.warning("未检测到可用的硬件加速器,使用软件解码")
Args:
shot_boundaries: 镜头边界列表
# 提取帧
frame_numbers = []
for i, timestamp in enumerate(tqdm(extraction_times, desc="提取视频帧")):
frame_number = int(timestamp * self.fps)
frame_numbers.append(frame_number)
Yields:
Tuple[np.ndarray, int]: (关键帧, 帧索引)
"""
shot_frames = []
current_shot_start = 0
# 格式化时间戳字符串 (HHMMSSmmm)
hours = int(timestamp // 3600)
minutes = int((timestamp % 3600) // 60)
seconds = int(timestamp % 60)
milliseconds = int((timestamp % 1) * 1000)
time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
output_path = os.path.join(output_dir, f"keyframe_{frame_number:06d}_{time_str}.jpg")
# 使用ffmpeg提取单帧
cmd = [
"ffmpeg",
"-hide_banner",
"-loglevel", "error",
]
# 添加硬件加速参数
cmd.extend(hw_accel)
cmd.extend([
"-ss", str(timestamp),
"-i", self.video_path,
"-vframes", "1",
"-q:v", "1", # 最高质量
"-y",
output_path
])
try:
subprocess.run(cmd, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
logger.warning(f"提取帧 {frame_number} 失败: {e.stderr}")
for frame_idx, frame in self.preprocess_video():
if frame_idx in shot_boundaries:
if shot_frames:
keyframe, keyframe_idx = self.process_shot(shot_frames)
if keyframe is not None:
yield keyframe, keyframe_idx
# 清理内存
shot_frames.clear()
gc.collect()
logger.info(f"成功提取了 {len(frame_numbers)} 个视频帧")
return frame_numbers
def _detect_hw_accelerator(self) -> List[str]:
"""
检测系统可用的硬件加速器
Returns:
List[str]: 硬件加速器ffmpeg命令参数
"""
# 检测操作系统
import platform
system = platform.system().lower()
# 测试不同的硬件加速器
accelerators = []
if system == 'darwin': # macOS
# 测试 videotoolbox (Apple 硬件加速)
test_cmd = [
"ffmpeg",
"-hide_banner",
"-loglevel", "error",
"-hwaccel", "videotoolbox",
"-i", self.video_path,
"-t", "0.1",
"-f", "null",
"-"
]
try:
subprocess.run(test_cmd, capture_output=True, check=True)
return ["-hwaccel", "videotoolbox"]
except subprocess.CalledProcessError:
pass
current_shot_start = frame_idx
elif system == 'linux':
# 测试 VAAPI
test_cmd = [
"ffmpeg",
"-hide_banner",
"-loglevel", "error",
"-hwaccel", "vaapi",
"-i", self.video_path,
"-t", "0.1",
"-f", "null",
"-"
]
try:
subprocess.run(test_cmd, capture_output=True, check=True)
return ["-hwaccel", "vaapi"]
except subprocess.CalledProcessError:
pass
shot_frames.append((frame_idx, frame))
# 控制单个镜头的最大帧数
if len(shot_frames) > self.batch_size:
keyframe, keyframe_idx = self.process_shot(shot_frames)
if keyframe is not None:
yield keyframe, keyframe_idx
shot_frames.clear()
gc.collect()
# 尝试 CUDA
test_cmd = [
"ffmpeg",
"-hide_banner",
"-loglevel", "error",
"-hwaccel", "cuda",
"-i", self.video_path,
"-t", "0.1",
"-f", "null",
"-"
]
try:
subprocess.run(test_cmd, capture_output=True, check=True)
return ["-hwaccel", "cuda"]
except subprocess.CalledProcessError:
pass
elif system == 'windows':
# 测试 CUDA
test_cmd = [
"ffmpeg",
"-hide_banner",
"-loglevel", "error",
"-hwaccel", "cuda",
"-i", self.video_path,
"-t", "0.1",
"-f", "null",
"-"
]
try:
subprocess.run(test_cmd, capture_output=True, check=True)
return ["-hwaccel", "cuda"]
except subprocess.CalledProcessError:
pass
# 测试 D3D11VA
test_cmd = [
"ffmpeg",
"-hide_banner",
"-loglevel", "error",
"-hwaccel", "d3d11va",
"-i", self.video_path,
"-t", "0.1",
"-f", "null",
"-"
]
try:
subprocess.run(test_cmd, capture_output=True, check=True)
return ["-hwaccel", "d3d11va"]
except subprocess.CalledProcessError:
pass
# 测试 DXVA2
test_cmd = [
"ffmpeg",
"-hide_banner",
"-loglevel", "error",
"-hwaccel", "dxva2",
"-i", self.video_path,
"-t", "0.1",
"-f", "null",
"-"
]
try:
subprocess.run(test_cmd, capture_output=True, check=True)
return ["-hwaccel", "dxva2"]
except subprocess.CalledProcessError:
pass
# 处理最后一个镜头
if shot_frames:
keyframe, keyframe_idx = self.process_shot(shot_frames)
if keyframe is not None:
yield keyframe, keyframe_idx
# 如果没有找到可用的硬件加速器
return []
def process_video(self, output_dir: str, skip_seconds: float = 0) -> None:
def process_video_pipeline(self,
output_dir: str,
interval_seconds: float = 5.0, # 帧提取间隔(秒)
use_hw_accel: bool = True) -> None:
"""
处理视频并提取关键帧使用分批处理方式
执行简化的视频处理流程直接从原视频按固定时间间隔提取帧
Args:
output_dir: 输出目录
skip_seconds: 跳过视频开头的秒数
interval_seconds: 帧提取间隔
use_hw_accel: 是否使用硬件加速
"""
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
try:
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 计算要跳过的帧数
skip_frames = int(skip_seconds * self.fps)
self.cap.set(cv2.CAP_PROP_POS_FRAMES, skip_frames)
# 检测镜头边界
logger.info("开始检测镜头边界...")
shot_boundaries = self.detect_shot_boundaries()
# 提取关键帧
logger.info("开始提取关键帧...")
frame_count = 0
pbar = tqdm(self.extract_keyframes(shot_boundaries),
desc="提取关键帧",
unit="")
for keyframe, frame_idx in pbar:
if frame_idx < skip_frames:
continue
# 计算时间戳
timestamp = frame_idx / self.fps
hours = int(timestamp // 3600)
minutes = int((timestamp % 3600) // 60)
seconds = int(timestamp % 60)
time_str = f"{hours:02d}{minutes:02d}{seconds:02d}"
# 保存关键帧
output_path = os.path.join(output_dir,
f'keyframe_{frame_idx:06d}_{time_str}.jpg')
cv2.imwrite(output_path, keyframe)
frame_count += 1
pbar.set_postfix({"已保存": frame_count})
if frame_count % 10 == 0:
gc.collect()
logger.info(f"关键帧提取完成,共保存 {frame_count} 帧到 {output_dir}")
# 直接从原视频提取关键帧
logger.info(f"从视频间隔 {interval_seconds} 秒提取关键帧...")
self.extract_frames_by_interval(
output_dir,
interval_seconds=interval_seconds,
use_hw_accel=use_hw_accel
)
logger.info(f"处理完成!视频帧已保存在: {output_dir}")
except Exception as e:
logger.error(f"视频处理失败: {str(e)}")
import traceback
logger.error(f"视频处理失败: \n{traceback.format_exc()}")
raise
finally:
# 确保资源被释放
self.cap.release()
gc.collect()
if __name__ == "__main__":
import time
start_time = time.time()
# 使用示例
processor = VideoProcessor("./resource/videos/test.mp4")
# 设置间隔为3秒提取帧
processor.process_video_pipeline(
output_dir="output",
interval_seconds=3.0,
use_hw_accel=True
)
end_time = time.time()
print(f"处理完成!总耗时: {end_time - start_time:.2f}")

View File

@ -1,382 +0,0 @@
import cv2
import numpy as np
from sklearn.cluster import KMeans
import os
import re
from typing import List, Tuple, Generator
from loguru import logger
import subprocess
from tqdm import tqdm
class VideoProcessor:
def __init__(self, video_path: str):
"""
初始化视频处理器
Args:
video_path: 视频文件路径
"""
if not os.path.exists(video_path):
raise FileNotFoundError(f"视频文件不存在: {video_path}")
self.video_path = video_path
self.cap = cv2.VideoCapture(video_path)
if not self.cap.isOpened():
raise RuntimeError(f"无法打开视频文件: {video_path}")
self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
def __del__(self):
"""析构函数,确保视频资源被释放"""
if hasattr(self, 'cap'):
self.cap.release()
def preprocess_video(self) -> Generator[np.ndarray, None, None]:
"""
使用生成器方式读取视频帧
Yields:
np.ndarray: 视频帧
"""
self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # 重置到视频开始
while self.cap.isOpened():
ret, frame = self.cap.read()
if not ret:
break
yield frame
def detect_shot_boundaries(self, frames: List[np.ndarray], threshold: int = 30) -> List[int]:
"""
使用帧差法检测镜头边界
Args:
frames: 视频帧列表
threshold: 差异阈值默认值调低为30
Returns:
List[int]: 镜头边界帧的索引列表
"""
shot_boundaries = []
if len(frames) < 2: # 添加帧数检查
logger.warning("视频帧数过少,无法检测场景边界")
return [len(frames) - 1] # 返回最后一帧作为边界
for i in range(1, len(frames)):
prev_frame = cv2.cvtColor(frames[i - 1], cv2.COLOR_BGR2GRAY)
curr_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
# 计算帧差
diff = np.mean(np.abs(curr_frame.astype(float) - prev_frame.astype(float)))
if diff > threshold:
shot_boundaries.append(i)
# 如果没有检测到任何边界,至少返回最后一帧
if not shot_boundaries:
logger.warning("未检测到场景边界,将视频作为单个场景处理")
shot_boundaries.append(len(frames) - 1)
return shot_boundaries
def extract_keyframes(self, frames: List[np.ndarray], shot_boundaries: List[int]) -> Tuple[
List[np.ndarray], List[int]]:
"""
从每个镜头中提取关键帧
Args:
frames: 视频帧列表
shot_boundaries: 镜头边界列表
Returns:
Tuple[List[np.ndarray], List[int]]: 关键帧列表和对应的帧索引
"""
keyframes = []
keyframe_indices = []
for i in tqdm(range(len(shot_boundaries)), desc="提取关键帧"):
start = shot_boundaries[i - 1] if i > 0 else 0
end = shot_boundaries[i]
shot_frames = frames[start:end]
if not shot_frames:
continue
# 将每一帧转换为灰度图并展平为一维数组
frame_features = np.array([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY).flatten()
for frame in shot_frames])
try:
# 尝试使用 KMeans
kmeans = KMeans(n_clusters=1, random_state=0).fit(frame_features)
center_idx = np.argmin(np.sum((frame_features - kmeans.cluster_centers_[0]) ** 2, axis=1))
except Exception as e:
logger.warning(f"KMeans 聚类失败,使用备选方案: {str(e)}")
# 备选方案:选择镜头中间的帧作为关键帧
center_idx = len(shot_frames) // 2
keyframes.append(shot_frames[center_idx])
keyframe_indices.append(start + center_idx)
return keyframes, keyframe_indices
def save_keyframes(self, keyframes: List[np.ndarray], keyframe_indices: List[int],
output_dir: str, desc: str = "保存关键帧") -> None:
"""
保存关键帧到指定目录文件名格式为keyframe_帧序号_时间戳.jpg
时间戳精确到毫秒格式为HHMMSSmmm
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for keyframe, frame_idx in tqdm(zip(keyframes, keyframe_indices),
total=len(keyframes),
desc=desc):
# 计算精确到毫秒的时间戳
timestamp = frame_idx / self.fps
hours = int(timestamp // 3600)
minutes = int((timestamp % 3600) // 60)
seconds = int(timestamp % 60)
milliseconds = int((timestamp % 1) * 1000) # 计算毫秒部分
time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
output_path = os.path.join(output_dir,
f'keyframe_{frame_idx:06d}_{time_str}.jpg')
cv2.imwrite(output_path, keyframe)
def extract_frames_by_numbers(self, frame_numbers: List[int], output_folder: str) -> None:
"""
根据指定的帧号提取帧如果多个帧在同一毫秒内只保留一个
"""
if not frame_numbers:
raise ValueError("未提供帧号列表")
if any(fn >= self.total_frames or fn < 0 for fn in frame_numbers):
raise ValueError("存在无效的帧号")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 用于记录已处理的时间戳(毫秒)
processed_timestamps = set()
for frame_number in tqdm(frame_numbers, desc="提取高清帧"):
# 计算精确到毫秒的时间戳
timestamp = frame_number / self.fps
timestamp_ms = int(timestamp * 1000) # 转换为毫秒
# 如果这一毫秒已经处理过,跳过
if timestamp_ms in processed_timestamps:
continue
self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
ret, frame = self.cap.read()
if ret:
# 记录这一毫秒已经处理
processed_timestamps.add(timestamp_ms)
# 计算时间戳字符串
hours = int(timestamp // 3600)
minutes = int((timestamp % 3600) // 60)
seconds = int(timestamp % 60)
milliseconds = int((timestamp % 1) * 1000) # 计算毫秒部分
time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
output_path = os.path.join(output_folder,
f"keyframe_{frame_number:06d}_{time_str}.jpg")
cv2.imwrite(output_path, frame)
else:
logger.info(f"无法读取帧 {frame_number}")
logger.info(f"共提取了 {len(processed_timestamps)} 个不同时间戳的帧")
@staticmethod
def extract_numbers_from_folder(folder_path: str) -> List[int]:
"""
从文件夹中提取帧号
Args:
folder_path: 关键帧文件夹路径
Returns:
List[int]: 排序后的帧号列表
"""
files = [f for f in os.listdir(folder_path) if f.endswith('.jpg')]
# 更新正则表达式以匹配新的文件名格式keyframe_000123_010534123.jpg
pattern = re.compile(r'keyframe_(\d+)_\d{9}\.jpg$')
numbers = []
for f in files:
match = pattern.search(f)
if match:
numbers.append(int(match.group(1)))
else:
logger.warning(f"文件名格式不匹配: {f}")
if not numbers:
logger.error(f"在目录 {folder_path} 中未找到有效的关键帧文件")
return sorted(numbers)
def process_video(self, output_dir: str, skip_seconds: float = 0, threshold: int = 30) -> None:
"""
处理视频并提取关键帧
Args:
output_dir: 输出目录
skip_seconds: 跳过视频开头的秒数
"""
skip_frames = int(skip_seconds * self.fps)
logger.info("读取视频帧...")
frames = []
for frame in tqdm(self.preprocess_video(),
total=self.total_frames,
desc="读取视频"):
frames.append(frame)
frames = frames[skip_frames:]
if not frames:
raise ValueError(f"跳过 {skip_seconds} 秒后没有剩余帧可以处理")
logger.info("检测场景边界...")
shot_boundaries = self.detect_shot_boundaries(frames, threshold)
logger.info(f"检测到 {len(shot_boundaries)} 个场景边界")
keyframes, keyframe_indices = self.extract_keyframes(frames, shot_boundaries)
adjusted_indices = [idx + skip_frames for idx in keyframe_indices]
self.save_keyframes(keyframes, adjusted_indices, output_dir, desc="保存压缩关键帧")
def process_video_pipeline(self,
output_dir: str,
skip_seconds: float = 0,
threshold: int = 20, # 降低默认阈值
compressed_width: int = 320,
keep_temp: bool = False) -> None:
"""
执行完整的视频处理流程
Args:
threshold: 降低默认阈值为20使场景检测更敏感
"""
os.makedirs(output_dir, exist_ok=True)
temp_dir = os.path.join(output_dir, 'temp')
compressed_dir = os.path.join(temp_dir, 'compressed')
mini_frames_dir = os.path.join(temp_dir, 'mini_frames')
hd_frames_dir = output_dir
os.makedirs(temp_dir, exist_ok=True)
os.makedirs(compressed_dir, exist_ok=True)
os.makedirs(mini_frames_dir, exist_ok=True)
os.makedirs(hd_frames_dir, exist_ok=True)
mini_processor = None
compressed_video = None
try:
# 1. 压缩视频
video_name = os.path.splitext(os.path.basename(self.video_path))[0]
compressed_video = os.path.join(compressed_dir, f"{video_name}_compressed.mp4")
# 获取原始视频的宽度和高度
original_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
original_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
logger.info("步骤1: 压缩视频...")
if original_width > original_height:
# 横版视频
scale_filter = f'scale={compressed_width}:-1'
else:
# 竖版视频
scale_filter = f'scale=-1:{compressed_width}'
ffmpeg_cmd = [
'ffmpeg', '-i', self.video_path,
'-vf', scale_filter,
'-y',
compressed_video
]
try:
subprocess.run(ffmpeg_cmd, check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
logger.error(f"FFmpeg 错误输出: {e.stderr}")
raise
# 2. 从压缩视频中提取关键帧
logger.info("\n步骤2: 从压缩视频提取关键帧...")
mini_processor = VideoProcessor(compressed_video)
mini_processor.process_video(mini_frames_dir, skip_seconds, threshold)
# 3. 从原始视频提取高清关键帧
logger.info("\n步骤3: 提取高清关键帧...")
frame_numbers = self.extract_numbers_from_folder(mini_frames_dir)
if not frame_numbers:
raise ValueError("未能从压缩视频中提取到有效的关键帧")
self.extract_frames_by_numbers(frame_numbers, hd_frames_dir)
logger.info(f"处理完成!高清关键帧保存在: {hd_frames_dir}")
except Exception as e:
import traceback
logger.error(f"视频处理失败: \n{traceback.format_exc()}")
raise
finally:
# 释放资源
if mini_processor:
mini_processor.cap.release()
del mini_processor
# 确保视频文件句柄被释放
if hasattr(self, 'cap'):
self.cap.release()
# 等待资源释放
import time
time.sleep(0.5)
if not keep_temp:
try:
# 先删除压缩视频文件
if compressed_video and os.path.exists(compressed_video):
try:
os.remove(compressed_video)
except Exception as e:
logger.warning(f"删除压缩视频失败: {e}")
# 再删除临时目录
import shutil
if os.path.exists(temp_dir):
max_retries = 3
for i in range(max_retries):
try:
shutil.rmtree(temp_dir)
break
except Exception as e:
if i == max_retries - 1:
logger.warning(f"清理临时文件失败: {e}")
else:
time.sleep(1) # 等待1秒后重试
continue
logger.info("临时文件已清理")
except Exception as e:
logger.warning(f"清理临时文件时出错: {e}")
if __name__ == "__main__":
import time
start_time = time.time()
processor = VideoProcessor("E:\\projects\\NarratoAI\\resource\\videos\\test.mp4")
processor.process_video_pipeline(output_dir="output")
end_time = time.time()
print(f"处理完成!总耗时: {end_time - start_time:.2f}")

View File

@ -1,10 +1,9 @@
[app]
project_version="0.5.3"
project_version="0.6.0"
# 支持视频理解的大模型提供商
# gemini
# qwenvl
vision_llm_provider="qwenvl"
vision_analysis_prompt = "你是资深视频内容分析专家,擅长分析视频画面信息,分析下面视频画面内容,只输出客观的画面描述不要给任何总结或评价"
########## Vision Gemini API Key
vision_gemini_api_key = ""
@ -173,12 +172,7 @@
speech_region=""
[frames]
skip_seconds = 0
# threshold差异阈值用于判断两个连续帧之间是否发生了场景切换
# 较小的阈值(如 20更敏感能捕捉到细微的场景变化但可能会误判关键帧图片更多
# 较大的阈值(如 40更保守只捕捉明显的场景切换但可能会漏掉渐变场景关键帧图片更少
# 默认值 30在实践中是一个比较平衡的选择
threshold = 30
version = "v2"
# 提取关键帧的间隔时间
frame_interval_input = 3
# 大模型单次处理的关键帧数量
vision_batch_size = 5
vision_batch_size = 10

View File

@ -1,38 +1,46 @@
requests~=2.31.0
moviepy==2.0.0.dev2
faster-whisper~=1.0.1
uvicorn~=0.27.1
fastapi~=0.115.4
tomli~=2.0.1
streamlit~=1.40.0
loguru~=0.7.2
aiohttp~=3.10.10
urllib3~=2.2.1
pydantic~=2.6.3
g4f~=0.3.0.4
dashscope~=1.15.0
google.generativeai>=0.8.3
python-multipart~=0.0.9
redis==5.0.3
opencv-python~=4.10.0.84
# for azure speech
# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
azure-cognitiveservices-speech~=1.37.0
git-changelog~=2.5.2
watchdog==5.0.2
pydub==0.25.1
psutil>=5.9.0
opencv-python~=4.10.0.84
scikit-learn~=1.5.2
google-generativeai~=0.8.3
pillow==10.3.0
python-dotenv~=1.0.1
openai~=1.53.0
tqdm>=4.66.6
tenacity>=9.0.0
tiktoken==0.8.0
yt-dlp==2024.11.18
pysrt==1.1.2
httpx==0.27.2
transformers==4.47.0
# 必须项
requests~=2.32.0
moviepy==2.1.1
edge-tts==6.1.19
streamlit~=1.45.0
watchdog==6.0.0
loguru~=0.7.3
tomli~=2.2.1
pydub==0.25.1
openai~=1.77.0
google-generativeai>=0.8.5
# 待优化项
# opencv-python==4.11.0.86
# scikit-learn==1.6.1
# fastapi~=0.115.4
# uvicorn~=0.27.1
# pydantic~=2.11.4
# faster-whisper~=1.0.1
# tomli~=2.0.1
# aiohttp~=3.10.10
# httpx==0.27.2
# urllib3~=2.2.1
# python-multipart~=0.0.9
# redis==5.0.3
# opencv-python~=4.10.0.84
# azure-cognitiveservices-speech~=1.37.0
# git-changelog~=2.5.2
# watchdog==5.0.2
# pydub==0.25.1
# psutil>=5.9.0
# scikit-learn~=1.5.2
# pillow==10.3.0
# python-dotenv~=1.0.1
# tqdm>=4.66.6
# tenacity>=9.0.0
# tiktoken==0.8.0
# pysrt==1.1.2
# transformers==4.50.0
# yt-dlp==2025.4.30

232
webui.py
View File

@ -1,13 +1,14 @@
import streamlit as st
import os
import sys
from uuid import uuid4
from loguru import logger
from app.config import config
from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, review_settings, merge_settings, system_settings
from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \
review_settings, merge_settings, system_settings
from webui.utils import cache, file_utils
from app.utils import utils
from app.models.schema import VideoClipParams, VideoAspect
from webui.utils.performance import PerformanceMonitor
# 初始化配置 - 必须是第一个 Streamlit 命令
st.set_page_config(
@ -17,7 +18,7 @@ st.set_page_config(
initial_sidebar_state="auto",
menu_items={
"Report a bug": "https://github.com/linyqh/NarratoAI/issues",
'About': f"# NarratoAI:sunglasses: 📽️ \n #### Version: v{config.project_version} \n "
'About': f"# Narrato:blue[AI] :sunglasses: 📽️ \n #### Version: v{config.project_version} \n "
f"自动化影视解说视频详情请移步https://github.com/linyqh/NarratoAI"
},
)
@ -28,6 +29,7 @@ hide_streamlit_style = """
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
def init_log():
"""初始化日志配置"""
from loguru import logger
@ -35,17 +37,7 @@ def init_log():
_lvl = "DEBUG"
def format_record(record):
# 增加更多需要过滤的警告消息
ignore_messages = [
"Examining the path of torch.classes raised",
"torch.cuda.is_available()",
"CUDA initialization"
]
for msg in ignore_messages:
if msg in record["message"]:
return ""
# 简化日志格式化处理不尝试按特定字符串过滤torch相关内容
file_path = record["file"].path
relative_path = os.path.relpath(file_path, config.root_dir)
record["file"].path = f"./{relative_path}"
@ -57,23 +49,54 @@ def init_log():
'- <level>{message}</>' + "\n"
return _format
# 优化日志过滤器
def log_filter(record):
ignore_messages = [
"Examining the path of torch.classes raised",
"torch.cuda.is_available()",
"CUDA initialization"
]
return not any(msg in record["message"] for msg in ignore_messages)
# 替换为更简单的过滤方式避免在过滤时访问message内容
# 此处先不设置复杂的过滤器,等应用启动后再动态添加
logger.add(
sys.stdout,
level=_lvl,
format=format_record,
colorize=True,
filter=log_filter
colorize=True
)
# 应用启动后,可以再添加更复杂的过滤器
def setup_advanced_filters():
"""在应用完全启动后设置高级过滤器"""
try:
for handler_id in logger._core.handlers:
logger.remove(handler_id)
# 重新添加带有高级过滤的处理器
def advanced_filter(record):
"""更复杂的过滤器,在应用启动后安全使用"""
ignore_messages = [
"Examining the path of torch.classes raised",
"torch.cuda.is_available()",
"CUDA initialization"
]
return not any(msg in record["message"] for msg in ignore_messages)
logger.add(
sys.stdout,
level=_lvl,
format=format_record,
colorize=True,
filter=advanced_filter
)
except Exception as e:
# 如果过滤器设置失败,确保日志仍然可用
logger.add(
sys.stdout,
level=_lvl,
format=format_record,
colorize=True
)
logger.error(f"设置高级日志过滤器失败: {e}")
# 将高级过滤器设置放到启动主逻辑后
import threading
threading.Timer(5.0, setup_advanced_filters).start()
def init_global_state():
"""初始化全局状态"""
if 'video_clip_json' not in st.session_state:
@ -85,6 +108,7 @@ def init_global_state():
if 'subclip_videos' not in st.session_state:
st.session_state['subclip_videos'] = {}
def tr(key):
"""翻译函数"""
i18n_dir = os.path.join(os.path.dirname(__file__), "webui", "i18n")
@ -92,90 +116,94 @@ def tr(key):
loc = locales.get(st.session_state['ui_language'], {})
return loc.get("Translation", {}).get(key, key)
def render_generate_button():
"""渲染生成按钮和处理逻辑"""
if st.button(tr("Generate Video"), use_container_width=True, type="primary"):
from app.services import task as tm
# 重置日志容器和记录
log_container = st.empty()
log_records = []
def log_received(msg):
with log_container:
log_records.append(msg)
st.code("\n".join(log_records))
from loguru import logger
logger.add(log_received)
config.save_config()
task_id = st.session_state.get('task_id')
if not task_id:
st.error(tr("请先裁剪视频"))
return
if not st.session_state.get('video_clip_json_path'):
st.error(tr("脚本文件不能为空"))
return
if not st.session_state.get('video_origin_path'):
st.error(tr("视频文件不能为空"))
return
st.toast(tr("生成视频"))
logger.info(tr("开始生成视频"))
# 获取所有参数
script_params = script_settings.get_script_params()
video_params = video_settings.get_video_params()
audio_params = audio_settings.get_audio_params()
subtitle_params = subtitle_settings.get_subtitle_params()
# 合并所有参数
all_params = {
**script_params,
**video_params,
**audio_params,
**subtitle_params
}
# 创建参数对象
params = VideoClipParams(**all_params)
result = tm.start_subclip(
task_id=task_id,
params=params,
subclip_path_videos=st.session_state['subclip_videos']
)
video_files = result.get("videos", [])
st.success(tr("视生成完成"))
try:
from app.services import task as tm
import torch
# 重置日志容器和记录
log_container = st.empty()
log_records = []
if video_files:
player_cols = st.columns(len(video_files) * 2 + 1)
for i, url in enumerate(video_files):
player_cols[i * 2 + 1].video(url)
except Exception as e:
logger.error(f"播放视频失败: {e}")
def log_received(msg):
with log_container:
log_records.append(msg)
st.code("\n".join(log_records))
file_utils.open_task_folder(config.root_dir, task_id)
logger.info(tr("视频生成完成"))
from loguru import logger
logger.add(log_received)
config.save_config()
task_id = st.session_state.get('task_id')
if not task_id:
st.error(tr("请先裁剪视频"))
return
if not st.session_state.get('video_clip_json_path'):
st.error(tr("脚本文件不能为空"))
return
if not st.session_state.get('video_origin_path'):
st.error(tr("视频文件不能为空"))
return
st.toast(tr("生成视频"))
logger.info(tr("开始生成视频"))
# 获取所有参数
script_params = script_settings.get_script_params()
video_params = video_settings.get_video_params()
audio_params = audio_settings.get_audio_params()
subtitle_params = subtitle_settings.get_subtitle_params()
# 合并所有参数
all_params = {
**script_params,
**video_params,
**audio_params,
**subtitle_params
}
# 创建参数对象
params = VideoClipParams(**all_params)
result = tm.start_subclip(
task_id=task_id,
params=params,
subclip_path_videos=st.session_state['subclip_videos']
)
video_files = result.get("videos", [])
st.success(tr("视生成完成"))
try:
if video_files:
player_cols = st.columns(len(video_files) * 2 + 1)
for i, url in enumerate(video_files):
player_cols[i * 2 + 1].video(url)
except Exception as e:
logger.error(f"播放视频失败: {e}")
file_utils.open_task_folder(config.root_dir, task_id)
logger.info(tr("视频生成完成"))
finally:
PerformanceMonitor.cleanup_resources()
def main():
"""主函数"""
init_log()
init_global_state()
utils.init_resources()
st.title(f"NarratoAI :sunglasses:📽️")
# 仅初始化基本资源避免过早地加载依赖PyTorch的资源
# 检查是否能分解utils.init_resources()为基本资源和高级资源(如依赖PyTorch的资源)
try:
utils.init_resources()
except Exception as e:
logger.warning(f"资源初始化时出现警告: {e}")
st.title(f"Narrato:blue[AI]:sunglasses: 📽️")
st.write(tr("Get Help"))
# 首先渲染不依赖PyTorch的UI部分
# 渲染基础设置面板
basic_settings.render_basic_settings(tr)
# 渲染合并设置
@ -190,14 +218,18 @@ def main():
audio_settings.render_audio_panel(tr)
with panel[2]:
subtitle_settings.render_subtitle_panel(tr)
# 渲染系统设置面板
system_settings.render_system_panel(tr)
# 渲染视频审查面板
review_settings.render_review_panel(tr)
# 渲染生成按钮和处理逻辑
# 放到最后渲染可能使用PyTorch的部分
# 渲染系统设置面板
with panel[2]:
system_settings.render_system_panel(tr)
# 放到最后渲染生成按钮和处理逻辑
render_generate_button()
if __name__ == "__main__":
main()

View File

@ -8,7 +8,7 @@ from webui.components import (
audio_settings,
subtitle_settings
)
from webui.utils import cache, file_utils, performance
from webui.utils import cache, file_utils
__all__ = [
'config',
@ -17,6 +17,5 @@ __all__ = [
'audio_settings',
'subtitle_settings',
'cache',
'file_utils',
'performance'
'file_utils'
]

View File

@ -1,7 +1,10 @@
import traceback
import streamlit as st
import os
from app.config import config
from app.utils import utils
from loguru import logger
def render_basic_settings(tr):
@ -266,7 +269,7 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr):
elif provider.lower() == 'moonshot':
base_url = "https://api.moonshot.cn/v1"
elif provider.lower() == 'deepseek':
base_url = "https://api.deepseek.com/v1"
base_url = "https://api.deepseek.com"
# 构建测试URL
test_url = f"{base_url.rstrip('/')}/chat/completions"
@ -288,7 +291,7 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr):
"messages": [
{"role": "user", "content": "直接回复我文本'当前网络可用'"}
],
"max_tokens": 10
"stream": False
}
# 发送测试请求
@ -296,7 +299,6 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr):
test_url,
headers=headers,
json=test_data,
timeout=10
)
if response.status_code == 200:
@ -313,7 +315,7 @@ def render_text_llm_settings(tr):
st.subheader(tr("Text Generation Model Settings"))
# 文案生成模型提供商选择
text_providers = ['DeepSeek', 'OpenAI', 'Qwen', 'Moonshot', 'Gemini']
text_providers = ['DeepSeek', 'OpenAI', 'Siliconflow', 'Qwen', 'Moonshot', 'Gemini']
saved_text_provider = config.app.get("text_llm_provider", "DeepSeek").lower()
saved_provider_index = 0
@ -331,9 +333,9 @@ def render_text_llm_settings(tr):
config.app["text_llm_provider"] = text_provider
# 获取已保存的文本模型配置
text_api_key = config.app.get(f"text_{text_provider}_api_key", "")
text_base_url = config.app.get(f"text_{text_provider}_base_url", "")
text_model_name = config.app.get(f"text_{text_provider}_model_name", "")
text_api_key = config.app.get(f"text_{text_provider}_api_key")
text_base_url = config.app.get(f"text_{text_provider}_base_url")
text_model_name = config.app.get(f"text_{text_provider}_model_name")
# 渲染文本模型配置输入框
st_text_api_key = st.text_input(tr("Text API Key"), value=text_api_key, type="password")
@ -342,6 +344,8 @@ def render_text_llm_settings(tr):
# 添加测试按钮
if st.button(tr("Test Connection"), key="test_text_connection"):
logger.debug(st_text_base_url)
logger.debug(st_text_model_name)
with st.spinner(tr("Testing connection...")):
success, message = test_text_model_connection(
api_key=st_text_api_key,
@ -364,11 +368,11 @@ def render_text_llm_settings(tr):
if st_text_model_name:
config.app[f"text_{text_provider}_model_name"] = st_text_model_name
# Cloudflare 特殊配置
if text_provider == 'cloudflare':
st_account_id = st.text_input(
tr("Account ID"),
value=config.app.get(f"text_{text_provider}_account_id", "")
)
if st_account_id:
config.app[f"text_{text_provider}_account_id"] = st_account_id
# # Cloudflare 特殊配置
# if text_provider == 'cloudflare':
# st_account_id = st.text_input(
# tr("Account ID"),
# value=config.app.get(f"text_{text_provider}_account_id", "")
# )
# if st_account_id:
# config.app[f"text_{text_provider}_account_id"] = st_account_id

View File

@ -285,8 +285,8 @@ def render_merge_settings(tr):
error_message = str(e)
if "moviepy" in error_message.lower():
st.error(tr("Error processing video files. Please check if the videos are valid MP4 files."))
elif "pysrt" in error_message.lower():
st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files."))
# elif "pysrt" in error_message.lower():
# st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files."))
else:
st.error(f"{tr('Error during merge')}: {error_message}")

View File

@ -33,7 +33,7 @@ def render_video_item(tr, video_list, subclip_videos, index):
video_script = video_list[index]
# 显示时间戳
timestamp = video_script.get('timestamp', '')
timestamp = video_script.get('_id', '')
st.text_area(
tr("Timestamp"),
value=timestamp,

View File

@ -47,7 +47,7 @@ def render_script_file(tr, params):
(tr("None"), ""),
(tr("Auto Generate"), "auto"),
(tr("Short Generate"), "short"),
(tr("Upload Script"), "upload_script") # 新增上传脚本选项
(tr("Upload Script"), "upload_script")
]
# 获取已有脚本文件
@ -214,38 +214,25 @@ def render_script_buttons(tr, params):
# 根据脚本类型显示不同的设置
if script_path != "short":
# 非短视频模式下显示原有的三个输入框
input_cols = st.columns(3)
input_cols = st.columns(2)
with input_cols[0]:
skip_seconds = st.number_input(
"skip_seconds",
st.number_input(
tr("Frame Interval (seconds)"),
min_value=0,
value=st.session_state.get('skip_seconds', config.frames.get('skip_seconds', 0)),
help=tr("Skip the first few seconds"),
key="skip_seconds_input"
value=st.session_state.get('frame_interval_input', config.frames.get('frame_interval_input', 3)),
help=tr("Frame Interval (seconds) (More keyframes consume more tokens)"),
key="frame_interval_input"
)
st.session_state['skip_seconds'] = skip_seconds
with input_cols[1]:
threshold = st.number_input(
"threshold",
st.number_input(
tr("Batch Size"),
min_value=0,
value=st.session_state.get('threshold', config.frames.get('threshold', 30)),
help=tr("Difference threshold"),
key="threshold_input"
value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 10)),
help=tr("Batch Size (More keyframes consume more tokens)"),
key="vision_batch_size"
)
st.session_state['threshold'] = threshold
with input_cols[2]:
vision_batch_size = st.number_input(
"vision_batch_size",
min_value=1,
max_value=20,
value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 5)),
help=tr("Vision processing batch size"),
key="vision_batch_size_input"
)
st.session_state['vision_batch_size'] = vision_batch_size
# 生成/加载按钮
if script_path == "auto":
@ -259,7 +246,8 @@ def render_script_buttons(tr, params):
if st.button(button_name, key="script_action", disabled=not script_path):
if script_path == "auto":
generate_script_docu(tr, params)
# 执行纪录片视频脚本生成(视频无字幕无配音)
generate_script_docu(params)
elif script_path == "short":
# 获取自定义片段数量参数
custom_clips = st.session_state.get('custom_clips', 5)
@ -366,12 +354,11 @@ def crop_video(tr, params):
utils.cut_video(params, update_progress)
time.sleep(0.5)
progress_bar.progress(100)
status_text.text("剪完成!")
st.success("视频剪辑成功完成!")
except Exception as e:
st.error(f"剪辑过程中发生错误: {str(e)}")
finally:
time.sleep(2)
time.sleep(1)
progress_bar.empty()
status_text.empty()

View File

@ -127,7 +127,7 @@ def get_subtitle_params():
'font_name': st.session_state.get('font_name', ''),
'font_size': st.session_state.get('font_size', 60),
'text_fore_color': st.session_state.get('text_fore_color', '#FFFFFF'),
'position': st.session_state.get('subtitle_position', 'bottom'),
'subtitle_position': st.session_state.get('subtitle_position', 'bottom'),
'custom_position': st.session_state.get('custom_position', 70.0),
'stroke_color': st.session_state.get('stroke_color', '#000000'),
'stroke_width': st.session_state.get('stroke_width', 1.5),

View File

@ -85,6 +85,7 @@
"TTS Provider": "TTS Provider",
"Hide Log": "Hide Log",
"Upload Local Files": "Upload Local Files",
"File Uploaded Successfully": "File Uploaded Successfully"
"File Uploaded Successfully": "File Uploaded Successfully",
"Frame Interval (seconds)": "Frame Interval (seconds) (More keyframes consume more tokens)"
}
}

View File

@ -115,7 +115,6 @@
"Text Generation Model Settings": "文案生成模型设置",
"LLM Model Name": "大语言模型名称",
"LLM Model API Key": "大语言模型 API 密钥",
"Batch Size": "批处理大小",
"Text Model Provider": "文案生成模型提供商",
"Text API Key": "文案生成 API 密钥",
"Text Base URL": "文案生成接口地址",
@ -192,6 +191,10 @@
"Generate Short Video Script": "AI生成短剧混剪脚本",
"Adjust the volume of the original audio": "调整原始音频的音量",
"Original Volume": "视频音量",
"Auto Generate": "纪录片解说 (画面解说)"
"Auto Generate": "纪录片解说 (画面解说)",
"Frame Interval (seconds)": "帧间隔 (秒)",
"Frame Interval (seconds) (More keyframes consume more tokens)": "帧间隔 (秒) (更多关键帧消耗更多令牌)",
"Batch Size": "批处理大小",
"Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多"
}
}
}

View File

@ -5,20 +5,23 @@ import time
import asyncio
import traceback
import requests
from app.utils import video_processor
import streamlit as st
from loguru import logger
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from datetime import datetime
from app.config import config
from app.utils.script_generator import ScriptProcessor
from app.utils import utils, video_processor, video_processor_v2, qwenvl_analyzer
from app.utils import utils, video_processor, qwenvl_analyzer
from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps, chekc_video_config
def generate_script_docu(tr, params):
def generate_script_docu(params):
"""
生成 纪录片 视频脚本
要求: 原视频无字幕无配音
适合场景: 纪录片动物搞笑解说荒野建造等
"""
progress_bar = st.progress(0)
status_text = st.empty()
@ -35,8 +38,9 @@ def generate_script_docu(tr, params):
if not params.video_origin_path:
st.error("请先选择视频文件")
return
# ===================提取键帧===================
"""
1. 提取键帧
"""
update_progress(10, "正在提取关键帧...")
# 创建临时目录用于存储关键帧
@ -64,21 +68,12 @@ def generate_script_docu(tr, params):
os.makedirs(video_keyframes_dir, exist_ok=True)
# 初始化视频处理器
if config.frames.get("version") == "v2":
processor = video_processor_v2.VideoProcessor(params.video_origin_path)
# 处理视频并提取关键帧
processor.process_video_pipeline(
output_dir=video_keyframes_dir,
skip_seconds=st.session_state.get('skip_seconds'),
threshold=st.session_state.get('threshold')
)
else:
processor = video_processor.VideoProcessor(params.video_origin_path)
# 处理视频并提取关键帧
processor.process_video(
output_dir=video_keyframes_dir,
skip_seconds=0
)
processor = video_processor.VideoProcessor(params.video_origin_path)
# 处理视频并提取关键帧
processor.process_video_pipeline(
output_dir=video_keyframes_dir,
interval_seconds=st.session_state.get('frame_interval_input'),
)
# 获取所有关键文件路径
for filename in sorted(os.listdir(video_keyframes_dir)):
@ -101,9 +96,11 @@ def generate_script_docu(tr, params):
raise Exception(f"关键帧提取失败: {str(e)}")
# 根据不同的 LLM 提供商处理
"""
2. 视觉分析(批量分析每一帧)
"""
vision_llm_provider = st.session_state.get('vision_llm_providers').lower()
logger.debug(f"Vision LLM 提供商: {vision_llm_provider}")
logger.debug(f"VLM 视觉大模型提供商: {vision_llm_provider}")
try:
# ===================初始化视觉分析器===================
@ -137,111 +134,240 @@ def generate_script_docu(tr, params):
# 执行异步分析
vision_batch_size = st.session_state.get('vision_batch_size') or config.frames.get("vision_batch_size")
vision_analysis_prompt = """
我提供了 %s 张视频帧它们按时间顺序排列代表一个连续的视频片段请仔细分析每一帧的内容并关注帧与帧之间的变化以理解整个片段的活动
首先请详细描述每一帧的关键视觉信息包含主要内容人物动作和场景
然后基于所有帧的分析请用**简洁的语言**总结整个视频片段中发生的主要活动或事件流程
请务必使用 JSON 格式输出你的结果JSON 结构应如下
{
"frame_observations": [
{
"frame_number": 1, // 或其他标识帧的方式
"observation": "描述每张视频帧中的主要内容、人物、动作和场景。"
},
// ... 更多帧的观察 ...
],
"overall_activity_summary": "在这里填写你总结的整个片段的主要活动,保持简洁。"
}
请务必不要遗漏视频帧我提供了 %s 张视频帧frame_observations 必须包含 %s 个元素
请只返回 JSON 字符串不要包含任何其他解释性文字
"""
results = loop.run_until_complete(
analyzer.analyze_images(
images=keyframe_files,
prompt=config.app.get('vision_analysis_prompt'),
prompt=vision_analysis_prompt,
batch_size=vision_batch_size
)
)
loop.close()
"""
3. 处理分析结果格式化为 json 数据
"""
# ===================处理分析结果===================
update_progress(60, "正在整理分析结果...")
# 合并所有批次的析结果
# 合并所有批次的析结果
frame_analysis = ""
merged_frame_observations = [] # 合并所有批次的帧观察
overall_activity_summaries = [] # 合并所有批次的整体总结
prev_batch_files = None
frame_counter = 1 # 初始化帧计数器,用于给所有帧分配连续的序号
# logger.debug(json.dumps(results, indent=4, ensure_ascii=False))
# 确保分析目录存在
analysis_dir = os.path.join(utils.storage_dir(), "temp", "analysis")
os.makedirs(analysis_dir, exist_ok=True)
origin_res = os.path.join(analysis_dir, "frame_analysis.json")
with open(origin_res, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# 开始处理
for result in results:
if 'error' in result:
logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}")
# 获取当前批次的文件列表 keyframe_001136_000045.jpg 将 000045 精度提升到 毫秒
continue
# 获取当前批次的文件列表
batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
logger.debug(f"批次 {result['batch_index']} 处理完成,共 {len(batch_files)} 张图片")
# logger.debug(batch_files)
first_timestamp, last_timestamp, _ = get_batch_timestamps(batch_files, prev_batch_files)
# 获取批次的时间戳范围
first_timestamp, last_timestamp, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files)
logger.debug(f"处理时间戳: {first_timestamp}-{last_timestamp}")
# 添加带时间戳的分析结果
frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
frame_analysis += result['response']
frame_analysis += "\n"
# 解析响应中的JSON数据
response_text = result['response']
try:
# 处理可能包含```json```格式的响应
if "```json" in response_text:
json_content = response_text.split("```json")[1].split("```")[0].strip()
elif "```" in response_text:
json_content = response_text.split("```")[1].split("```")[0].strip()
else:
json_content = response_text.strip()
response_data = json.loads(json_content)
# 提取frame_observations和overall_activity_summary
if "frame_observations" in response_data:
frame_obs = response_data["frame_observations"]
overall_summary = response_data.get("overall_activity_summary", "")
# 添加时间戳信息到每个帧观察
for i, obs in enumerate(frame_obs):
if i < len(batch_files):
# 从文件名中提取时间戳
file_path = batch_files[i]
file_name = os.path.basename(file_path)
# 提取时间戳字符串 (格式如: keyframe_000675_000027000.jpg)
# 格式解析: keyframe_帧序号_毫秒时间戳.jpg
timestamp_parts = file_name.split('_')
if len(timestamp_parts) >= 3:
timestamp_str = timestamp_parts[-1].split('.')[0]
try:
# 修正时间戳解析逻辑
# 格式为000100000表示00:01:00,000即1分钟
# 需要按照对应位数进行解析:
# 前两位是小时,中间两位是分钟,后面是秒和毫秒
if len(timestamp_str) >= 9: # 确保格式正确
hours = int(timestamp_str[0:2])
minutes = int(timestamp_str[2:4])
seconds = int(timestamp_str[4:6])
milliseconds = int(timestamp_str[6:9])
# 计算总秒数
timestamp_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000
formatted_time = utils.format_time(timestamp_seconds) # 格式化时间戳
else:
# 兼容旧的解析方式
timestamp_seconds = int(timestamp_str) / 1000 # 转换为秒
formatted_time = utils.format_time(timestamp_seconds) # 格式化时间戳
except ValueError:
logger.warning(f"无法解析时间戳: {timestamp_str}")
timestamp_seconds = 0
formatted_time = "00:00:00,000"
else:
logger.warning(f"文件名格式不符合预期: {file_name}")
timestamp_seconds = 0
formatted_time = "00:00:00,000"
# 添加额外信息到帧观察
obs["frame_path"] = file_path
obs["timestamp"] = formatted_time
obs["timestamp_seconds"] = timestamp_seconds
obs["batch_index"] = result['batch_index']
# 使用全局递增的帧计数器替换原始的frame_number
if "frame_number" in obs:
obs["original_frame_number"] = obs["frame_number"] # 保留原始编号作为参考
obs["frame_number"] = frame_counter # 赋值连续的帧编号
frame_counter += 1 # 增加帧计数器
# 添加到合并列表
merged_frame_observations.append(obs)
# 添加批次整体总结信息
if overall_summary:
# 从文件名中提取时间戳数值
first_time_str = first_timestamp.split('_')[-1].split('.')[0]
last_time_str = last_timestamp.split('_')[-1].split('.')[0]
# 转换为毫秒并计算持续时间(秒)
try:
# 修正解析逻辑,与上面相同的方式解析时间戳
if len(first_time_str) >= 9 and len(last_time_str) >= 9:
# 解析第一个时间戳
first_hours = int(first_time_str[0:2])
first_minutes = int(first_time_str[2:4])
first_seconds = int(first_time_str[4:6])
first_ms = int(first_time_str[6:9])
first_time_seconds = first_hours * 3600 + first_minutes * 60 + first_seconds + first_ms / 1000
# 解析第二个时间戳
last_hours = int(last_time_str[0:2])
last_minutes = int(last_time_str[2:4])
last_seconds = int(last_time_str[4:6])
last_ms = int(last_time_str[6:9])
last_time_seconds = last_hours * 3600 + last_minutes * 60 + last_seconds + last_ms / 1000
batch_duration = last_time_seconds - first_time_seconds
else:
# 兼容旧的解析方式
first_time_ms = int(first_time_str)
last_time_ms = int(last_time_str)
batch_duration = (last_time_ms - first_time_ms) / 1000
except ValueError:
# 使用 utils.time_to_seconds 函数处理格式化的时间戳
first_time_seconds = utils.time_to_seconds(first_time_str.replace('_', ':').replace('-', ','))
last_time_seconds = utils.time_to_seconds(last_time_str.replace('_', ':').replace('-', ','))
batch_duration = last_time_seconds - first_time_seconds
overall_activity_summaries.append({
"batch_index": result['batch_index'],
"time_range": f"{first_timestamp}-{last_timestamp}",
"duration_seconds": batch_duration,
"summary": overall_summary
})
except Exception as e:
logger.error(f"解析批次 {result['batch_index']} 的响应数据失败: {str(e)}")
# 添加原始响应作为回退
frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
frame_analysis += response_text
frame_analysis += "\n"
# 更新上一个批次的文件
prev_batch_files = batch_files
# 将合并后的结果转为JSON字符串
merged_results = {
"frame_observations": merged_frame_observations,
"overall_activity_summaries": overall_activity_summaries
}
# 使用当前时间创建文件名
now = datetime.now()
timestamp_str = now.strftime("%Y%m%d_%H%M")
# 保存完整的分析结果为JSON
analysis_filename = f"frame_analysis_{timestamp_str}.json"
analysis_json_path = os.path.join(analysis_dir, analysis_filename)
with open(analysis_json_path, 'w', encoding='utf-8') as f:
json.dump(merged_results, f, ensure_ascii=False, indent=2)
logger.info(f"分析结果已保存到: {analysis_json_path}")
if not frame_analysis.strip():
raise Exception("未能生成有效的帧分析结果")
# 保存分析结果
analysis_path = os.path.join(utils.temp_dir(), "frame_analysis.txt")
with open(analysis_path, 'w', encoding='utf-8') as f:
f.write(frame_analysis)
update_progress(70, "正在生成脚本...")
"""
4. 生成文案
"""
logger.info("开始准备生成解说文案")
update_progress(80, "正在生成文案...")
from app.services.generate_narration_script import parse_frame_analysis_to_markdown, generate_narration
# 从配置中获取文本生成相关配置
text_provider = config.app.get('text_llm_provider', 'gemini').lower()
text_api_key = config.app.get(f'text_{text_provider}_api_key')
text_model = config.app.get(f'text_{text_provider}_model_name')
text_base_url = config.app.get(f'text_{text_provider}_base_url')
# 构建帧内容列表
frame_content_list = []
prev_batch_files = None
# 整理帧分析数据
markdown_output = parse_frame_analysis_to_markdown(analysis_json_path)
for i, result in enumerate(results):
if 'error' in result:
continue
batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
_, _, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files)
frame_content = {
"timestamp": timestamp_range,
"picture": result['response'],
"narration": "",
"OST": 2
}
frame_content_list.append(frame_content)
logger.debug(f"添加帧内容: 时间范围={timestamp_range}, 分析结果长度={len(result['response'])}")
# 更新上一个批次的文件
prev_batch_files = batch_files
if not frame_content_list:
raise Exception("没有有效的帧内容可以处理")
# ===================开始生成文案===================
update_progress(80, "正在生成文案...")
# 校验配置
api_params = {
"vision_api_key": vision_api_key,
"vision_model_name": vision_model,
"vision_base_url": vision_base_url or "",
"text_api_key": text_api_key,
"text_model_name": text_model,
"text_base_url": text_base_url or ""
}
chekc_video_config(api_params)
custom_prompt = st.session_state.get('custom_prompt', '')
processor = ScriptProcessor(
model_name=text_model,
api_key=text_api_key,
prompt=custom_prompt,
base_url=text_base_url or "",
video_theme=st.session_state.get('video_theme', '')
# 生成文案
# 生成解说文案
narration = generate_narration(
markdown_output,
text_api_key,
base_url=text_base_url,
model=text_model
)
# 处理帧内容生成脚本
script_result = processor.process_frames(frame_content_list)
narration_dict = json.loads(narration)['items']
# 为 narration_dict 中每个 item 新增一个 OST: 2 的字段, 代表保留原声和配音
narration_dict = [{**item, "OST": 2} for item in narration_dict]
logger.debug(f"解说文案创作完成:\n{"\n".join([item['narration'] for item in narration_dict])}")
# 结果转换为JSON字符串
script = json.dumps(script_result, ensure_ascii=False, indent=2)
script = json.dumps(narration_dict, ensure_ascii=False, indent=2)
except Exception as e:
logger.exception(f"大模型处理过程中发生错误\n{traceback.format_exc()}")
@ -250,7 +376,7 @@ def generate_script_docu(tr, params):
if script is None:
st.error("生成脚本失败,请检查日志")
st.stop()
logger.info(f"脚本生成完成")
logger.success(f"剪辑脚本生成完成")
if isinstance(script, list):
st.session_state['video_clip_json'] = script
elif isinstance(script, str):

View File

@ -1,8 +0,0 @@
from .performance import monitor_performance, PerformanceMonitor
from .cache import *
from .file_utils import *
__all__ = [
'monitor_performance',
'PerformanceMonitor'
]

View File

@ -1,8 +1,8 @@
"""
合并视频和字幕文件
"""
from moviepy.editor import VideoFileClip, concatenate_videoclips
import pysrt
from moviepy import VideoFileClip, concatenate_videoclips
# import pysrt
import os

View File

@ -1,37 +0,0 @@
import psutil
import os
from loguru import logger
import torch
class PerformanceMonitor:
@staticmethod
def monitor_memory():
process = psutil.Process(os.getpid())
memory_info = process.memory_info()
logger.debug(f"Memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")
if torch.cuda.is_available():
gpu_memory = torch.cuda.memory_allocated() / 1024 / 1024
logger.debug(f"GPU Memory usage: {gpu_memory:.2f} MB")
@staticmethod
def cleanup_resources():
if torch.cuda.is_available():
torch.cuda.empty_cache()
import gc
gc.collect()
PerformanceMonitor.monitor_memory()
def monitor_performance(func):
"""性能监控装饰器"""
def wrapper(*args, **kwargs):
try:
PerformanceMonitor.monitor_memory()
result = func(*args, **kwargs)
return result
finally:
PerformanceMonitor.cleanup_resources()
return wrapper