diff --git a/.gitignore b/.gitignore index 4bea0a5..8096610 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,5 @@ resource/fonts/*.ttf resource/fonts/*.otf resource/srt/*.srt app/models/faster-whisper-large-v2/* +app/models/faster-whisper-large-v3/* app/models/bert/* diff --git a/README-cn.md b/README-en.md similarity index 100% rename from README-cn.md rename to README-en.md diff --git a/README.md b/README.md index b969811..7528267 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@

一站式 AI 影视解说+自动化剪辑工具🎬🎞️

-

📖 English | 简体中文 | 日本語

+

📖 English | 简体中文 | 日本語

[//]: # ( harry0703%2FNarratoAI | Trendshift) @@ -83,7 +83,7 @@ _**注意⚠️:近期在 x (推特) 上发现有人冒充作者在 pump.fun ## 配置要求 📦 - 建议最低 CPU 4核或以上,内存 8G 或以上,显卡非必须 -- Windows 10 或 MacOS 11.0 以上系统 +- Windows 10/11 或 MacOS 11.0 以上系统 - [Python 3.10+](https://www.python.org/downloads/) ## 反馈建议 📢 diff --git a/app/models/schema.py b/app/models/schema.py index 5e2e909..ddf0ad1 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -20,7 +20,9 @@ class VideoConcatMode(str, Enum): class VideoAspect(str, Enum): landscape = "16:9" + landscape_2 = "4:3" portrait = "9:16" + portrait_2 = "3:4" square = "1:1" def to_resolution(self): @@ -360,13 +362,14 @@ class VideoClipParams(BaseModel): text_back_color: Optional[str] = None # 文本背景色 stroke_color: str = "black" # 描边颜色 stroke_width: float = 1.5 # 描边宽度 - subtitle_position: str = "bottom" # top, bottom, center, custom + subtitle_position: str = "bottom" # top, bottom, center, custom + custom_position: float = 70.0 # 自定义位置 - n_threads: Optional[int] = Field(default=16, description="解说语音音量") # 线程���,有助于提升视频处理速度 + n_threads: Optional[int] = Field(default=16, description="线程数") # 线程数,有助于提升视频处理速度 tts_volume: Optional[float] = Field(default=1.0, description="解说语音音量(后处理)") original_volume: Optional[float] = Field(default=1.0, description="视频原声音量") - bgm_volume: Optional[float] = Field(default=0.6, description="背景音乐音量") + bgm_volume: Optional[float] = Field(default=0.3, description="背景音乐音量") class VideoTranscriptionRequest(BaseModel): diff --git a/app/models/schema_v2.py b/app/models/schema_v2.py index 1611a3b..8584c75 100644 --- a/app/models/schema_v2.py +++ b/app/models/schema_v2.py @@ -6,6 +6,7 @@ class GenerateScriptRequest(BaseModel): video_path: str video_theme: Optional[str] = "" custom_prompt: Optional[str] = "" + frame_interval_input: Optional[int] = 5 skip_seconds: Optional[int] = 0 threshold: Optional[int] = 30 vision_batch_size: Optional[int] = 5 diff --git a/app/services/SDP/generate_script_short.pyd b/app/services/SDP/generate_script_short.pyd index 72c29a7..de8b47c 100644 Binary files a/app/services/SDP/generate_script_short.pyd and b/app/services/SDP/generate_script_short.pyd differ diff --git a/app/services/SDP/generate_script_short.so b/app/services/SDP/generate_script_short.so index fb65efd..d659cd4 100755 Binary files a/app/services/SDP/generate_script_short.so and b/app/services/SDP/generate_script_short.so differ diff --git a/app/services/SDP/utils/short_schema.pyd b/app/services/SDP/utils/short_schema.pyd index e6b7c24..7774303 100644 Binary files a/app/services/SDP/utils/short_schema.pyd and b/app/services/SDP/utils/short_schema.pyd differ diff --git a/app/services/SDP/utils/short_schema.so b/app/services/SDP/utils/short_schema.so index 161acc5..933ef73 100755 Binary files a/app/services/SDP/utils/short_schema.so and b/app/services/SDP/utils/short_schema.so differ diff --git a/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd b/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd index 798a5c8..5c1da0b 100644 Binary files a/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd and b/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd differ diff --git a/app/services/SDP/utils/step1_subtitle_analyzer_openai.so b/app/services/SDP/utils/step1_subtitle_analyzer_openai.so index 94a963e..f43f7d4 100755 Binary files a/app/services/SDP/utils/step1_subtitle_analyzer_openai.so and b/app/services/SDP/utils/step1_subtitle_analyzer_openai.so differ diff --git a/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd b/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd index 1cfa6ea..1e6913d 100644 Binary files a/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd and b/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd differ diff --git a/app/services/SDP/utils/step2_subtitle_analyzer_bert.so b/app/services/SDP/utils/step2_subtitle_analyzer_bert.so index 2e67bb1..8b6587d 100755 Binary files a/app/services/SDP/utils/step2_subtitle_analyzer_bert.so and b/app/services/SDP/utils/step2_subtitle_analyzer_bert.so differ diff --git a/app/services/SDP/utils/step3_fragment_check.pyd b/app/services/SDP/utils/step3_fragment_check.pyd index bbc015d..38f4991 100644 Binary files a/app/services/SDP/utils/step3_fragment_check.pyd and b/app/services/SDP/utils/step3_fragment_check.pyd differ diff --git a/app/services/SDP/utils/step3_fragment_check.so b/app/services/SDP/utils/step3_fragment_check.so index 2bd0ff3..e57e026 100755 Binary files a/app/services/SDP/utils/step3_fragment_check.so and b/app/services/SDP/utils/step3_fragment_check.so differ diff --git a/app/services/SDP/utils/step4_text_generate.pyd b/app/services/SDP/utils/step4_text_generate.pyd index 77cef03..2454e49 100644 Binary files a/app/services/SDP/utils/step4_text_generate.pyd and b/app/services/SDP/utils/step4_text_generate.pyd differ diff --git a/app/services/SDP/utils/step4_text_generate.so b/app/services/SDP/utils/step4_text_generate.so index 916415f..fd536e7 100755 Binary files a/app/services/SDP/utils/step4_text_generate.so and b/app/services/SDP/utils/step4_text_generate.so differ diff --git a/app/services/SDP/utils/step5_merge_script.pyd b/app/services/SDP/utils/step5_merge_script.pyd index 4ceaf8b..b284950 100644 Binary files a/app/services/SDP/utils/step5_merge_script.pyd and b/app/services/SDP/utils/step5_merge_script.pyd differ diff --git a/app/services/SDP/utils/step5_merge_script.so b/app/services/SDP/utils/step5_merge_script.so index 11e685a..ec8181e 100755 Binary files a/app/services/SDP/utils/step5_merge_script.so and b/app/services/SDP/utils/step5_merge_script.so differ diff --git a/app/services/SDP/utils/utils.pyd b/app/services/SDP/utils/utils.pyd index ad16a2c..8c70c3d 100644 Binary files a/app/services/SDP/utils/utils.pyd and b/app/services/SDP/utils/utils.pyd differ diff --git a/app/services/SDP/utils/utils.so b/app/services/SDP/utils/utils.so index 8d48207..608acd4 100755 Binary files a/app/services/SDP/utils/utils.so and b/app/services/SDP/utils/utils.so differ diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py index c7edc77..bedb585 100644 --- a/app/services/audio_merger.py +++ b/app/services/audio_merger.py @@ -18,15 +18,14 @@ def check_ffmpeg(): return False -def merge_audio_files(task_id: str, audio_files: list, total_duration: float, list_script: list): +def merge_audio_files(task_id: str, total_duration: float, list_script: list): """ - 合并音频文件,根据OST设置处理不同的音频轨道 + 合并音频文件 Args: task_id: 任务ID - audio_files: TTS生成的音频文件列表 total_duration: 总时长 - list_script: 完整脚本信息,包含OST设置 + list_script: 完整脚本信息,包含duration时长和audio路径 Returns: str: 合并后的音频文件路径 @@ -39,36 +38,38 @@ def merge_audio_files(task_id: str, audio_files: list, total_duration: float, li # 创建一个空的音频片段 final_audio = AudioSegment.silent(duration=total_duration * 1000) # 总时长以毫秒为单位 + # 计算每个片段的开始位置(基于duration字段) + current_position = 0 # 初始位置(秒) + # 遍历脚本中的每个片段 - for segment, audio_file in zip(list_script, audio_files): + for segment in list_script: try: - # 加载TTS音频文件 - tts_audio = AudioSegment.from_file(audio_file) - - # 获取片段的开始和结束时间 - start_time, end_time = segment['new_timestamp'].split('-') - start_seconds = utils.time_to_seconds(start_time) - end_seconds = utils.time_to_seconds(end_time) - - # 根据OST设置处理音频 - if segment['OST'] == 0: - # 只使用TTS音频 - final_audio = final_audio.overlay(tts_audio, position=start_seconds * 1000) - elif segment['OST'] == 1: - # 只使用原声(假设原声已经在视频中) - continue - elif segment['OST'] == 2: - # 混合TTS音频和原声 - original_audio = AudioSegment.silent(duration=(end_seconds - start_seconds) * 1000) - mixed_audio = original_audio.overlay(tts_audio) - final_audio = final_audio.overlay(mixed_audio, position=start_seconds * 1000) + # 获取片段时长(秒) + duration = segment['duration'] + + # 检查audio字段是否为空 + if segment['audio'] and os.path.exists(segment['audio']): + # 加载TTS音频文件 + tts_audio = AudioSegment.from_file(segment['audio']) + + # 将TTS音频添加到最终音频 + final_audio = final_audio.overlay(tts_audio, position=current_position * 1000) + else: + # audio为空,不添加音频,仅保留间隔 + logger.info(f"片段 {segment.get('timestamp', '')} 没有音频文件,保留 {duration} 秒的间隔") + + # 更新下一个片段的开始位置 + current_position += duration except Exception as e: - logger.error(f"处理音频文件 {audio_file} 时出错: {str(e)}") + logger.error(f"处理音频片段时出错: {str(e)}") + # 即使处理失败,也要更新位置,确保后续片段位置正确 + if 'duration' in segment: + current_position += segment['duration'] continue # 保存合并后的音频文件 - output_audio_path = os.path.join(utils.task_dir(task_id), "final_audio.mp3") + output_audio_path = os.path.join(utils.task_dir(task_id), "merger_audio.mp3") final_audio.export(output_audio_path, format="mp3") logger.info(f"合并后的音频文件已保存: {output_audio_path}") @@ -93,7 +94,7 @@ def time_to_seconds(time_str): # 分割时间部分 parts = time_part.split(':') - + if len(parts) == 3: # HH:MM:SS h, m, s = map(int, parts) seconds = h * 3600 + m * 60 + s @@ -118,11 +119,11 @@ def extract_timestamp(filename): # 从文件名中提取时间部分 time_part = filename.split('_', 1)[1].split('.')[0] # 获取 "00_06,500-00_24,800" 部分 start_time, end_time = time_part.split('-') # 分割成开始和结束时间 - + # 将下划线格式转换回冒号格式 start_time = start_time.replace('_', ':') end_time = end_time.replace('_', ':') - + # 将时间戳转换为秒 start_seconds = time_to_seconds(start_time) end_seconds = time_to_seconds(end_time) @@ -135,17 +136,36 @@ def extract_timestamp(filename): if __name__ == "__main__": # 示例用法 - audio_files =[ - "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:06-00:24.mp3", - "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:32-00:38.mp3", - "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:43-00:52.mp3", - "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:52-01:09.mp3", - "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01:13-01:15.mp3", - ] - total_duration = 38 - video_script_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json" - with open(video_script_path, "r", encoding="utf-8") as f: - video_script = json.load(f) + total_duration = 90 - output_file = merge_audio_files("test456", audio_files, total_duration, video_script) + video_script = [ + {'picture': '【解说】好的,各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!', + 'timestamp': '00:00:00-00:00:26', + 'narration': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!', + 'OST': 0, 'duration': 26, + 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3'}, + {'picture': '【解说】上一集我们看到,范闲在北齐遭遇了惊天变故,生死不明!', 'timestamp': '00:01:15-00:01:29', + 'narration': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…', + 'OST': 0, 'duration': 14, + 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3'}, + {'picture': '画面切到王启年小心翼翼地向范闲汇报。', 'timestamp': '00:04:41-00:04:58', + 'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪', + 'OST': 1, 'duration': 17, + 'audio': ''}, + {'picture': '【解说】"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。', + 'timestamp': '00:04:58-00:05:20', + 'narration': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!', + 'OST': 0, 'duration': 22, + 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3'}, + {'picture': '【解说】但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', + 'timestamp': '00:05:45-00:05:53', + 'narration': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', + 'OST': 0, 'duration': 8, + 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'}, + {'picture': '画面切换到范闲蒙面闯入皇宫,被侍卫包围的场景。', 'timestamp': '00:06:00-00:06:03', + 'narration': '抓刺客', + 'OST': 1, 'duration': 3, + 'audio': ''}] + + output_file = merge_audio_files("test456", total_duration, video_script) print(output_file) diff --git a/app/services/clip_video.py b/app/services/clip_video.py new file mode 100644 index 0000000..1329333 --- /dev/null +++ b/app/services/clip_video.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +''' +@Project: NarratoAI +@File : clip_video +@Author : 小林同学 +@Date : 2025/5/6 下午6:14 +''' + +import os +import subprocess +import json +import hashlib +from loguru import logger +from typing import Dict, List, Optional +from pathlib import Path + + +def parse_timestamp(timestamp: str) -> tuple: + """ + 解析时间戳字符串,返回开始和结束时间 + + Args: + timestamp: 格式为'HH:MM:SS-HH:MM:SS'或'HH:MM:SS,sss-HH:MM:SS,sss'的时间戳字符串 + + Returns: + tuple: (开始时间, 结束时间) 格式为'HH:MM:SS'或'HH:MM:SS,sss' + """ + start_time, end_time = timestamp.split('-') + return start_time, end_time + + +def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str: + """ + 根据开始时间和持续时间计算结束时间 + + Args: + start_time: 开始时间,格式为'HH:MM:SS'或'HH:MM:SS,sss'(带毫秒) + duration: 持续时间,单位为秒 + extra_seconds: 额外添加的秒数,默认为1秒 + + Returns: + str: 计算后的结束时间,格式与输入格式相同 + """ + # 检查是否包含毫秒 + has_milliseconds = ',' in start_time + milliseconds = 0 + + if has_milliseconds: + time_part, ms_part = start_time.split(',') + h, m, s = map(int, time_part.split(':')) + milliseconds = int(ms_part) + else: + h, m, s = map(int, start_time.split(':')) + + # 转换为总毫秒数 + total_milliseconds = ((h * 3600 + m * 60 + s) * 1000 + milliseconds + + int((duration + extra_seconds) * 1000)) + + # 计算新的时、分、秒、毫秒 + ms_new = total_milliseconds % 1000 + total_seconds = total_milliseconds // 1000 + h_new = int(total_seconds // 3600) + m_new = int((total_seconds % 3600) // 60) + s_new = int(total_seconds % 60) + + # 返回与输入格式一致的时间字符串 + if has_milliseconds: + return f"{h_new:02d}:{m_new:02d}:{s_new:02d},{ms_new:03d}" + else: + return f"{h_new:02d}:{m_new:02d}:{s_new:02d}" + + +def check_hardware_acceleration() -> Optional[str]: + """ + 检查系统支持的硬件加速选项 + + Returns: + Optional[str]: 硬件加速参数,如果不支持则返回None + """ + # 检查NVIDIA GPU支持 + try: + nvidia_check = subprocess.run( + ["ffmpeg", "-hwaccel", "cuda", "-i", "/dev/null", "-f", "null", "-"], + stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False + ) + if nvidia_check.returncode == 0: + return "cuda" + except Exception: + pass + + # 检查MacOS videotoolbox支持 + try: + videotoolbox_check = subprocess.run( + ["ffmpeg", "-hwaccel", "videotoolbox", "-i", "/dev/null", "-f", "null", "-"], + stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False + ) + if videotoolbox_check.returncode == 0: + return "videotoolbox" + except Exception: + pass + + # 检查Intel Quick Sync支持 + try: + qsv_check = subprocess.run( + ["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"], + stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False + ) + if qsv_check.returncode == 0: + return "qsv" + except Exception: + pass + + return None + + +def clip_video( + video_origin_path: str, + tts_result: List[Dict], + output_dir: Optional[str] = None, + task_id: Optional[str] = None +) -> Dict[str, str]: + """ + 根据时间戳裁剪视频 + + Args: + video_origin_path: 原始视频的路径 + tts_result: 包含时间戳和持续时间信息的列表 + output_dir: 输出目录路径,默认为None时会自动生成 + task_id: 任务ID,用于生成唯一的输出目录,默认为None时会自动生成 + + Returns: + Dict[str, str]: 时间戳到裁剪后视频路径的映射 + """ + # 检查视频文件是否存在 + if not os.path.exists(video_origin_path): + raise FileNotFoundError(f"视频文件不存在: {video_origin_path}") + + # 如果未提供task_id,则根据输入生成一个唯一ID + if task_id is None: + content_for_hash = f"{video_origin_path}_{json.dumps(tts_result)}" + task_id = hashlib.md5(content_for_hash.encode()).hexdigest() + + # 设置输出目录 + if output_dir is None: + output_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "storage", "temp", "clip_video", task_id + ) + + # 确保输出目录存在 + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # 检查硬件加速支持 + hwaccel = check_hardware_acceleration() + hwaccel_args = [] + if hwaccel: + hwaccel_args = ["-hwaccel", hwaccel] + logger.info(f"使用硬件加速: {hwaccel}") + + # 存储裁剪结果 + result = {} + + for item in tts_result: + _id = item.get("_id", item.get("timestamp", "unknown")) + timestamp = item["timestamp"] + start_time, _ = parse_timestamp(timestamp) + + # 根据持续时间计算真正的结束时间(加上1秒余量) + duration = item["duration"] + calculated_end_time = calculate_end_time(start_time, duration) + + # 转换为FFmpeg兼容的时间格式(逗号替换为点) + ffmpeg_start_time = start_time.replace(',', '.') + ffmpeg_end_time = calculated_end_time.replace(',', '.') + + # 格式化输出文件名(使用连字符替代冒号和逗号) + safe_start_time = start_time.replace(':', '-').replace(',', '-') + safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-') + output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4" + output_path = os.path.join(output_dir, output_filename) + + # 构建FFmpeg命令 + ffmpeg_cmd = [ + "ffmpeg", "-y", *hwaccel_args, + "-i", video_origin_path, + "-ss", ffmpeg_start_time, + "-to", ffmpeg_end_time, + "-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264", + "-c:a", "aac", + "-strict", "experimental", + output_path + ] + + # 执行FFmpeg命令 + try: + logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}") + # logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}") + + process = subprocess.run( + ffmpeg_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True + ) + + result[_id] = output_path + + except subprocess.CalledProcessError as e: + logger.error(f"裁剪视频片段失败: {timestamp}") + logger.error(f"错误信息: {e.stderr}") + raise RuntimeError(f"视频裁剪失败: {e.stderr}") + + return result + + +if __name__ == "__main__": + video_origin_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/qyn2-2无片头片尾.mp4" + + tts_result = [{'timestamp': '00:00:00-00:01:15', + 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', + 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', + 'duration': 25.55, + 'text': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!'}, + {'timestamp': '00:01:15-00:04:40', + 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', + 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', + 'duration': 13.488, + 'text': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…'}, + {'timestamp': '00:04:58-00:05:45', + 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', + 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', + 'duration': 21.363, + 'text': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!'}, + {'timestamp': '00:05:45-00:06:00', + 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3', + 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt', + 'duration': 7.675, 'text': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!'}] + subclip_path_videos = { + '00:00:00-00:01:15': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-00-00-00-01-15.mp4', + '00:01:15-00:04:40': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-01-15-00-04-40.mp4', + '00:04:41-00:04:58': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4', + '00:04:58-00:05:45': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-58-00-05-45.mp4', + '00:05:45-00:06:00': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-05-45-00-06-00.mp4', + '00:06:00-00:06:03': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4', + } + + # 使用方法示例 + try: + result = clip_video(video_origin_path, tts_result, subclip_path_videos) + print("裁剪结果:") + print(json.dumps(result, indent=4, ensure_ascii=False)) + except Exception as e: + print(f"发生错误: {e}") diff --git a/app/services/generate_narration_script.py b/app/services/generate_narration_script.py new file mode 100644 index 0000000..f21aa6a --- /dev/null +++ b/app/services/generate_narration_script.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +''' +@Project: NarratoAI +@File : 生成介绍文案 +@Author : 小林同学 +@Date : 2025/5/8 上午11:33 +''' + +import json +import os +import traceback +from openai import OpenAI +from loguru import logger + + +def parse_frame_analysis_to_markdown(json_file_path): + """ + 解析视频帧分析JSON文件并转换为Markdown格式 + + :param json_file_path: JSON文件路径 + :return: Markdown格式的字符串 + """ + # 检查文件是否存在 + if not os.path.exists(json_file_path): + return f"错误: 文件 {json_file_path} 不存在" + + try: + # 读取JSON文件 + with open(json_file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + # 初始化Markdown字符串 + markdown = "" + + # 获取总结和帧观察数据 + summaries = data.get('overall_activity_summaries', []) + frame_observations = data.get('frame_observations', []) + + # 按批次组织数据 + batch_frames = {} + for frame in frame_observations: + batch_index = frame.get('batch_index') + if batch_index not in batch_frames: + batch_frames[batch_index] = [] + batch_frames[batch_index].append(frame) + + # 生成Markdown内容 + for i, summary in enumerate(summaries, 1): + batch_index = summary.get('batch_index') + time_range = summary.get('time_range', '') + batch_summary = summary.get('summary', '') + + markdown += f"## 片段 {i}\n" + markdown += f"- 时间范围:{time_range}\n" + + # 添加片段描述 + markdown += f"- 片段描述:{batch_summary}\n" if batch_summary else f"- 片段描述:\n" + + markdown += "- 详细描述:\n" + + # 添加该批次的帧观察详情 + frames = batch_frames.get(batch_index, []) + for frame in frames: + timestamp = frame.get('timestamp', '') + observation = frame.get('observation', '') + + # 直接使用原始文本,不进行分割 + markdown += f" - {timestamp}: {observation}\n" if observation else f" - {timestamp}: \n" + + markdown += "\n" + + return markdown + + except Exception as e: + return f"处理JSON文件时出错: {traceback.format_exc()}" + + +def generate_narration(markdown_content, api_key, base_url, model): + """ + 调用OpenAI API根据视频帧分析的Markdown内容生成解说文案 + + :param markdown_content: Markdown格式的视频帧分析内容 + :param api_key: OpenAI API密钥 + :param base_url: API基础URL,如果使用非官方API + :param model: 使用的模型名称 + :return: 生成的解说文案 + """ + try: + # 构建提示词 + prompt = """ +我是一名荒野建造解说的博主,以下是一些同行的对标文案,请你深度学习并总结这些文案的风格特点跟内容特点: + + +解压助眠的天花板就是荒野建造,沉浸丝滑的搭建过程可以说每一帧都是极致享受,我保证强迫症来了都找不出一丁点毛病。更别说全屋严丝合缝的拼接工艺,还能轻松抵御零下二十度气温,让你居住的每一天都温暖如春。 +在家闲不住的西姆今天也打算来一次野外建造,行走没多久他就发现许多倒塌的树,任由它们自生自灭不如将其利用起来。想到这他就开始挥舞铲子要把地基挖掘出来,虽然每次只能挖一点点,但架不住他体能惊人。没多长时间一个 2x3 的深坑就赫然出现,这深度住他一人绰绰有余。 +随后他去附近收集来原木,这些都是搭建墙壁的最好材料。而在投入使用前自然要把表皮刮掉,防止森林中的白蚁蛀虫。处理好一大堆后西姆还在两端打孔,使用木钉固定在一起。这可不是用来做墙壁的,而是做庇护所的承重柱。只要木头间的缝隙足够紧密,那搭建出的木屋就能足够坚固。 +每向上搭建一层,他都会在中间塞入苔藓防寒,保证不会泄露一丝热量。其他几面也是用相同方法,很快西姆就做好了三面墙壁,每一根木头都极其工整,保证强迫症来了都要点个赞再走。 +在继续搭建墙壁前西姆决定将壁炉制作出来,毕竟森林夜晚的气温会很低,保暖措施可是重中之重。完成后他找来一块大树皮用来充当庇护所的大门,而上面刮掉的木屑还能作为壁炉的引火物,可以说再完美不过。 +测试了排烟没问题后他才开始搭建最后一面墙壁,这一面要预留门和窗,所以在搭建到一半后还需要在原木中间开出卡口,让自己劈砍时能轻松许多。此时只需将另外一根如法炮制,两端拼接在一起后就是一扇大小适中的窗户。而随着随后一层苔藓铺好,最后一根原木落位,这个庇护所的雏形就算完成。 +大门的安装他没选择用合页,而是在底端雕刻出榫头,门框上则雕刻出榫眼,只能说西姆的眼就是一把尺,这完全就是严丝合缝。此时他才开始搭建屋顶。这里西姆用的方法不同,他先把最外围的原木固定好,随后将原木平铺在上面,就能得到完美的斜面屋顶。等他将四周的围栏也装好后,工整的屋顶看起来十分舒服,西姆躺上去都不想动。 +稍作休息后,他利用剩余的苔藓,对屋顶的缝隙处密封。可这样西姆觉得不够保险,于是他找来一些黏土,再次对原本的缝隙二次加工,保管这庇护所冬天也暖和。最后只需要平铺上枯叶,以及挖掘出的泥土,整个屋顶就算完成。 +考虑到庇护所的美观性,自然少不了覆盖上苔藓,翠绿的颜色看起来十分舒服。就连门口的庭院旁,他都移植了许多小树做点缀,让这木屋与周边环境融为一体。西姆才刚完成好这件事,一场大雨就骤然降临。好在此时的他已经不用淋雨,更别说这屋顶防水十分不错,室内没一点雨水渗透进来。 +等待温度回升的过程,西姆利用墙壁本身的凹槽,把床框镶嵌在上面,只需要铺上苔藓,以及自带的床单枕头,一张完美的单人床就做好。辛苦劳作一整天,西姆可不会亏待自己。他将自带的牛肉腌制好后,直接放到壁炉中烤,只需要等待三十分钟,就能享受这美味的一顿。 +在辛苦建造一星期后,他终于可以在自己搭建的庇护所中,享受最纯正的野外露营。后面西姆回家补给了一堆物资,再次回来时森林已经大雪纷飞,让他原本翠绿的小屋,更换上了冬季限定皮肤。好在内部设施没受什么影响,和他离开时一样整洁。 +就是房间中已经没多少柴火,让西姆今天又得劈柴。寒冷干燥的天气,让木头劈起来十分轻松。没多久他就收集到一大堆,这些足够燃烧好几天。虽然此时外面大雪纷飞,但小屋中却开始逐渐温暖。这次他除了带来一些食物外,还有几瓶调味料,以及一整套被褥,让自己的居住舒适度提高一大截。 +而秋天他有收集干草的缘故,只需要塞入枕套中密封起来,就能作为靠垫用。就这居住条件,比一般人在家过的还要奢侈。趁着壁炉木头变木炭的过程,西姆则开始不紧不慢的处理食物。他取出一块牛排,改好花刀以后,撒上一堆调料腌制起来。接着用锡纸包裹好,放到壁炉中直接炭烤,搭配上自带的红酒,是一个非常好的选择。 +随着时间来到第二天,外面的积雪融化了不少,西姆简单做顿煎蛋补充体力后,决定制作一个室外篝火堆,用来晚上驱散周边野兽。搭建这玩意没什么技巧,只需要找到一大堆木棍,利用大树的夹缝将其掰弯,然后将其堆积在一起,就是一个简易版的篝火堆。看这外形有点像帐篷,好在西姆没想那么多。 +等待天色暗淡下来后,他才来到室外将其点燃,顺便处理下多余的废料。只可惜这场景没朋友陪在身边,对西姆来说可能是个遗憾。而哪怕森林只有他一个人,都依旧做了好几个小时。等到里面的篝火彻底燃尽后,西姆还找来雪球,覆盖到上面将火熄灭,这防火意识可谓十分好。最后在室内二十五度的高温下,裹着被子睡觉。 + + + +解压助眠的天花板就是荒野建造,沉浸丝滑的搭建过程每一帧都是极致享受,全屋严丝合缝的拼接工艺,能轻松抵御零下二十度气温,居住体验温暖如春。 +在家闲不住的西姆开启野外建造。他发现倒塌的树,决定加以利用。先挖掘出 2x3 的深坑作为地基,接着收集原木,刮掉表皮防白蚁蛀虫,打孔用木钉固定制作承重柱。搭建墙壁时,每一层都塞入苔藓防寒,很快做好三面墙。 +为应对森林夜晚低温,西姆制作壁炉,用大树皮当大门,刮下的木屑做引火物。搭建最后一面墙时预留门窗,通过在原木中间开口拼接做出窗户。大门采用榫卯结构安装,严丝合缝。 +搭建屋顶时,先固定外围原木,再平铺原木形成斜面屋顶,之后用苔藓、黏土密封缝隙,铺上枯叶和泥土。为美观,在木屋覆盖苔藓,移植小树点缀。完工时遇大雨,木屋防水良好。 +西姆利用墙壁凹槽镶嵌床框,铺上苔藓、床单枕头做成床。劳作一天后,他用壁炉烤牛肉享用。建造一星期后,他开始野外露营。 +后来西姆回家补给物资,回来时森林大雪纷飞。他劈柴储备,带回食物、调味料和被褥,提高居住舒适度,还用干草做靠垫。他用壁炉烤牛排,搭配红酒。 +第二天,积雪融化,西姆制作室外篝火堆防野兽。用大树夹缝掰弯木棍堆积而成,晚上点燃处理废料,结束后用雪球灭火,最后在室内二十五度的环境中裹被入睡。 + + + +如果战争到来,这个深埋地下十几米的庇护所绝对是 bug 般的存在。即使被敌人发现,还能通过快速通道一秒逃出。里面不仅有竹子、地暖、地下水井,还自制抽水机。在解决用水问题的同时,甚至自研无土栽培技术,过上完全自给自足的生活。 +阿伟的老婆美如花,但阿伟从来不回家,来到野外他乐哈哈,一言不合就开挖。众所周知当战争来临时,地下堡垒的安全性是最高的。阿伟苦苦研习两载半,只为练就一身挖洞本领。在这双逆天麒麟臂的加持下,如此坚硬的泥土都只能当做炮灰。 +得到了充足的空间后,他便开始对这些边缘进行打磨。随后阿伟将细线捆在木棍上,以此描绘出圆柱的轮廓。接着再一点点铲掉多余的部分。虽然是由泥土一体式打造,但这样的桌子保准用上千年都不成问题。 +考虑到十几米的深度进出非常不方便,于是阿伟找来两根长达 66.6 米的木头,打算为庇护所打造一条快速通道。只见他将木桩牢牢地插入地下,并顺着洞口的方向延伸出去,直到贯穿整个山洞。接着在每个木桩的连接处钉入铁钉,确保轨道不能有一毫米的偏差。完成后再制作一个木质框架,从而达到前后滑动的效果。 +不得不说阿伟这手艺简直就是大钢管子杵青蛙。在上面放上一个木制的车斗,还能加快搬运泥土的速度。没多久庇护所的内部就已经初见雏形。为了住起来更加舒适,还需要为自己打造一张床。虽然深处的泥土同样很坚固,但好处就是不用担心垮塌的风险。 +阿伟不仅设计了更加符合人体工学的拱形,并且还在一旁雕刻处壁龛。就是这氛围怎么看着有点不太吉利。别看阿伟一身腱子肉,但这身体里的艺术细菌可不少。每个边缘的地方他都做了精雕细琢,瞬间让整个卧室的颜值提升一大截。 +住在地下的好处就是房子面积全靠挖,每平方消耗两个半馒头。不仅没有了房贷的压力,就连买墓地的钱也省了。阿伟将中间的墙壁挖空,从而得到取暖的壁炉。当然最重要的还有排烟问题,要想从上往下打通十几米的山体是件极其困难的事。好在阿伟年轻时报过忆坤年的古墓派补习班,这打洞技术堪比隔壁学校的土拨鼠专业。虽然深度长达十几米,但排烟效果却一点不受影响,一个字专业! +随后阿伟继续对壁炉底部雕刻,打通了底部放柴火的空间,并制作出放锅的灶头。完成后阿伟从侧面将壁炉打通,并制作出一条导热的通道,以此连接到床铺的位置。毕竟住在这么一个风湿宝地,不注意保暖除湿很容易得老寒腿。 +阿伟在床面上挖出一条条管道,以便于温度能传输到床的每个角落。接下来就可以根据这些通道的长度裁切出同样长短的竹子,根据竹筒的大小凿出相互连接的孔洞,最后再将竹筒内部打通,以达到温度传送的效果。 +而后阿伟将这些管道安装到凹槽内,在他严谨的制作工艺下,每根竹子刚好都能镶嵌进去。在铺设床面之前还需要用木塞把圆孔堵住,防止泥土掉落进管道。泥土虽然不能隔绝湿气,但却是十分优良的导热材料。等他把床面都压平后就可以小心的将这些木塞拔出来,最后再用黏土把剩余的管道也遮盖起来,直到整个墙面恢复原样。 +接下来还需要测试一下加热效果,当他把火点起来后,温度很快就传送到了管道内,把火力一点点加大,直到热气流淌到更远的床面。随着小孔里的青烟冒出,也预示着阿伟的地暖可以投入使用。而后阿伟制作了一些竹条,并用细绳将它们喜结连理。 +千里之行始于足下,美好的家园要靠自己双手打造。明明可以靠才艺吃饭的阿伟偏偏要用八块腹肌征服大家,就问这样的男人哪个野生婆娘不喜欢?完成后阿伟还用自己 35 码的大腚感受了一下,真烫! +随后阿伟来到野区找到一根上好的雷击木,他当即就把木头咔嚓成两段,并取下两节较为完整的带了回去,刚好能和圆桌配套。另外一个在里面凿出凹槽,并插入木棍连接,得到一个夯土的木锤。住过农村的小伙伴都知道,这样夯出来的地面堪比水泥地,不仅坚硬耐磨,还不用担心脚底打滑。忙碌了一天的阿伟已经饥渴难耐,拿出野生小烤肠,安安心心住新房,光脚爬上大热炕,一觉能睡到天亮。 +第二天阿伟打算将房间扩宽,毕竟吃住的地方有了,还要解决个人卫生的问题。阿伟在另一侧增加了一个房间,他打算将这里打造成洗澡的地方。为了防止泥土垮塌,他将顶部做成圆弧形,等挖出足够的空间后,旁边的泥土已经堆成了小山。 +为了方便清理这些泥土,阿伟在之前的轨道增加了转弯,交接处依然是用铁钉固定,一直延伸到房间的最里面。有了运输车的帮助,这些成吨的泥土也能轻松的运送出去,并且还能体验过山车的感觉。很快他就完成了清理工作。 +为了更方便的在里面洗澡,他将底部一点点挖空,这么大的浴缸,看来阿伟并不打算一个人住。完成后他将墙面雕刻的凹凸有致,让这里看起来更加豪华。接着用洛阳铲挖出排水口,并用一根相同大小的竹筒作为开关。 +由于四周都是泥土还不能防水,阿伟特意找了一些白蚁巢,用来制作可以防水的野生水泥。现在就可以将里里外外,能接触到水的地方都涂抹一遍。细心的阿伟还找来这种 500 克一斤的鹅卵石,对池子表面进行装饰。 +没错,水源问题阿伟早已经考虑在内,他打算直接在旁边挖个水井,毕竟已经挖了这么深,再向下挖一挖,应该就能到达地下水的深度。经过几日的奋战,能看得出阿伟已经消瘦了不少,但一想到马上就能拥有的豪宅,他直接化身为无情的挖土机器,很快就挖到了好几米的深度。 +考虑到自己的弹跳力有限,阿伟在一旁定入木桩,然后通过绳子爬上爬下。随着深度越来越深,井底已经开始渗出水来,这也预示着打井成功。没多久这里面将渗满泉水,仅凭一次就能挖到水源,看来这里还真是块风湿宝地。 +随后阿伟在井口四周挖出凹槽,以便于井盖的安置。这一量才知道,井的深度已经达到了足足的 5 米。阿伟把木板组合在一起,再沿着标记切掉多余部分,他甚至还给井盖做了把手。可是如何从这么深的井里打水还是个问题,但从阿伟坚定的眼神来看,他应该想到了解决办法。 +只见他将树桩锯成两半,然后用凿子把里面一点点掏空,另外一半也是如法炮制。接着还要在底部挖出圆孔,要想成功将水从 5 米深的地方抽上来,那就不得不提到大家熟知的勾股定理。没错,这跟勾股定理没什么关系。 +阿伟给竹筒做了一个木塞,并在里面打上安装连接轴的孔。为了增加密闭性,阿伟不得不牺牲了自己的 AJ,剪出与木塞相同的大小后,再用木钉固定住。随后他收集了一些树胶,并放到火上加热融化。接下来就可以涂在木塞上增加使用寿命。 +现在将竹筒组装完成,就可以利用虹吸原理将水抽上来。完成后就可以把井盖盖上去,再用泥土在上面覆盖,现在就不用担心失足掉下去了。 +接下来阿伟去采集了一些大漆,将它涂抹在木桶接缝处,就能将其二合为一。完了再接入旁边浴缸的入水口,每个连接的地方都要做好密封,不然后面很容易漏水。随后就可以安装上活塞,并用一根木桩作为省力杠杆,根据空气压强的原理将井水抽上来。 +经过半小时的来回拉扯,硕大的浴缸终于被灌满,阿伟也是忍不住洗了把脸。接下来还需要解决排水的问题,阿伟在地上挖出沟渠,一直贯穿到屋外,然后再用竹筒从出水口连接,每个接口处都要抹上胶水,就连门外的出水口他都做了隐藏。 +在野外最重要的就是庇护所、水源还有食物。既然已经完成了前二者,那么阿伟还需要拥有可持续发展的食物来源。他先是在地上挖了两排地洞,然后在每根竹筒的表面都打上无数孔洞,这就是他打算用来种植的载体。在此之前,还需要用大火对竹筒进行杀菌消毒。 +趁着这时候,他去搬了一麻袋的木屑,先用芭蕉叶覆盖在上面,再铺上厚厚的黏土隔绝温度。在火焰的温度下,能让里面的木屑达到生长条件。 +等到第二天所有材料都晾凉后,阿伟才将竹筒内部掏空,并将木屑一点点地塞入竹筒。一切准备就绪,就可以将竹筒插入提前挖好的地洞。最后再往竹筒里塞入种子,依靠房间内的湿度和温度,就能达到大棚种植的效果。稍加时日,这些种子就会慢慢发芽。 +虽然暂时还吃不上自己培养的食物,但好在阿伟从表哥贺强那里学到不少钓鱼本领,哪怕只有一根小小的竹竿,也能让他钓上两斤半的大鲶鱼。新鲜的食材,那肯定是少不了高温消毒的过程。趁着鱼没熟,阿伟直接爬进浴缸,冰凉的井水瞬间洗去了身上的疲惫。这一刻的阿伟是无比的享受。 +不久后鱼也烤得差不多了,阿伟的生活现在可以说是有滋有味。住在十几米的地下,不仅能安全感满满,哪怕遇到危险,还能通过轨道快速逃生。 + + + +%s + + +我正在尝试做这个内容的解说纪录片视频,我需要你以 中的内容为解说目标,根据我刚才提供给你的对标文案 特点,以及你总结的特点,帮我生成一段关于荒野建造的解说文案,文案需要符合平台受欢迎的解说风格,请使用 json 格式进行输出;使用 中的输出格式: + + +{ + "items": [ + { + "_id": 1, # 唯一递增id + "timestamp": "00:00:05,390-00:00:10,430", + "picture": "画面描述", + "narration": "解说文案", + } +} + + + +1. 只输出 json 内容,不要输出其他任何说明性的文字 +2. 解说文案的语言使用 简体中文 +3. 严禁虚构画面,所有画面只能从 中摘取 + +""" % (markdown_content) + + # 使用OpenAI SDK初始化客户端 + client = OpenAI( + api_key=api_key, + base_url=base_url + ) + + # 使用SDK发送请求 + if model not in ["deepseek-reasoner"]: + # deepseek-reasoner 不支持 json 输出 + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"}, + {"role": "user", "content": prompt} + ], + temperature=1.5, + response_format={"type": "json_object"}, + ) + # 提取生成的文案 + if response.choices and len(response.choices) > 0: + narration_script = response.choices[0].message.content + # 打印消耗的tokens + logger.debug(f"消耗的tokens: {response.usage.total_tokens}") + return narration_script + else: + return "生成解说文案失败: 未获取到有效响应" + else: + # 不支持 json 输出,需要多一步处理 ```json ``` 的步骤 + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"}, + {"role": "user", "content": prompt} + ], + temperature=1.5, + ) + # 提取生成的文案 + if response.choices and len(response.choices) > 0: + narration_script = response.choices[0].message.content + # 打印消耗的tokens + logger.debug(f"文案消耗的tokens: {response.usage.total_tokens}") + # 清理 narration_script 字符串前后的 ```json ``` 字符串 + narration_script = narration_script.replace("```json", "").replace("```", "") + return narration_script + else: + return "生成解说文案失败: 未获取到有效响应" + + except Exception as e: + return f"调用API生成解说文案时出错: {traceback.format_exc()}" + + +if __name__ == '__main__': + text_provider = 'openai' + text_api_key = "sk-xxx" + text_model = "deepseek-reasoner" + text_base_url = "https://api.deepseek.com" + video_frame_description_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1139.json" + + # 测试新的JSON文件 + test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1458.json" + markdown_output = parse_frame_analysis_to_markdown(test_file_path) + # print(markdown_output) + + # 输出到文件以便检查格式 + output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/narration_script.md" + with open(output_file, 'w', encoding='utf-8') as f: + f.write(markdown_output) + # print(f"\n已将Markdown输出保存到: {output_file}") + + # 生成解说文案 + narration = generate_narration( + markdown_output, + text_api_key, + base_url=text_base_url, + model=text_model + ) + + # 保存解说文案 + print(narration) + print(type(narration)) + narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json" + with open(narration_file, 'w', encoding='utf-8') as f: + f.write(narration) + print(f"\n已将解说文案保存到: {narration_file}") diff --git a/app/services/generate_video.py b/app/services/generate_video.py new file mode 100644 index 0000000..f125c05 --- /dev/null +++ b/app/services/generate_video.py @@ -0,0 +1,393 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +''' +@Project: NarratoAI +@File : generate_video +@Author : 小林同学 +@Date : 2025/5/7 上午11:55 +''' + +import os +import traceback +from typing import Optional, Dict, Any +from loguru import logger +from moviepy import ( + VideoFileClip, + AudioFileClip, + CompositeAudioClip, + CompositeVideoClip, + TextClip, + afx +) +from moviepy.video.tools.subtitles import SubtitlesClip +from PIL import ImageFont + +from app.utils import utils + + +def merge_materials( + video_path: str, + audio_path: str, + output_path: str, + subtitle_path: Optional[str] = None, + bgm_path: Optional[str] = None, + options: Optional[Dict[str, Any]] = None +) -> str: + """ + 合并视频、音频、BGM和字幕素材生成最终视频 + + 参数: + video_path: 视频文件路径 + audio_path: 音频文件路径 + output_path: 输出文件路径 + subtitle_path: 字幕文件路径,可选 + bgm_path: 背景音乐文件路径,可选 + options: 其他选项配置,可包含以下字段: + - voice_volume: 人声音量,默认1.0 + - bgm_volume: 背景音乐音量,默认0.3 + - original_audio_volume: 原始音频音量,默认0.0 + - keep_original_audio: 是否保留原始音频,默认False + - subtitle_font: 字幕字体,默认None,系统会使用默认字体 + - subtitle_font_size: 字幕字体大小,默认40 + - subtitle_color: 字幕颜色,默认白色 + - subtitle_bg_color: 字幕背景颜色,默认透明 + - subtitle_position: 字幕位置,可选值'bottom', 'top', 'center',默认'bottom' + - custom_position: 自定义位置 + - stroke_color: 描边颜色,默认黑色 + - stroke_width: 描边宽度,默认1 + - threads: 处理线程数,默认2 + - fps: 输出帧率,默认30 + + 返回: + 输出视频的路径 + """ + # 合并选项默认值 + if options is None: + options = {} + + # 设置默认参数值 + voice_volume = options.get('voice_volume', 1.0) + bgm_volume = options.get('bgm_volume', 0.3) + original_audio_volume = options.get('original_audio_volume', 0.0) # 默认为0,即不保留原声 + keep_original_audio = options.get('keep_original_audio', False) # 是否保留原声 + subtitle_font = options.get('subtitle_font', '') + subtitle_font_size = options.get('subtitle_font_size', 40) + subtitle_color = options.get('subtitle_color', '#FFFFFF') + subtitle_bg_color = options.get('subtitle_bg_color', 'transparent') + subtitle_position = options.get('subtitle_position', 'bottom') + custom_position = options.get('custom_position', 70) + stroke_color = options.get('stroke_color', '#000000') + stroke_width = options.get('stroke_width', 1) + threads = options.get('threads', 2) + fps = options.get('fps', 30) + + # 处理透明背景色问题 - MoviePy 2.1.1不支持'transparent'值 + if subtitle_bg_color == 'transparent': + subtitle_bg_color = None # None在新版MoviePy中表示透明背景 + + # 创建输出目录(如果不存在) + output_dir = os.path.dirname(output_path) + os.makedirs(output_dir, exist_ok=True) + + logger.info(f"开始合并素材...") + logger.info(f" ① 视频: {video_path}") + logger.info(f" ② 音频: {audio_path}") + if subtitle_path: + logger.info(f" ③ 字幕: {subtitle_path}") + if bgm_path: + logger.info(f" ④ 背景音乐: {bgm_path}") + logger.info(f" ⑤ 输出: {output_path}") + + # 加载视频 + try: + video_clip = VideoFileClip(video_path) + logger.info(f"视频尺寸: {video_clip.size[0]}x{video_clip.size[1]}, 时长: {video_clip.duration}秒") + + # 提取视频原声(如果需要) + original_audio = None + if keep_original_audio and original_audio_volume > 0: + try: + original_audio = video_clip.audio + if original_audio: + original_audio = original_audio.with_effects([afx.MultiplyVolume(original_audio_volume)]) + logger.info(f"已提取视频原声,音量设置为: {original_audio_volume}") + else: + logger.warning("视频没有音轨,无法提取原声") + except Exception as e: + logger.error(f"提取视频原声失败: {str(e)}") + original_audio = None + + # 移除原始音轨,稍后会合并新的音频 + video_clip = video_clip.without_audio() + + except Exception as e: + logger.error(f"加载视频失败: {str(e)}") + raise + + # 处理背景音乐和所有音频轨道合成 + audio_tracks = [] + + # 先添加主音频(配音) + if audio_path and os.path.exists(audio_path): + try: + voice_audio = AudioFileClip(audio_path).with_effects([afx.MultiplyVolume(voice_volume)]) + audio_tracks.append(voice_audio) + logger.info(f"已添加配音音频,音量: {voice_volume}") + except Exception as e: + logger.error(f"加载配音音频失败: {str(e)}") + + # 添加原声(如果需要) + if original_audio is not None: + audio_tracks.append(original_audio) + logger.info(f"已添加视频原声,音量: {original_audio_volume}") + + # 添加背景音乐(如果有) + if bgm_path and os.path.exists(bgm_path): + try: + bgm_clip = AudioFileClip(bgm_path).with_effects([ + afx.MultiplyVolume(bgm_volume), + afx.AudioFadeOut(3), + afx.AudioLoop(duration=video_clip.duration), + ]) + audio_tracks.append(bgm_clip) + logger.info(f"已添加背景音乐,音量: {bgm_volume}") + except Exception as e: + logger.error(f"添加背景音乐失败: \n{traceback.format_exc()}") + + # 合成最终的音频轨道 + if audio_tracks: + final_audio = CompositeAudioClip(audio_tracks) + video_clip = video_clip.with_audio(final_audio) + logger.info(f"已合成所有音频轨道,共{len(audio_tracks)}个") + else: + logger.warning("没有可用的音频轨道,输出视频将没有声音") + + # 处理字体路径 + font_path = None + if subtitle_path and subtitle_font: + font_path = os.path.join(utils.font_dir(), subtitle_font) + if os.name == "nt": + font_path = font_path.replace("\\", "/") + logger.info(f"使用字体: {font_path}") + + # 处理视频尺寸 + video_width, video_height = video_clip.size + + # 字幕处理函数 + def create_text_clip(subtitle_item): + """创建单个字幕片段""" + phrase = subtitle_item[1] + max_width = video_width * 0.9 + + # 如果有字体路径,进行文本换行处理 + wrapped_txt = phrase + txt_height = 0 + if font_path: + wrapped_txt, txt_height = wrap_text( + phrase, + max_width=max_width, + font=font_path, + fontsize=subtitle_font_size + ) + + # 创建文本片段 + try: + _clip = TextClip( + text=wrapped_txt, + font=font_path, + font_size=subtitle_font_size, + color=subtitle_color, + bg_color=subtitle_bg_color, # 这里已经在前面处理过,None表示透明 + stroke_color=stroke_color, + stroke_width=stroke_width, + ) + except Exception as e: + logger.error(f"创建字幕片段失败: {str(e)}, 使用简化参数重试") + # 如果上面的方法失败,尝试使用更简单的参数 + _clip = TextClip( + text=wrapped_txt, + font=font_path, + font_size=subtitle_font_size, + color=subtitle_color, + ) + + # 设置字幕时间 + duration = subtitle_item[0][1] - subtitle_item[0][0] + _clip = _clip.with_start(subtitle_item[0][0]) + _clip = _clip.with_end(subtitle_item[0][1]) + _clip = _clip.with_duration(duration) + + # 设置字幕位置 + if subtitle_position == "bottom": + _clip = _clip.with_position(("center", video_height * 0.95 - _clip.h)) + elif subtitle_position == "top": + _clip = _clip.with_position(("center", video_height * 0.05)) + elif subtitle_position == "custom": + margin = 10 + max_y = video_height - _clip.h - margin + min_y = margin + custom_y = (video_height - _clip.h) * (custom_position / 100) + custom_y = max( + min_y, min(custom_y, max_y) + ) + _clip = _clip.with_position(("center", custom_y)) + else: # center + _clip = _clip.with_position(("center", "center")) + + return _clip + + # 创建TextClip工厂函数 + def make_textclip(text): + return TextClip( + text=text, + font=font_path, + font_size=subtitle_font_size, + color=subtitle_color, + ) + + # 处理字幕 + if subtitle_path and os.path.exists(subtitle_path): + try: + # 加载字幕文件 + sub = SubtitlesClip( + subtitles=subtitle_path, + encoding="utf-8", + make_textclip=make_textclip + ) + + # 创建每个字幕片段 + text_clips = [] + for item in sub.subtitles: + clip = create_text_clip(subtitle_item=item) + text_clips.append(clip) + + # 合成视频和字幕 + video_clip = CompositeVideoClip([video_clip, *text_clips]) + logger.info(f"已添加{len(text_clips)}个字幕片段") + except Exception as e: + logger.error(f"处理字幕失败: \n{traceback.format_exc()}") + + # 导出最终视频 + try: + video_clip.write_videofile( + output_path, + audio_codec="aac", + temp_audiofile_path=output_dir, + threads=threads, + fps=fps, + ) + logger.success(f"素材合并完成: {output_path}") + except Exception as e: + logger.error(f"导出视频失败: {str(e)}") + raise + finally: + # 释放资源 + video_clip.close() + del video_clip + + return output_path + + +def wrap_text(text, max_width, font="Arial", fontsize=60): + """ + 文本换行函数,使长文本适应指定宽度 + + 参数: + text: 需要换行的文本 + max_width: 最大宽度(像素) + font: 字体路径 + fontsize: 字体大小 + + 返回: + 换行后的文本和文本高度 + """ + # 创建ImageFont对象 + try: + font_obj = ImageFont.truetype(font, fontsize) + except: + # 如果无法加载指定字体,使用默认字体 + font_obj = ImageFont.load_default() + + def get_text_size(inner_text): + inner_text = inner_text.strip() + left, top, right, bottom = font_obj.getbbox(inner_text) + return right - left, bottom - top + + width, height = get_text_size(text) + if width <= max_width: + return text, height + + processed = True + + _wrapped_lines_ = [] + words = text.split(" ") + _txt_ = "" + for word in words: + _before = _txt_ + _txt_ += f"{word} " + _width, _height = get_text_size(_txt_) + if _width <= max_width: + continue + else: + if _txt_.strip() == word.strip(): + processed = False + break + _wrapped_lines_.append(_before) + _txt_ = f"{word} " + _wrapped_lines_.append(_txt_) + if processed: + _wrapped_lines_ = [line.strip() for line in _wrapped_lines_] + result = "\n".join(_wrapped_lines_).strip() + height = len(_wrapped_lines_) * height + return result, height + + _wrapped_lines_ = [] + chars = list(text) + _txt_ = "" + for word in chars: + _txt_ += word + _width, _height = get_text_size(_txt_) + if _width <= max_width: + continue + else: + _wrapped_lines_.append(_txt_) + _txt_ = "" + _wrapped_lines_.append(_txt_) + result = "\n".join(_wrapped_lines_).strip() + height = len(_wrapped_lines_) * height + return result, height + + +if __name__ == '__main__': + merger_mp4 = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger.mp4' + merger_sub = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merged_subtitle_00_00_00-00_01_30.srt' + merger_audio = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger_audio.mp3' + bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3' + output_video = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/combined_test.mp4' + + # 调用示例 + options = { + 'voice_volume': 1.0, # 配音音量 + 'bgm_volume': 0.1, # 背景音乐音量 + 'original_audio_volume': 1.0, # 视频原声音量,0表示不保留 + 'keep_original_audio': True, # 是否保留原声 + 'subtitle_font': 'MicrosoftYaHeiNormal.ttc', # 这里使用相对字体路径,会自动在 font_dir() 目录下查找 + 'subtitle_font_size': 40, + 'subtitle_color': '#FFFFFF', + 'subtitle_bg_color': None, # 直接使用None表示透明背景 + 'subtitle_position': 'bottom', + 'threads': 2 + } + + try: + merge_materials( + video_path=merger_mp4, + audio_path=merger_audio, + subtitle_path=merger_sub, + bgm_path=bgm_path, + output_path=output_video, + options=options + ) + except Exception as e: + logger.error(f"合并素材失败: \n{traceback.format_exc()}") diff --git a/app/services/llm.py b/app/services/llm.py index d054eb1..0db7920 100644 --- a/app/services/llm.py +++ b/app/services/llm.py @@ -7,7 +7,7 @@ from typing import List from loguru import logger from openai import OpenAI from openai import AzureOpenAI -from moviepy.editor import VideoFileClip +from moviepy import VideoFileClip from openai.types.chat import ChatCompletion import google.generativeai as gemini from googleapiclient.errors import ResumableUploadError diff --git a/app/services/material.py b/app/services/material.py index 2a84f85..c048a92 100644 --- a/app/services/material.py +++ b/app/services/material.py @@ -4,9 +4,10 @@ import random import traceback from urllib.parse import urlencode from datetime import datetime +import json import requests -from typing import List +from typing import List, Optional from loguru import logger from moviepy.video.io.VideoFileClip import VideoFileClip @@ -306,7 +307,50 @@ def format_timestamp(seconds: float) -> str: return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}" -def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict: +def _detect_hardware_acceleration() -> Optional[str]: + """ + 检测系统可用的硬件加速器 + + Returns: + Optional[str]: 硬件加速参数,如果不支持则返回None + """ + # 检查NVIDIA GPU支持 + try: + nvidia_check = subprocess.run( + ["ffmpeg", "-hwaccel", "cuda", "-i", "/dev/null", "-f", "null", "-"], + stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False + ) + if nvidia_check.returncode == 0: + return "cuda" + except Exception: + pass + + # 检查MacOS videotoolbox支持 + try: + videotoolbox_check = subprocess.run( + ["ffmpeg", "-hwaccel", "videotoolbox", "-i", "/dev/null", "-f", "null", "-"], + stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False + ) + if videotoolbox_check.returncode == 0: + return "videotoolbox" + except Exception: + pass + + # 检查Intel Quick Sync支持 + try: + qsv_check = subprocess.run( + ["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"], + stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False + ) + if qsv_check.returncode == 0: + return "qsv" + except Exception: + pass + + return None + + +def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> str: """ 保存剪辑后的视频 @@ -328,29 +372,43 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di if not os.path.exists(save_dir): os.makedirs(save_dir) - # 生成更规范的视频文件名 - video_id = f"vid-{timestamp.replace(':', '-').replace(',', '_')}" - video_path = os.path.join(save_dir, f"{video_id}.mp4") + # 解析时间戳 + start_str, end_str = timestamp.split('-') + + # 格式化输出文件名(使用连字符替代冒号和逗号) + safe_start_time = start_str.replace(':', '-').replace(',', '-') + safe_end_time = end_str.replace(':', '-').replace(',', '-') + output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4" + video_path = os.path.join(save_dir, output_filename) + # 如果视频已存在,直接返回 if os.path.exists(video_path) and os.path.getsize(video_path) > 0: - logger.info(f"video already exists: {video_path}") - return {timestamp: video_path} + logger.info(f"视频已存在: {video_path}") + return video_path try: - # 加载视频获取总时长 - video = VideoFileClip(origin_video) - total_duration = video.duration + # 检查视频是否存在 + if not os.path.exists(origin_video): + logger.error(f"源视频文件不存在: {origin_video}") + return '' + + # 获取视频总时长 + try: + probe_cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", origin_video] + total_duration = float(subprocess.check_output(probe_cmd).decode('utf-8').strip()) + except subprocess.CalledProcessError as e: + logger.error(f"获取视频时长失败: {str(e)}") + return '' - # 解析时间戳 - start_str, end_str = timestamp.split('-') + # 计算时间点 start = time_to_seconds(start_str) end = time_to_seconds(end_str) # 验证时间段 if start >= total_duration: logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)") - video.close() - return {} + return '' if end > total_duration: logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒),将自动调整为视频结尾") @@ -358,55 +416,74 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di if end <= start: logger.warning(f"结束时间 {format_timestamp(end)} 必须大于起始时间 {format_timestamp(start)}") - video.close() - return {} + return '' - # 剪辑视频 + # 计算剪辑时长 duration = end - start - logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)},时长 {format_timestamp(duration)}") + # logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)},时长 {format_timestamp(duration)}") - # 剪辑视频 - subclip = video.subclip(start, end) + # 检测可用的硬件加速选项 + hwaccel = _detect_hardware_acceleration() + hwaccel_args = [] + if hwaccel: + hwaccel_args = ["-hwaccel", hwaccel] + logger.info(f"使用硬件加速: {hwaccel}") - try: - # 检查视频是否有音频轨道并写入文件 - subclip.write_videofile( - video_path, - codec='libx264', - audio_codec='aac', - temp_audiofile='temp-audio.m4a', - remove_temp=True, - audio=(subclip.audio is not None), - logger=None - ) - - # 验证生成的视频文件 - if os.path.exists(video_path) and os.path.getsize(video_path) > 0: - with VideoFileClip(video_path) as clip: - if clip.duration > 0 and clip.fps > 0: - return {timestamp: video_path} - - raise ValueError("视频文件验证失败") - - except Exception as e: - logger.warning(f"视频文件处理失败: {video_path} => {str(e)}") + # 转换为FFmpeg兼容的时间格式(逗号替换为点) + ffmpeg_start_time = start_str.replace(',', '.') + ffmpeg_end_time = end_str.replace(',', '.') + + # 构建FFmpeg命令 + ffmpeg_cmd = [ + "ffmpeg", "-y", *hwaccel_args, + "-i", origin_video, + "-ss", ffmpeg_start_time, + "-to", ffmpeg_end_time, + "-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264", + "-c:a", "aac", + "-strict", "experimental", + video_path + ] + + # 执行FFmpeg命令 + # logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}") + # logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}") + + process = subprocess.run( + ffmpeg_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False # 不抛出异常,我们会检查返回码 + ) + + # 检查是否成功 + if process.returncode != 0: + logger.error(f"视频剪辑失败: {process.stderr}") if os.path.exists(video_path): os.remove(video_path) + return '' + + # 验证生成的视频文件 + if os.path.exists(video_path) and os.path.getsize(video_path) > 0: + # 检查视频是否可播放 + probe_cmd = ["ffprobe", "-v", "error", video_path] + validate_result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if validate_result.returncode == 0: + logger.info(f"视频剪辑成功: {video_path}") + return video_path - except Exception as e: - logger.warning(f"视频剪辑失败: \n{str(traceback.format_exc())}") + logger.error("视频文件验证失败") if os.path.exists(video_path): os.remove(video_path) - finally: - # 确保视频对象被正确关闭 - try: - video.close() - if 'subclip' in locals(): - subclip.close() - except: - pass - - return {} + return '' + + except Exception as e: + logger.error(f"视频剪辑过程中发生错误: \n{str(traceback.format_exc())}") + if os.path.exists(video_path): + os.remove(video_path) + return '' def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None) -> dict: @@ -428,8 +505,7 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro try: saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory) if saved_video_path: - logger.info(f"video saved: {saved_video_path}") - video_paths.update(saved_video_path) + video_paths.update({index+1:saved_video_path}) # 更新进度 if progress_callback: @@ -439,6 +515,7 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro return {} logger.success(f"裁剪 {len(video_paths)} videos") + # logger.debug(json.dumps(video_paths, indent=4, ensure_ascii=False)) return video_paths diff --git a/app/services/merger_video.py b/app/services/merger_video.py new file mode 100644 index 0000000..66b58de --- /dev/null +++ b/app/services/merger_video.py @@ -0,0 +1,555 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +''' +@Project: NarratoAI +@File : merger_video +@Author : 小林同学 +@Date : 2025/5/6 下午7:38 +''' + +import os +import shutil +import subprocess +from enum import Enum +from typing import List, Optional, Tuple +from loguru import logger + + +class VideoAspect(Enum): + """视频宽高比枚举""" + landscape = "16:9" # 横屏 16:9 + landscape_2 = "4:3" + portrait = "9:16" # 竖屏 9:16 + portrait_2 = "3:4" + square = "1:1" # 方形 1:1 + + def to_resolution(self) -> Tuple[int, int]: + """根据宽高比返回标准分辨率""" + if self == VideoAspect.portrait: + return 1080, 1920 # 竖屏 9:16 + elif self == VideoAspect.portrait_2: + return 720, 1280 # 竖屏 4:3 + elif self == VideoAspect.landscape: + return 1920, 1080 # 横屏 16:9 + elif self == VideoAspect.landscape_2: + return 1280, 720 # 横屏 4:3 + elif self == VideoAspect.square: + return 1080, 1080 # 方形 1:1 + else: + return 1080, 1920 # 默认竖屏 + + +def check_ffmpeg_installation() -> bool: + """ + 检查ffmpeg是否已安装 + + Returns: + bool: 如果安装则返回True,否则返回False + """ + try: + subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + return True + except (subprocess.SubprocessError, FileNotFoundError): + logger.error("ffmpeg未安装或不在系统PATH中,请安装ffmpeg") + return False + + +def get_hardware_acceleration_option() -> Optional[str]: + """ + 根据系统环境选择合适的硬件加速选项 + + Returns: + Optional[str]: 硬件加速参数,如果不支持则返回None + """ + try: + # 检查NVIDIA GPU支持 + nvidia_check = subprocess.run( + ['ffmpeg', '-hide_banner', '-hwaccels'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True + ) + output = nvidia_check.stdout.lower() + + if 'cuda' in output: + return 'cuda' + elif 'nvenc' in output: + return 'nvenc' + elif 'qsv' in output: # Intel Quick Sync + return 'qsv' + elif 'videotoolbox' in output: # macOS + return 'videotoolbox' + elif 'vaapi' in output: # Linux VA-API + return 'vaapi' + else: + logger.info("没有找到支持的硬件加速器,将使用软件编码") + return None + except Exception as e: + logger.warning(f"检测硬件加速器时出错: {str(e)},将使用软件编码") + return None + + +def check_video_has_audio(video_path: str) -> bool: + """ + 检查视频是否包含音频流 + + Args: + video_path: 视频文件路径 + + Returns: + bool: 如果视频包含音频流则返回True,否则返回False + """ + if not os.path.exists(video_path): + logger.warning(f"视频文件不存在: {video_path}") + return False + + probe_cmd = [ + 'ffprobe', '-v', 'error', + '-select_streams', 'a:0', + '-show_entries', 'stream=codec_type', + '-of', 'csv=p=0', + video_path + ] + + try: + result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False) + return result.stdout.strip() == 'audio' + except Exception as e: + logger.warning(f"检测视频音频流时出错: {str(e)}") + return False + + +def create_ffmpeg_concat_file(video_paths: List[str], concat_file_path: str) -> str: + """ + 创建ffmpeg合并所需的concat文件 + + Args: + video_paths: 需要合并的视频文件路径列表 + concat_file_path: concat文件的输出路径 + + Returns: + str: concat文件的路径 + """ + with open(concat_file_path, 'w', encoding='utf-8') as f: + for video_path in video_paths: + # 获取绝对路径 + abs_path = os.path.abspath(video_path) + # 在Windows上将反斜杠替换为正斜杠 + if os.name == 'nt': # Windows系统 + abs_path = abs_path.replace('\\', '/') + else: # Unix/Mac系统 + # 转义特殊字符 + abs_path = abs_path.replace('\\', '\\\\').replace(':', '\\:') + + # 处理路径中的单引号 (如果有) + abs_path = abs_path.replace("'", "\\'") + + f.write(f"file '{abs_path}'\n") + return concat_file_path + + +def process_single_video( + input_path: str, + output_path: str, + target_width: int, + target_height: int, + keep_audio: bool = True, + hwaccel: Optional[str] = None +) -> str: + """ + 处理单个视频:调整分辨率、帧率等 + + Args: + input_path: 输入视频路径 + output_path: 输出视频路径 + target_width: 目标宽度 + target_height: 目标高度 + keep_audio: 是否保留音频 + hwaccel: 硬件加速选项 + + Returns: + str: 处理后的视频路径 + """ + if not os.path.exists(input_path): + raise FileNotFoundError(f"找不到视频文件: {input_path}") + + # 构建基本命令 + command = ['ffmpeg', '-y'] + + # 添加硬件加速参数 + if hwaccel: + if hwaccel == 'cuda' or hwaccel == 'nvenc': + command.extend(['-hwaccel', 'cuda']) + elif hwaccel == 'qsv': + command.extend(['-hwaccel', 'qsv']) + elif hwaccel == 'videotoolbox': + command.extend(['-hwaccel', 'videotoolbox']) + elif hwaccel == 'vaapi': + command.extend(['-hwaccel', 'vaapi', '-vaapi_device', '/dev/dri/renderD128']) + + # 输入文件 + command.extend(['-i', input_path]) + + # 处理音频 + if not keep_audio: + command.extend(['-an']) # 移除音频 + else: + # 检查输入视频是否有音频流 + has_audio = check_video_has_audio(input_path) + if has_audio: + command.extend(['-c:a', 'aac', '-b:a', '128k']) # 音频编码为AAC + else: + logger.warning(f"视频 {input_path} 没有音频流,将会忽略音频设置") + command.extend(['-an']) # 没有音频流时移除音频设置 + + # 视频处理参数:缩放并添加填充以保持比例 + scale_filter = f"scale={target_width}:{target_height}:force_original_aspect_ratio=decrease" + pad_filter = f"pad={target_width}:{target_height}:(ow-iw)/2:(oh-ih)/2" + command.extend([ + '-vf', f"{scale_filter},{pad_filter}", + '-r', '30', # 设置帧率为30fps + ]) + + # 选择编码器 + if hwaccel == 'cuda' or hwaccel == 'nvenc': + command.extend(['-c:v', 'h264_nvenc', '-preset', 'p4', '-profile:v', 'high']) + elif hwaccel == 'qsv': + command.extend(['-c:v', 'h264_qsv', '-preset', 'medium']) + elif hwaccel == 'videotoolbox': + command.extend(['-c:v', 'h264_videotoolbox', '-profile:v', 'high']) + elif hwaccel == 'vaapi': + command.extend(['-c:v', 'h264_vaapi', '-profile', '100']) + else: + command.extend(['-c:v', 'libx264', '-preset', 'medium', '-profile:v', 'high']) + + # 设置视频比特率和其他参数 + command.extend([ + '-b:v', '5M', + '-maxrate', '8M', + '-bufsize', '10M', + '-pix_fmt', 'yuv420p', # 兼容性更好的颜色格式 + ]) + + # 输出文件 + command.append(output_path) + + # 执行命令 + try: + subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return output_path + except subprocess.CalledProcessError as e: + logger.error(f"处理视频失败: {e.stderr.decode() if e.stderr else str(e)}") + raise RuntimeError(f"处理视频失败: {str(e)}") + + +def combine_clip_videos( + output_video_path: str, + video_paths: List[str], + video_ost_list: List[int], + video_aspect: VideoAspect = VideoAspect.portrait, + threads: int = 4, +) -> str: + """ + 合并子视频 + Args: + output_video_path: 合并后的存储路径 + video_paths: 子视频路径列表 + video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说) + video_aspect: 屏幕比例 + threads: 线程数 + + Returns: + str: 合并后的视频路径 + """ + # 检查ffmpeg是否安装 + if not check_ffmpeg_installation(): + raise RuntimeError("未找到ffmpeg,请先安装") + + # 准备输出目录 + output_dir = os.path.dirname(output_video_path) + os.makedirs(output_dir, exist_ok=True) + + # 获取目标分辨率 + aspect = VideoAspect(video_aspect) + video_width, video_height = aspect.to_resolution() + + # 检测可用的硬件加速选项 + hwaccel = get_hardware_acceleration_option() + if hwaccel: + logger.info(f"将使用 {hwaccel} 硬件加速") + + # 重组视频路径和原声设置为一个字典列表结构 + video_segments = [] + + # 检查视频路径和原声设置列表长度是否匹配 + if len(video_paths) != len(video_ost_list): + logger.warning(f"视频路径列表({len(video_paths)})和原声设置列表({len(video_ost_list)})长度不匹配") + # 调整长度以匹配较短的列表 + min_length = min(len(video_paths), len(video_ost_list)) + video_paths = video_paths[:min_length] + video_ost_list = video_ost_list[:min_length] + + # 创建视频处理配置字典列表 + for i, (video_path, video_ost) in enumerate(zip(video_paths, video_ost_list)): + if not os.path.exists(video_path): + logger.warning(f"视频不存在,跳过: {video_path}") + continue + + # 检查是否有音频流 + has_audio = check_video_has_audio(video_path) + + # 构建视频片段配置 + segment = { + "index": i, + "path": video_path, + "ost": video_ost, + "has_audio": has_audio, + "keep_audio": video_ost > 0 and has_audio # 只有当ost>0且实际有音频时才保留 + } + + # 记录日志 + if video_ost > 0 and not has_audio: + logger.warning(f"视频 {video_path} 设置为保留原声(ost={video_ost}),但该视频没有音频流") + + video_segments.append(segment) + + # 处理每个视频片段 + processed_videos = [] + temp_dir = os.path.join(output_dir, "temp_videos") + os.makedirs(temp_dir, exist_ok=True) + + try: + # 第一阶段:处理所有视频片段到中间文件 + for segment in video_segments: + # 处理单个视频,去除或保留音频 + temp_output = os.path.join(temp_dir, f"processed_{segment['index']}.mp4") + try: + process_single_video( + input_path=segment['path'], + output_path=temp_output, + target_width=video_width, + target_height=video_height, + keep_audio=segment['keep_audio'], + hwaccel=hwaccel + ) + processed_videos.append({ + "index": segment["index"], + "path": temp_output, + "keep_audio": segment["keep_audio"] + }) + logger.info(f"视频 {segment['index'] + 1}/{len(video_segments)} 处理完成") + except Exception as e: + logger.error(f"处理视频 {segment['path']} 时出错: {str(e)}") + continue + + if not processed_videos: + raise ValueError("没有有效的视频片段可以合并") + + # 按原始索引排序处理后的视频 + processed_videos.sort(key=lambda x: x["index"]) + + # 第二阶段:分步骤合并视频 - 避免复杂的filter_complex滤镜 + try: + # 1. 首先,将所有没有音频的视频或音频被禁用的视频合并到一个临时文件中 + video_paths_only = [video["path"] for video in processed_videos] + video_concat_path = os.path.join(temp_dir, "video_concat.mp4") + + # 创建concat文件,用于合并视频流 + concat_file = os.path.join(temp_dir, "concat_list.txt") + create_ffmpeg_concat_file(video_paths_only, concat_file) + + # 合并所有视频流,但不包含音频 + concat_cmd = [ + 'ffmpeg', '-y', + '-f', 'concat', + '-safe', '0', + '-i', concat_file, + '-c:v', 'libx264', + '-preset', 'medium', + '-profile:v', 'high', + '-an', # 不包含音频 + '-threads', str(threads), + video_concat_path + ] + + subprocess.run(concat_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.info("视频流合并完成") + + # 2. 提取并合并有音频的片段 + audio_segments = [video for video in processed_videos if video["keep_audio"]] + + if not audio_segments: + # 如果没有音频片段,直接使用无音频的合并视频作为最终结果 + shutil.copy(video_concat_path, output_video_path) + logger.info("无音频视频合并完成") + return output_video_path + + # 创建音频中间文件 + audio_files = [] + for i, segment in enumerate(audio_segments): + # 提取音频 + audio_file = os.path.join(temp_dir, f"audio_{i}.aac") + extract_audio_cmd = [ + 'ffmpeg', '-y', + '-i', segment["path"], + '-vn', # 不包含视频 + '-c:a', 'aac', + '-b:a', '128k', + audio_file + ] + subprocess.run(extract_audio_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + audio_files.append({ + "index": segment["index"], + "path": audio_file + }) + logger.info(f"提取音频 {i+1}/{len(audio_segments)} 完成") + + # 3. 计算每个音频片段的时间位置 + audio_timings = [] + current_time = 0.0 + + # 获取每个视频片段的时长 + for i, video in enumerate(processed_videos): + duration_cmd = [ + 'ffprobe', '-v', 'error', + '-show_entries', 'format=duration', + '-of', 'csv=p=0', + video["path"] + ] + result = subprocess.run(duration_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + duration = float(result.stdout.strip()) + + # 如果当前片段需要保留音频,记录时间位置 + if video["keep_audio"]: + for audio in audio_files: + if audio["index"] == video["index"]: + audio_timings.append({ + "file": audio["path"], + "start": current_time, + "index": video["index"] + }) + break + + current_time += duration + + # 4. 创建静音音频轨道作为基础 + silence_audio = os.path.join(temp_dir, "silence.aac") + create_silence_cmd = [ + 'ffmpeg', '-y', + '-f', 'lavfi', + '-i', f'anullsrc=r=44100:cl=stereo', + '-t', str(current_time), # 总时长 + '-c:a', 'aac', + '-b:a', '128k', + silence_audio + ] + subprocess.run(create_silence_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # 5. 创建复杂滤镜命令以混合音频 + filter_script = os.path.join(temp_dir, "filter_script.txt") + with open(filter_script, 'w') as f: + f.write(f"[0:a]volume=0.0[silence];\n") # 首先静音背景轨道 + + # 添加每个音频文件 + for i, timing in enumerate(audio_timings): + f.write(f"[{i+1}:a]adelay={int(timing['start']*1000)}|{int(timing['start']*1000)}[a{i}];\n") + + # 混合所有音频 + mix_str = "[silence]" + for i in range(len(audio_timings)): + mix_str += f"[a{i}]" + mix_str += f"amix=inputs={len(audio_timings)+1}:duration=longest[aout]" + f.write(mix_str) + + # 6. 构建音频合并命令 + audio_inputs = ['-i', silence_audio] + for timing in audio_timings: + audio_inputs.extend(['-i', timing["file"]]) + + mixed_audio = os.path.join(temp_dir, "mixed_audio.aac") + audio_mix_cmd = [ + 'ffmpeg', '-y' + ] + audio_inputs + [ + '-filter_complex_script', filter_script, + '-map', '[aout]', + '-c:a', 'aac', + '-b:a', '128k', + mixed_audio + ] + + subprocess.run(audio_mix_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.info("音频混合完成") + + # 7. 将合并的视频和混合的音频组合在一起 + final_cmd = [ + 'ffmpeg', '-y', + '-i', video_concat_path, + '-i', mixed_audio, + '-c:v', 'copy', + '-c:a', 'aac', + '-map', '0:v:0', + '-map', '1:a:0', + '-shortest', + output_video_path + ] + + subprocess.run(final_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.info("视频最终合并完成") + + return output_video_path + + except subprocess.CalledProcessError as e: + logger.error(f"合并视频过程中出错: {e.stderr.decode() if e.stderr else str(e)}") + + # 尝试备用合并方法 - 最简单的无音频合并 + logger.info("尝试备用合并方法 - 无音频合并") + try: + concat_file = os.path.join(temp_dir, "concat_list.txt") + video_paths_only = [video["path"] for video in processed_videos] + create_ffmpeg_concat_file(video_paths_only, concat_file) + + backup_cmd = [ + 'ffmpeg', '-y', + '-f', 'concat', + '-safe', '0', + '-i', concat_file, + '-c:v', 'copy', + '-an', # 无音频 + output_video_path + ] + + subprocess.run(backup_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.warning("使用备用方法(无音频)成功合并视频") + return output_video_path + except Exception as backup_error: + logger.error(f"备用合并方法也失败: {str(backup_error)}") + raise RuntimeError(f"无法合并视频: {str(backup_error)}") + + except Exception as e: + logger.error(f"合并视频时出错: {str(e)}") + raise + finally: + # 清理临时文件 + try: + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + logger.info("已清理临时文件") + except Exception as e: + logger.warning(f"清理临时文件时出错: {str(e)}") + + +if __name__ == '__main__': + video_paths = [ + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-00-00-00-00-26.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-01-15-00-01-29.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-04-58-00-05-20.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-05-45-00-05-53.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4' + ] + + combine_clip_videos( + output_video_path="/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/merged_123.mp4", + video_paths=video_paths, + video_ost_list=[1, 0, 1, 0, 0, 1], + video_aspect=VideoAspect.portrait + ) diff --git a/app/services/script_service.py b/app/services/script_service.py index 37644a7..461978b 100644 --- a/app/services/script_service.py +++ b/app/services/script_service.py @@ -3,10 +3,11 @@ import json import time import asyncio import requests +from app.utils import video_processor from loguru import logger from typing import List, Dict, Any, Callable -from app.utils import utils, gemini_analyzer, video_processor, video_processor_v2 +from app.utils import utils, gemini_analyzer, video_processor from app.utils.script_generator import ScriptProcessor from app.config import config @@ -21,6 +22,7 @@ class ScriptGenerator: video_path: str, video_theme: str = "", custom_prompt: str = "", + frame_interval_input: int = 5, skip_seconds: int = 0, threshold: int = 30, vision_batch_size: int = 5, @@ -105,20 +107,13 @@ class ScriptGenerator: os.makedirs(video_keyframes_dir, exist_ok=True) try: - if config.frames.get("version") == "v2": - processor = video_processor_v2.VideoProcessor(video_path) - processor.process_video_pipeline( - output_dir=video_keyframes_dir, - skip_seconds=skip_seconds, - threshold=threshold - ) - else: - processor = video_processor.VideoProcessor(video_path) - processor.process_video( - output_dir=video_keyframes_dir, - skip_seconds=skip_seconds - ) - + processor = video_processor.VideoProcessor(video_path) + processor.process_video_pipeline( + output_dir=video_keyframes_dir, + skip_seconds=skip_seconds, + threshold=threshold + ) + for filename in sorted(os.listdir(video_keyframes_dir)): if filename.endswith('.jpg'): keyframe_files.append(os.path.join(video_keyframes_dir, filename)) diff --git a/app/services/subtitle.py b/app/services/subtitle.py index e7f037d..c443c3f 100644 --- a/app/services/subtitle.py +++ b/app/services/subtitle.py @@ -4,11 +4,11 @@ import re import traceback from typing import Optional -from faster_whisper import WhisperModel +# from faster_whisper import WhisperModel from timeit import default_timer as timer from loguru import logger import google.generativeai as genai -from moviepy.editor import VideoFileClip +from moviepy import VideoFileClip import os from app.config import config @@ -33,7 +33,7 @@ def create(audio_file, subtitle_file: str = ""): """ global model, device, compute_type if not model: - model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v2" + model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v3" model_bin_file = f"{model_path}/model.bin" if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file): logger.error( @@ -45,12 +45,25 @@ def create(audio_file, subtitle_file: str = ""): ) return None - # 尝试使用 CUDA,如果失败则回退到 CPU + # 首先使用CPU模式,不触发CUDA检查 + use_cuda = False try: - import torch - if torch.cuda.is_available(): + # 在函数中延迟导入torch,而不是在全局范围内 + # 使用安全的方式检查CUDA可用性 + def check_cuda_available(): + try: + import torch + return torch.cuda.is_available() + except (ImportError, RuntimeError) as e: + logger.warning(f"检查CUDA可用性时出错: {e}") + return False + + # 仅当明确需要时才检查CUDA + use_cuda = check_cuda_available() + + if use_cuda: + logger.info(f"尝试使用 CUDA 加载模型: {model_path}") try: - logger.info(f"尝试使用 CUDA 加载模型: {model_path}") model = WhisperModel( model_size_or_path=model_path, device="cuda", @@ -63,18 +76,18 @@ def create(audio_file, subtitle_file: str = ""): except Exception as e: logger.warning(f"CUDA 加载失败,错误信息: {str(e)}") logger.warning("回退到 CPU 模式") - device = "cpu" - compute_type = "int8" + use_cuda = False else: - logger.info("未检测到 CUDA,使用 CPU 模式") - device = "cpu" - compute_type = "int8" - except ImportError: - logger.warning("未安装 torch,使用 CPU 模式") + logger.info("使用 CPU 模式") + except Exception as e: + logger.warning(f"CUDA检查过程出错: {e}") + logger.warning("默认使用CPU模式") + use_cuda = False + + # 如果CUDA不可用或加载失败,使用CPU + if not use_cuda: device = "cpu" compute_type = "int8" - - if device == "cpu": logger.info(f"使用 CPU 加载模型: {model_path}") model = WhisperModel( model_size_or_path=model_path, @@ -403,7 +416,7 @@ def extract_audio_and_create_subtitle(video_file: str, subtitle_file: str = "") logger.info("音频提取完成,开始生成字幕") # 使用create函数生成字幕 - create(audio_file, subtitle_file) + create("/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav", subtitle_file) # 删除临时音频文件 if os.path.exists(audio_file): @@ -422,8 +435,8 @@ if __name__ == "__main__": task_id = "123456" task_dir = utils.task_dir(task_id) subtitle_file = f"{task_dir}/subtitle_123456.srt" - audio_file = f"{task_dir}/audio.wav" - video_file = "/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_1702.mp4" + audio_file = "/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav" + video_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/qyn2-2-720p.mp4" extract_audio_and_create_subtitle(video_file, subtitle_file) diff --git a/app/services/subtitle_merger.py b/app/services/subtitle_merger.py new file mode 100644 index 0000000..9097586 --- /dev/null +++ b/app/services/subtitle_merger.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +''' +@Project: NarratoAI +@File : subtitle_merger +@Author : viccy +@Date : 2025/5/6 下午4:00 +''' + +import re +import os +from datetime import datetime, timedelta + + +def parse_time(time_str): + """解析时间字符串为timedelta对象""" + hours, minutes, seconds_ms = time_str.split(':') + seconds, milliseconds = seconds_ms.split(',') + + td = timedelta( + hours=int(hours), + minutes=int(minutes), + seconds=int(seconds), + milliseconds=int(milliseconds) + ) + return td + + +def format_time(td): + """将timedelta对象格式化为SRT时间字符串""" + total_seconds = int(td.total_seconds()) + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + seconds = total_seconds % 60 + milliseconds = td.microseconds // 1000 + + return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" + + +def parse_edited_time_range(time_range_str): + """从editedTimeRange字符串中提取时间范围""" + if not time_range_str: + return None, None + + parts = time_range_str.split('-') + if len(parts) != 2: + return None, None + + start_time_str, end_time_str = parts + + # 将HH:MM:SS格式转换为timedelta + start_h, start_m, start_s = map(int, start_time_str.split(':')) + end_h, end_m, end_s = map(int, end_time_str.split(':')) + + start_time = timedelta(hours=start_h, minutes=start_m, seconds=start_s) + end_time = timedelta(hours=end_h, minutes=end_m, seconds=end_s) + + return start_time, end_time + + +def merge_subtitle_files(subtitle_items, output_file=None): + """ + 合并多个SRT字幕文件 + + 参数: + subtitle_items: 字典列表,每个字典包含subtitle文件路径和editedTimeRange + output_file: 输出文件的路径,如果为None则自动生成 + + 返回: + 合并后的字幕文件路径 + """ + # 按照editedTimeRange的开始时间排序 + sorted_items = sorted(subtitle_items, + key=lambda x: parse_edited_time_range(x.get('editedTimeRange', ''))[0] or timedelta()) + + merged_subtitles = [] + subtitle_index = 1 + + for item in sorted_items: + if not item.get('subtitle') or not os.path.exists(item.get('subtitle')): + continue + + # 从editedTimeRange获取起始时间偏移 + offset_time, _ = parse_edited_time_range(item.get('editedTimeRange', '')) + + if offset_time is None: + print(f"警告: 无法从项目 {item.get('_id')} 的editedTimeRange中提取时间范围,跳过该项") + continue + + with open(item['subtitle'], 'r', encoding='utf-8') as file: + content = file.read() + + # 解析字幕文件 + subtitle_blocks = re.split(r'\n\s*\n', content.strip()) + + for block in subtitle_blocks: + lines = block.strip().split('\n') + if len(lines) < 3: # 确保块有足够的行数 + continue + + # 解析时间轴行 + time_line = lines[1] + time_parts = time_line.split(' --> ') + if len(time_parts) != 2: + continue + + start_time = parse_time(time_parts[0]) + end_time = parse_time(time_parts[1]) + + # 应用时间偏移 + adjusted_start_time = start_time + offset_time + adjusted_end_time = end_time + offset_time + + # 重建字幕块 + adjusted_time_line = f"{format_time(adjusted_start_time)} --> {format_time(adjusted_end_time)}" + text_lines = lines[2:] + + new_block = [ + str(subtitle_index), + adjusted_time_line, + *text_lines + ] + + merged_subtitles.append('\n'.join(new_block)) + subtitle_index += 1 + + # 确定输出文件路径 + if output_file is None: + dir_path = os.path.dirname(sorted_items[0]['subtitle']) + first_start = parse_edited_time_range(sorted_items[0]['editedTimeRange'])[0] + last_end = parse_edited_time_range(sorted_items[-1]['editedTimeRange'])[1] + + first_start_h, first_start_m, first_start_s = int(first_start.seconds // 3600), int((first_start.seconds % 3600) // 60), int(first_start.seconds % 60) + last_end_h, last_end_m, last_end_s = int(last_end.seconds // 3600), int((last_end.seconds % 3600) // 60), int(last_end.seconds % 60) + + first_start_str = f"{first_start_h:02d}_{first_start_m:02d}_{first_start_s:02d}" + last_end_str = f"{last_end_h:02d}_{last_end_m:02d}_{last_end_s:02d}" + + output_file = os.path.join(dir_path, f"merged_subtitle_{first_start_str}-{last_end_str}.srt") + + # 合并所有字幕块 + merged_content = '\n\n'.join(merged_subtitles) + + # 写入合并后的内容 + with open(output_file, 'w', encoding='utf-8') as file: + file.write(merged_content) + + return output_file + + +if __name__ == '__main__': + # 测试数据 + test_data = [ + {'picture': '【解说】好的,各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!', + 'timestamp': '00:00:00-00:01:15', + 'narration': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!', + 'OST': 0, + '_id': 1, + 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', + 'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', + 'sourceTimeRange': '00:00:00-00:00:26', + 'duration': 26, + 'editedTimeRange': '00:00:00-00:00:26' + }, + {'picture': '【解说】上一集我们看到,范闲在北齐遭遇了惊天变故,生死不明!', + 'timestamp': '00:01:15-00:04:40', + 'narration': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…', + 'OST': 0, + '_id': 2, + 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', + 'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', + 'sourceTimeRange': '00:01:15-00:01:29', + 'duration': 14, + 'editedTimeRange': '00:00:26-00:00:40' + }, + {'picture': '【解说】"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。', + 'timestamp': '00:04:58-00:05:45', + 'narration': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!', + 'OST': 0, + '_id': 4, + 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', + 'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', + 'sourceTimeRange': '00:04:58-00:05:20', + 'duration': 22, + 'editedTimeRange': '00:00:57-00:01:19' + }, + {'picture': '【解说】但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', + 'timestamp': '00:05:45-00:06:00', + 'narration': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', + 'OST': 0, + '_id': 5, + 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3', + 'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt', + 'sourceTimeRange': '00:05:45-00:05:53', + 'duration': 8, + 'editedTimeRange': '00:01:19-00:01:27' + } + ] + + output_file = merge_subtitle_files(test_data) + print(f"字幕文件已合并至: {output_file}") diff --git a/app/services/task.py b/app/services/task.py index 6704f0d..c257d39 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -9,167 +9,177 @@ from loguru import logger from app.config import config from app.models import const from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams -from app.services import llm, material, subtitle, video, voice, audio_merger +from app.services import (llm, material, subtitle, video, voice, audio_merger, + subtitle_merger, clip_video, merger_video, update_script, generate_video) from app.services import state as sm from app.utils import utils -def generate_script(task_id, params): - logger.info("\n\n## generating video script") - video_script = params.video_script.strip() - if not video_script: - video_script = llm.generate_script( - video_subject=params.video_subject, - language=params.video_language, - paragraph_number=params.paragraph_number, - ) - else: - logger.debug(f"video script: \n{video_script}") +# def generate_script(task_id, params): +# logger.info("\n\n## generating video script") +# video_script = params.video_script.strip() +# if not video_script: +# video_script = llm.generate_script( +# video_subject=params.video_subject, +# language=params.video_language, +# paragraph_number=params.paragraph_number, +# ) +# else: +# logger.debug(f"video script: \n{video_script}") - if not video_script: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error("failed to generate video script.") - return None +# if not video_script: +# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) +# logger.error("failed to generate video script.") +# return None - return video_script +# return video_script -def generate_terms(task_id, params, video_script): - logger.info("\n\n## generating video terms") - video_terms = params.video_terms - if not video_terms: - video_terms = llm.generate_terms( - video_subject=params.video_subject, video_script=video_script, amount=5 - ) - else: - if isinstance(video_terms, str): - video_terms = [term.strip() for term in re.split(r"[,,]", video_terms)] - elif isinstance(video_terms, list): - video_terms = [term.strip() for term in video_terms] - else: - raise ValueError("video_terms must be a string or a list of strings.") +# def generate_terms(task_id, params, video_script): +# logger.info("\n\n## generating video terms") +# video_terms = params.video_terms +# if not video_terms: +# video_terms = llm.generate_terms( +# video_subject=params.video_subject, video_script=video_script, amount=5 +# ) +# else: +# if isinstance(video_terms, str): +# video_terms = [term.strip() for term in re.split(r"[,,]", video_terms)] +# elif isinstance(video_terms, list): +# video_terms = [term.strip() for term in video_terms] +# else: +# raise ValueError("video_terms must be a string or a list of strings.") - logger.debug(f"video terms: {utils.to_json(video_terms)}") +# logger.debug(f"video terms: {utils.to_json(video_terms)}") - if not video_terms: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error("failed to generate video terms.") - return None +# if not video_terms: +# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) +# logger.error("failed to generate video terms.") +# return None - return video_terms +# return video_terms -def save_script_data(task_id, video_script, video_terms, params): - script_file = path.join(utils.task_dir(task_id), "script.json") - script_data = { - "script": video_script, - "search_terms": video_terms, - "params": params, - } +# def save_script_data(task_id, video_script, video_terms, params): +# script_file = path.join(utils.task_dir(task_id), "script.json") +# script_data = { +# "script": video_script, +# "search_terms": video_terms, +# "params": params, +# } - with open(script_file, "w", encoding="utf-8") as f: - f.write(utils.to_json(script_data)) +# with open(script_file, "w", encoding="utf-8") as f: +# f.write(utils.to_json(script_data)) -def generate_audio(task_id, params, video_script): - logger.info("\n\n## generating audio") - audio_file = path.join(utils.task_dir(task_id), "audio.mp3") - sub_maker = voice.tts( - text=video_script, - voice_name=voice.parse_voice_name(params.voice_name), - voice_rate=params.voice_rate, - voice_file=audio_file, - ) - if sub_maker is None: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error( - """failed to generate audio: -1. check if the language of the voice matches the language of the video script. -2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode. - """.strip() - ) - return None, None, None +# def generate_audio(task_id, params, video_script): +# logger.info("\n\n## generating audio") +# audio_file = path.join(utils.task_dir(task_id), "audio.mp3") +# sub_maker = voice.tts( +# text=video_script, +# voice_name=voice.parse_voice_name(params.voice_name), +# voice_rate=params.voice_rate, +# voice_file=audio_file, +# ) +# if sub_maker is None: +# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) +# logger.error( +# """failed to generate audio: +# 1. check if the language of the voice matches the language of the video script. +# 2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode. +# """.strip() +# ) +# return None, None, None - audio_duration = math.ceil(voice.get_audio_duration(sub_maker)) - return audio_file, audio_duration, sub_maker +# audio_duration = math.ceil(voice.get_audio_duration(sub_maker)) +# return audio_file, audio_duration, sub_maker -def generate_subtitle(task_id, params, video_script, sub_maker, audio_file): - if not params.subtitle_enabled: - return "" +# def generate_subtitle(task_id, params, video_script, sub_maker, audio_file): +# if not params.subtitle_enabled: +# return "" - subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt") - subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() - logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}") +# subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt") +# subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() +# logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}") - subtitle_fallback = False - if subtitle_provider == "edge": - voice.create_subtitle( - text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path - ) - if not os.path.exists(subtitle_path): - subtitle_fallback = True - logger.warning("subtitle file not found, fallback to whisper") +# subtitle_fallback = False +# if subtitle_provider == "edge": +# voice.create_subtitle( +# text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path +# ) +# if not os.path.exists(subtitle_path): +# subtitle_fallback = True +# logger.warning("subtitle file not found, fallback to whisper") - if subtitle_provider == "whisper" or subtitle_fallback: - subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path) - logger.info("\n\n## correcting subtitle") - subtitle.correct(subtitle_file=subtitle_path, video_script=video_script) +# if subtitle_provider == "whisper" or subtitle_fallback: +# subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path) +# logger.info("\n\n## correcting subtitle") +# subtitle.correct(subtitle_file=subtitle_path, video_script=video_script) - subtitle_lines = subtitle.file_to_subtitles(subtitle_path) - if not subtitle_lines: - logger.warning(f"subtitle file is invalid: {subtitle_path}") - return "" +# subtitle_lines = subtitle.file_to_subtitles(subtitle_path) +# if not subtitle_lines: +# logger.warning(f"subtitle file is invalid: {subtitle_path}") +# return "" - return subtitle_path +# return subtitle_path -def get_video_materials(task_id, params, video_terms, audio_duration): - if params.video_source == "local": - logger.info("\n\n## preprocess local materials") - materials = video.preprocess_video( - materials=params.video_materials, clip_duration=params.video_clip_duration - ) - if not materials: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error( - "no valid materials found, please check the materials and try again." - ) - return None - return [material_info.url for material_info in materials] - else: - logger.info(f"\n\n## downloading videos from {params.video_source}") - downloaded_videos = material.download_videos( - task_id=task_id, - search_terms=video_terms, - source=params.video_source, - video_aspect=params.video_aspect, - video_contact_mode=params.video_concat_mode, - audio_duration=audio_duration * params.video_count, - max_clip_duration=params.video_clip_duration, - ) - if not downloaded_videos: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error( - "failed to download videos, maybe the network is not available. if you are in China, please use a VPN." - ) - return None - return downloaded_videos +# def get_video_materials(task_id, params, video_terms, audio_duration): +# if params.video_source == "local": +# logger.info("\n\n## preprocess local materials") +# materials = video.preprocess_video( +# materials=params.video_materials, clip_duration=params.video_clip_duration +# ) +# if not materials: +# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) +# logger.error( +# "no valid materials found, please check the materials and try again." +# ) +# return None +# return [material_info.url for material_info in materials] +# else: +# logger.info(f"\n\n## downloading videos from {params.video_source}") +# downloaded_videos = material.download_videos( +# task_id=task_id, +# search_terms=video_terms, +# source=params.video_source, +# video_aspect=params.video_aspect, +# video_contact_mode=params.video_concat_mode, +# audio_duration=audio_duration * params.video_count, +# max_clip_duration=params.video_clip_duration, +# ) +# if not downloaded_videos: +# sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) +# logger.error( +# "failed to download videos, maybe the network is not available. if you are in China, please use a VPN." +# ) +# return None +# return downloaded_videos def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict): - """后台任务(自动剪辑视频进行剪辑)""" + """ + 后台任务(自动剪辑视频进行剪辑) + Args: + task_id: 任务ID + params: 视频参数 + subclip_path_videos: 视频片段路径 + """ + global merged_audio_path, merged_subtitle_path + logger.info(f"\n\n## 开始任务: {task_id}") - - # 初始化 ImageMagick - if not utils.init_imagemagick(): - logger.warning("ImageMagick 初始化失败,字幕可能无法正常显示") - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5) + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0) - # tts 角色名称 - voice_name = voice.parse_voice_name(params.voice_name) + # # 初始化 ImageMagick + # if not utils.init_imagemagick(): + # logger.warning("ImageMagick 初始化失败,字幕可能无法正常显示") + # # tts 角色名称 + # voice_name = voice.parse_voice_name(params.voice_name) + """ + 1. 加载剪辑脚本 + """ logger.info("\n\n## 1. 加载视频脚本") video_script_path = path.join(params.video_clip_json_path) @@ -185,174 +195,144 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di logger.debug(f"解说完整脚本: \n{video_script}") logger.debug(f"解说 OST 列表: \n{video_ost}") logger.debug(f"解说时间戳列表: \n{time_list}") - - # 获取视频总时长(单位 s) - last_timestamp = list_script[-1]['new_timestamp'] - end_time = last_timestamp.split("-")[1] - total_duration = utils.time_to_seconds(end_time) - except Exception as e: - logger.error(f"无法读取视频json脚本,请检查配置是否正确。{e}") - raise ValueError("无法读取视频json脚本,请检查配置是否正确") + logger.error(f"无法读取视频json脚本,请检查脚本格式是否正确") + raise ValueError("无法读取视频json脚本,请检查脚本格式是否正确") else: logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc()) raise ValueError("解说脚本不存在!请检查配置是否正确。") + """ + 2. 使用 TTS 生成音频素材 + """ logger.info("\n\n## 2. 根据OST设置生成音频列表") - # 只为OST=0或2的片段生成TTS音频 + # 只为OST=0 or 2的判断生成音频, OST=0 仅保留解说 OST=2 保留解说和原声 tts_segments = [ segment for segment in list_script if segment['OST'] in [0, 2] ] logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}") - - # 初始化音频文件路径 - audio_files = [] - final_audio = "" - + + tts_results = voice.tts_multiple( + task_id=task_id, + list_script=tts_segments, # 只传入需要TTS的片段 + voice_name=params.voice_name, + voice_rate=params.voice_rate, + voice_pitch=params.voice_pitch, + ) + + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20) + + # """ + # 3. (可选) 使用 whisper 生成字幕 + # """ + # if merged_subtitle_path is None: + # if audio_files: + # merged_subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt") + # subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() + # logger.info(f"\n\n使用 {subtitle_provider} 生成字幕") + # + # subtitle.create( + # audio_file=merged_audio_path, + # subtitle_file=merged_subtitle_path, + # ) + # subtitle_lines = subtitle.file_to_subtitles(merged_subtitle_path) + # if not subtitle_lines: + # logger.warning(f"字幕文件无效: {merged_subtitle_path}") + # + # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40) + + """ + 3. 裁剪视频 - 将超出音频长度的视频进行裁剪 + """ + logger.info("\n\n## 3. 裁剪视频") + video_clip_result = clip_video.clip_video(params.video_origin_path, tts_results) + # 更新 list_script 中的时间戳 + tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results} + subclip_clip_result = { + tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results + } + new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result) + + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60) + + """ + 4. 合并音频和字幕 + """ + logger.info("\n\n## 4. 合并音频和字幕") + total_duration = sum([script["duration"] for script in new_script_list]) if tts_segments: - audio_files, sub_maker_list = voice.tts_multiple( - task_id=task_id, - list_script=tts_segments, # 只传入需要TTS的片段 - voice_name=voice_name, - voice_rate=params.voice_rate, - voice_pitch=params.voice_pitch, - force_regenerate=True - ) - if audio_files is None: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.") - return - - if audio_files: - logger.info(f"合并音频文件: {audio_files}") - try: - # 传入OST信息以便正确处理音频 - final_audio = audio_merger.merge_audio_files( - task_id=task_id, - audio_files=audio_files, - total_duration=total_duration, - list_script=list_script # 传入完整脚本以便处理OST - ) - logger.info("音频文件合并成功") - except Exception as e: - logger.error(f"合并音频文件失败: {str(e)}") - final_audio = "" - else: - # 如果没有需要生成TTS的片段,创建一个空白音频文件 - # 这样可以确保后续的音频处理能正确进行 - logger.info("没有需要生成TTS的片段,将保留原声和背景音乐") - final_audio = path.join(utils.task_dir(task_id), "empty.mp3") try: - from moviepy.editor import AudioClip - # 创建一个与视频等长的空白音频 - empty_audio = AudioClip(make_frame=lambda t: 0, duration=total_duration) - empty_audio.write_audiofile(final_audio, fps=44100) - logger.info(f"已创建空白音频文件: {final_audio}") - except Exception as e: - logger.error(f"创建空白音频文件失败: {str(e)}") - final_audio = "" - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30) - - subtitle_path = "" - if params.subtitle_enabled: - if audio_files: - subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt") - subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() - logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}") - - subtitle.create( - audio_file=final_audio, - subtitle_file=subtitle_path, + # 合并音频文件 + merged_audio_path = audio_merger.merge_audio_files( + task_id=task_id, + total_duration=total_duration, + list_script=new_script_list ) + logger.info(f"音频文件合并成功->{merged_audio_path}") + # 合并字幕文件 + merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list) + logger.info(f"字幕文件合并成功->{merged_subtitle_path}") + except Exception as e: + logger.error(f"合并音频文件失败: {str(e)}") + else: + logger.warning("没有需要合并的音频/字幕") + merged_audio_path = "" + merged_subtitle_path = "" - subtitle_lines = subtitle.file_to_subtitles(subtitle_path) - if not subtitle_lines: - logger.warning(f"字幕文件无效: {subtitle_path}") - subtitle_path = "" - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40) - - logger.info("\n\n## 4. 裁剪视频") - subclip_videos = [x for x in subclip_path_videos.values()] - # logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}") - - if not subclip_videos: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error( - "裁剪视频失败,可能是 ImageMagick 不可用") - return - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50) - + """ + 5. 合并视频 + """ final_video_paths = [] combined_video_paths = [] - _progress = 50 - index = 1 - combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4") + combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4") logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}") + # 如果 new_script_list 中没有 video,则使用 subclip_path_videos 中的视频 + video_clips = [new_script['video'] if new_script.get('video') else subclip_path_videos.get(new_script.get('_id', '')) for new_script in new_script_list] - video.combine_clip_videos( - combined_video_path=combined_video_path, - video_paths=subclip_videos, + merger_video.combine_clip_videos( + output_video_path=combined_video_path, + video_paths=video_clips, video_ost_list=video_ost, - list_script=list_script, video_aspect=params.video_aspect, - threads=params.n_threads # 多线程 + threads=params.n_threads ) + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80) - _progress += 50 / 2 - sm.state.update_task(task_id, progress=_progress) + """ + 6. 合并字幕/BGM/配音/视频 + """ + output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4") + logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}") - final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4") + # bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3' + bgm_path = utils.get_bgm_file() - logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}") - - # 获取背景音乐 - bgm_path = None - if params.bgm_type or params.bgm_file: - try: - bgm_path = utils.get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file) - if bgm_path: - logger.info(f"使用背景音乐: {bgm_path}") - except Exception as e: - logger.error(f"获取背景音乐失败: {str(e)}") - - # 示例:自定义字幕样式 - subtitle_style = { - 'fontsize': params.font_size, # 字体大小 - 'color': params.text_fore_color, # 字体颜色 - 'stroke_color': params.stroke_color, # 描边颜色 - 'stroke_width': params.stroke_width, # 描边宽度, 范围0-10 - 'bg_color': params.text_back_color, # 半透明黑色背景 - 'position': (params.subtitle_position, 0.2), # 距离顶部60%的位置 - 'method': 'caption' # 渲染方法 + # 调用示例 + options = { + 'voice_volume': params.tts_volume, # 配音音量 + 'bgm_volume': params.bgm_volume, # 背景音乐音量 + 'original_audio_volume': params.original_volume, # 视频原声音量,0表示不保留 + 'keep_original_audio': True, # 是否保留原声 + 'subtitle_font': params.font_name, # 这里使用相对字体路径,会自动在 font_dir() 目录下查找 + 'subtitle_font_size': params.font_size, + 'subtitle_color': params.text_fore_color, + 'subtitle_bg_color': None, # 直接使用None表示透明背景 + 'subtitle_position': params.subtitle_position, + 'custom_position': params.custom_position, + 'threads': params.n_threads } - - # 示例:自定义音量配置 - volume_config = { - 'original': params.original_volume, # 原声音量80% - 'bgm': params.bgm_volume, # BGM音量20% - 'narration': params.tts_volume or params.voice_volume, # 解说音量100% - } - font_path = utils.font_dir(params.font_name) - video.generate_video_v3( + generate_video.merge_materials( video_path=combined_video_path, - subtitle_path=subtitle_path, + audio_path=merged_audio_path, + subtitle_path=merged_subtitle_path, bgm_path=bgm_path, - narration_path=final_audio, - output_path=final_video_path, - volume_config=volume_config, # 添加音量配置 - subtitle_style=subtitle_style, - font_path=font_path + output_path=output_video_path, + options=options ) - _progress += 50 / 2 - sm.state.update_task(task_id, progress=_progress) - - final_video_paths.append(final_video_path) + final_video_paths.append(output_video_path) combined_video_paths.append(combined_video_path) logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.") @@ -400,35 +380,19 @@ def validate_params(video_path, audio_path, output_file, params): if __name__ == "__main__": - # task_id = "test123" - # subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4', - # '00:06-00:15': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_06-00_15.mp4', - # '01:10-01:17': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_10-01_17.mp4', - # '00:47-01:03': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_47-01_03.mp4', - # '01:03-01:10': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_03-01_10.mp4', - # '02:40-03:08': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-02_40-03_08.mp4', - # '03:02-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_02-03_20.mp4', - # '03:18-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_18-03_20.mp4'} - # - # params = VideoClipParams( - # video_clip_json_path="E:\\projects\\NarratoAI\\resource/scripts/test003.json", - # video_origin_path="E:\\projects\\NarratoAI\\resource/videos/1.mp4", - # ) - # start_subclip(task_id, params, subclip_path_videos=subclip_path_videos) + task_id = "demo" - task_id = "test456" - subclip_path_videos = {'01:10-01:17': './storage/cache_videos/vid-01_10-01_17.mp4', - '01:58-02:04': './storage/cache_videos/vid-01_58-02_04.mp4', - '02:25-02:31': './storage/cache_videos/vid-02_25-02_31.mp4', - '01:28-01:33': './storage/cache_videos/vid-01_28-01_33.mp4', - '03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4', - '00:24-00:28': './storage/cache_videos/vid-00_24-00_28.mp4', - '03:02-03:08': './storage/cache_videos/vid-03_02-03_08.mp4', - '00:41-00:44': './storage/cache_videos/vid-00_41-00_44.mp4', - '02:12-02:25': './storage/cache_videos/vid-02_12-02_25.mp4'} + # 提前裁剪是为了方便检查视频 + subclip_path_videos = { + 1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-05-390@00-00-57-980.mp4', + 2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-28-900@00-00-43-700.mp4', + 3: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-01-17-840@00-01-27-600.mp4', + 4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-02-35-460@00-02-52-380.mp4', + 5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-06-59-520@00-07-29-500.mp4', + } params = VideoClipParams( - video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test004.json", - video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4", + video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/2025-0507-223311.json", + video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_4938.mp4", ) - start_subclip(task_id, params, subclip_path_videos=subclip_path_videos) + start_subclip(task_id, params, subclip_path_videos) diff --git a/app/services/update_script.py b/app/services/update_script.py new file mode 100644 index 0000000..2eb9663 --- /dev/null +++ b/app/services/update_script.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +''' +@Project: NarratoAI +@File : update_script +@Author : 小林同学 +@Date : 2025/5/6 下午11:00 +''' + +import re +import os +from typing import Dict, List, Any, Tuple, Union + + +def extract_timestamp_from_video_path(video_path: str) -> str: + """ + 从视频文件路径中提取时间戳 + + Args: + video_path: 视频文件路径 + + Returns: + 提取出的时间戳,格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss' + """ + # 使用正则表达式从文件名中提取时间戳 + filename = os.path.basename(video_path) + + # 匹配新格式: vid_00-00-00-000@00-00-20-250.mp4 + match_new = re.search(r'vid_(\d{2})-(\d{2})-(\d{2})-(\d{3})@(\d{2})-(\d{2})-(\d{2})-(\d{3})\.mp4', filename) + if match_new: + # 提取并格式化时间戳(包含毫秒) + start_h, start_m, start_s, start_ms = match_new.group(1), match_new.group(2), match_new.group(3), match_new.group(4) + end_h, end_m, end_s, end_ms = match_new.group(5), match_new.group(6), match_new.group(7), match_new.group(8) + return f"{start_h}:{start_m}:{start_s},{start_ms}-{end_h}:{end_m}:{end_s},{end_ms}" + + # 匹配旧格式: vid-00-00-00-00-00-00.mp4 + match_old = re.search(r'vid-(\d{2}-\d{2}-\d{2})-(\d{2}-\d{2}-\d{2})\.mp4', filename) + if match_old: + # 提取并格式化时间戳 + start_time = match_old.group(1).replace('-', ':') + end_time = match_old.group(2).replace('-', ':') + return f"{start_time}-{end_time}" + + return "" + + +def calculate_duration(timestamp: str) -> float: + """ + 计算时间戳范围的持续时间(秒) + + Args: + timestamp: 格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss' 的时间戳 + + Returns: + 持续时间(秒) + """ + try: + start_time, end_time = timestamp.split('-') + + # 处理毫秒部分 + if ',' in start_time: + start_parts = start_time.split(',') + start_time_parts = start_parts[0].split(':') + start_ms = float('0.' + start_parts[1]) if len(start_parts) > 1 else 0 + start_h, start_m, start_s = map(int, start_time_parts) + else: + start_h, start_m, start_s = map(int, start_time.split(':')) + start_ms = 0 + + if ',' in end_time: + end_parts = end_time.split(',') + end_time_parts = end_parts[0].split(':') + end_ms = float('0.' + end_parts[1]) if len(end_parts) > 1 else 0 + end_h, end_m, end_s = map(int, end_time_parts) + else: + end_h, end_m, end_s = map(int, end_time.split(':')) + end_ms = 0 + + # 转换为秒 + start_seconds = start_h * 3600 + start_m * 60 + start_s + start_ms + end_seconds = end_h * 3600 + end_m * 60 + end_s + end_ms + + # 计算时间差(秒) + return round(end_seconds - start_seconds, 2) + except (ValueError, AttributeError): + return 0.0 + + +def update_script_timestamps( + script_list: List[Dict[str, Any]], + video_result: Dict[Union[str, int], str], + audio_result: Dict[Union[str, int], str] = None, + subtitle_result: Dict[Union[str, int], str] = None, + calculate_edited_timerange: bool = True +) -> List[Dict[str, Any]]: + """ + 根据 video_result 中的视频文件更新 script_list 中的时间戳,添加持续时间, + 并根据 audio_result 添加音频路径,根据 subtitle_result 添加字幕路径 + + Args: + script_list: 原始脚本列表 + video_result: 视频结果字典,键为原时间戳或_id,值为视频文件路径 + audio_result: 音频结果字典,键为原时间戳或_id,值为音频文件路径 + subtitle_result: 字幕结果字典,键为原时间戳或_id,值为字幕文件路径 + calculate_edited_timerange: 是否计算并添加成品视频中的时间范围 + + Returns: + 更新后的脚本列表 + """ + # 创建副本,避免修改原始数据 + updated_script = [] + + # 建立ID和时间戳到视频路径和新时间戳的映射 + id_timestamp_mapping = {} + for key, video_path in video_result.items(): + new_timestamp = extract_timestamp_from_video_path(video_path) + if new_timestamp: + id_timestamp_mapping[key] = { + 'new_timestamp': new_timestamp, + 'video_path': video_path + } + + # 计算累积时长,用于生成成品视频中的时间范围 + accumulated_duration = 0.0 + + # 更新脚本中的时间戳 + for item in script_list: + item_copy = item.copy() + item_id = item_copy.get('_id') + orig_timestamp = item_copy.get('timestamp', '') + + # 初始化音频和字幕路径为空字符串 + item_copy['audio'] = "" + item_copy['subtitle'] = "" + item_copy['video'] = "" # 初始化视频路径为空字符串 + + # 如果提供了音频结果字典且ID存在于音频结果中,直接使用对应的音频路径 + if audio_result: + if item_id and item_id in audio_result: + item_copy['audio'] = audio_result[item_id] + elif orig_timestamp in audio_result: + item_copy['audio'] = audio_result[orig_timestamp] + + # 如果提供了字幕结果字典且ID存在于字幕结果中,直接使用对应的字幕路径 + if subtitle_result: + if item_id and item_id in subtitle_result: + item_copy['subtitle'] = subtitle_result[item_id] + elif orig_timestamp in subtitle_result: + item_copy['subtitle'] = subtitle_result[orig_timestamp] + + # 添加视频路径 + if item_id and item_id in video_result: + item_copy['video'] = video_result[item_id] + elif orig_timestamp in video_result: + item_copy['video'] = video_result[orig_timestamp] + + # 更新时间戳和计算持续时间 + current_duration = 0.0 + if item_id and item_id in id_timestamp_mapping: + # 根据ID找到对应的新时间戳 + item_copy['sourceTimeRange'] = id_timestamp_mapping[item_id]['new_timestamp'] + current_duration = calculate_duration(item_copy['sourceTimeRange']) + item_copy['duration'] = current_duration + elif orig_timestamp in id_timestamp_mapping: + # 根据原始时间戳找到对应的新时间戳 + item_copy['sourceTimeRange'] = id_timestamp_mapping[orig_timestamp]['new_timestamp'] + current_duration = calculate_duration(item_copy['sourceTimeRange']) + item_copy['duration'] = current_duration + elif orig_timestamp: + # 对于未更新的时间戳,也计算并添加持续时间 + item_copy['sourceTimeRange'] = orig_timestamp + current_duration = calculate_duration(orig_timestamp) + item_copy['duration'] = current_duration + + # 计算片段在成品视频中的时间范围 + if calculate_edited_timerange and current_duration > 0: + start_time_seconds = accumulated_duration + end_time_seconds = accumulated_duration + current_duration + + # 将秒数转换为 HH:MM:SS 格式 + start_h = int(start_time_seconds // 3600) + start_m = int((start_time_seconds % 3600) // 60) + start_s = int(start_time_seconds % 60) + + end_h = int(end_time_seconds // 3600) + end_m = int((end_time_seconds % 3600) // 60) + end_s = int(end_time_seconds % 60) + + item_copy['editedTimeRange'] = f"{start_h:02d}:{start_m:02d}:{start_s:02d}-{end_h:02d}:{end_m:02d}:{end_s:02d}" + + # 更新累积时长 + accumulated_duration = end_time_seconds + + updated_script.append(item_copy) + + return updated_script + + +if __name__ == '__main__': + list_script = [ + { + 'picture': '【解说】好的,各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!', + 'timestamp': '00:00:00,001-00:01:15,001', + 'narration': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!', + 'OST': 0, + '_id': 1 + }, + { + 'picture': '【解说】上一集我们看到,范闲在北齐遭遇了惊天变故,生死不明!', + 'timestamp': '00:01:15,001-00:04:40,001', + 'narration': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…', + 'OST': 0, + '_id': 2 + }, + { + 'picture': '画面切到王启年小心翼翼地向范闲汇报。', + 'timestamp': '00:04:41,001-00:04:58,001', + 'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪', + 'OST': 1, + '_id': 3 + }, + { + 'picture': '【解说】"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。', + 'timestamp': '00:04:58,001-00:05:45,001', + 'narration': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!', + 'OST': 0, + '_id': 4 + }, + { + 'picture': '【解说】但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', + 'timestamp': '00:05:45,001-00:06:00,001', + 'narration': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', + 'OST': 0, + '_id': 5 + }, + { + 'picture': '画面切换到范闲蒙面闯入皇宫,被侍卫包围的场景。', + 'timestamp': '00:06:00,001-00:06:03,001', + 'narration': '抓刺客', + 'OST': 1, + '_id': 6 + }] + video_res = { + 1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-00-000@00-00-20-250.mp4', + 2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-30-000@00-00-48-950.mp4', + 4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-00-000@00-01-15-688.mp4', + 5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-30-000@00-01-49-512.mp4'} + audio_res = { + 1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', + 2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', + 4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', + 5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'} + sub_res = { + 1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', + 2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', + 4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', + 5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt'} + + # 更新并打印结果 + updated_list_script = update_script_timestamps(list_script, video_res, audio_res, sub_res) + for item in updated_list_script: + print( + f"ID: {item['_id']} | Picture: {item['picture'][:20]}... | Timestamp: {item['timestamp']} | " + + f"SourceTimeRange: {item['sourceTimeRange']} | EditedTimeRange: {item.get('editedTimeRange', '')} | " + + f"Duration: {item['duration']} 秒 | Audio: {item['audio']} | Video: {item['video']} | Subtitle: {item['subtitle']}") diff --git a/app/services/video.py b/app/services/video.py index f840c66..087dbdf 100644 --- a/app/services/video.py +++ b/app/services/video.py @@ -1,13 +1,13 @@ import traceback -import pysrt +# import pysrt from typing import Optional from typing import List from loguru import logger -from moviepy.editor import * +from moviepy import * from PIL import ImageFont from contextlib import contextmanager -from moviepy.editor import ( +from moviepy import ( VideoFileClip, AudioFileClip, TextClip, @@ -105,86 +105,6 @@ def manage_clip(clip): del clip -def combine_clip_videos(combined_video_path: str, - video_paths: List[str], - video_ost_list: List[int], - list_script: list, - video_aspect: VideoAspect = VideoAspect.portrait, - threads: int = 2, - ) -> str: - """ - 合并子视频 - Args: - combined_video_path: 合并后的存储路径 - video_paths: 子视频路径列表 - video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说) - list_script: 剪辑脚本 - video_aspect: 屏幕比例 - threads: 线程数 - - Returns: - str: 合并后的视频路径 - """ - from app.utils.utils import calculate_total_duration - audio_duration = calculate_total_duration(list_script) - logger.info(f"音频的最大持续时间: {audio_duration} s") - - output_dir = os.path.dirname(combined_video_path) - aspect = VideoAspect(video_aspect) - video_width, video_height = aspect.to_resolution() - - clips = [] - for video_path, video_ost in zip(video_paths, video_ost_list): - try: - clip = VideoFileClip(video_path) - - if video_ost == 0: # 不保留原声 - clip = clip.without_audio() - # video_ost 为 1 或 2 时都保留原声,不需要特殊处理 - - clip = clip.set_fps(30) - - # 处理视频尺寸 - clip_w, clip_h = clip.size - if clip_w != video_width or clip_h != video_height: - clip = resize_video_with_padding( - clip, - target_width=video_width, - target_height=video_height - ) - logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}") - - clips.append(clip) - - except Exception as e: - logger.error(f"处理视频 {video_path} 时出错: {str(e)}") - continue - - if not clips: - raise ValueError("没有有效的视频片段可以合并") - - try: - video_clip = concatenate_videoclips(clips) - video_clip = video_clip.set_fps(30) - - logger.info("开始合并视频... (过程中出现 UserWarning: 不必理会)") - video_clip.write_videofile( - filename=combined_video_path, - threads=threads, - audio_codec="aac", - fps=30, - temp_audiofile=os.path.join(output_dir, "temp-audio.m4a") - ) - finally: - # 确保资源被正确放 - video_clip.close() - for clip in clips: - clip.close() - - logger.success("视频合并完成") - return combined_video_path - - def resize_video_with_padding(clip, target_width: int, target_height: int): """ 调整视频尺寸并添加黑边 @@ -443,4 +363,3 @@ def generate_video_v3( bgm.close() if narration_path: narration.close() - diff --git a/app/services/video_service.py b/app/services/video_service.py index 2a0a9a6..1b2ddf0 100644 --- a/app/services/video_service.py +++ b/app/services/video_service.py @@ -4,8 +4,6 @@ from loguru import logger from typing import Dict, List, Optional, Tuple from app.services import material -from app.models.schema import VideoClipParams -from app.utils import utils class VideoService: diff --git a/app/services/voice.py b/app/services/voice.py index eba3c6d..31f6d66 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -5,10 +5,11 @@ import traceback import edge_tts import asyncio from loguru import logger -from typing import List +from typing import List, Union from datetime import datetime from xml.sax.saxutils import unescape from edge_tts import submaker, SubMaker +from edge_tts.submaker import mktimestamp from moviepy.video.tools import subtitles import time @@ -1036,7 +1037,7 @@ def is_azure_v2_voice(voice_name: str): def tts( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str -) -> [SubMaker, None]: +) -> Union[SubMaker, None]: if is_azure_v2_voice(voice_name): return azure_tts_v2(text, voice_name, voice_file) return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file) @@ -1064,7 +1065,7 @@ def convert_pitch_to_percent(rate: float) -> str: def azure_tts_v1( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str -) -> [SubMaker, None]: +) -> Union[SubMaker, None]: voice_name = parse_voice_name(voice_name) text = text.strip() rate_str = convert_rate_to_percent(voice_rate) @@ -1087,11 +1088,6 @@ def azure_tts_v1( ) return sub_maker, audio_data - # 判断音频文件是否已存在 - if os.path.exists(voice_file): - logger.info(f"voice file exists, skip tts: {voice_file}") - continue - # 获取音频数据和字幕信息 sub_maker, audio_data = asyncio.run(_do()) @@ -1105,8 +1101,6 @@ def azure_tts_v1( # 数据有效,写入文件 with open(voice_file, "wb") as file: file.write(audio_data) - - logger.info(f"completed, output file: {voice_file}") return sub_maker except Exception as e: logger.error(f"生成音频文件时出错: {str(e)}") @@ -1115,7 +1109,7 @@ def azure_tts_v1( return None -def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]: +def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]: voice_name = is_azure_v2_voice(voice_name) if not voice_name: logger.error(f"invalid voice name: {voice_name}") @@ -1203,11 +1197,14 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None def _format_text(text: str) -> str: - # text = text.replace("\n", " ") + text = text.replace("\n", " ") + text = text.replace("\"", " ") text = text.replace("[", " ") text = text.replace("]", " ") text = text.replace("(", " ") text = text.replace(")", " ") + text = text.replace(")", " ") + text = text.replace("(", " ") text = text.replace("{", " ") text = text.replace("}", " ") text = text.strip() @@ -1240,7 +1237,7 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis if script_item['OST']: continue - start_time, end_time = script_item['new_timestamp'].split('-') + start_time, end_time = script_item['timestamp'].split('-') if sub_maker_index >= len(sub_maker_list): logger.error(f"Sub maker list index out of range: {sub_maker_index}") break @@ -1317,6 +1314,99 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis traceback.print_exc() +def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str): + """ + 优化字幕文件 + 1. 将字幕文件按照标点符号分割成多行 + 2. 逐行匹配字幕文件中的文本 + 3. 生成新的字幕文件 + """ + + text = _format_text(text) + + def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str: + """ + 1 + 00:00:00,000 --> 00:00:02,360 + 跑步是一项简单易行的运动 + """ + start_t = mktimestamp(start_time).replace(".", ",") + end_t = mktimestamp(end_time).replace(".", ",") + return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n" + + start_time = -1.0 + sub_items = [] + sub_index = 0 + + script_lines = utils.split_string_by_punctuations(text) + + def match_line(_sub_line: str, _sub_index: int): + if len(script_lines) <= _sub_index: + return "" + + _line = script_lines[_sub_index] + if _sub_line == _line: + return script_lines[_sub_index].strip() + + _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line) + _line_ = re.sub(r"[^\w\s]", "", _line) + if _sub_line_ == _line_: + return _line_.strip() + + _sub_line_ = re.sub(r"\W+", "", _sub_line) + _line_ = re.sub(r"\W+", "", _line) + if _sub_line_ == _line_: + return _line.strip() + + return "" + + sub_line = "" + + try: + for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)): + _start_time, end_time = offset + if start_time < 0: + start_time = _start_time + + sub = unescape(sub) + sub_line += sub + sub_text = match_line(sub_line, sub_index) + if sub_text: + sub_index += 1 + line = formatter( + idx=sub_index, + start_time=start_time, + end_time=end_time, + sub_text=sub_text, + ) + sub_items.append(line) + start_time = -1.0 + sub_line = "" + + if len(sub_items) == len(script_lines): + with open(subtitle_file, "w", encoding="utf-8") as file: + file.write("\n".join(sub_items) + "\n") + try: + sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8") + duration = max([tb for ((ta, tb), txt) in sbs]) + logger.info( + f"已创建字幕文件: {subtitle_file}, duration: {duration}" + ) + return subtitle_file, duration + except Exception as e: + logger.error(f"failed, error: {str(e)}") + os.remove(subtitle_file) + else: + logger.error( + f"字幕创建失败, 字幕长度: {len(sub_items)}, script_lines len: {len(script_lines)}" + f"\nsub_items:{json.dumps(sub_items, indent=4, ensure_ascii=False)}" + f"\nscript_lines:{json.dumps(script_lines, indent=4, ensure_ascii=False)}" + ) + + except Exception as e: + logger.error(f"failed, error: {str(e)}") + + def get_audio_duration(sub_maker: submaker.SubMaker): """ 获取音频时长 @@ -1326,7 +1416,7 @@ def get_audio_duration(sub_maker: submaker.SubMaker): return sub_maker.offset[-1][1] / 10000000 -def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float, force_regenerate: bool = True): +def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float): """ 根据JSON文件中的多段文本进行TTS转换 @@ -1334,25 +1424,18 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f :param list_script: 脚本列表 :param voice_name: 语音名称 :param voice_rate: 语音速率 - :param force_regenerate: 是否强制重新生成已存在的音频文件 :return: 生成的音频文件列表 """ voice_name = parse_voice_name(voice_name) output_dir = utils.task_dir(task_id) - audio_files = [] - sub_maker_list = [] + tts_results = [] for item in list_script: if item['OST'] != 1: # 将时间戳中的冒号替换为下划线 - timestamp = item['new_timestamp'].replace(':', '_') + timestamp = item['timestamp'].replace(':', '_') audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3") - - # 检查文件是否已存在,如存在且不强制重新生成,则跳过 - if os.path.exists(audio_file) and not force_regenerate: - logger.info(f"音频文件已存在,跳过生成: {audio_file}") - audio_files.append(audio_file) - continue + subtitle_file = os.path.join(output_dir, f"subtitle_{timestamp}.srt") text = item['narration'] @@ -1369,9 +1452,18 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f f"如果您在中国,请使用VPN; " f"或者使用其他 tts 引擎") continue + else: + # 为当前片段生成字幕文件 + _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file) - audio_files.append(audio_file) - sub_maker_list.append(sub_maker) + tts_results.append({ + "_id": item['_id'], + "timestamp": item['timestamp'], + "audio_file": audio_file, + "subtitle_file": subtitle_file, + "duration": duration, + "text": text, + }) logger.info(f"已生成音频文件: {audio_file}") - return audio_files, sub_maker_list + return tts_results diff --git a/app/utils/gemini_analyzer.py b/app/utils/gemini_analyzer.py index 07306c5..7236a9e 100644 --- a/app/utils/gemini_analyzer.py +++ b/app/utils/gemini_analyzer.py @@ -61,7 +61,6 @@ class VisionAnalyzer: try: # 加载图片 if isinstance(images[0], str): - logger.info("正在加载图片...") images = self.load_images(images) # 验证图片列表 @@ -81,11 +80,14 @@ class VisionAnalyzer: images = valid_images results = [] - total_batches = (len(images) + batch_size - 1) // batch_size + # 视频帧总数除以批量处理大小,如果有小数则+1 + batches_needed = len(images) // batch_size + if len(images) % batch_size > 0: + batches_needed += 1 + + logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次") - logger.debug(f"共 {total_batches} 个批次,每批次 {batch_size} 张图片") - - with tqdm(total=total_batches, desc="分析进度") as pbar: + with tqdm(total=batches_needed, desc="分析进度") as pbar: for i in range(0, len(images), batch_size): batch = images[i:i + batch_size] retry_count = 0 @@ -93,8 +95,8 @@ class VisionAnalyzer: while retry_count < 3: try: # 在每个批次处理前添加小延迟 - if i > 0: - await asyncio.sleep(2) + # if i > 0: + # await asyncio.sleep(2) # 确保每个批次的图片都是有效的 valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)] diff --git a/app/utils/qwenvl_analyzer.py b/app/utils/qwenvl_analyzer.py index 54e6e36..6d1669a 100644 --- a/app/utils/qwenvl_analyzer.py +++ b/app/utils/qwenvl_analyzer.py @@ -30,7 +30,7 @@ class QwenAnalyzer: self.model_name = model_name self.api_key = api_key - self.base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" + self.base_url = base_url # 配置API客户端 self._configure_client() @@ -80,7 +80,7 @@ class QwenAnalyzer: # 添加文本提示 content.append({ "type": "text", - "text": prompt + "text": prompt % (len(content), len(content), len(content)) }) # 调用API @@ -102,7 +102,7 @@ class QwenAnalyzer: async def analyze_images(self, images: Union[List[str], List[PIL.Image.Image]], prompt: str, - batch_size: int = 5) -> List[Dict]: + batch_size: int) -> List[Dict]: """ 批量分析多张图片 Args: @@ -118,7 +118,6 @@ class QwenAnalyzer: # 加载图片 if isinstance(images[0], str): - logger.info("正在加载图片...") images = self.load_images(images) # 验证图片列表 @@ -141,9 +140,14 @@ class QwenAnalyzer: images = valid_images results = [] - total_batches = (len(images) + batch_size - 1) // batch_size + # 视频帧总数除以批量处理大小,如果有小数则+1 + batches_needed = len(images) // batch_size + if len(images) % batch_size > 0: + batches_needed += 1 + + logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次") - with tqdm(total=total_batches, desc="分析进度") as pbar: + with tqdm(total=batches_needed, desc="分析进度") as pbar: for i in range(0, len(images), batch_size): batch = images[i:i + batch_size] batch_paths = valid_paths[i:i + batch_size] if valid_paths else None @@ -151,9 +155,9 @@ class QwenAnalyzer: while retry_count < 3: try: - # 在每个批次处理前��加小延迟 - if i > 0: - await asyncio.sleep(2) + # 在每个批次处理前添加小延迟 + # if i > 0: + # await asyncio.sleep(0.5) # 确保每个批次的图片都是有效的 valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)] @@ -209,7 +213,7 @@ class QwenAnalyzer: for i, result in enumerate(results): response_text = result['response'] - # 如果有图片路径信息,���用它来生成文件名 + # 如果有图片路径信息,用它来生成文件名 if result.get('image_paths'): image_paths = result['image_paths'] img_name_start = Path(image_paths[0]).stem.split('_')[-1] diff --git a/app/utils/script_generator.py b/app/utils/script_generator.py index 6493e82..7020782 100644 --- a/app/utils/script_generator.py +++ b/app/utils/script_generator.py @@ -2,7 +2,7 @@ import os import json import traceback from loguru import logger -import tiktoken +# import tiktoken from typing import List, Dict from datetime import datetime from openai import OpenAI @@ -94,12 +94,12 @@ class OpenAIGenerator(BaseGenerator): "user": "script_generator" } - # 初始化token计数器 - try: - self.encoding = tiktoken.encoding_for_model(self.model_name) - except KeyError: - logger.warning(f"未找到模型 {self.model_name} 的专用编码器,使用默认编码器") - self.encoding = tiktoken.get_encoding("cl100k_base") + # # 初始化token计数器 + # try: + # self.encoding = tiktoken.encoding_for_model(self.model_name) + # except KeyError: + # logger.warning(f"未找到模型 {self.model_name} 的专用编码器,使用默认编码器") + # self.encoding = tiktoken.get_encoding("cl100k_base") def _generate(self, messages: list, params: dict) -> any: """实现OpenAI特定的生成逻辑""" diff --git a/app/utils/utils.py b/app/utils/utils.py index 49d44be..56eba09 100644 --- a/app/utils/utils.py +++ b/app/utils/utils.py @@ -197,6 +197,28 @@ def time_convert_seconds_to_hmsm(seconds) -> str: return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds) +def format_time(seconds: float) -> str: + """ + 将秒数转换为格式化的时间字符串 (HH:MM:SS,mmm) + + 参数: + seconds: 需要转换的秒数,可以是整数或浮点数 + + 返回: + 格式化的时间字符串,格式为 HH:MM:SS,mmm + """ + # 计算小时、分钟、秒和毫秒 + hours = int(seconds // 3600) + remaining_seconds = seconds % 3600 + minutes = int(remaining_seconds // 60) + remaining_seconds = remaining_seconds % 60 + secs = int(remaining_seconds) + milliseconds = int((remaining_seconds - secs) * 1000) + + # 格式化为时间字符串 + return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, secs, milliseconds) + + def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str: start_time = time_convert_seconds_to_hmsm(start_time) end_time = time_convert_seconds_to_hmsm(end_time) @@ -506,7 +528,7 @@ def cut_video(params, progress_callback=None): st.session_state['subclip_videos'] = subclip_videos for i, video_script in enumerate(video_script_list): try: - video_script['path'] = subclip_videos[video_script['timestamp']] + video_script['path'] = subclip_videos[i+1] except KeyError as err: logger.error(f"裁剪视频失败: {err}") diff --git a/app/utils/video_processor.py b/app/utils/video_processor.py index 5949e6b..1d3dd9b 100644 --- a/app/utils/video_processor.py +++ b/app/utils/video_processor.py @@ -1,237 +1,339 @@ -import cv2 -import numpy as np -from sklearn.cluster import MiniBatchKMeans +""" +视频帧提取工具 + +这个模块提供了简单高效的视频帧提取功能。主要特点: +1. 使用ffmpeg进行视频处理,支持硬件加速 +2. 按指定时间间隔提取视频关键帧 +3. 支持多种视频格式 +4. 支持高清视频帧输出 +5. 直接从原视频提取高质量关键帧 + +不依赖OpenCV和sklearn等库,只使用ffmpeg作为外部依赖,降低了安装和使用的复杂度。 +""" + import os import re -from typing import List, Tuple, Generator +import time +import subprocess +from typing import List, Dict from loguru import logger -import gc from tqdm import tqdm class VideoProcessor: - def __init__(self, video_path: str, batch_size: int = 100): + def __init__(self, video_path: str): """ 初始化视频处理器 - + Args: video_path: 视频文件路径 - batch_size: 批处理大小,控制内存使用 """ if not os.path.exists(video_path): raise FileNotFoundError(f"视频文件不存在: {video_path}") - + self.video_path = video_path - self.batch_size = batch_size - self.cap = cv2.VideoCapture(video_path) - - if not self.cap.isOpened(): - raise RuntimeError(f"无法打开视频文件: {video_path}") - - self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) - self.fps = int(self.cap.get(cv2.CAP_PROP_FPS)) + self.video_info = self._get_video_info() + self.fps = float(self.video_info.get('fps', 25)) + self.duration = float(self.video_info.get('duration', 0)) + self.width = int(self.video_info.get('width', 0)) + self.height = int(self.video_info.get('height', 0)) + self.total_frames = int(self.fps * self.duration) - def __del__(self): - """析构函数,确保视频资源被释放""" - if hasattr(self, 'cap'): - self.cap.release() - gc.collect() + def _get_video_info(self) -> Dict[str, str]: + """ + 使用ffprobe获取视频信息 - def preprocess_video(self) -> Generator[Tuple[int, np.ndarray], None, None]: - """ - 使用生成器方式分批读取视频帧 - - Yields: - Tuple[int, np.ndarray]: (帧索引, 视频帧) - """ - self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0) - frame_idx = 0 - - while self.cap.isOpened(): - ret, frame = self.cap.read() - if not ret: - break - - # 降低分辨率以减少内存使用 - frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5) - yield frame_idx, frame - - frame_idx += 1 - - # 定期进行垃圾回收 - if frame_idx % 1000 == 0: - gc.collect() - - def detect_shot_boundaries(self, threshold: int = 70) -> List[int]: - """ - 使用批处理方式检测镜头边界 - - Args: - threshold: 差异阈值 - Returns: - List[int]: 镜头边界帧的索引列表 + Dict[str, str]: 包含视频基本信息的字典 """ - shot_boundaries = [] - prev_frame = None - prev_idx = -1 - - pbar = tqdm(self.preprocess_video(), - total=self.total_frames, - desc="检测镜头边界", - unit="帧") - - for frame_idx, curr_frame in pbar: - if prev_frame is not None: - prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY) - curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY) - - diff = np.mean(np.abs(curr_gray.astype(float) - prev_gray.astype(float))) - if diff > threshold: - shot_boundaries.append(frame_idx) - pbar.set_postfix({"检测到边界": len(shot_boundaries)}) - - prev_frame = curr_frame.copy() - prev_idx = frame_idx - - del curr_frame - if frame_idx % 100 == 0: - gc.collect() - - return shot_boundaries + cmd = [ + "ffprobe", + "-v", "error", + "-select_streams", "v:0", + "-show_entries", "stream=width,height,r_frame_rate,duration", + "-of", "default=noprint_wrappers=1:nokey=0", + self.video_path + ] - def process_shot(self, shot_frames: List[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, int]: - """ - 处理单个镜头的帧 - - Args: - shot_frames: 镜头中的帧列表 + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + lines = result.stdout.strip().split('\n') + info = {} + for line in lines: + if '=' in line: + key, value = line.split('=', 1) + info[key] = value + # 处理帧率(可能是分数形式) + if 'r_frame_rate' in info: + try: + num, den = map(int, info['r_frame_rate'].split('/')) + info['fps'] = str(num / den) + except ValueError: + info['fps'] = info.get('r_frame_rate', '25') + + return info + + except subprocess.CalledProcessError as e: + logger.error(f"获取视频信息失败: {e.stderr}") + return { + 'width': '1280', + 'height': '720', + 'fps': '25', + 'duration': '0' + } + + def extract_frames_by_interval(self, output_dir: str, interval_seconds: float = 5.0, + use_hw_accel: bool = True) -> List[int]: + """ + 按指定时间间隔提取视频帧 + + Args: + output_dir: 输出目录 + interval_seconds: 帧提取间隔(秒) + use_hw_accel: 是否使用硬件加速 + Returns: - Tuple[np.ndarray, int]: (关键帧, 帧索引) + List[int]: 提取的帧号列表 """ - if not shot_frames: - return None, -1 - - frame_features = [] - frame_indices = [] + if not os.path.exists(output_dir): + os.makedirs(output_dir) - for idx, frame in tqdm(shot_frames, - desc="处理镜头帧", - unit="帧", - leave=False): - gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) - resized_gray = cv2.resize(gray, (32, 32)) - frame_features.append(resized_gray.flatten()) - frame_indices.append(idx) - - frame_features = np.array(frame_features) + # 计算起始时间和帧提取点 + start_time = 0 + end_time = self.duration + extraction_times = [] - kmeans = MiniBatchKMeans(n_clusters=1, batch_size=min(len(frame_features), 100), - random_state=0).fit(frame_features) + current_time = start_time + while current_time < end_time: + extraction_times.append(current_time) + current_time += interval_seconds - center_idx = np.argmin(np.sum((frame_features - kmeans.cluster_centers_[0]) ** 2, axis=1)) - - return shot_frames[center_idx][1], frame_indices[center_idx] + if not extraction_times: + logger.warning("未找到需要提取的帧") + return [] - def extract_keyframes(self, shot_boundaries: List[int]) -> Generator[Tuple[np.ndarray, int], None, None]: - """ - 使用生成器方式提取关键帧 + # 确定硬件加速器选项 + hw_accel = [] + if use_hw_accel: + # 尝试检测可用的硬件加速器 + hw_accel_options = self._detect_hw_accelerator() + if hw_accel_options: + hw_accel = hw_accel_options + logger.info(f"使用硬件加速: {' '.join(hw_accel)}") + else: + logger.warning("未检测到可用的硬件加速器,使用软件解码") - Args: - shot_boundaries: 镜头边界列表 + # 提取帧 + frame_numbers = [] + for i, timestamp in enumerate(tqdm(extraction_times, desc="提取视频帧")): + frame_number = int(timestamp * self.fps) + frame_numbers.append(frame_number) - Yields: - Tuple[np.ndarray, int]: (关键帧, 帧索引) - """ - shot_frames = [] - current_shot_start = 0 + # 格式化时间戳字符串 (HHMMSSmmm) + hours = int(timestamp // 3600) + minutes = int((timestamp % 3600) // 60) + seconds = int(timestamp % 60) + milliseconds = int((timestamp % 1) * 1000) + time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}" + + output_path = os.path.join(output_dir, f"keyframe_{frame_number:06d}_{time_str}.jpg") + + # 使用ffmpeg提取单帧 + cmd = [ + "ffmpeg", + "-hide_banner", + "-loglevel", "error", + ] + + # 添加硬件加速参数 + cmd.extend(hw_accel) + + cmd.extend([ + "-ss", str(timestamp), + "-i", self.video_path, + "-vframes", "1", + "-q:v", "1", # 最高质量 + "-y", + output_path + ]) + + try: + subprocess.run(cmd, check=True, capture_output=True) + except subprocess.CalledProcessError as e: + logger.warning(f"提取帧 {frame_number} 失败: {e.stderr}") - for frame_idx, frame in self.preprocess_video(): - if frame_idx in shot_boundaries: - if shot_frames: - keyframe, keyframe_idx = self.process_shot(shot_frames) - if keyframe is not None: - yield keyframe, keyframe_idx - - # 清理内存 - shot_frames.clear() - gc.collect() + logger.info(f"成功提取了 {len(frame_numbers)} 个视频帧") + return frame_numbers + + def _detect_hw_accelerator(self) -> List[str]: + """ + 检测系统可用的硬件加速器 + + Returns: + List[str]: 硬件加速器ffmpeg命令参数 + """ + # 检测操作系统 + import platform + system = platform.system().lower() + + # 测试不同的硬件加速器 + accelerators = [] + + if system == 'darwin': # macOS + # 测试 videotoolbox (Apple 硬件加速) + test_cmd = [ + "ffmpeg", + "-hide_banner", + "-loglevel", "error", + "-hwaccel", "videotoolbox", + "-i", self.video_path, + "-t", "0.1", + "-f", "null", + "-" + ] + try: + subprocess.run(test_cmd, capture_output=True, check=True) + return ["-hwaccel", "videotoolbox"] + except subprocess.CalledProcessError: + pass - current_shot_start = frame_idx + elif system == 'linux': + # 测试 VAAPI + test_cmd = [ + "ffmpeg", + "-hide_banner", + "-loglevel", "error", + "-hwaccel", "vaapi", + "-i", self.video_path, + "-t", "0.1", + "-f", "null", + "-" + ] + try: + subprocess.run(test_cmd, capture_output=True, check=True) + return ["-hwaccel", "vaapi"] + except subprocess.CalledProcessError: + pass - shot_frames.append((frame_idx, frame)) - - # 控制单个镜头的最大帧数 - if len(shot_frames) > self.batch_size: - keyframe, keyframe_idx = self.process_shot(shot_frames) - if keyframe is not None: - yield keyframe, keyframe_idx - shot_frames.clear() - gc.collect() + # 尝试 CUDA + test_cmd = [ + "ffmpeg", + "-hide_banner", + "-loglevel", "error", + "-hwaccel", "cuda", + "-i", self.video_path, + "-t", "0.1", + "-f", "null", + "-" + ] + try: + subprocess.run(test_cmd, capture_output=True, check=True) + return ["-hwaccel", "cuda"] + except subprocess.CalledProcessError: + pass + + elif system == 'windows': + # 测试 CUDA + test_cmd = [ + "ffmpeg", + "-hide_banner", + "-loglevel", "error", + "-hwaccel", "cuda", + "-i", self.video_path, + "-t", "0.1", + "-f", "null", + "-" + ] + try: + subprocess.run(test_cmd, capture_output=True, check=True) + return ["-hwaccel", "cuda"] + except subprocess.CalledProcessError: + pass + + # 测试 D3D11VA + test_cmd = [ + "ffmpeg", + "-hide_banner", + "-loglevel", "error", + "-hwaccel", "d3d11va", + "-i", self.video_path, + "-t", "0.1", + "-f", "null", + "-" + ] + try: + subprocess.run(test_cmd, capture_output=True, check=True) + return ["-hwaccel", "d3d11va"] + except subprocess.CalledProcessError: + pass + + # 测试 DXVA2 + test_cmd = [ + "ffmpeg", + "-hide_banner", + "-loglevel", "error", + "-hwaccel", "dxva2", + "-i", self.video_path, + "-t", "0.1", + "-f", "null", + "-" + ] + try: + subprocess.run(test_cmd, capture_output=True, check=True) + return ["-hwaccel", "dxva2"] + except subprocess.CalledProcessError: + pass - # 处理最后一个镜头 - if shot_frames: - keyframe, keyframe_idx = self.process_shot(shot_frames) - if keyframe is not None: - yield keyframe, keyframe_idx + # 如果没有找到可用的硬件加速器 + return [] - def process_video(self, output_dir: str, skip_seconds: float = 0) -> None: + def process_video_pipeline(self, + output_dir: str, + interval_seconds: float = 5.0, # 帧提取间隔(秒) + use_hw_accel: bool = True) -> None: """ - 处理视频并提取关键帧,使用分批处理方式 + 执行简化的视频处理流程,直接从原视频按固定时间间隔提取帧 Args: output_dir: 输出目录 - skip_seconds: 跳过视频开头的秒数 + interval_seconds: 帧提取间隔(秒) + use_hw_accel: 是否使用硬件加速 """ + # 创建输出目录 + os.makedirs(output_dir, exist_ok=True) + try: - # 创建输出目录 - os.makedirs(output_dir, exist_ok=True) - - # 计算要跳过的帧数 - skip_frames = int(skip_seconds * self.fps) - self.cap.set(cv2.CAP_PROP_POS_FRAMES, skip_frames) - - # 检测镜头边界 - logger.info("开始检测镜头边界...") - shot_boundaries = self.detect_shot_boundaries() - - # 提取关键帧 - logger.info("开始提取关键帧...") - frame_count = 0 - - pbar = tqdm(self.extract_keyframes(shot_boundaries), - desc="提取关键帧", - unit="帧") - - for keyframe, frame_idx in pbar: - if frame_idx < skip_frames: - continue - - # 计算时间戳 - timestamp = frame_idx / self.fps - hours = int(timestamp // 3600) - minutes = int((timestamp % 3600) // 60) - seconds = int(timestamp % 60) - time_str = f"{hours:02d}{minutes:02d}{seconds:02d}" - - # 保存关键帧 - output_path = os.path.join(output_dir, - f'keyframe_{frame_idx:06d}_{time_str}.jpg') - cv2.imwrite(output_path, keyframe) - frame_count += 1 - - pbar.set_postfix({"已保存": frame_count}) - - if frame_count % 10 == 0: - gc.collect() - - logger.info(f"关键帧提取完成,共保存 {frame_count} 帧到 {output_dir}") + # 直接从原视频提取关键帧 + logger.info(f"从视频间隔 {interval_seconds} 秒提取关键帧...") + self.extract_frames_by_interval( + output_dir, + interval_seconds=interval_seconds, + use_hw_accel=use_hw_accel + ) + logger.info(f"处理完成!视频帧已保存在: {output_dir}") + except Exception as e: - logger.error(f"视频处理失败: {str(e)}") + import traceback + logger.error(f"视频处理失败: \n{traceback.format_exc()}") raise - finally: - # 确保资源被释放 - self.cap.release() - gc.collect() + + +if __name__ == "__main__": + import time + + start_time = time.time() + + # 使用示例 + processor = VideoProcessor("./resource/videos/test.mp4") + + # 设置间隔为3秒提取帧 + processor.process_video_pipeline( + output_dir="output", + interval_seconds=3.0, + use_hw_accel=True + ) + + end_time = time.time() + print(f"处理完成!总耗时: {end_time - start_time:.2f} 秒") diff --git a/app/utils/video_processor_v2.py b/app/utils/video_processor_v2.py deleted file mode 100644 index 825306b..0000000 --- a/app/utils/video_processor_v2.py +++ /dev/null @@ -1,382 +0,0 @@ -import cv2 -import numpy as np -from sklearn.cluster import KMeans -import os -import re -from typing import List, Tuple, Generator -from loguru import logger -import subprocess -from tqdm import tqdm - - -class VideoProcessor: - def __init__(self, video_path: str): - """ - 初始化视频处理器 - - Args: - video_path: 视频文件路径 - """ - if not os.path.exists(video_path): - raise FileNotFoundError(f"视频文件不存在: {video_path}") - - self.video_path = video_path - self.cap = cv2.VideoCapture(video_path) - - if not self.cap.isOpened(): - raise RuntimeError(f"无法打开视频文件: {video_path}") - - self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) - self.fps = int(self.cap.get(cv2.CAP_PROP_FPS)) - - def __del__(self): - """析构函数,确保视频资源被释放""" - if hasattr(self, 'cap'): - self.cap.release() - - def preprocess_video(self) -> Generator[np.ndarray, None, None]: - """ - 使用生成器方式读取视频帧 - - Yields: - np.ndarray: 视频帧 - """ - self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # 重置到视频开始 - while self.cap.isOpened(): - ret, frame = self.cap.read() - if not ret: - break - yield frame - - def detect_shot_boundaries(self, frames: List[np.ndarray], threshold: int = 30) -> List[int]: - """ - 使用帧差法检测镜头边界 - - Args: - frames: 视频帧列表 - threshold: 差异阈值,默认值调低为30 - - Returns: - List[int]: 镜头边界帧的索引列表 - """ - shot_boundaries = [] - if len(frames) < 2: # 添加帧数检查 - logger.warning("视频帧数过少,无法检测场景边界") - return [len(frames) - 1] # 返回最后一帧作为边界 - - for i in range(1, len(frames)): - prev_frame = cv2.cvtColor(frames[i - 1], cv2.COLOR_BGR2GRAY) - curr_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY) - - # 计算帧差 - diff = np.mean(np.abs(curr_frame.astype(float) - prev_frame.astype(float))) - - if diff > threshold: - shot_boundaries.append(i) - - # 如果没有检测到任何边界,至少返回最后一帧 - if not shot_boundaries: - logger.warning("未检测到场景边界,将视频作为单个场景处理") - shot_boundaries.append(len(frames) - 1) - - return shot_boundaries - - def extract_keyframes(self, frames: List[np.ndarray], shot_boundaries: List[int]) -> Tuple[ - List[np.ndarray], List[int]]: - """ - 从每个镜头中提取关键帧 - - Args: - frames: 视频帧列表 - shot_boundaries: 镜头边界列表 - - Returns: - Tuple[List[np.ndarray], List[int]]: 关键帧列表和对应的帧索引 - """ - keyframes = [] - keyframe_indices = [] - - for i in tqdm(range(len(shot_boundaries)), desc="提取关键帧"): - start = shot_boundaries[i - 1] if i > 0 else 0 - end = shot_boundaries[i] - shot_frames = frames[start:end] - - if not shot_frames: - continue - - # 将每一帧转换为灰度图并展平为一维数组 - frame_features = np.array([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY).flatten() - for frame in shot_frames]) - - try: - # 尝试使用 KMeans - kmeans = KMeans(n_clusters=1, random_state=0).fit(frame_features) - center_idx = np.argmin(np.sum((frame_features - kmeans.cluster_centers_[0]) ** 2, axis=1)) - except Exception as e: - logger.warning(f"KMeans 聚类失败,使用备选方案: {str(e)}") - # 备选方案:选择镜头中间的帧作为关键帧 - center_idx = len(shot_frames) // 2 - - keyframes.append(shot_frames[center_idx]) - keyframe_indices.append(start + center_idx) - - return keyframes, keyframe_indices - - def save_keyframes(self, keyframes: List[np.ndarray], keyframe_indices: List[int], - output_dir: str, desc: str = "保存关键帧") -> None: - """ - 保存关键帧到指定目录,文件名格式为:keyframe_帧序号_时间戳.jpg - 时间戳精确到毫秒,格式为:HHMMSSmmm - """ - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - for keyframe, frame_idx in tqdm(zip(keyframes, keyframe_indices), - total=len(keyframes), - desc=desc): - # 计算精确到毫秒的时间戳 - timestamp = frame_idx / self.fps - hours = int(timestamp // 3600) - minutes = int((timestamp % 3600) // 60) - seconds = int(timestamp % 60) - milliseconds = int((timestamp % 1) * 1000) # 计算毫秒部分 - time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}" - - output_path = os.path.join(output_dir, - f'keyframe_{frame_idx:06d}_{time_str}.jpg') - cv2.imwrite(output_path, keyframe) - - def extract_frames_by_numbers(self, frame_numbers: List[int], output_folder: str) -> None: - """ - 根据指定的帧号提取帧,如果多个帧在同一毫秒内,只保留一个 - """ - if not frame_numbers: - raise ValueError("未提供帧号列表") - - if any(fn >= self.total_frames or fn < 0 for fn in frame_numbers): - raise ValueError("存在无效的帧号") - - if not os.path.exists(output_folder): - os.makedirs(output_folder) - - # 用于记录已处理的时间戳(毫秒) - processed_timestamps = set() - - for frame_number in tqdm(frame_numbers, desc="提取高清帧"): - # 计算精确到毫秒的时间戳 - timestamp = frame_number / self.fps - timestamp_ms = int(timestamp * 1000) # 转换为毫秒 - - # 如果这一毫秒已经处理过,跳过 - if timestamp_ms in processed_timestamps: - continue - - self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number) - ret, frame = self.cap.read() - - if ret: - # 记录这一毫秒已经处理 - processed_timestamps.add(timestamp_ms) - - # 计算时间戳字符串 - hours = int(timestamp // 3600) - minutes = int((timestamp % 3600) // 60) - seconds = int(timestamp % 60) - milliseconds = int((timestamp % 1) * 1000) # 计算毫秒部分 - time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}" - - output_path = os.path.join(output_folder, - f"keyframe_{frame_number:06d}_{time_str}.jpg") - cv2.imwrite(output_path, frame) - else: - logger.info(f"无法读取帧 {frame_number}") - - logger.info(f"共提取了 {len(processed_timestamps)} 个不同时间戳的帧") - - @staticmethod - def extract_numbers_from_folder(folder_path: str) -> List[int]: - """ - 从文件夹中提取帧号 - - Args: - folder_path: 关键帧文件夹路径 - - Returns: - List[int]: 排序后的帧号列表 - """ - files = [f for f in os.listdir(folder_path) if f.endswith('.jpg')] - # 更新正则表达式以匹配新的文件名格式:keyframe_000123_010534123.jpg - pattern = re.compile(r'keyframe_(\d+)_\d{9}\.jpg$') - numbers = [] - - for f in files: - match = pattern.search(f) - if match: - numbers.append(int(match.group(1))) - else: - logger.warning(f"文件名格式不匹配: {f}") - - if not numbers: - logger.error(f"在目录 {folder_path} 中未找到有效的关键帧文件") - - return sorted(numbers) - - def process_video(self, output_dir: str, skip_seconds: float = 0, threshold: int = 30) -> None: - """ - 处理视频并提取关键帧 - - Args: - output_dir: 输出目录 - skip_seconds: 跳过视频开头的秒数 - """ - skip_frames = int(skip_seconds * self.fps) - - logger.info("读取视频帧...") - frames = [] - for frame in tqdm(self.preprocess_video(), - total=self.total_frames, - desc="读取视频"): - frames.append(frame) - - frames = frames[skip_frames:] - - if not frames: - raise ValueError(f"跳过 {skip_seconds} 秒后没有剩余帧可以处理") - - logger.info("检测场景边界...") - shot_boundaries = self.detect_shot_boundaries(frames, threshold) - logger.info(f"检测到 {len(shot_boundaries)} 个场景边界") - - keyframes, keyframe_indices = self.extract_keyframes(frames, shot_boundaries) - - adjusted_indices = [idx + skip_frames for idx in keyframe_indices] - self.save_keyframes(keyframes, adjusted_indices, output_dir, desc="保存压缩关键帧") - - def process_video_pipeline(self, - output_dir: str, - skip_seconds: float = 0, - threshold: int = 20, # 降低默认阈值 - compressed_width: int = 320, - keep_temp: bool = False) -> None: - """ - 执行完整的视频处理流程 - - Args: - threshold: 降低默认阈值为20,使场景检测更敏感 - """ - os.makedirs(output_dir, exist_ok=True) - temp_dir = os.path.join(output_dir, 'temp') - compressed_dir = os.path.join(temp_dir, 'compressed') - mini_frames_dir = os.path.join(temp_dir, 'mini_frames') - hd_frames_dir = output_dir - - os.makedirs(temp_dir, exist_ok=True) - os.makedirs(compressed_dir, exist_ok=True) - os.makedirs(mini_frames_dir, exist_ok=True) - os.makedirs(hd_frames_dir, exist_ok=True) - - mini_processor = None - compressed_video = None - - try: - # 1. 压缩视频 - video_name = os.path.splitext(os.path.basename(self.video_path))[0] - compressed_video = os.path.join(compressed_dir, f"{video_name}_compressed.mp4") - - # 获取原始视频的宽度和高度 - original_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - original_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - - logger.info("步骤1: 压缩视频...") - if original_width > original_height: - # 横版视频 - scale_filter = f'scale={compressed_width}:-1' - else: - # 竖版视频 - scale_filter = f'scale=-1:{compressed_width}' - - ffmpeg_cmd = [ - 'ffmpeg', '-i', self.video_path, - '-vf', scale_filter, - '-y', - compressed_video - ] - - try: - subprocess.run(ffmpeg_cmd, check=True, capture_output=True, text=True) - except subprocess.CalledProcessError as e: - logger.error(f"FFmpeg 错误输出: {e.stderr}") - raise - - # 2. 从压缩视频中提取关键帧 - logger.info("\n步骤2: 从压缩视频提取关键帧...") - mini_processor = VideoProcessor(compressed_video) - mini_processor.process_video(mini_frames_dir, skip_seconds, threshold) - - # 3. 从原始视频提取高清关键帧 - logger.info("\n步骤3: 提取高清关键帧...") - frame_numbers = self.extract_numbers_from_folder(mini_frames_dir) - - if not frame_numbers: - raise ValueError("未能从压缩视频中提取到有效的关键帧") - - self.extract_frames_by_numbers(frame_numbers, hd_frames_dir) - - logger.info(f"处理完成!高清关键帧保存在: {hd_frames_dir}") - - except Exception as e: - import traceback - logger.error(f"视频处理失败: \n{traceback.format_exc()}") - raise - - finally: - # 释放资源 - if mini_processor: - mini_processor.cap.release() - del mini_processor - - # 确保视频文件句柄被释放 - if hasattr(self, 'cap'): - self.cap.release() - - # 等待资源释放 - import time - time.sleep(0.5) - - if not keep_temp: - try: - # 先删除压缩视频文件 - if compressed_video and os.path.exists(compressed_video): - try: - os.remove(compressed_video) - except Exception as e: - logger.warning(f"删除压缩视频失败: {e}") - - # 再删除临时目录 - import shutil - if os.path.exists(temp_dir): - max_retries = 3 - for i in range(max_retries): - try: - shutil.rmtree(temp_dir) - break - except Exception as e: - if i == max_retries - 1: - logger.warning(f"清理临时文件失败: {e}") - else: - time.sleep(1) # 等待1秒后重试 - continue - - logger.info("临时文件已清理") - except Exception as e: - logger.warning(f"清理临时文件时出错: {e}") - - -if __name__ == "__main__": - import time - - start_time = time.time() - processor = VideoProcessor("E:\\projects\\NarratoAI\\resource\\videos\\test.mp4") - processor.process_video_pipeline(output_dir="output") - end_time = time.time() - print(f"处理完成!总耗时: {end_time - start_time:.2f} 秒") diff --git a/config.example.toml b/config.example.toml index 5620744..762651b 100644 --- a/config.example.toml +++ b/config.example.toml @@ -1,10 +1,9 @@ [app] - project_version="0.5.3" + project_version="0.6.0" # 支持视频理解的大模型提供商 # gemini # qwenvl vision_llm_provider="qwenvl" - vision_analysis_prompt = "你是资深视频内容分析专家,擅长分析视频画面信息,分析下面视频画面内容,只输出客观的画面描述不要给任何总结或评价" ########## Vision Gemini API Key vision_gemini_api_key = "" @@ -173,12 +172,7 @@ speech_region="" [frames] - skip_seconds = 0 - # threshold(差异阈值)用于判断两个连续帧之间是否发生了场景切换 - # 较小的阈值(如 20):更敏感,能捕捉到细微的场景变化,但可能会误判,关键帧图片更多 - # 较大的阈值(如 40):更保守,只捕捉明显的场景切换,但可能会漏掉渐变场景,关键帧图片更少 - # 默认值 30:在实践中是一个比较平衡的选择 - threshold = 30 - version = "v2" + # 提取关键帧的间隔时间 + frame_interval_input = 3 # 大模型单次处理的关键帧数量 - vision_batch_size = 5 + vision_batch_size = 10 diff --git a/requirements.txt b/requirements.txt index 55c7972..ffc3dc5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,38 +1,46 @@ -requests~=2.31.0 -moviepy==2.0.0.dev2 -faster-whisper~=1.0.1 -uvicorn~=0.27.1 -fastapi~=0.115.4 -tomli~=2.0.1 -streamlit~=1.40.0 -loguru~=0.7.2 -aiohttp~=3.10.10 -urllib3~=2.2.1 -pydantic~=2.6.3 -g4f~=0.3.0.4 -dashscope~=1.15.0 -google.generativeai>=0.8.3 -python-multipart~=0.0.9 -redis==5.0.3 -opencv-python~=4.10.0.84 -# for azure speech -# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471 -azure-cognitiveservices-speech~=1.37.0 -git-changelog~=2.5.2 -watchdog==5.0.2 -pydub==0.25.1 -psutil>=5.9.0 -opencv-python~=4.10.0.84 -scikit-learn~=1.5.2 -google-generativeai~=0.8.3 -pillow==10.3.0 -python-dotenv~=1.0.1 -openai~=1.53.0 -tqdm>=4.66.6 -tenacity>=9.0.0 -tiktoken==0.8.0 -yt-dlp==2024.11.18 -pysrt==1.1.2 -httpx==0.27.2 -transformers==4.47.0 +# 必须项 +requests~=2.32.0 +moviepy==2.1.1 edge-tts==6.1.19 +streamlit~=1.45.0 +watchdog==6.0.0 +loguru~=0.7.3 +tomli~=2.2.1 +pydub==0.25.1 + +openai~=1.77.0 +google-generativeai>=0.8.5 + +# 待优化项 +# opencv-python==4.11.0.86 +# scikit-learn==1.6.1 + +# fastapi~=0.115.4 +# uvicorn~=0.27.1 +# pydantic~=2.11.4 + +# faster-whisper~=1.0.1 +# tomli~=2.0.1 +# aiohttp~=3.10.10 +# httpx==0.27.2 +# urllib3~=2.2.1 + +# python-multipart~=0.0.9 +# redis==5.0.3 +# opencv-python~=4.10.0.84 +# azure-cognitiveservices-speech~=1.37.0 +# git-changelog~=2.5.2 +# watchdog==5.0.2 +# pydub==0.25.1 +# psutil>=5.9.0 +# scikit-learn~=1.5.2 +# pillow==10.3.0 +# python-dotenv~=1.0.1 + +# tqdm>=4.66.6 +# tenacity>=9.0.0 +# tiktoken==0.8.0 +# pysrt==1.1.2 +# transformers==4.50.0 + +# yt-dlp==2025.4.30 \ No newline at end of file diff --git a/webui.py b/webui.py index 434cbb9..7c65df6 100644 --- a/webui.py +++ b/webui.py @@ -1,13 +1,14 @@ import streamlit as st import os import sys -from uuid import uuid4 +from loguru import logger from app.config import config -from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, review_settings, merge_settings, system_settings +from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \ + review_settings, merge_settings, system_settings from webui.utils import cache, file_utils from app.utils import utils from app.models.schema import VideoClipParams, VideoAspect -from webui.utils.performance import PerformanceMonitor + # 初始化配置 - 必须是第一个 Streamlit 命令 st.set_page_config( @@ -17,7 +18,7 @@ st.set_page_config( initial_sidebar_state="auto", menu_items={ "Report a bug": "https://github.com/linyqh/NarratoAI/issues", - 'About': f"# NarratoAI:sunglasses: 📽️ \n #### Version: v{config.project_version} \n " + 'About': f"# Narrato:blue[AI] :sunglasses: 📽️ \n #### Version: v{config.project_version} \n " f"自动化影视解说视频详情请移步:https://github.com/linyqh/NarratoAI" }, ) @@ -28,6 +29,7 @@ hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) + def init_log(): """初始化日志配置""" from loguru import logger @@ -35,17 +37,7 @@ def init_log(): _lvl = "DEBUG" def format_record(record): - # 增加更多需要过滤的警告消息 - ignore_messages = [ - "Examining the path of torch.classes raised", - "torch.cuda.is_available()", - "CUDA initialization" - ] - - for msg in ignore_messages: - if msg in record["message"]: - return "" - + # 简化日志格式化处理,不尝试按特定字符串过滤torch相关内容 file_path = record["file"].path relative_path = os.path.relpath(file_path, config.root_dir) record["file"].path = f"./{relative_path}" @@ -57,23 +49,54 @@ def init_log(): '- {message}' + "\n" return _format - # 优化日志过滤器 - def log_filter(record): - ignore_messages = [ - "Examining the path of torch.classes raised", - "torch.cuda.is_available()", - "CUDA initialization" - ] - return not any(msg in record["message"] for msg in ignore_messages) - + # 替换为更简单的过滤方式,避免在过滤时访问message内容 + # 此处先不设置复杂的过滤器,等应用启动后再动态添加 logger.add( sys.stdout, level=_lvl, format=format_record, - colorize=True, - filter=log_filter + colorize=True ) + # 应用启动后,可以再添加更复杂的过滤器 + def setup_advanced_filters(): + """在应用完全启动后设置高级过滤器""" + try: + for handler_id in logger._core.handlers: + logger.remove(handler_id) + + # 重新添加带有高级过滤的处理器 + def advanced_filter(record): + """更复杂的过滤器,在应用启动后安全使用""" + ignore_messages = [ + "Examining the path of torch.classes raised", + "torch.cuda.is_available()", + "CUDA initialization" + ] + return not any(msg in record["message"] for msg in ignore_messages) + + logger.add( + sys.stdout, + level=_lvl, + format=format_record, + colorize=True, + filter=advanced_filter + ) + except Exception as e: + # 如果过滤器设置失败,确保日志仍然可用 + logger.add( + sys.stdout, + level=_lvl, + format=format_record, + colorize=True + ) + logger.error(f"设置高级日志过滤器失败: {e}") + + # 将高级过滤器设置放到启动主逻辑后 + import threading + threading.Timer(5.0, setup_advanced_filters).start() + + def init_global_state(): """初始化全局状态""" if 'video_clip_json' not in st.session_state: @@ -85,6 +108,7 @@ def init_global_state(): if 'subclip_videos' not in st.session_state: st.session_state['subclip_videos'] = {} + def tr(key): """翻译函数""" i18n_dir = os.path.join(os.path.dirname(__file__), "webui", "i18n") @@ -92,90 +116,94 @@ def tr(key): loc = locales.get(st.session_state['ui_language'], {}) return loc.get("Translation", {}).get(key, key) + def render_generate_button(): """渲染生成按钮和处理逻辑""" if st.button(tr("Generate Video"), use_container_width=True, type="primary"): + from app.services import task as tm + + # 重置日志容器和记录 + log_container = st.empty() + log_records = [] + + def log_received(msg): + with log_container: + log_records.append(msg) + st.code("\n".join(log_records)) + + from loguru import logger + logger.add(log_received) + + config.save_config() + task_id = st.session_state.get('task_id') + + if not task_id: + st.error(tr("请先裁剪视频")) + return + if not st.session_state.get('video_clip_json_path'): + st.error(tr("脚本文件不能为空")) + return + if not st.session_state.get('video_origin_path'): + st.error(tr("视频文件不能为空")) + return + + st.toast(tr("生成视频")) + logger.info(tr("开始生成视频")) + + # 获取所有参数 + script_params = script_settings.get_script_params() + video_params = video_settings.get_video_params() + audio_params = audio_settings.get_audio_params() + subtitle_params = subtitle_settings.get_subtitle_params() + + # 合并所有参数 + all_params = { + **script_params, + **video_params, + **audio_params, + **subtitle_params + } + + # 创建参数对象 + params = VideoClipParams(**all_params) + + result = tm.start_subclip( + task_id=task_id, + params=params, + subclip_path_videos=st.session_state['subclip_videos'] + ) + + video_files = result.get("videos", []) + st.success(tr("视生成完成")) + try: - from app.services import task as tm - import torch - - # 重置日志容器和记录 - log_container = st.empty() - log_records = [] + if video_files: + player_cols = st.columns(len(video_files) * 2 + 1) + for i, url in enumerate(video_files): + player_cols[i * 2 + 1].video(url) + except Exception as e: + logger.error(f"播放视频失败: {e}") - def log_received(msg): - with log_container: - log_records.append(msg) - st.code("\n".join(log_records)) + file_utils.open_task_folder(config.root_dir, task_id) + logger.info(tr("视频生成完成")) - from loguru import logger - logger.add(log_received) - - config.save_config() - task_id = st.session_state.get('task_id') - - if not task_id: - st.error(tr("请先裁剪视频")) - return - if not st.session_state.get('video_clip_json_path'): - st.error(tr("脚本文件不能为空")) - return - if not st.session_state.get('video_origin_path'): - st.error(tr("视频文件不能为空")) - return - - st.toast(tr("生成视频")) - logger.info(tr("开始生成视频")) - - # 获取所有参数 - script_params = script_settings.get_script_params() - video_params = video_settings.get_video_params() - audio_params = audio_settings.get_audio_params() - subtitle_params = subtitle_settings.get_subtitle_params() - - # 合并所有参数 - all_params = { - **script_params, - **video_params, - **audio_params, - **subtitle_params - } - - # 创建参数对象 - params = VideoClipParams(**all_params) - - result = tm.start_subclip( - task_id=task_id, - params=params, - subclip_path_videos=st.session_state['subclip_videos'] - ) - - video_files = result.get("videos", []) - st.success(tr("视生成完成")) - - try: - if video_files: - player_cols = st.columns(len(video_files) * 2 + 1) - for i, url in enumerate(video_files): - player_cols[i * 2 + 1].video(url) - except Exception as e: - logger.error(f"播放视频失败: {e}") - - file_utils.open_task_folder(config.root_dir, task_id) - logger.info(tr("视频生成完成")) - - finally: - PerformanceMonitor.cleanup_resources() def main(): """主函数""" init_log() init_global_state() - utils.init_resources() - st.title(f"NarratoAI :sunglasses:📽️") + # 仅初始化基本资源,避免过早地加载依赖PyTorch的资源 + # 检查是否能分解utils.init_resources()为基本资源和高级资源(如依赖PyTorch的资源) + try: + utils.init_resources() + except Exception as e: + logger.warning(f"资源初始化时出现警告: {e}") + + st.title(f"Narrato:blue[AI]:sunglasses: 📽️") st.write(tr("Get Help")) - + + # 首先渲染不依赖PyTorch的UI部分 # 渲染基础设置面板 basic_settings.render_basic_settings(tr) # 渲染合并设置 @@ -190,14 +218,18 @@ def main(): audio_settings.render_audio_panel(tr) with panel[2]: subtitle_settings.render_subtitle_panel(tr) - # 渲染系统设置面板 - system_settings.render_system_panel(tr) # 渲染视频审查面板 review_settings.render_review_panel(tr) - # 渲染生成按钮和处理逻辑 + # 放到最后渲染可能使用PyTorch的部分 + # 渲染系统设置面板 + with panel[2]: + system_settings.render_system_panel(tr) + + # 放到最后渲染生成按钮和处理逻辑 render_generate_button() + if __name__ == "__main__": main() diff --git a/webui/__init__.py b/webui/__init__.py index 3c0a334..4d5f92e 100644 --- a/webui/__init__.py +++ b/webui/__init__.py @@ -8,7 +8,7 @@ from webui.components import ( audio_settings, subtitle_settings ) -from webui.utils import cache, file_utils, performance +from webui.utils import cache, file_utils __all__ = [ 'config', @@ -17,6 +17,5 @@ __all__ = [ 'audio_settings', 'subtitle_settings', 'cache', - 'file_utils', - 'performance' + 'file_utils' ] \ No newline at end of file diff --git a/webui/components/basic_settings.py b/webui/components/basic_settings.py index e6165fe..cae4c16 100644 --- a/webui/components/basic_settings.py +++ b/webui/components/basic_settings.py @@ -1,7 +1,10 @@ +import traceback + import streamlit as st import os from app.config import config from app.utils import utils +from loguru import logger def render_basic_settings(tr): @@ -266,7 +269,7 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr): elif provider.lower() == 'moonshot': base_url = "https://api.moonshot.cn/v1" elif provider.lower() == 'deepseek': - base_url = "https://api.deepseek.com/v1" + base_url = "https://api.deepseek.com" # 构建测试URL test_url = f"{base_url.rstrip('/')}/chat/completions" @@ -288,7 +291,7 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr): "messages": [ {"role": "user", "content": "直接回复我文本'当前网络可用'"} ], - "max_tokens": 10 + "stream": False } # 发送测试请求 @@ -296,7 +299,6 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr): test_url, headers=headers, json=test_data, - timeout=10 ) if response.status_code == 200: @@ -313,7 +315,7 @@ def render_text_llm_settings(tr): st.subheader(tr("Text Generation Model Settings")) # 文案生成模型提供商选择 - text_providers = ['DeepSeek', 'OpenAI', 'Qwen', 'Moonshot', 'Gemini'] + text_providers = ['DeepSeek', 'OpenAI', 'Siliconflow', 'Qwen', 'Moonshot', 'Gemini'] saved_text_provider = config.app.get("text_llm_provider", "DeepSeek").lower() saved_provider_index = 0 @@ -331,9 +333,9 @@ def render_text_llm_settings(tr): config.app["text_llm_provider"] = text_provider # 获取已保存的文本模型配置 - text_api_key = config.app.get(f"text_{text_provider}_api_key", "") - text_base_url = config.app.get(f"text_{text_provider}_base_url", "") - text_model_name = config.app.get(f"text_{text_provider}_model_name", "") + text_api_key = config.app.get(f"text_{text_provider}_api_key") + text_base_url = config.app.get(f"text_{text_provider}_base_url") + text_model_name = config.app.get(f"text_{text_provider}_model_name") # 渲染文本模型配置输入框 st_text_api_key = st.text_input(tr("Text API Key"), value=text_api_key, type="password") @@ -342,6 +344,8 @@ def render_text_llm_settings(tr): # 添加测试按钮 if st.button(tr("Test Connection"), key="test_text_connection"): + logger.debug(st_text_base_url) + logger.debug(st_text_model_name) with st.spinner(tr("Testing connection...")): success, message = test_text_model_connection( api_key=st_text_api_key, @@ -364,11 +368,11 @@ def render_text_llm_settings(tr): if st_text_model_name: config.app[f"text_{text_provider}_model_name"] = st_text_model_name - # Cloudflare 特殊配置 - if text_provider == 'cloudflare': - st_account_id = st.text_input( - tr("Account ID"), - value=config.app.get(f"text_{text_provider}_account_id", "") - ) - if st_account_id: - config.app[f"text_{text_provider}_account_id"] = st_account_id + # # Cloudflare 特殊配置 + # if text_provider == 'cloudflare': + # st_account_id = st.text_input( + # tr("Account ID"), + # value=config.app.get(f"text_{text_provider}_account_id", "") + # ) + # if st_account_id: + # config.app[f"text_{text_provider}_account_id"] = st_account_id diff --git a/webui/components/merge_settings.py b/webui/components/merge_settings.py index 99b8b43..edaa183 100644 --- a/webui/components/merge_settings.py +++ b/webui/components/merge_settings.py @@ -285,8 +285,8 @@ def render_merge_settings(tr): error_message = str(e) if "moviepy" in error_message.lower(): st.error(tr("Error processing video files. Please check if the videos are valid MP4 files.")) - elif "pysrt" in error_message.lower(): - st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files.")) + # elif "pysrt" in error_message.lower(): + # st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files.")) else: st.error(f"{tr('Error during merge')}: {error_message}") diff --git a/webui/components/review_settings.py b/webui/components/review_settings.py index 932ec9b..c4f3bce 100644 --- a/webui/components/review_settings.py +++ b/webui/components/review_settings.py @@ -33,7 +33,7 @@ def render_video_item(tr, video_list, subclip_videos, index): video_script = video_list[index] # 显示时间戳 - timestamp = video_script.get('timestamp', '') + timestamp = video_script.get('_id', '') st.text_area( tr("Timestamp"), value=timestamp, diff --git a/webui/components/script_settings.py b/webui/components/script_settings.py index ac5c76e..5893dc9 100644 --- a/webui/components/script_settings.py +++ b/webui/components/script_settings.py @@ -47,7 +47,7 @@ def render_script_file(tr, params): (tr("None"), ""), (tr("Auto Generate"), "auto"), (tr("Short Generate"), "short"), - (tr("Upload Script"), "upload_script") # 新增上传脚本选项 + (tr("Upload Script"), "upload_script") ] # 获取已有脚本文件 @@ -214,38 +214,25 @@ def render_script_buttons(tr, params): # 根据脚本类型显示不同的设置 if script_path != "short": # 非短视频模式下显示原有的三个输入框 - input_cols = st.columns(3) + input_cols = st.columns(2) with input_cols[0]: - skip_seconds = st.number_input( - "skip_seconds", + st.number_input( + tr("Frame Interval (seconds)"), min_value=0, - value=st.session_state.get('skip_seconds', config.frames.get('skip_seconds', 0)), - help=tr("Skip the first few seconds"), - key="skip_seconds_input" + value=st.session_state.get('frame_interval_input', config.frames.get('frame_interval_input', 3)), + help=tr("Frame Interval (seconds) (More keyframes consume more tokens)"), + key="frame_interval_input" ) - st.session_state['skip_seconds'] = skip_seconds - + with input_cols[1]: - threshold = st.number_input( - "threshold", + st.number_input( + tr("Batch Size"), min_value=0, - value=st.session_state.get('threshold', config.frames.get('threshold', 30)), - help=tr("Difference threshold"), - key="threshold_input" + value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 10)), + help=tr("Batch Size (More keyframes consume more tokens)"), + key="vision_batch_size" ) - st.session_state['threshold'] = threshold - - with input_cols[2]: - vision_batch_size = st.number_input( - "vision_batch_size", - min_value=1, - max_value=20, - value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 5)), - help=tr("Vision processing batch size"), - key="vision_batch_size_input" - ) - st.session_state['vision_batch_size'] = vision_batch_size # 生成/加载按钮 if script_path == "auto": @@ -259,7 +246,8 @@ def render_script_buttons(tr, params): if st.button(button_name, key="script_action", disabled=not script_path): if script_path == "auto": - generate_script_docu(tr, params) + # 执行纪录片视频脚本生成(视频无字幕无配音) + generate_script_docu(params) elif script_path == "short": # 获取自定义片段数量参数 custom_clips = st.session_state.get('custom_clips', 5) @@ -366,12 +354,11 @@ def crop_video(tr, params): utils.cut_video(params, update_progress) time.sleep(0.5) progress_bar.progress(100) - status_text.text("剪完成!") st.success("视频剪辑成功完成!") except Exception as e: st.error(f"剪辑过程中发生错误: {str(e)}") finally: - time.sleep(2) + time.sleep(1) progress_bar.empty() status_text.empty() diff --git a/webui/components/subtitle_settings.py b/webui/components/subtitle_settings.py index cb624dc..ee27985 100644 --- a/webui/components/subtitle_settings.py +++ b/webui/components/subtitle_settings.py @@ -127,7 +127,7 @@ def get_subtitle_params(): 'font_name': st.session_state.get('font_name', ''), 'font_size': st.session_state.get('font_size', 60), 'text_fore_color': st.session_state.get('text_fore_color', '#FFFFFF'), - 'position': st.session_state.get('subtitle_position', 'bottom'), + 'subtitle_position': st.session_state.get('subtitle_position', 'bottom'), 'custom_position': st.session_state.get('custom_position', 70.0), 'stroke_color': st.session_state.get('stroke_color', '#000000'), 'stroke_width': st.session_state.get('stroke_width', 1.5), diff --git a/webui/i18n/en.json b/webui/i18n/en.json index e0f2900..63a2c36 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -85,6 +85,7 @@ "TTS Provider": "TTS Provider", "Hide Log": "Hide Log", "Upload Local Files": "Upload Local Files", - "File Uploaded Successfully": "File Uploaded Successfully" + "File Uploaded Successfully": "File Uploaded Successfully", + "Frame Interval (seconds)": "Frame Interval (seconds) (More keyframes consume more tokens)" } } \ No newline at end of file diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index beb48ec..6aa7fbc 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -115,7 +115,6 @@ "Text Generation Model Settings": "文案生成模型设置", "LLM Model Name": "大语言模型名称", "LLM Model API Key": "大语言模型 API 密钥", - "Batch Size": "批处理大小", "Text Model Provider": "文案生成模型提供商", "Text API Key": "文案生成 API 密钥", "Text Base URL": "文案生成接口地址", @@ -192,6 +191,10 @@ "Generate Short Video Script": "AI生成短剧混剪脚本", "Adjust the volume of the original audio": "调整原始音频的音量", "Original Volume": "视频音量", - "Auto Generate": "纪录片解说 (画面解说)" + "Auto Generate": "纪录片解说 (画面解说)", + "Frame Interval (seconds)": "帧间隔 (秒)", + "Frame Interval (seconds) (More keyframes consume more tokens)": "帧间隔 (秒) (更多关键帧消耗更多令牌)", + "Batch Size": "批处理大小", + "Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多" } -} +} \ No newline at end of file diff --git a/webui/tools/generate_script_docu.py b/webui/tools/generate_script_docu.py index 6552ebf..5f958ba 100644 --- a/webui/tools/generate_script_docu.py +++ b/webui/tools/generate_script_docu.py @@ -5,20 +5,23 @@ import time import asyncio import traceback import requests +from app.utils import video_processor import streamlit as st from loguru import logger from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry +from datetime import datetime from app.config import config from app.utils.script_generator import ScriptProcessor -from app.utils import utils, video_processor, video_processor_v2, qwenvl_analyzer +from app.utils import utils, video_processor, qwenvl_analyzer from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps, chekc_video_config -def generate_script_docu(tr, params): +def generate_script_docu(params): """ 生成 纪录片 视频脚本 + 要求: 原视频无字幕无配音 + 适合场景: 纪录片、动物搞笑解说、荒野建造等 """ progress_bar = st.progress(0) status_text = st.empty() @@ -35,8 +38,9 @@ def generate_script_docu(tr, params): if not params.video_origin_path: st.error("请先选择视频文件") return - - # ===================提取键帧=================== + """ + 1. 提取键帧 + """ update_progress(10, "正在提取关键帧...") # 创建临时目录用于存储关键帧 @@ -64,21 +68,12 @@ def generate_script_docu(tr, params): os.makedirs(video_keyframes_dir, exist_ok=True) # 初始化视频处理器 - if config.frames.get("version") == "v2": - processor = video_processor_v2.VideoProcessor(params.video_origin_path) - # 处理视频并提取关键帧 - processor.process_video_pipeline( - output_dir=video_keyframes_dir, - skip_seconds=st.session_state.get('skip_seconds'), - threshold=st.session_state.get('threshold') - ) - else: - processor = video_processor.VideoProcessor(params.video_origin_path) - # 处理视频并提取关键帧 - processor.process_video( - output_dir=video_keyframes_dir, - skip_seconds=0 - ) + processor = video_processor.VideoProcessor(params.video_origin_path) + # 处理视频并提取关键帧 + processor.process_video_pipeline( + output_dir=video_keyframes_dir, + interval_seconds=st.session_state.get('frame_interval_input'), + ) # 获取所有关键文件路径 for filename in sorted(os.listdir(video_keyframes_dir)): @@ -101,9 +96,11 @@ def generate_script_docu(tr, params): raise Exception(f"关键帧提取失败: {str(e)}") - # 根据不同的 LLM 提供商处理 + """ + 2. 视觉分析(批量分析每一帧) + """ vision_llm_provider = st.session_state.get('vision_llm_providers').lower() - logger.debug(f"Vision LLM 提供商: {vision_llm_provider}") + logger.debug(f"VLM 视觉大模型提供商: {vision_llm_provider}") try: # ===================初始化视觉分析器=================== @@ -137,111 +134,240 @@ def generate_script_docu(tr, params): # 执行异步分析 vision_batch_size = st.session_state.get('vision_batch_size') or config.frames.get("vision_batch_size") + vision_analysis_prompt = """ +我提供了 %s 张视频帧,它们按时间顺序排列,代表一个连续的视频片段。请仔细分析每一帧的内容,并关注帧与帧之间的变化,以理解整个片段的活动。 + +首先,请详细描述每一帧的关键视觉信息(包含:主要内容、人物、动作和场景)。 +然后,基于所有帧的分析,请用**简洁的语言**总结整个视频片段中发生的主要活动或事件流程。 + +请务必使用 JSON 格式输出你的结果。JSON 结构应如下: +{ + "frame_observations": [ + { + "frame_number": 1, // 或其他标识帧的方式 + "observation": "描述每张视频帧中的主要内容、人物、动作和场景。" + }, + // ... 更多帧的观察 ... + ], + "overall_activity_summary": "在这里填写你总结的整个片段的主要活动,保持简洁。" +} + +请务必不要遗漏视频帧,我提供了 %s 张视频帧,frame_observations 必须包含 %s 个元素 + +请只返回 JSON 字符串,不要包含任何其他解释性文字。 + """ results = loop.run_until_complete( analyzer.analyze_images( images=keyframe_files, - prompt=config.app.get('vision_analysis_prompt'), + prompt=vision_analysis_prompt, batch_size=vision_batch_size ) ) loop.close() + """ + 3. 处理分析结果(格式化为 json 数据) + """ # ===================处理分析结果=================== update_progress(60, "正在整理分析结果...") - # 合并所有批次的析结果 + # 合并所有批次的分析结果 frame_analysis = "" + merged_frame_observations = [] # 合并所有批次的帧观察 + overall_activity_summaries = [] # 合并所有批次的整体总结 prev_batch_files = None - + frame_counter = 1 # 初始化帧计数器,用于给所有帧分配连续的序号 + # logger.debug(json.dumps(results, indent=4, ensure_ascii=False)) + # 确保分析目录存在 + analysis_dir = os.path.join(utils.storage_dir(), "temp", "analysis") + os.makedirs(analysis_dir, exist_ok=True) + origin_res = os.path.join(analysis_dir, "frame_analysis.json") + with open(origin_res, 'w', encoding='utf-8') as f: + json.dump(results, f, ensure_ascii=False, indent=2) + + # 开始处理 for result in results: if 'error' in result: logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}") - - # 获取当前批次的文件列表 keyframe_001136_000045.jpg 将 000045 精度提升到 毫秒 + continue + + # 获取当前批次的文件列表 batch_files = get_batch_files(keyframe_files, result, vision_batch_size) logger.debug(f"批次 {result['batch_index']} 处理完成,共 {len(batch_files)} 张图片") - # logger.debug(batch_files) - - first_timestamp, last_timestamp, _ = get_batch_timestamps(batch_files, prev_batch_files) + + # 获取批次的时间戳范围 + first_timestamp, last_timestamp, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files) logger.debug(f"处理时间戳: {first_timestamp}-{last_timestamp}") - - # 添加带时间戳的分析结果 - frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n" - frame_analysis += result['response'] - frame_analysis += "\n" - + + # 解析响应中的JSON数据 + response_text = result['response'] + try: + # 处理可能包含```json```格式的响应 + if "```json" in response_text: + json_content = response_text.split("```json")[1].split("```")[0].strip() + elif "```" in response_text: + json_content = response_text.split("```")[1].split("```")[0].strip() + else: + json_content = response_text.strip() + + response_data = json.loads(json_content) + + # 提取frame_observations和overall_activity_summary + if "frame_observations" in response_data: + frame_obs = response_data["frame_observations"] + overall_summary = response_data.get("overall_activity_summary", "") + + # 添加时间戳信息到每个帧观察 + for i, obs in enumerate(frame_obs): + if i < len(batch_files): + # 从文件名中提取时间戳 + file_path = batch_files[i] + file_name = os.path.basename(file_path) + # 提取时间戳字符串 (格式如: keyframe_000675_000027000.jpg) + # 格式解析: keyframe_帧序号_毫秒时间戳.jpg + timestamp_parts = file_name.split('_') + if len(timestamp_parts) >= 3: + timestamp_str = timestamp_parts[-1].split('.')[0] + try: + # 修正时间戳解析逻辑 + # 格式为000100000,表示00:01:00,000,即1分钟 + # 需要按照对应位数进行解析: + # 前两位是小时,中间两位是分钟,后面是秒和毫秒 + if len(timestamp_str) >= 9: # 确保格式正确 + hours = int(timestamp_str[0:2]) + minutes = int(timestamp_str[2:4]) + seconds = int(timestamp_str[4:6]) + milliseconds = int(timestamp_str[6:9]) + + # 计算总秒数 + timestamp_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000 + formatted_time = utils.format_time(timestamp_seconds) # 格式化时间戳 + else: + # 兼容旧的解析方式 + timestamp_seconds = int(timestamp_str) / 1000 # 转换为秒 + formatted_time = utils.format_time(timestamp_seconds) # 格式化时间戳 + except ValueError: + logger.warning(f"无法解析时间戳: {timestamp_str}") + timestamp_seconds = 0 + formatted_time = "00:00:00,000" + else: + logger.warning(f"文件名格式不符合预期: {file_name}") + timestamp_seconds = 0 + formatted_time = "00:00:00,000" + + # 添加额外信息到帧观察 + obs["frame_path"] = file_path + obs["timestamp"] = formatted_time + obs["timestamp_seconds"] = timestamp_seconds + obs["batch_index"] = result['batch_index'] + + # 使用全局递增的帧计数器替换原始的frame_number + if "frame_number" in obs: + obs["original_frame_number"] = obs["frame_number"] # 保留原始编号作为参考 + obs["frame_number"] = frame_counter # 赋值连续的帧编号 + frame_counter += 1 # 增加帧计数器 + + # 添加到合并列表 + merged_frame_observations.append(obs) + + # 添加批次整体总结信息 + if overall_summary: + # 从文件名中提取时间戳数值 + first_time_str = first_timestamp.split('_')[-1].split('.')[0] + last_time_str = last_timestamp.split('_')[-1].split('.')[0] + + # 转换为毫秒并计算持续时间(秒) + try: + # 修正解析逻辑,与上面相同的方式解析时间戳 + if len(first_time_str) >= 9 and len(last_time_str) >= 9: + # 解析第一个时间戳 + first_hours = int(first_time_str[0:2]) + first_minutes = int(first_time_str[2:4]) + first_seconds = int(first_time_str[4:6]) + first_ms = int(first_time_str[6:9]) + first_time_seconds = first_hours * 3600 + first_minutes * 60 + first_seconds + first_ms / 1000 + + # 解析第二个时间戳 + last_hours = int(last_time_str[0:2]) + last_minutes = int(last_time_str[2:4]) + last_seconds = int(last_time_str[4:6]) + last_ms = int(last_time_str[6:9]) + last_time_seconds = last_hours * 3600 + last_minutes * 60 + last_seconds + last_ms / 1000 + + batch_duration = last_time_seconds - first_time_seconds + else: + # 兼容旧的解析方式 + first_time_ms = int(first_time_str) + last_time_ms = int(last_time_str) + batch_duration = (last_time_ms - first_time_ms) / 1000 + except ValueError: + # 使用 utils.time_to_seconds 函数处理格式化的时间戳 + first_time_seconds = utils.time_to_seconds(first_time_str.replace('_', ':').replace('-', ',')) + last_time_seconds = utils.time_to_seconds(last_time_str.replace('_', ':').replace('-', ',')) + batch_duration = last_time_seconds - first_time_seconds + + overall_activity_summaries.append({ + "batch_index": result['batch_index'], + "time_range": f"{first_timestamp}-{last_timestamp}", + "duration_seconds": batch_duration, + "summary": overall_summary + }) + except Exception as e: + logger.error(f"解析批次 {result['batch_index']} 的响应数据失败: {str(e)}") + # 添加原始响应作为回退 + frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n" + frame_analysis += response_text + frame_analysis += "\n" + # 更新上一个批次的文件 prev_batch_files = batch_files + + # 将合并后的结果转为JSON字符串 + merged_results = { + "frame_observations": merged_frame_observations, + "overall_activity_summaries": overall_activity_summaries + } + + # 使用当前时间创建文件名 + now = datetime.now() + timestamp_str = now.strftime("%Y%m%d_%H%M") + + # 保存完整的分析结果为JSON + analysis_filename = f"frame_analysis_{timestamp_str}.json" + analysis_json_path = os.path.join(analysis_dir, analysis_filename) + with open(analysis_json_path, 'w', encoding='utf-8') as f: + json.dump(merged_results, f, ensure_ascii=False, indent=2) + logger.info(f"分析结果已保存到: {analysis_json_path}") - if not frame_analysis.strip(): - raise Exception("未能生成有效的帧分析结果") - - # 保存分析结果 - analysis_path = os.path.join(utils.temp_dir(), "frame_analysis.txt") - with open(analysis_path, 'w', encoding='utf-8') as f: - f.write(frame_analysis) - - update_progress(70, "正在生成脚本...") - + """ + 4. 生成文案 + """ + logger.info("开始准备生成解说文案") + update_progress(80, "正在生成文案...") + from app.services.generate_narration_script import parse_frame_analysis_to_markdown, generate_narration # 从配置中获取文本生成相关配置 text_provider = config.app.get('text_llm_provider', 'gemini').lower() text_api_key = config.app.get(f'text_{text_provider}_api_key') text_model = config.app.get(f'text_{text_provider}_model_name') text_base_url = config.app.get(f'text_{text_provider}_base_url') - # 构建帧内容列表 - frame_content_list = [] - prev_batch_files = None + # 整理帧分析数据 + markdown_output = parse_frame_analysis_to_markdown(analysis_json_path) - for i, result in enumerate(results): - if 'error' in result: - continue - - batch_files = get_batch_files(keyframe_files, result, vision_batch_size) - _, _, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files) - - frame_content = { - "timestamp": timestamp_range, - "picture": result['response'], - "narration": "", - "OST": 2 - } - frame_content_list.append(frame_content) - - logger.debug(f"添加帧内容: 时间范围={timestamp_range}, 分析结果长度={len(result['response'])}") - - # 更新上一个批次的文件 - prev_batch_files = batch_files - - if not frame_content_list: - raise Exception("没有有效的帧内容可以处理") - - # ===================开始生成文案=================== - update_progress(80, "正在生成文案...") - # 校验配置 - api_params = { - "vision_api_key": vision_api_key, - "vision_model_name": vision_model, - "vision_base_url": vision_base_url or "", - "text_api_key": text_api_key, - "text_model_name": text_model, - "text_base_url": text_base_url or "" - } - chekc_video_config(api_params) - custom_prompt = st.session_state.get('custom_prompt', '') - processor = ScriptProcessor( - model_name=text_model, - api_key=text_api_key, - prompt=custom_prompt, - base_url=text_base_url or "", - video_theme=st.session_state.get('video_theme', '') + # 生成文案 + # 生成解说文案 + narration = generate_narration( + markdown_output, + text_api_key, + base_url=text_base_url, + model=text_model ) - - # 处理帧内容生成脚本 - script_result = processor.process_frames(frame_content_list) - + narration_dict = json.loads(narration)['items'] + # 为 narration_dict 中每个 item 新增一个 OST: 2 的字段, 代表保留原声和配音 + narration_dict = [{**item, "OST": 2} for item in narration_dict] + logger.debug(f"解说文案创作完成:\n{"\n".join([item['narration'] for item in narration_dict])}") # 结果转换为JSON字符串 - script = json.dumps(script_result, ensure_ascii=False, indent=2) + script = json.dumps(narration_dict, ensure_ascii=False, indent=2) except Exception as e: logger.exception(f"大模型处理过程中发生错误\n{traceback.format_exc()}") @@ -250,7 +376,7 @@ def generate_script_docu(tr, params): if script is None: st.error("生成脚本失败,请检查日志") st.stop() - logger.info(f"脚本生成完成") + logger.success(f"剪辑脚本生成完成") if isinstance(script, list): st.session_state['video_clip_json'] = script elif isinstance(script, str): diff --git a/webui/utils/__init__.py b/webui/utils/__init__.py deleted file mode 100644 index 74dd09d..0000000 --- a/webui/utils/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .performance import monitor_performance, PerformanceMonitor -from .cache import * -from .file_utils import * - -__all__ = [ - 'monitor_performance', - 'PerformanceMonitor' -] \ No newline at end of file diff --git a/webui/utils/merge_video.py b/webui/utils/merge_video.py index 65e13aa..9fa2b39 100644 --- a/webui/utils/merge_video.py +++ b/webui/utils/merge_video.py @@ -1,8 +1,8 @@ """ 合并视频和字幕文件 """ -from moviepy.editor import VideoFileClip, concatenate_videoclips -import pysrt +from moviepy import VideoFileClip, concatenate_videoclips +# import pysrt import os diff --git a/webui/utils/performance.py b/webui/utils/performance.py deleted file mode 100644 index 0eab5fa..0000000 --- a/webui/utils/performance.py +++ /dev/null @@ -1,37 +0,0 @@ -import psutil -import os -from loguru import logger -import torch - -class PerformanceMonitor: - @staticmethod - def monitor_memory(): - process = psutil.Process(os.getpid()) - memory_info = process.memory_info() - - logger.debug(f"Memory usage: {memory_info.rss / 1024 / 1024:.2f} MB") - - if torch.cuda.is_available(): - gpu_memory = torch.cuda.memory_allocated() / 1024 / 1024 - logger.debug(f"GPU Memory usage: {gpu_memory:.2f} MB") - - @staticmethod - def cleanup_resources(): - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - import gc - gc.collect() - - PerformanceMonitor.monitor_memory() - -def monitor_performance(func): - """性能监控装饰器""" - def wrapper(*args, **kwargs): - try: - PerformanceMonitor.monitor_memory() - result = func(*args, **kwargs) - return result - finally: - PerformanceMonitor.cleanup_resources() - return wrapper \ No newline at end of file