diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py index c7edc77..73cab3b 100644 --- a/app/services/audio_merger.py +++ b/app/services/audio_merger.py @@ -46,7 +46,7 @@ def merge_audio_files(task_id: str, audio_files: list, total_duration: float, li tts_audio = AudioSegment.from_file(audio_file) # 获取片段的开始和结束时间 - start_time, end_time = segment['new_timestamp'].split('-') + start_time, end_time = segment['timestamp'].split('-') start_seconds = utils.time_to_seconds(start_time) end_seconds = utils.time_to_seconds(end_time) diff --git a/app/services/clip_video.py b/app/services/clip_video.py new file mode 100644 index 0000000..d5c591a --- /dev/null +++ b/app/services/clip_video.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +''' +@Project: NarratoAI +@File : clip_video +@Author : 小林同学 +@Date : 2025/5/6 下午6:14 +''' + +import os +import subprocess +import json +import hashlib +import logging +from typing import Dict, List, Optional +from pathlib import Path + +# 配置日志 +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def parse_timestamp(timestamp: str) -> tuple: + """ + 解析时间戳字符串,返回开始和结束时间 + + Args: + timestamp: 格式为'HH:MM:SS-HH:MM:SS'的时间戳字符串 + + Returns: + tuple: (开始时间, 结束时间) 格式为'HH:MM:SS' + """ + start_time, end_time = timestamp.split('-') + return start_time, end_time + + +def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str: + """ + 根据开始时间和持续时间计算结束时间 + + Args: + start_time: 开始时间,格式为'HH:MM:SS' + duration: 持续时间,单位为秒 + extra_seconds: 额外添加的秒数,默认为1秒 + + Returns: + str: 计算后的结束时间,格式为'HH:MM:SS' + """ + h, m, s = map(int, start_time.split(':')) + total_seconds = h * 3600 + m * 60 + s + duration + extra_seconds + + h_new = int(total_seconds // 3600) + m_new = int((total_seconds % 3600) // 60) + s_new = int(total_seconds % 60) + + return f"{h_new:02d}:{m_new:02d}:{s_new:02d}" + + +def check_hardware_acceleration() -> Optional[str]: + """ + 检查系统支持的硬件加速选项 + + Returns: + Optional[str]: 硬件加速参数,如果不支持则返回None + """ + # 检查NVIDIA GPU支持 + try: + nvidia_check = subprocess.run( + ["ffmpeg", "-hwaccel", "cuda", "-i", "/dev/null", "-f", "null", "-"], + stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False + ) + if nvidia_check.returncode == 0: + return "cuda" + except Exception: + pass + + # 检查MacOS videotoolbox支持 + try: + videotoolbox_check = subprocess.run( + ["ffmpeg", "-hwaccel", "videotoolbox", "-i", "/dev/null", "-f", "null", "-"], + stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False + ) + if videotoolbox_check.returncode == 0: + return "videotoolbox" + except Exception: + pass + + # 检查Intel Quick Sync支持 + try: + qsv_check = subprocess.run( + ["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"], + stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False + ) + if qsv_check.returncode == 0: + return "qsv" + except Exception: + pass + + return None + + +def clip_video( + video_origin_path: str, + tts_result: List[Dict], + output_dir: Optional[str] = None, + task_id: Optional[str] = None +) -> Dict[str, str]: + """ + 根据时间戳裁剪视频 + + Args: + video_origin_path: 原始视频的路径 + tts_result: 包含时间戳和持续时间信息的列表 + output_dir: 输出目录路径,默认为None时会自动生成 + task_id: 任务ID,用于生成唯一的输出目录,默认为None时会自动生成 + + Returns: + Dict[str, str]: 时间戳到裁剪后视频路径的映射 + """ + # 检查视频文件是否存在 + if not os.path.exists(video_origin_path): + raise FileNotFoundError(f"视频文件不存在: {video_origin_path}") + + # 如果未提供task_id,则根据输入生成一个唯一ID + if task_id is None: + content_for_hash = f"{video_origin_path}_{json.dumps(tts_result)}" + task_id = hashlib.md5(content_for_hash.encode()).hexdigest() + + # 设置输出目录 + if output_dir is None: + output_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "storage", "temp", "clip_video", task_id + ) + + # 确保输出目录存在 + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # 检查硬件加速支持 + hwaccel = check_hardware_acceleration() + hwaccel_args = [] + if hwaccel: + hwaccel_args = ["-hwaccel", hwaccel] + logger.info(f"使用硬件加速: {hwaccel}") + + # 存储裁剪结果 + result = {} + + for item in tts_result: + timestamp = item["timestamp"] + start_time, _ = parse_timestamp(timestamp) + + # 根据持续时间计算真正的结束时间(加上1秒余量) + duration = item["duration"] + calculated_end_time = calculate_end_time(start_time, duration) + + # 格式化输出文件名 + output_filename = f"vid-{start_time.replace(':', '-')}-{calculated_end_time.replace(':', '-')}.mp4" + output_path = os.path.join(output_dir, output_filename) + + # 构建FFmpeg命令 + ffmpeg_cmd = [ + "ffmpeg", "-y", *hwaccel_args, + "-i", video_origin_path, + "-ss", start_time, + "-to", calculated_end_time, + "-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264", + "-c:a", "aac", + "-strict", "experimental", + output_path + ] + + # 执行FFmpeg命令 + try: + logger.info(f"裁剪视频片段: {timestamp} -> {start_time}到{calculated_end_time}") + logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}") + + process = subprocess.run( + ffmpeg_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True + ) + + result[timestamp] = output_path + logger.info(f"成功裁剪视频片段: {timestamp} -> {output_path}") + + except subprocess.CalledProcessError as e: + logger.error(f"裁剪视频片段失败: {timestamp}") + logger.error(f"错误信息: {e.stderr}") + raise RuntimeError(f"视频裁剪失败: {e.stderr}") + + return result + + +if __name__ == "__main__": + video_origin_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/qyn2-2无片头片尾.mp4" + + tts_result = [{'timestamp': '00:00:00-00:01:15', + 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', + 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', + 'duration': 25.55, + 'text': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!'}, + {'timestamp': '00:01:15-00:04:40', + 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', + 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', + 'duration': 13.488, + 'text': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…'}, + {'timestamp': '00:04:58-00:05:45', + 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', + 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', + 'duration': 21.363, + 'text': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!'}, + {'timestamp': '00:05:45-00:06:00', + 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3', + 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt', + 'duration': 7.675, 'text': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!'}] + + # 使用方法示例 + try: + result = clip_video(video_origin_path, tts_result) + print("裁剪结果:") + print(json.dumps(result, indent=4, ensure_ascii=False)) + except Exception as e: + print(f"发生错误: {e}") diff --git a/app/services/merger_video.py b/app/services/merger_video.py new file mode 100644 index 0000000..e900845 --- /dev/null +++ b/app/services/merger_video.py @@ -0,0 +1,543 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +''' +@Project: NarratoAI +@File : merger_video +@Author : 小林同学 +@Date : 2025/5/6 下午7:38 +''' + +import os +import subprocess +import logging +from enum import Enum +from typing import List, Optional, Tuple, Dict, Any +import shutil + +# 设置日志 +logger = logging.getLogger(__name__) + + +class VideoAspect(Enum): + """视频宽高比枚举""" + portrait = "portrait" # 竖屏 9:16 + landscape = "landscape" # 横屏 16:9 + square = "square" # 方形 1:1 + + def to_resolution(self) -> Tuple[int, int]: + """根据宽高比返回标准分辨率""" + if self == VideoAspect.portrait: + return 1080, 1920 # 竖屏 9:16 + elif self == VideoAspect.landscape: + return 1920, 1080 # 横屏 16:9 + elif self == VideoAspect.square: + return 1080, 1080 # 方形 1:1 + else: + return 1080, 1920 # 默认竖屏 + + +def check_ffmpeg_installation() -> bool: + """ + 检查ffmpeg是否已安装 + + Returns: + bool: 如果安装则返回True,否则返回False + """ + try: + subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + return True + except (subprocess.SubprocessError, FileNotFoundError): + logger.error("ffmpeg未安装或不在系统PATH中,请安装ffmpeg") + return False + + +def get_hardware_acceleration_option() -> Optional[str]: + """ + 根据系统环境选择合适的硬件加速选项 + + Returns: + Optional[str]: 硬件加速参数,如果不支持则返回None + """ + try: + # 检查NVIDIA GPU支持 + nvidia_check = subprocess.run( + ['ffmpeg', '-hide_banner', '-hwaccels'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True + ) + output = nvidia_check.stdout.lower() + + if 'cuda' in output: + return 'cuda' + elif 'nvenc' in output: + return 'nvenc' + elif 'qsv' in output: # Intel Quick Sync + return 'qsv' + elif 'videotoolbox' in output: # macOS + return 'videotoolbox' + elif 'vaapi' in output: # Linux VA-API + return 'vaapi' + else: + logger.info("没有找到支持的硬件加速器,将使用软件编码") + return None + except Exception as e: + logger.warning(f"检测硬件加速器时出错: {str(e)},将使用软件编码") + return None + + +def check_video_has_audio(video_path: str) -> bool: + """ + 检查视频是否包含音频流 + + Args: + video_path: 视频文件路径 + + Returns: + bool: 如果视频包含音频流则返回True,否则返回False + """ + if not os.path.exists(video_path): + logger.warning(f"视频文件不存在: {video_path}") + return False + + probe_cmd = [ + 'ffprobe', '-v', 'error', + '-select_streams', 'a:0', + '-show_entries', 'stream=codec_type', + '-of', 'csv=p=0', + video_path + ] + + try: + result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False) + return result.stdout.strip() == 'audio' + except Exception as e: + logger.warning(f"检测视频音频流时出错: {str(e)}") + return False + + +def create_ffmpeg_concat_file(video_paths: List[str], concat_file_path: str) -> str: + """ + 创建ffmpeg合并所需的concat文件 + + Args: + video_paths: 需要合并的视频文件路径列表 + concat_file_path: concat文件的输出路径 + + Returns: + str: concat文件的路径 + """ + with open(concat_file_path, 'w', encoding='utf-8') as f: + for video_path in video_paths: + # 使用绝对路径并转义特殊字符 + abs_path = os.path.abspath(video_path).replace('\\', '\\\\').replace(':', '\\:') + f.write(f"file '{abs_path}'\n") + return concat_file_path + + +def process_single_video( + input_path: str, + output_path: str, + target_width: int, + target_height: int, + keep_audio: bool = True, + hwaccel: Optional[str] = None +) -> str: + """ + 处理单个视频:调整分辨率、帧率等 + + Args: + input_path: 输入视频路径 + output_path: 输出视频路径 + target_width: 目标宽度 + target_height: 目标高度 + keep_audio: 是否保留音频 + hwaccel: 硬件加速选项 + + Returns: + str: 处理后的视频路径 + """ + if not os.path.exists(input_path): + raise FileNotFoundError(f"找不到视频文件: {input_path}") + + # 构建基本命令 + command = ['ffmpeg', '-y'] + + # 添加硬件加速参数 + if hwaccel: + if hwaccel == 'cuda' or hwaccel == 'nvenc': + command.extend(['-hwaccel', 'cuda']) + elif hwaccel == 'qsv': + command.extend(['-hwaccel', 'qsv']) + elif hwaccel == 'videotoolbox': + command.extend(['-hwaccel', 'videotoolbox']) + elif hwaccel == 'vaapi': + command.extend(['-hwaccel', 'vaapi', '-vaapi_device', '/dev/dri/renderD128']) + + # 输入文件 + command.extend(['-i', input_path]) + + # 处理音频 + if not keep_audio: + command.extend(['-an']) # 移除音频 + else: + # 检查输入视频是否有音频流 + has_audio = check_video_has_audio(input_path) + if has_audio: + command.extend(['-c:a', 'aac', '-b:a', '128k']) # 音频编码为AAC + else: + logger.warning(f"视频 {input_path} 没有音频流,将会忽略音频设置") + command.extend(['-an']) # 没有音频流时移除音频设置 + + # 视频处理参数:缩放并添加填充以保持比例 + scale_filter = f"scale={target_width}:{target_height}:force_original_aspect_ratio=decrease" + pad_filter = f"pad={target_width}:{target_height}:(ow-iw)/2:(oh-ih)/2" + command.extend([ + '-vf', f"{scale_filter},{pad_filter}", + '-r', '30', # 设置帧率为30fps + ]) + + # 选择编码器 + if hwaccel == 'cuda' or hwaccel == 'nvenc': + command.extend(['-c:v', 'h264_nvenc', '-preset', 'p4', '-profile:v', 'high']) + elif hwaccel == 'qsv': + command.extend(['-c:v', 'h264_qsv', '-preset', 'medium']) + elif hwaccel == 'videotoolbox': + command.extend(['-c:v', 'h264_videotoolbox', '-profile:v', 'high']) + elif hwaccel == 'vaapi': + command.extend(['-c:v', 'h264_vaapi', '-profile', '100']) + else: + command.extend(['-c:v', 'libx264', '-preset', 'medium', '-profile:v', 'high']) + + # 设置视频比特率和其他参数 + command.extend([ + '-b:v', '5M', + '-maxrate', '8M', + '-bufsize', '10M', + '-pix_fmt', 'yuv420p', # 兼容性更好的颜色格式 + ]) + + # 输出文件 + command.append(output_path) + + # 执行命令 + try: + logger.info(f"处理视频 {input_path} -> {output_path}") + subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return output_path + except subprocess.CalledProcessError as e: + logger.error(f"处理视频失败: {e.stderr.decode() if e.stderr else str(e)}") + raise RuntimeError(f"处理视频失败: {str(e)}") + + +def combine_clip_videos( + output_video_path: str, + video_paths: List[str], + video_ost_list: List[int], + video_aspect: VideoAspect = VideoAspect.portrait, + threads: int = 4, +) -> str: + """ + 合并子视频 + Args: + output_video_path: 合并后的存储路径 + video_paths: 子视频路径列表 + video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说) + video_aspect: 屏幕比例 + threads: 线程数 + + Returns: + str: 合并后的视频路径 + """ + # 检查ffmpeg是否安装 + if not check_ffmpeg_installation(): + raise RuntimeError("未找到ffmpeg,请先安装") + + # 准备输出目录 + output_dir = os.path.dirname(output_video_path) + os.makedirs(output_dir, exist_ok=True) + + # 获取目标分辨率 + aspect = VideoAspect(video_aspect) + video_width, video_height = aspect.to_resolution() + + # 检测可用的硬件加速选项 + hwaccel = get_hardware_acceleration_option() + if hwaccel: + logger.info(f"将使用 {hwaccel} 硬件加速") + + # 重组视频路径和原声设置为一个字典列表结构 + video_segments = [] + + # 检查视频路径和原声设置列表长度是否匹配 + if len(video_paths) != len(video_ost_list): + logger.warning(f"视频路径列表({len(video_paths)})和原声设置列表({len(video_ost_list)})长度不匹配") + # 调整长度以匹配较短的列表 + min_length = min(len(video_paths), len(video_ost_list)) + video_paths = video_paths[:min_length] + video_ost_list = video_ost_list[:min_length] + + # 创建视频处理配置字典列表 + for i, (video_path, video_ost) in enumerate(zip(video_paths, video_ost_list)): + if not os.path.exists(video_path): + logger.warning(f"视频不存在,跳过: {video_path}") + continue + + # 检查是否有音频流 + has_audio = check_video_has_audio(video_path) + + # 构建视频片段配置 + segment = { + "index": i, + "path": video_path, + "ost": video_ost, + "has_audio": has_audio, + "keep_audio": video_ost > 0 and has_audio # 只有当ost>0且实际有音频时才保留 + } + + # 记录日志 + if video_ost > 0 and not has_audio: + logger.warning(f"视频 {video_path} 设置为保留原声(ost={video_ost}),但该视频没有音频流") + + video_segments.append(segment) + + # 处理每个视频片段 + processed_videos = [] + temp_dir = os.path.join(output_dir, "temp_videos") + os.makedirs(temp_dir, exist_ok=True) + + try: + # 第一阶段:处理所有视频片段到中间文件 + for segment in video_segments: + # 处理单个视频,去除或保留音频 + temp_output = os.path.join(temp_dir, f"processed_{segment['index']}.mp4") + try: + process_single_video( + input_path=segment['path'], + output_path=temp_output, + target_width=video_width, + target_height=video_height, + keep_audio=segment['keep_audio'], + hwaccel=hwaccel + ) + processed_videos.append({ + "index": segment["index"], + "path": temp_output, + "keep_audio": segment["keep_audio"] + }) + logger.info(f"视频 {segment['index'] + 1}/{len(video_segments)} 处理完成") + except Exception as e: + logger.error(f"处理视频 {segment['path']} 时出错: {str(e)}") + continue + + if not processed_videos: + raise ValueError("没有有效的视频片段可以合并") + + # 按原始索引排序处理后的视频 + processed_videos.sort(key=lambda x: x["index"]) + + # 第二阶段:分步骤合并视频 - 避免复杂的filter_complex滤镜 + try: + # 1. 首先,将所有没有音频的视频或音频被禁用的视频合并到一个临时文件中 + video_paths_only = [video["path"] for video in processed_videos] + video_concat_path = os.path.join(temp_dir, "video_concat.mp4") + + # 创建concat文件,用于合并视频流 + concat_file = os.path.join(temp_dir, "concat_list.txt") + create_ffmpeg_concat_file(video_paths_only, concat_file) + + # 合并所有视频流,但不包含音频 + concat_cmd = [ + 'ffmpeg', '-y', + '-f', 'concat', + '-safe', '0', + '-i', concat_file, + '-c:v', 'libx264', + '-preset', 'medium', + '-profile:v', 'high', + '-an', # 不包含音频 + '-threads', str(threads), + video_concat_path + ] + + subprocess.run(concat_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.info("视频流合并完成") + + # 2. 提取并合并有音频的片段 + audio_segments = [video for video in processed_videos if video["keep_audio"]] + + if not audio_segments: + # 如果没有音频片段,直接使用无音频的合并视频作为最终结果 + shutil.copy(video_concat_path, output_video_path) + logger.info("无音频视频合并完成") + return output_video_path + + # 创建音频中间文件 + audio_files = [] + for i, segment in enumerate(audio_segments): + # 提取音频 + audio_file = os.path.join(temp_dir, f"audio_{i}.aac") + extract_audio_cmd = [ + 'ffmpeg', '-y', + '-i', segment["path"], + '-vn', # 不包含视频 + '-c:a', 'aac', + '-b:a', '128k', + audio_file + ] + subprocess.run(extract_audio_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + audio_files.append({ + "index": segment["index"], + "path": audio_file + }) + logger.info(f"提取音频 {i+1}/{len(audio_segments)} 完成") + + # 3. 计算每个音频片段的时间位置 + audio_timings = [] + current_time = 0.0 + + # 获取每个视频片段的时长 + for i, video in enumerate(processed_videos): + duration_cmd = [ + 'ffprobe', '-v', 'error', + '-show_entries', 'format=duration', + '-of', 'csv=p=0', + video["path"] + ] + result = subprocess.run(duration_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + duration = float(result.stdout.strip()) + + # 如果当前片段需要保留音频,记录时间位置 + if video["keep_audio"]: + for audio in audio_files: + if audio["index"] == video["index"]: + audio_timings.append({ + "file": audio["path"], + "start": current_time, + "index": video["index"] + }) + break + + current_time += duration + + # 4. 创建静音音频轨道作为基础 + silence_audio = os.path.join(temp_dir, "silence.aac") + create_silence_cmd = [ + 'ffmpeg', '-y', + '-f', 'lavfi', + '-i', f'anullsrc=r=44100:cl=stereo', + '-t', str(current_time), # 总时长 + '-c:a', 'aac', + '-b:a', '128k', + silence_audio + ] + subprocess.run(create_silence_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # 5. 创建复杂滤镜命令以混合音频 + filter_script = os.path.join(temp_dir, "filter_script.txt") + with open(filter_script, 'w') as f: + f.write(f"[0:a]volume=0.0[silence];\n") # 首先静音背景轨道 + + # 添加每个音频文件 + for i, timing in enumerate(audio_timings): + f.write(f"[{i+1}:a]adelay={int(timing['start']*1000)}|{int(timing['start']*1000)}[a{i}];\n") + + # 混合所有音频 + mix_str = "[silence]" + for i in range(len(audio_timings)): + mix_str += f"[a{i}]" + mix_str += f"amix=inputs={len(audio_timings)+1}:duration=longest[aout]" + f.write(mix_str) + + # 6. 构建音频合并命令 + audio_inputs = ['-i', silence_audio] + for timing in audio_timings: + audio_inputs.extend(['-i', timing["file"]]) + + mixed_audio = os.path.join(temp_dir, "mixed_audio.aac") + audio_mix_cmd = [ + 'ffmpeg', '-y' + ] + audio_inputs + [ + '-filter_complex_script', filter_script, + '-map', '[aout]', + '-c:a', 'aac', + '-b:a', '128k', + mixed_audio + ] + + subprocess.run(audio_mix_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.info("音频混合完成") + + # 7. 将合并的视频和混合的音频组合在一起 + final_cmd = [ + 'ffmpeg', '-y', + '-i', video_concat_path, + '-i', mixed_audio, + '-c:v', 'copy', + '-c:a', 'aac', + '-map', '0:v:0', + '-map', '1:a:0', + '-shortest', + output_video_path + ] + + subprocess.run(final_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.info("视频最终合并完成") + + return output_video_path + + except subprocess.CalledProcessError as e: + logger.error(f"合并视频过程中出错: {e.stderr.decode() if e.stderr else str(e)}") + + # 尝试备用合并方法 - 最简单的无音频合并 + logger.info("尝试备用合并方法 - 无音频合并") + try: + concat_file = os.path.join(temp_dir, "concat_list.txt") + video_paths_only = [video["path"] for video in processed_videos] + create_ffmpeg_concat_file(video_paths_only, concat_file) + + backup_cmd = [ + 'ffmpeg', '-y', + '-f', 'concat', + '-safe', '0', + '-i', concat_file, + '-c:v', 'copy', + '-an', # 无音频 + output_video_path + ] + + subprocess.run(backup_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.warning("使用备用方法(无音频)成功合并视频") + return output_video_path + except Exception as backup_error: + logger.error(f"备用合并方法也失败: {str(backup_error)}") + raise RuntimeError(f"无法合并视频: {str(backup_error)}") + + except Exception as e: + logger.error(f"合并视频时出错: {str(e)}") + raise + finally: + # 清理临时文件 + try: + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + logger.info("已清理临时文件") + except Exception as e: + logger.warning(f"清理临时文件时出错: {str(e)}") + + +if __name__ == '__main__': + video_paths = [ + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-00-00-00-00-26.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-01-15-00-01-29.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-04-58-00-05-20.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/0ac14d474144b54d614c26a5c87cffe7/vid-00-05-45-00-05-53.mp4', + '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4' + ] + + combine_clip_videos( + output_video_path="/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/merged_123.mp4", + video_paths=video_paths, + video_ost_list=[1, 0, 1, 0, 0, 1], + video_aspect=VideoAspect.portrait + ) diff --git a/app/services/subtitle_merger.py b/app/services/subtitle_merger.py new file mode 100644 index 0000000..1388b76 --- /dev/null +++ b/app/services/subtitle_merger.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +''' +@Project: NarratoAI +@File : subtitle_merger +@Author : viccy +@Date : 2025/5/6 下午4:00 +''' + +import re +import os +from datetime import datetime, timedelta + + +def parse_time(time_str): + """解析时间字符串为timedelta对象""" + hours, minutes, seconds_ms = time_str.split(':') + seconds, milliseconds = seconds_ms.split(',') + + td = timedelta( + hours=int(hours), + minutes=int(minutes), + seconds=int(seconds), + milliseconds=int(milliseconds) + ) + return td + + +def format_time(td): + """将timedelta对象格式化为SRT时间字符串""" + total_seconds = int(td.total_seconds()) + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + seconds = total_seconds % 60 + milliseconds = td.microseconds // 1000 + + return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" + + +def extract_time_range_from_filename(filename): + """从文件名中提取时间范围""" + pattern = r'subtitle_(\d{2})_(\d{2})_(\d{2})-(\d{2})_(\d{2})_(\d{2})' + match = re.search(pattern, filename) + + if not match: + return None, None + + start_h, start_m, start_s, end_h, end_m, end_s = map(int, match.groups()) + + start_time = timedelta(hours=start_h, minutes=start_m, seconds=start_s) + end_time = timedelta(hours=end_h, minutes=end_m, seconds=end_s) + + return start_time, end_time + + +def merge_subtitle_files(subtitle_files, output_file=None): + """ + 合并多个SRT字幕文件 + + 参数: + subtitle_files: 包含SRT文件路径的列表 + output_file: 输出文件的路径,如果为None则自动生成 + + 返回: + 合并后的字幕文件路径 + """ + # 按文件名中的开始时间排序 + sorted_files = sorted(subtitle_files, + key=lambda x: extract_time_range_from_filename(x)[0]) + + merged_subtitles = [] + subtitle_index = 1 + + for file_path in sorted_files: + # 从文件名获取起始时间偏移 + offset_time, _ = extract_time_range_from_filename(file_path) + + if offset_time is None: + print(f"警告: 无法从文件名 {os.path.basename(file_path)} 中提取时间范围,跳过该文件") + continue + + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + + # 解析字幕文件 + subtitle_blocks = re.split(r'\n\s*\n', content.strip()) + + for block in subtitle_blocks: + lines = block.strip().split('\n') + if len(lines) < 3: # 确保块有足够的行数 + continue + + # 解析时间轴行 + time_line = lines[1] + time_parts = time_line.split(' --> ') + if len(time_parts) != 2: + continue + + start_time = parse_time(time_parts[0]) + end_time = parse_time(time_parts[1]) + + # 应用时间偏移 + adjusted_start_time = start_time + offset_time + adjusted_end_time = end_time + offset_time + + # 重建字幕块 + adjusted_time_line = f"{format_time(adjusted_start_time)} --> {format_time(adjusted_end_time)}" + text_lines = lines[2:] + + new_block = [ + str(subtitle_index), + adjusted_time_line, + *text_lines + ] + + merged_subtitles.append('\n'.join(new_block)) + subtitle_index += 1 + + # 合并所有字幕块 + merged_content = '\n\n'.join(merged_subtitles) + + # 确定输出文件路径 + if output_file is None: + # 自动生成输出文件名 + first_file_path = sorted_files[0] + last_file_path = sorted_files[-1] + _, first_end = extract_time_range_from_filename(first_file_path) + _, last_end = extract_time_range_from_filename(last_file_path) + + dir_path = os.path.dirname(first_file_path) + first_start_str = os.path.basename(first_file_path).split('-')[0].replace('subtitle_', '') + last_end_h, last_end_m, last_end_s = int(last_end.seconds // 3600), int((last_end.seconds % 3600) // 60), int(last_end.seconds % 60) + last_end_str = f"{last_end_h:02d}_{last_end_m:02d}_{last_end_s:02d}" + + output_file = os.path.join(dir_path, f"merged_subtitle_{first_start_str}-{last_end_str}.srt") + + # 写入合并后的内容 + with open(output_file, 'w', encoding='utf-8') as file: + file.write(merged_content) + + return output_file + + +if __name__ == '__main__': + subtitle_files = [ + "/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt", + "/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt", + "/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt", + "/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt", + ] + + output_file = merge_subtitle_files(subtitle_files) + print(f"字幕文件已合并至: {output_file}") diff --git a/app/services/task.py b/app/services/task.py index 6704f0d..1f3e797 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -9,7 +9,7 @@ from loguru import logger from app.config import config from app.models import const from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams -from app.services import llm, material, subtitle, video, voice, audio_merger +from app.services import llm, material, subtitle, video, voice, audio_merger, subtitle_merger, clip_video from app.services import state as sm from app.utils import utils @@ -158,18 +158,25 @@ def get_video_materials(task_id, params, video_terms, audio_duration): def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict): - """后台任务(自动剪辑视频进行剪辑)""" + """ + 后台任务(自动剪辑视频进行剪辑) + Args: + task_id: 任务ID + params: 视频参数 + subclip_path_videos: 视频片段路径 + """ logger.info(f"\n\n## 开始任务: {task_id}") - - # 初始化 ImageMagick - if not utils.init_imagemagick(): - logger.warning("ImageMagick 初始化失败,字幕可能无法正常显示") - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5) + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0) - # tts 角色名称 - voice_name = voice.parse_voice_name(params.voice_name) + # # 初始化 ImageMagick + # if not utils.init_imagemagick(): + # logger.warning("ImageMagick 初始化失败,字幕可能无法正常显示") + # # tts 角色名称 + # voice_name = voice.parse_voice_name(params.voice_name) + """ + 1. 加载剪辑脚本 + """ logger.info("\n\n## 1. 加载视频脚本") video_script_path = path.join(params.video_clip_json_path) @@ -187,111 +194,102 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di logger.debug(f"解说时间戳列表: \n{time_list}") # 获取视频总时长(单位 s) - last_timestamp = list_script[-1]['new_timestamp'] - end_time = last_timestamp.split("-")[1] - total_duration = utils.time_to_seconds(end_time) - + last_timestamp = list_script[-1]['timestamp'].split("-")[1] + total_duration = utils.time_to_seconds(last_timestamp) + except Exception as e: - logger.error(f"无法读取视频json脚本,请检查配置是否正确。{e}") - raise ValueError("无法读取视频json脚本,请检查配置是否正确") + logger.error(f"无法读取视频json脚本,请检查脚本格式是否正确") + raise ValueError("无法读取视频json脚本,请检查脚本格式是否正确") else: logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc()) raise ValueError("解说脚本不存在!请检查配置是否正确。") + """ + 2. 使用 TTS 生成音频素材 + """ logger.info("\n\n## 2. 根据OST设置生成音频列表") - # 只为OST=0或2的片段生成TTS音频 + # 只为OST=0 or 2的判断生成音频, OST=0 仅保留解说 OST=2 保留解说和原声 tts_segments = [ segment for segment in list_script if segment['OST'] in [0, 2] ] logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}") - - # 初始化音频文件路径 - audio_files = [] - final_audio = "" - - if tts_segments: - audio_files, sub_maker_list = voice.tts_multiple( - task_id=task_id, - list_script=tts_segments, # 只传入需要TTS的片段 - voice_name=voice_name, - voice_rate=params.voice_rate, - voice_pitch=params.voice_pitch, - force_regenerate=True - ) - if audio_files is None: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.") - return - if audio_files: - logger.info(f"合并音频文件: {audio_files}") - try: - # 传入OST信息以便正确处理音频 - final_audio = audio_merger.merge_audio_files( - task_id=task_id, - audio_files=audio_files, - total_duration=total_duration, - list_script=list_script # 传入完整脚本以便处理OST - ) - logger.info("音频文件合并成功") - except Exception as e: - logger.error(f"合并音频文件失败: {str(e)}") - final_audio = "" - else: - # 如果没有需要生成TTS的片段,创建一个空白音频文件 - # 这样可以确保后续的音频处理能正确进行 - logger.info("没有需要生成TTS的片段,将保留原声和背景音乐") - final_audio = path.join(utils.task_dir(task_id), "empty.mp3") + tts_results = voice.tts_multiple( + task_id=task_id, + list_script=tts_segments, # 只传入需要TTS的片段 + voice_name=params.voice_name, + voice_rate=params.voice_rate, + voice_pitch=params.voice_pitch, + force_regenerate=True + ) + audio_files = [ + tts_result["audio_file"] for tts_result in tts_results + ] + subtitle_files = [ + tts_result["subtitle_file"] for tts_result in tts_results + ] + if tts_results: + logger.info(f"合并音频/字幕文件") try: - from moviepy.editor import AudioClip - # 创建一个与视频等长的空白音频 - empty_audio = AudioClip(make_frame=lambda t: 0, duration=total_duration) - empty_audio.write_audiofile(final_audio, fps=44100) - logger.info(f"已创建空白音频文件: {final_audio}") - except Exception as e: - logger.error(f"创建空白音频文件失败: {str(e)}") - final_audio = "" - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30) - - subtitle_path = "" - if params.subtitle_enabled: - if audio_files: - subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt") - subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() - logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}") - - subtitle.create( - audio_file=final_audio, - subtitle_file=subtitle_path, + # 合并音频文件 + merged_audio_path = audio_merger.merge_audio_files( + task_id=task_id, + audio_files=audio_files, + total_duration=total_duration, + list_script=list_script # 传入完整脚本以便处理OST ) + logger.info(f"音频文件合并成功->{merged_audio_path}") + # 合并字幕文件 + merged_subtitle_path = subtitle_merger.merge_subtitle_files( + subtitle_files=subtitle_files, + ) + logger.info(f"字幕文件合并成功->{merged_subtitle_path}") + except Exception as e: + logger.error(f"合并音频文件失败: {str(e)}") + merged_audio_path = "" + merged_subtitle_path = "" + else: + logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.") + return + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20) - subtitle_lines = subtitle.file_to_subtitles(subtitle_path) + """ + 3. (可选) 使用 whisper 生成字幕 + """ + if merged_subtitle_path is None: + if audio_files: + merged_subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt") + subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() + logger.info(f"\n\n使用 {subtitle_provider} 生成字幕") + + subtitle.create( + audio_file=merged_audio_path, + subtitle_file=merged_subtitle_path, + ) + subtitle_lines = subtitle.file_to_subtitles(merged_subtitle_path) if not subtitle_lines: - logger.warning(f"字幕文件无效: {subtitle_path}") - subtitle_path = "" + logger.warning(f"字幕文件无效: {merged_subtitle_path}") sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40) + """ + 4. 裁剪视频 - 将超出音频长度的视频进行裁剪 + """ logger.info("\n\n## 4. 裁剪视频") + result = clip_video.clip_video(params.video_origin_path, tts_results) + subclip_path_videos.update(result) subclip_videos = [x for x in subclip_path_videos.values()] - # logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}") - if not subclip_videos: - sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) - logger.error( - "裁剪视频失败,可能是 ImageMagick 不可用") - return - - sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50) + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60) + """ + 5. 合并视频 + """ final_video_paths = [] combined_video_paths = [] - _progress = 50 - index = 1 - combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4") + combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4") logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}") video.combine_clip_videos( @@ -302,14 +300,15 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di video_aspect=params.video_aspect, threads=params.n_threads # 多线程 ) + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80) - _progress += 50 / 2 - sm.state.update_task(task_id, progress=_progress) - final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4") + """ + 6. 合并字幕/BGM/配音/视频 + """ + final_video_path = path.join(utils.task_dir(task_id), f"combined.mp4") + logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {final_video_path}") - logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}") - # 获取背景音乐 bgm_path = None if params.bgm_type or params.bgm_file: @@ -340,18 +339,15 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di font_path = utils.font_dir(params.font_name) video.generate_video_v3( video_path=combined_video_path, - subtitle_path=subtitle_path, + subtitle_path=merged_subtitle_path, bgm_path=bgm_path, - narration_path=final_audio, + narration_path=merged_audio_path, output_path=final_video_path, volume_config=volume_config, # 添加音量配置 subtitle_style=subtitle_style, font_path=font_path ) - _progress += 50 / 2 - sm.state.update_task(task_id, progress=_progress) - final_video_paths.append(final_video_path) combined_video_paths.append(combined_video_path) @@ -400,35 +396,20 @@ def validate_params(video_path, audio_path, output_file, params): if __name__ == "__main__": - # task_id = "test123" - # subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4', - # '00:06-00:15': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_06-00_15.mp4', - # '01:10-01:17': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_10-01_17.mp4', - # '00:47-01:03': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_47-01_03.mp4', - # '01:03-01:10': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_03-01_10.mp4', - # '02:40-03:08': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-02_40-03_08.mp4', - # '03:02-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_02-03_20.mp4', - # '03:18-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_18-03_20.mp4'} - # - # params = VideoClipParams( - # video_clip_json_path="E:\\projects\\NarratoAI\\resource/scripts/test003.json", - # video_origin_path="E:\\projects\\NarratoAI\\resource/videos/1.mp4", - # ) - # start_subclip(task_id, params, subclip_path_videos=subclip_path_videos) + task_id = "qyn2-2-demo" - task_id = "test456" - subclip_path_videos = {'01:10-01:17': './storage/cache_videos/vid-01_10-01_17.mp4', - '01:58-02:04': './storage/cache_videos/vid-01_58-02_04.mp4', - '02:25-02:31': './storage/cache_videos/vid-02_25-02_31.mp4', - '01:28-01:33': './storage/cache_videos/vid-01_28-01_33.mp4', - '03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4', - '00:24-00:28': './storage/cache_videos/vid-00_24-00_28.mp4', - '03:02-03:08': './storage/cache_videos/vid-03_02-03_08.mp4', - '00:41-00:44': './storage/cache_videos/vid-00_41-00_44.mp4', - '02:12-02:25': './storage/cache_videos/vid-02_12-02_25.mp4'} + # 提前裁剪是为了方便检查视频 + subclip_path_videos = { + '00:00:00-00:01:15': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-00-00-00-01-15.mp4', + '00:01:15-00:04:40': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-01-15-00-04-40.mp4', + '00:04:41-00:04:58': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4', + '00:04:58-00:05:45': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-58-00-05-45.mp4', + '00:05:45-00:06:00': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-05-45-00-06-00.mp4', + '00:06:00-00:06:03': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4', + } params = VideoClipParams( - video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test004.json", - video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4", + video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/demo.json", + video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/qyn2-2无片头片尾.mp4", ) - start_subclip(task_id, params, subclip_path_videos=subclip_path_videos) + start_subclip(task_id, params, subclip_path_videos) diff --git a/app/services/video.py b/app/services/video.py index f840c66..dbe7986 100644 --- a/app/services/video.py +++ b/app/services/video.py @@ -443,4 +443,3 @@ def generate_video_v3( bgm.close() if narration_path: narration.close() - diff --git a/app/services/voice.py b/app/services/voice.py index eba3c6d..f5570e4 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -5,10 +5,11 @@ import traceback import edge_tts import asyncio from loguru import logger -from typing import List +from typing import List, Union from datetime import datetime from xml.sax.saxutils import unescape from edge_tts import submaker, SubMaker +from edge_tts.submaker import mktimestamp from moviepy.video.tools import subtitles import time @@ -1036,7 +1037,7 @@ def is_azure_v2_voice(voice_name: str): def tts( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str -) -> [SubMaker, None]: +) -> Union[SubMaker, None]: if is_azure_v2_voice(voice_name): return azure_tts_v2(text, voice_name, voice_file) return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file) @@ -1064,7 +1065,7 @@ def convert_pitch_to_percent(rate: float) -> str: def azure_tts_v1( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str -) -> [SubMaker, None]: +) -> Union[SubMaker, None]: voice_name = parse_voice_name(voice_name) text = text.strip() rate_str = convert_rate_to_percent(voice_rate) @@ -1087,11 +1088,6 @@ def azure_tts_v1( ) return sub_maker, audio_data - # 判断音频文件是否已存在 - if os.path.exists(voice_file): - logger.info(f"voice file exists, skip tts: {voice_file}") - continue - # 获取音频数据和字幕信息 sub_maker, audio_data = asyncio.run(_do()) @@ -1105,8 +1101,6 @@ def azure_tts_v1( # 数据有效,写入文件 with open(voice_file, "wb") as file: file.write(audio_data) - - logger.info(f"completed, output file: {voice_file}") return sub_maker except Exception as e: logger.error(f"生成音频文件时出错: {str(e)}") @@ -1115,7 +1109,7 @@ def azure_tts_v1( return None -def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]: +def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]: voice_name = is_azure_v2_voice(voice_name) if not voice_name: logger.error(f"invalid voice name: {voice_name}") @@ -1240,7 +1234,7 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis if script_item['OST']: continue - start_time, end_time = script_item['new_timestamp'].split('-') + start_time, end_time = script_item['timestamp'].split('-') if sub_maker_index >= len(sub_maker_list): logger.error(f"Sub maker list index out of range: {sub_maker_index}") break @@ -1317,6 +1311,97 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis traceback.print_exc() +def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str): + """ + 优化字幕文件 + 1. 将字幕文件按照标点符号分割成多行 + 2. 逐行匹配字幕文件中的文本 + 3. 生成新的字幕文件 + """ + + text = _format_text(text) + + def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str: + """ + 1 + 00:00:00,000 --> 00:00:02,360 + 跑步是一项简单易行的运动 + """ + start_t = mktimestamp(start_time).replace(".", ",") + end_t = mktimestamp(end_time).replace(".", ",") + return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n" + + start_time = -1.0 + sub_items = [] + sub_index = 0 + + script_lines = utils.split_string_by_punctuations(text) + + def match_line(_sub_line: str, _sub_index: int): + if len(script_lines) <= _sub_index: + return "" + + _line = script_lines[_sub_index] + if _sub_line == _line: + return script_lines[_sub_index].strip() + + _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line) + _line_ = re.sub(r"[^\w\s]", "", _line) + if _sub_line_ == _line_: + return _line_.strip() + + _sub_line_ = re.sub(r"\W+", "", _sub_line) + _line_ = re.sub(r"\W+", "", _line) + if _sub_line_ == _line_: + return _line.strip() + + return "" + + sub_line = "" + + try: + for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)): + _start_time, end_time = offset + if start_time < 0: + start_time = _start_time + + sub = unescape(sub) + sub_line += sub + sub_text = match_line(sub_line, sub_index) + if sub_text: + sub_index += 1 + line = formatter( + idx=sub_index, + start_time=start_time, + end_time=end_time, + sub_text=sub_text, + ) + sub_items.append(line) + start_time = -1.0 + sub_line = "" + + if len(sub_items) == len(script_lines): + with open(subtitle_file, "w", encoding="utf-8") as file: + file.write("\n".join(sub_items) + "\n") + try: + sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8") + duration = max([tb for ((ta, tb), txt) in sbs]) + logger.info( + f"已创建字幕文件: {subtitle_file}, duration: {duration}" + ) + return subtitle_file, duration + except Exception as e: + logger.error(f"failed, error: {str(e)}") + os.remove(subtitle_file) + else: + logger.warning( + f"字幕创建失败, 字幕长度: {len(sub_items)}, script_lines len: {len(script_lines)}" + ) + + except Exception as e: + logger.error(f"failed, error: {str(e)}") + + def get_audio_duration(sub_maker: submaker.SubMaker): """ 获取音频时长 @@ -1339,20 +1424,25 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f """ voice_name = parse_voice_name(voice_name) output_dir = utils.task_dir(task_id) - audio_files = [] - sub_maker_list = [] + tts_results = [] for item in list_script: + tts_item = { + "audio_file": "", + "subtitle_file": "", + "duration": 0, + } if item['OST'] != 1: # 将时间戳中的冒号替换为下划线 - timestamp = item['new_timestamp'].replace(':', '_') + timestamp = item['timestamp'].replace(':', '_') audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3") + subtitle_file = os.path.join(output_dir, f"subtitle_{timestamp}.srt") - # 检查文件是否已存在,如存在且不强制重新生成,则跳过 - if os.path.exists(audio_file) and not force_regenerate: - logger.info(f"音频文件已存在,跳过生成: {audio_file}") - audio_files.append(audio_file) - continue + # # 检查文件是否已存在,如存在且不强制重新生成,则跳过 + # if os.path.exists(audio_file) and not force_regenerate: + # logger.info(f"音频文件已存在,跳过生成: {audio_file}") + # tts_item["audio_file"] = audio_file + # continue text = item['narration'] @@ -1369,9 +1459,17 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f f"如果您在中国,请使用VPN; " f"或者使用其他 tts 引擎") continue + else: + # 为当前片段生成字幕文件 + _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file) - audio_files.append(audio_file) - sub_maker_list.append(sub_maker) + tts_results.append({ + "timestamp": item['timestamp'], + "audio_file": audio_file, + "subtitle_file": subtitle_file, + "duration": duration, + "text": text, + }) logger.info(f"已生成音频文件: {audio_file}") - return audio_files, sub_maker_list + return tts_results diff --git a/webui.py b/webui.py index 434cbb9..1f605b5 100644 --- a/webui.py +++ b/webui.py @@ -3,7 +3,8 @@ import os import sys from uuid import uuid4 from app.config import config -from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, review_settings, merge_settings, system_settings +from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \ + review_settings, merge_settings, system_settings from webui.utils import cache, file_utils from app.utils import utils from app.models.schema import VideoClipParams, VideoAspect @@ -28,6 +29,7 @@ hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) + def init_log(): """初始化日志配置""" from loguru import logger @@ -41,11 +43,11 @@ def init_log(): "torch.cuda.is_available()", "CUDA initialization" ] - + for msg in ignore_messages: if msg in record["message"]: return "" - + file_path = record["file"].path relative_path = os.path.relpath(file_path, config.root_dir) record["file"].path = f"./{relative_path}" @@ -74,6 +76,7 @@ def init_log(): filter=log_filter ) + def init_global_state(): """初始化全局状态""" if 'video_clip_json' not in st.session_state: @@ -85,6 +88,7 @@ def init_global_state(): if 'subclip_videos' not in st.session_state: st.session_state['subclip_videos'] = {} + def tr(key): """翻译函数""" i18n_dir = os.path.join(os.path.dirname(__file__), "webui", "i18n") @@ -92,13 +96,14 @@ def tr(key): loc = locales.get(st.session_state['ui_language'], {}) return loc.get("Translation", {}).get(key, key) + def render_generate_button(): """渲染生成按钮和处理逻辑""" if st.button(tr("Generate Video"), use_container_width=True, type="primary"): try: from app.services import task as tm import torch - + # 重置日志容器和记录 log_container = st.empty() log_records = [] @@ -152,7 +157,7 @@ def render_generate_button(): video_files = result.get("videos", []) st.success(tr("视生成完成")) - + try: if video_files: player_cols = st.columns(len(video_files) * 2 + 1) @@ -167,15 +172,16 @@ def render_generate_button(): finally: PerformanceMonitor.cleanup_resources() + def main(): """主函数""" init_log() init_global_state() utils.init_resources() - + st.title(f"NarratoAI :sunglasses:📽️") st.write(tr("Get Help")) - + # 渲染基础设置面板 basic_settings.render_basic_settings(tr) # 渲染合并设置 @@ -192,12 +198,13 @@ def main(): subtitle_settings.render_subtitle_panel(tr) # 渲染系统设置面板 system_settings.render_system_panel(tr) - + # 渲染视频审查面板 review_settings.render_review_panel(tr) - + # 渲染生成按钮和处理逻辑 render_generate_button() + if __name__ == "__main__": main()