linyq 401eb92fa3 feat(audio): 改进音频合并功能,支持 OST 设置,提升时间戳精度
-重构了 merge_audio_files 函数,增加了对 OST 设置的支持
- 新增 time_to_seconds 函数,支持多种时间格式的转换
- 修改了 audio_merger 模块的逻辑,根据 OST 设置处理音频
- 更新了 task 模块中的 start_subclip 函数,传入 OST 信息
- 优化了 subtitle 和 video 模块的逻辑,适应新的音频处理方式
2024-11-27 23:26:43 +08:00

771 lines
27 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import os
import glob
import random
from typing import List
from typing import Union
import traceback
from loguru import logger
from moviepy.editor import *
from moviepy.video.tools.subtitles import SubtitlesClip
from PIL import ImageFont
from contextlib import contextmanager
from app.models import const
from app.models.schema import MaterialInfo, VideoAspect, VideoConcatMode, VideoParams, VideoClipParams
from app.utils import utils
def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
if not bgm_type:
return ""
if bgm_file and os.path.exists(bgm_file):
return bgm_file
if bgm_type == "random":
song_dir = utils.song_dir()
# 检查目录是否存在
if not os.path.exists(song_dir):
logger.warning(f"背景音乐目录不存在: {song_dir}")
return ""
# 支持 mp3 和 flac 格式
mp3_files = glob.glob(os.path.join(song_dir, "*.mp3"))
flac_files = glob.glob(os.path.join(song_dir, "*.flac"))
files = mp3_files + flac_files
# 检查是否找到音乐文件
if not files:
logger.warning(f"在目录 {song_dir} 中没有找到 MP3 或 FLAC 文件")
return ""
return random.choice(files)
return ""
def combine_videos(
combined_video_path: str,
video_paths: List[str],
audio_file: str,
video_aspect: VideoAspect = VideoAspect.portrait,
video_concat_mode: VideoConcatMode = VideoConcatMode.random,
max_clip_duration: int = 5,
threads: int = 2,
) -> str:
audio_clip = AudioFileClip(audio_file)
audio_duration = audio_clip.duration
logger.info(f"max duration of audio: {audio_duration} seconds")
# Required duration of each clip
req_dur = audio_duration / len(video_paths)
req_dur = max_clip_duration
logger.info(f"each clip will be maximum {req_dur} seconds long")
output_dir = os.path.dirname(combined_video_path)
aspect = VideoAspect(video_aspect)
video_width, video_height = aspect.to_resolution()
clips = []
video_duration = 0
raw_clips = []
for video_path in video_paths:
clip = VideoFileClip(video_path).without_audio()
clip_duration = clip.duration
start_time = 0
while start_time < clip_duration:
end_time = min(start_time + max_clip_duration, clip_duration)
split_clip = clip.subclip(start_time, end_time)
raw_clips.append(split_clip)
# logger.info(f"splitting from {start_time:.2f} to {end_time:.2f}, clip duration {clip_duration:.2f}, split_clip duration {split_clip.duration:.2f}")
start_time = end_time
if video_concat_mode.value == VideoConcatMode.sequential.value:
break
# random video_paths order
if video_concat_mode.value == VideoConcatMode.random.value:
random.shuffle(raw_clips)
# Add downloaded clips over and over until the duration of the audio (max_duration) has been reached
while video_duration < audio_duration:
for clip in raw_clips:
# Check if clip is longer than the remaining audio
if (audio_duration - video_duration) < clip.duration:
clip = clip.subclip(0, (audio_duration - video_duration))
# Only shorten clips if the calculated clip length (req_dur) is shorter than the actual clip to prevent still image
elif req_dur < clip.duration:
clip = clip.subclip(0, req_dur)
clip = clip.set_fps(30)
# Not all videos are same size, so we need to resize them
clip_w, clip_h = clip.size
if clip_w != video_width or clip_h != video_height:
clip_ratio = clip.w / clip.h
video_ratio = video_width / video_height
if clip_ratio == video_ratio:
# 等比例缩放
clip = clip.resize((video_width, video_height))
else:
# 等比缩放视频
if clip_ratio > video_ratio:
# 按照目标宽度等比缩放
scale_factor = video_width / clip_w
else:
# 按照目标高度等比缩放
scale_factor = video_height / clip_h
new_width = int(clip_w * scale_factor)
new_height = int(clip_h * scale_factor)
clip_resized = clip.resize(newsize=(new_width, new_height))
background = ColorClip(
size=(video_width, video_height), color=(0, 0, 0)
)
clip = CompositeVideoClip(
[
background.set_duration(clip.duration),
clip_resized.set_position("center"),
]
)
logger.info(
f"resizing video to {video_width} x {video_height}, clip size: {clip_w} x {clip_h}"
)
if clip.duration > max_clip_duration:
clip = clip.subclip(0, max_clip_duration)
clips.append(clip)
video_duration += clip.duration
video_clip = concatenate_videoclips(clips)
video_clip = video_clip.set_fps(30)
logger.info("writing")
# https://github.com/harry0703/NarratoAI/issues/111#issuecomment-2032354030
video_clip.write_videofile(
filename=combined_video_path,
threads=threads,
logger=None,
temp_audiofile_path=output_dir,
audio_codec="aac",
fps=30,
)
video_clip.close()
logger.success("completed")
return combined_video_path
def wrap_text(text, max_width, font, fontsize=60):
# 创建字体对象
font = ImageFont.truetype(font, fontsize)
def get_text_size(inner_text):
inner_text = inner_text.strip()
left, top, right, bottom = font.getbbox(inner_text)
return right - left, bottom - top
width, height = get_text_size(text)
if width <= max_width:
return text, height
logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 本: {text}")
processed = True
_wrapped_lines_ = []
words = text.split(" ")
_txt_ = ""
for word in words:
_before = _txt_
_txt_ += f"{word} "
_width, _height = get_text_size(_txt_)
if _width <= max_width:
continue
else:
if _txt_.strip() == word.strip():
processed = False
break
_wrapped_lines_.append(_before)
_txt_ = f"{word} "
_wrapped_lines_.append(_txt_)
if processed:
_wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
result = "\n".join(_wrapped_lines_).strip()
height = len(_wrapped_lines_) * height
# logger.warning(f"wrapped text: {result}")
return result, height
_wrapped_lines_ = []
chars = list(text)
_txt_ = ""
for word in chars:
_txt_ += word
_width, _height = get_text_size(_txt_)
if _width <= max_width:
continue
else:
_wrapped_lines_.append(_txt_)
_txt_ = ""
_wrapped_lines_.append(_txt_)
result = "\n".join(_wrapped_lines_).strip()
height = len(_wrapped_lines_) * height
logger.debug(f"换行文本: {result}")
return result, height
@contextmanager
def manage_clip(clip):
try:
yield clip
finally:
clip.close()
del clip
def generate_video_v2(
video_path: str,
audio_path: str,
subtitle_path: str,
output_file: str,
params: VideoClipParams,
list_script: list = None
):
"""
生成最终视频,处理音频和字幕
Args:
video_path: 视频文件路径
audio_path: 音频文件路径
subtitle_path: 字幕文件路径
output_file: 输出文件路径
params: 视频参数
list_script: 视频脚本列表包含OST设置
"""
try:
video_clip = VideoFileClip(video_path)
# 处理音频
if audio_path and os.path.exists(audio_path):
audio_clip = AudioFileClip(audio_path)
if list_script:
# 根据OST设置处理音频
# OST=0: 只使用TTS音频
# OST=1: 只使用视频原声
# OST=2: 混合TTS音频和视频原声
original_audio = video_clip.audio
# 设置音频音量
tts_volume = params.tts_volume if hasattr(params, 'tts_volume') else 1.0
video_volume = params.video_volume if hasattr(params, 'video_volume') else 0.1
# 创建最终音频
if original_audio:
# 有些片段需要原声有些需要TTS
final_audio = CompositeAudioClip([
audio_clip.volumex(tts_volume), # TTS音频
original_audio.volumex(video_volume) # 原声音频
])
else:
final_audio = audio_clip.volumex(tts_volume)
else:
# 如果没有OST设置使用默认行为
final_audio = audio_clip
video_clip = video_clip.set_audio(final_audio)
# 处理字幕
if subtitle_path and os.path.exists(subtitle_path):
# 添加字幕
video_clip = add_subtitles(
video_clip,
subtitle_path,
params.font_size,
params.font_name,
params.text_fore_color,
params.subtitle_position,
params.stroke_color,
params.stroke_width
)
# 写入最终视频文件
video_clip.write_videofile(
output_file,
codec="libx264",
audio_codec="aac",
temp_audiofile="temp-audio.m4a",
remove_temp=True,
threads=params.n_threads
)
except Exception as e:
logger.error(f"生成视频时发生错误: {str(e)}")
raise e
finally:
# 清理资源
if 'video_clip' in locals():
video_clip.close()
if 'audio_clip' in locals():
audio_clip.close()
if 'final_audio' in locals():
final_audio.close()
def process_audio_tracks(original_audio, new_audio, params, video_duration):
"""处理所有音轨"""
audio_tracks = []
if original_audio is not None:
audio_tracks.append(original_audio)
new_audio = new_audio.volumex(params.voice_volume)
audio_tracks.append(new_audio)
# 处理背景音乐
bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
if bgm_file:
try:
bgm_clip = AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3)
bgm_clip = afx.audio_loop(bgm_clip, duration=video_duration)
audio_tracks.append(bgm_clip)
except Exception as e:
logger.error(f"添加背景音乐失败: {str(e)}")
return CompositeAudioClip(audio_tracks) if audio_tracks else new_audio
def process_subtitles(subtitle_path, video_clip, video_duration, create_text_clip):
"""处理字幕"""
if not (subtitle_path and os.path.exists(subtitle_path)):
return video_clip
sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8")
text_clips = []
for item in sub.subtitles:
clip = create_text_clip(subtitle_item=item)
# 时间范围<E88C83><E59BB4>
start_time = max(clip.start, 0)
if start_time >= video_duration:
continue
end_time = min(clip.end, video_duration)
clip = clip.set_start(start_time).set_end(end_time)
text_clips.append(clip)
logger.info(f"处理了 {len(text_clips)} 段字幕")
return CompositeVideoClip([video_clip, *text_clips])
def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
for material in materials:
if not material.url:
continue
ext = utils.parse_extension(material.url)
try:
clip = VideoFileClip(material.url)
except Exception:
clip = ImageClip(material.url)
width = clip.size[0]
height = clip.size[1]
if width < 480 or height < 480:
logger.warning(f"video is too small, width: {width}, height: {height}")
continue
if ext in const.FILE_TYPE_IMAGES:
logger.info(f"processing image: {material.url}")
# 创建一个图片剪辑并设置持续时间为3秒钟
clip = (
ImageClip(material.url)
.set_duration(clip_duration)
.set_position("center")
)
# 使用resize方法来添加缩放效果。这里使用了lambda函数来使得缩放效果随时间变化。
# 假设我们想要从原始大小逐渐放大到120%的大小。
# t代表当前时间clip.duration为视频总时长这里是3秒。
# 注意1 表示100%的大小所以1.2表示120%的大小
zoom_clip = clip.resize(
lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
)
# 如果需要,可以创建一个包含缩放剪辑的复合视频剪辑
# (这在您想要在视频中添加其他元素时非常有用)
final_clip = CompositeVideoClip([zoom_clip])
# 输出视频
video_file = f"{material.url}.mp4"
final_clip.write_videofile(video_file, fps=30, logger=None)
final_clip.close()
del final_clip
material.url = video_file
logger.success(f"completed: {video_file}")
return materials
def combine_clip_videos(combined_video_path: str,
video_paths: List[str],
video_ost_list: List[int],
list_script: list,
video_aspect: VideoAspect = VideoAspect.portrait,
threads: int = 2,
) -> str:
"""
合并子视频
Args:
combined_video_path: 合并后的存储路径
video_paths: 子视频路径列表
video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说)
list_script: 剪辑脚本
video_aspect: 屏幕比例
threads: 线程数
Returns:
str: 合并后的视频路径
"""
# 计算总时长时需要考虑毫秒精度
total_duration = 0.0
for item in list_script:
timestamp = item.get('new_timestamp', '')
if timestamp:
start_str, end_str = timestamp.split('-')
start_time = utils.time_to_seconds(start_str)
end_time = utils.time_to_seconds(end_str)
duration = end_time - start_time
total_duration += duration
logger.info(f"音频的最大持续时间: {total_duration:.3f} s")
output_dir = os.path.dirname(combined_video_path)
aspect = VideoAspect(video_aspect)
video_width, video_height = aspect.to_resolution()
clips = []
for video_path, video_ost in zip(video_paths, video_ost_list):
try:
# 加载视频片段
clip = VideoFileClip(video_path)
# 根据OST设置处理音频
if video_ost == 0: # 不保留原声
clip = clip.without_audio()
elif video_ost == 1: # 只保留原声
# 保持原声,但可能需要调整音量
if clip.audio:
clip = clip.set_audio(clip.audio.volumex(1.0)) # 可以调整音量系数
# OST == 2 的情况会在后续处理中混合音频
clip = clip.set_fps(30)
# 处理视频尺寸
clip_w, clip_h = clip.size
if clip_w != video_width or clip_h != video_height:
clip = resize_video_with_padding(
clip,
target_width=video_width,
target_height=video_height
)
logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}")
# 精确控制视频时长
filename = os.path.basename(video_path)
timestamp = extract_timestamp_from_filename(filename)
if timestamp:
start_time, end_time = timestamp
clip_duration = end_time - start_time
if abs(clip.duration - clip_duration) > 0.1: # 允许0.1秒的误差
logger.warning(f"视频 {video_path} 时长与时间戳不匹配,进行调整")
clip = clip.set_duration(clip_duration)
clips.append(clip)
except Exception as e:
logger.error(f"处理视频 {video_path} 时出错: {str(e)}")
continue
if not clips:
raise ValueError("没有有效的视频片段可以合并")
try:
# 合并所有视频片段
video_clip = concatenate_videoclips(clips)
video_clip = video_clip.set_fps(30)
logger.info("开始合并视频...")
video_clip.write_videofile(
filename=combined_video_path,
threads=threads,
logger=None,
audio_codec="aac",
fps=30,
temp_audiofile=os.path.join(output_dir, "temp-audio.m4a")
)
finally:
# 确保资源被正确释放
video_clip.close()
for clip in clips:
clip.close()
logger.success("视频合并完成")
return combined_video_path
def extract_timestamp_from_filename(filename: str) -> tuple:
"""
从文件名中提取时间戳,支持多种格式:
- "vid-00_06,500-00_24,800.mp4" -> (6.5, 24.8)
- "vid-00_00_00-020-00_00_10-400.mp4" -> (0.02, 10.4)
"""
try:
# 提取时间戳部分
match = re.search(r'vid-(.+?)\.mp4$', filename)
if not match:
logger.warning(f"文件名格式不正确: {filename}")
return None
timestamp = match.group(1)
# 处理包含毫秒的格式 (00_00_00-020-00_00_10-400)
if timestamp.count('-') == 3:
parts = timestamp.split('-')
start_time = f"{parts[0]}-{parts[1]}" # 组合开始时间和毫秒
end_time = f"{parts[2]}-{parts[3]}" # 组合结束时间和毫秒
# 转换开始时间
start_time_str = start_time.replace('_', ':')
if start_time_str.count(':') == 2: # 如果是 00:00:00-020 格式
start_base = utils.time_to_seconds(start_time_str.split('-')[0])
start_ms = int(start_time_str.split('-')[1]) / 1000
start_seconds = start_base + start_ms
else:
start_seconds = utils.time_to_seconds(start_time_str)
# 转换结束时间
end_time_str = end_time.replace('_', ':')
if end_time_str.count(':') == 2: # 如果是 00:00:10-400 格式
end_base = utils.time_to_seconds(end_time_str.split('-')[0])
end_ms = int(end_time_str.split('-')[1]) / 1000
end_seconds = end_base + end_ms
else:
end_seconds = utils.time_to_seconds(end_time_str)
# 处理简单格式 (00_06-00_24)
else:
start_str, end_str = timestamp.split('-')
start_seconds = utils.time_to_seconds(start_str.replace('_', ':'))
end_seconds = utils.time_to_seconds(end_str.replace('_', ':'))
logger.debug(f"从文件名 {filename} 提取时间戳: {start_seconds:.3f} - {end_seconds:.3f}")
return start_seconds, end_seconds
except Exception as e:
logger.error(f"从文件名提取时间戳失败 {filename}: {str(e)}\n{traceback.format_exc()}")
return None
def resize_video_with_padding(clip, target_width: int, target_height: int):
"""辅助函数:调整视频尺寸并添加黑边"""
clip_ratio = clip.w / clip.h
target_ratio = target_width / target_height
if clip_ratio == target_ratio:
return clip.resize((target_width, target_height))
if clip_ratio > target_ratio:
scale_factor = target_width / clip.w
else:
scale_factor = target_height / clip.h
new_width = int(clip.w * scale_factor)
new_height = int(clip.h * scale_factor)
clip_resized = clip.resize(newsize=(new_width, new_height))
background = ColorClip(
size=(target_width, target_height),
color=(0, 0, 0)
).set_duration(clip.duration)
return CompositeVideoClip([
background,
clip_resized.set_position("center")
])
def validate_params(video_path, audio_path, output_file, params):
"""验证输入参数"""
if not os.path.exists(video_path):
raise FileNotFoundError(f"视频文件不存在: {video_path}")
if not os.path.exists(audio_path):
raise FileNotFoundError(f"音频文件不存在: {audio_path}")
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
raise FileNotFoundError(f"输出目录不存在: {output_dir}")
if not hasattr(params, 'video_aspect'):
raise ValueError("params 缺少必要参数 video_aspect")
def add_subtitles(video_clip, subtitle_path, font_size, font_name, font_color, position, shadow_color, shadow_offset):
"""
为视频添加字幕
Args:
video_clip: 视频剪辑对象
subtitle_path: 字幕文件路径
font_size: 字体大小
font_name: 字体名称
font_color: 字体颜色
position: 字幕位置 ('top', 'center', 'bottom')
shadow_color: 阴影颜色
shadow_offset: 阴影偏移
Returns:
带有字幕的视频剪辑对象
"""
try:
# 确保字体文件存在
font_path = os.path.join(utils.font_dir(), font_name)
if not os.path.exists(font_path):
logger.error(f"字体文件不存在: {font_path}")
# 尝试使用系统默认字体
font_path = "Arial" if os.name == 'nt' else "/System/Library/Fonts/STHeiti Light.ttc"
logger.info(f"使用默认字体: {font_path}")
# 设置字幕位置
if position == "top":
pos = ("center", 50)
elif position == "center":
pos = "center"
else: # bottom
pos = ("center", -50)
def subtitle_generator(txt):
return TextClip(
txt,
fontsize=font_size,
font=font_path,
color=font_color,
stroke_color=shadow_color,
stroke_width=shadow_offset,
method='caption', # 使用 caption 方法可能更稳定
size=(video_clip.w * 0.9, None) # 限制字幕宽度
)
subtitles = SubtitlesClip(
subtitle_path,
subtitle_generator
)
# 添加字幕到视频
video_with_subtitles = CompositeVideoClip([
video_clip,
subtitles.set_position(pos)
])
return video_with_subtitles
except Exception as e:
logger.error(f"添加字幕时出错: {str(e)}\n{traceback.format_exc()}")
# 如果添加字幕失败,返回原始视频
return video_clip
if __name__ == "__main__":
# combined_video_path = "../../storage/tasks/12312312/com123.mp4"
#
# video_paths = ['../../storage/cache_videos/vid-00_00-00_03.mp4',
# '../../storage/cache_videos/vid-00_03-00_07.mp4',
# '../../storage/cache_videos/vid-00_12-00_17.mp4',
# '../../storage/cache_videos/vid-00_26-00_31.mp4']
# video_ost_list = [False, True, False, True]
# list_script = [
# {
# "picture": "夜晚,一个小孩在树林里奔跑,后面有人拿着火把在追赶",
# "timestamp": "00:00-00:03",
# "narration": "夜<><E5A49C><EFBFBD>风高的树林一个小孩在拼命奔跑后面的人穷追不舍",
# "OST": False,
# "new_timestamp": "00:00-00:03"
# },
# {
# "picture": "追赶的人命令抓住小孩",
# "timestamp": "00:03-00:07",
# "narration": "原声播放1",
# "OST": True,
# "new_timestamp": "00:03-00:07"
# },
# {
# "picture": "小孩躲在草丛里,黑衣人用脚踢了踢他",
# "timestamp": "00:12-00:17",
# "narration": "小孩脱下外套,跑进树林, 一路奔跑,直到第二天清晨",
# "OST": False,
# "new_timestamp": "00:07-00:12"
# },
# {
# "picture": "小孩跑到车前,慌慌张张地对女人说有人要杀他",
# "timestamp": "00:26-00:31",
# "narration": "原声播放2",
# "OST": True,
# "new_timestamp": "00:12-00:17"
# }
# ]
# combine_clip_videos(combined_video_path=combined_video_path, video_paths=video_paths, video_ost_list=video_ost_list, list_script=list_script)
# cfg = VideoClipParams()
# cfg.video_aspect = VideoAspect.portrait
# cfg.font_name = "STHeitiMedium.ttc"
# cfg.font_size = 60
# cfg.stroke_color = "#000000"
# cfg.stroke_width = 1.5
# cfg.text_fore_color = "#FFFFFF"
# cfg.text_background_color = "transparent"
# cfg.bgm_type = "random"
# cfg.bgm_file = ""
# cfg.bgm_volume = 1.0
# cfg.subtitle_enabled = True
# cfg.subtitle_position = "bottom"
# cfg.n_threads = 2
# cfg.paragraph_number = 1
#
# cfg.voice_volume = 1.0
# generate_video(video_path=video_file,
# audio_path=audio_file,
# subtitle_path=subtitle_file,
# output_file=output_file,
# params=cfg
# )
#
# video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4"
#
# audio_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3"
#
# subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt"
#
# output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4"
#
# generate_video_v2(video_path=video_path,
# audio_path=audio_path,
# subtitle_path=subtitle_path,
# output_file=output_file,
# params=cfg
# )
# 合并视频
video_list = [
'./storage/cache_videos/vid-01_03-01_50.mp4',
'./storage/cache_videos/vid-01_55-02_29.mp4',
'./storage/cache_videos/vid-03_24-04_04.mp4',
'./storage/cache_videos/vid-04_50-05_28.mp4'
]