linyqh 0bb811ea79 refactor(voice): 优化 Edge TTS 音频生成逻辑- 重构了 Edge TTS音频生成函数,提高了代码可读性和错误处理能力
-增加了重试机制,提高了生成音频的可靠性
-优化了日志输出,提供了更详细的错误信息和生成进度
- 删除了不必要的测试代码和注释,精简了代码结构
2024-12-03 23:24:20 +08:00

775 lines
27 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import os
import glob
import random
from typing import List
from typing import Union
import traceback
from loguru import logger
from moviepy.editor import *
from moviepy.video.tools.subtitles import SubtitlesClip
from PIL import ImageFont
from contextlib import contextmanager
from app.models import const
from app.models.schema import MaterialInfo, VideoAspect, VideoConcatMode, VideoParams, VideoClipParams
from app.utils import utils
def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
if not bgm_type:
return ""
if bgm_file and os.path.exists(bgm_file):
return bgm_file
if bgm_type == "random":
song_dir = utils.song_dir()
# 检查目录是否存在
if not os.path.exists(song_dir):
logger.warning(f"背景音乐目录不存在: {song_dir}")
return ""
# 支持 mp3 和 flac 格式
mp3_files = glob.glob(os.path.join(song_dir, "*.mp3"))
flac_files = glob.glob(os.path.join(song_dir, "*.flac"))
files = mp3_files + flac_files
# 检查是否找到音乐文件
if not files:
logger.warning(f"在目录 {song_dir} 中没有找到 MP3 或 FLAC 文件")
return ""
return random.choice(files)
return ""
def combine_videos(
combined_video_path: str,
video_paths: List[str],
audio_file: str,
video_aspect: VideoAspect = VideoAspect.portrait,
video_concat_mode: VideoConcatMode = VideoConcatMode.random,
max_clip_duration: int = 5,
threads: int = 2,
) -> str:
audio_clip = AudioFileClip(audio_file)
audio_duration = audio_clip.duration
logger.info(f"max duration of audio: {audio_duration} seconds")
# Required duration of each clip
req_dur = audio_duration / len(video_paths)
req_dur = max_clip_duration
logger.info(f"each clip will be maximum {req_dur} seconds long")
output_dir = os.path.dirname(combined_video_path)
aspect = VideoAspect(video_aspect)
video_width, video_height = aspect.to_resolution()
clips = []
video_duration = 0
raw_clips = []
for video_path in video_paths:
clip = VideoFileClip(video_path).without_audio()
clip_duration = clip.duration
start_time = 0
while start_time < clip_duration:
end_time = min(start_time + max_clip_duration, clip_duration)
split_clip = clip.subclip(start_time, end_time)
raw_clips.append(split_clip)
# logger.info(f"splitting from {start_time:.2f} to {end_time:.2f}, clip duration {clip_duration:.2f}, split_clip duration {split_clip.duration:.2f}")
start_time = end_time
if video_concat_mode.value == VideoConcatMode.sequential.value:
break
# random video_paths order
if video_concat_mode.value == VideoConcatMode.random.value:
random.shuffle(raw_clips)
# Add downloaded clips over and over until the duration of the audio (max_duration) has been reached
while video_duration < audio_duration:
for clip in raw_clips:
# Check if clip is longer than the remaining audio
if (audio_duration - video_duration) < clip.duration:
clip = clip.subclip(0, (audio_duration - video_duration))
# Only shorten clips if the calculated clip length (req_dur) is shorter than the actual clip to prevent still image
elif req_dur < clip.duration:
clip = clip.subclip(0, req_dur)
clip = clip.set_fps(30)
# Not all videos are same size, so we need to resize them
clip_w, clip_h = clip.size
if clip_w != video_width or clip_h != video_height:
clip_ratio = clip.w / clip.h
video_ratio = video_width / video_height
if clip_ratio == video_ratio:
# 等比例缩放
clip = clip.resize((video_width, video_height))
else:
# 等比缩放视频
if clip_ratio > video_ratio:
# 按照目标宽度等比缩放
scale_factor = video_width / clip_w
else:
# 按照目标高度等比缩放
scale_factor = video_height / clip_h
new_width = int(clip_w * scale_factor)
new_height = int(clip_h * scale_factor)
clip_resized = clip.resize(newsize=(new_width, new_height))
background = ColorClip(
size=(video_width, video_height), color=(0, 0, 0)
)
clip = CompositeVideoClip(
[
background.set_duration(clip.duration),
clip_resized.set_position("center"),
]
)
logger.info(
f"resizing video to {video_width} x {video_height}, clip size: {clip_w} x {clip_h}"
)
if clip.duration > max_clip_duration:
clip = clip.subclip(0, max_clip_duration)
clips.append(clip)
video_duration += clip.duration
video_clip = concatenate_videoclips(clips)
video_clip = video_clip.set_fps(30)
logger.info("writing")
# https://github.com/harry0703/NarratoAI/issues/111#issuecomment-2032354030
video_clip.write_videofile(
filename=combined_video_path,
threads=threads,
logger=None,
temp_audiofile_path=output_dir,
audio_codec="aac",
fps=30,
)
video_clip.close()
logger.success("completed")
return combined_video_path
def wrap_text(text, max_width, font, fontsize=60):
# 创建字体对象
font = ImageFont.truetype(font, fontsize)
def get_text_size(inner_text):
inner_text = inner_text.strip()
left, top, right, bottom = font.getbbox(inner_text)
return right - left, bottom - top
width, height = get_text_size(text)
if width <= max_width:
return text, height
logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 本: {text}")
processed = True
_wrapped_lines_ = []
words = text.split(" ")
_txt_ = ""
for word in words:
_before = _txt_
_txt_ += f"{word} "
_width, _height = get_text_size(_txt_)
if _width <= max_width:
continue
else:
if _txt_.strip() == word.strip():
processed = False
break
_wrapped_lines_.append(_before)
_txt_ = f"{word} "
_wrapped_lines_.append(_txt_)
if processed:
_wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
result = "\n".join(_wrapped_lines_).strip()
height = len(_wrapped_lines_) * height
# logger.warning(f"wrapped text: {result}")
return result, height
_wrapped_lines_ = []
chars = list(text)
_txt_ = ""
for word in chars:
_txt_ += word
_width, _height = get_text_size(_txt_)
if _width <= max_width:
continue
else:
_wrapped_lines_.append(_txt_)
_txt_ = ""
_wrapped_lines_.append(_txt_)
result = "\n".join(_wrapped_lines_).strip()
height = len(_wrapped_lines_) * height
logger.debug(f"换行文本: {result}")
return result, height
@contextmanager
def manage_clip(clip):
try:
yield clip
finally:
clip.close()
del clip
def generate_video_v2(
video_path: str,
audio_path: str,
subtitle_path: str,
output_file: str,
params: VideoClipParams,
list_script: list = None
):
"""
生成最终视频,处理音频和字幕
Args:
video_path: 视频文件路径
audio_path: 音频文件路径
subtitle_path: 字幕文件路径
output_file: 输出文件路径
params: 视频参数
list_script: 视频脚本列表包含OST设置
"""
try:
video_clip = VideoFileClip(video_path)
# 处理音频
if audio_path and os.path.exists(audio_path):
audio_clip = AudioFileClip(audio_path)
if list_script:
# 根据OST设置处理音频
# OST=0: 只使用TTS音频
# OST=1: 只使用视频原声
# OST=2: 混合TTS音频和视频原声
original_audio = video_clip.audio
# 设置音频音量
tts_volume = params.tts_volume if hasattr(params, 'tts_volume') else 1.0
video_volume = params.video_volume if hasattr(params, 'video_volume') else 0.1
# 创建最终音频
if original_audio:
# 有些片段需要原声有些需要TTS
final_audio = CompositeAudioClip([
audio_clip.volumex(tts_volume), # TTS音频
original_audio.volumex(video_volume) # 原声音频
])
else:
final_audio = audio_clip.volumex(tts_volume)
else:
# 如果没有OST设置使用默认行为
final_audio = audio_clip
video_clip = video_clip.set_audio(final_audio)
# 处理字幕
if subtitle_path and os.path.exists(subtitle_path):
# 添加字幕
video_clip = add_subtitles(
video_clip,
subtitle_path,
params.font_size,
params.font_name,
params.text_fore_color,
params.subtitle_position,
params.stroke_color,
params.stroke_width
)
# 写入最终视频文件
video_clip.write_videofile(
output_file,
codec="libx264",
audio_codec="aac",
temp_audiofile="temp-audio.m4a",
remove_temp=True,
threads=params.n_threads
)
except Exception as e:
logger.error(f"生成视频时发生错误: {str(e)}")
raise e
finally:
# 清理资源
if 'video_clip' in locals():
video_clip.close()
if 'audio_clip' in locals():
audio_clip.close()
if 'final_audio' in locals():
final_audio.close()
def process_audio_tracks(original_audio, new_audio, params, video_duration):
"""处理所有音轨"""
audio_tracks = []
if original_audio is not None:
audio_tracks.append(original_audio)
new_audio = new_audio.volumex(params.voice_volume)
audio_tracks.append(new_audio)
# 处理背景音乐
bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
if bgm_file:
try:
bgm_clip = AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3)
bgm_clip = afx.audio_loop(bgm_clip, duration=video_duration)
audio_tracks.append(bgm_clip)
except Exception as e:
logger.error(f"添加背景音乐失败: {str(e)}")
return CompositeAudioClip(audio_tracks) if audio_tracks else new_audio
def process_subtitles(subtitle_path, video_clip, video_duration, create_text_clip):
"""处理字幕"""
if not (subtitle_path and os.path.exists(subtitle_path)):
return video_clip
sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8")
text_clips = []
for item in sub.subtitles:
clip = create_text_clip(subtitle_item=item)
# 时间范围整
start_time = max(clip.start, 0)
if start_time >= video_duration:
continue
end_time = min(clip.end, video_duration)
clip = clip.set_start(start_time).set_end(end_time)
text_clips.append(clip)
logger.info(f"处理了 {len(text_clips)} 段字幕")
return CompositeVideoClip([video_clip, *text_clips])
def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
for material in materials:
if not material.url:
continue
ext = utils.parse_extension(material.url)
try:
clip = VideoFileClip(material.url)
except Exception:
clip = ImageClip(material.url)
width = clip.size[0]
height = clip.size[1]
if width < 480 or height < 480:
logger.warning(f"video is too small, width: {width}, height: {height}")
continue
if ext in const.FILE_TYPE_IMAGES:
logger.info(f"processing image: {material.url}")
# 创建一个图片剪辑并设置持续时间为3秒钟
clip = (
ImageClip(material.url)
.set_duration(clip_duration)
.set_position("center")
)
# 使用resize方法来添加缩放效果。这里使用了lambda函数来使得缩放效果随时间变化。
# 假设我们想要从原始大小逐渐放大到120%的大小。
# t代表当前时间clip.duration为视频总时长这里是3秒。
# 注意1 表示100%的大小所以1.2表示120%的大小
zoom_clip = clip.resize(
lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
)
# 如果需要,可以创建一个包含缩放剪辑的复合视频剪辑
# (这在您想要在视频中添加其他元素时非常有用)
final_clip = CompositeVideoClip([zoom_clip])
# 输出视频
video_file = f"{material.url}.mp4"
final_clip.write_videofile(video_file, fps=30, logger=None)
final_clip.close()
del final_clip
material.url = video_file
logger.success(f"completed: {video_file}")
return materials
def combine_clip_videos(combined_video_path: str,
video_paths: List[str],
video_ost_list: List[int],
list_script: list,
video_aspect: VideoAspect = VideoAspect.portrait,
threads: int = 2,
) -> str:
"""
合并子视频
Args:
combined_video_path: 合并后的存储路径
video_paths: 子视频路径列表
video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说)
list_script: 剪辑脚本
video_aspect: 屏幕比例
threads: 线程数
Returns:
str: 合并后的视频路径
"""
# 计算总时长时需要考虑毫秒精度
total_duration = 0.0
for item in list_script:
timestamp = item.get('new_timestamp', '')
if timestamp:
start_str, end_str = timestamp.split('-')
start_time = utils.time_to_seconds(start_str)
end_time = utils.time_to_seconds(end_str)
duration = end_time - start_time
total_duration += duration
logger.info(f"音频的最大持续时间: {total_duration:.3f} s")
output_dir = os.path.dirname(combined_video_path)
aspect = VideoAspect(video_aspect)
video_width, video_height = aspect.to_resolution()
clips = []
for video_path, video_ost in zip(video_paths, video_ost_list):
try:
# 加载视频片段
clip = VideoFileClip(video_path)
# 根据OST设置处理音频
if video_ost == 0: # 不保留原声
clip = clip.without_audio()
elif video_ost == 1: # 只保留原声
# 保持原声,但可能需要调整音量
if clip.audio:
clip = clip.set_audio(clip.audio.volumex(1.0)) # 可以调整音量系数
# OST == 2 的情况会在后续处理中混合音频
clip = clip.set_fps(30)
# 处理视频尺寸
clip_w, clip_h = clip.size
if clip_w != video_width or clip_h != video_height:
clip = resize_video_with_padding(
clip,
target_width=video_width,
target_height=video_height
)
logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}")
# 精确控制视频时长
filename = os.path.basename(video_path)
timestamp = extract_timestamp_from_filename(filename)
if timestamp:
start_time, end_time = timestamp
clip_duration = end_time - start_time
if abs(clip.duration - clip_duration) > 0.1: # 允许0.1秒的误差
logger.warning(f"视频 {video_path} 时长与时间戳不匹配,进行调整")
clip = clip.set_duration(clip_duration)
clips.append(clip)
except Exception as e:
logger.error(f"处理视频 {video_path} 时出错: {str(e)}")
continue
if not clips:
raise ValueError("没有有效的视频片段可以合并")
try:
# 合并所有视频片段
video_clip = concatenate_videoclips(clips)
video_clip = video_clip.set_fps(30)
logger.info("开始合并视频...")
video_clip.write_videofile(
filename=combined_video_path,
threads=threads,
logger=None,
audio_codec="aac",
fps=30,
temp_audiofile=os.path.join(output_dir, "temp-audio.m4a")
)
finally:
# 确保资源被正确释放
video_clip.close()
for clip in clips:
clip.close()
logger.success("视频合并完成")
return combined_video_path
def extract_timestamp_from_filename(filename: str) -> tuple:
"""
从文件名中提取时间戳,支持格式:
- "vid-00-00-10_000-00-00-43_039.mp4" -> (10.0, 43.039)
表示 00时00分10秒000毫秒 到 00时00分43秒039毫秒
"""
try:
# 提取时间戳部分
match = re.search(r'vid-(.+?)\.mp4$', filename)
if not match:
logger.warning(f"文件名格式不正确: {filename}")
return None
timestamp = match.group(1)
def parse_timestamp(time_str: str) -> float:
"""解析单个时间戳字符串为秒数"""
try:
# 处理 "00-00-10_000" 格式
main_time, milliseconds = time_str.rsplit('_', 1) # 从右边分割,处理可能存在的多个下划线
time_components = main_time.split('-')
if len(time_components) != 3:
raise ValueError(f"时间格式错误: {main_time}")
hours = int(time_components[0])
minutes = int(time_components[1])
seconds = int(time_components[2])
ms = int(milliseconds)
# 转换为秒数
total_seconds = hours * 3600 + minutes * 60 + seconds + ms / 1000
return total_seconds
except Exception as e:
raise ValueError(f"解析时间戳失败 {time_str}: {str(e)}")
# 分割起始和结束时间戳
timestamps = timestamp.split('-', 5) # 最多分割5次处理 00-00-10_000-00-00-43_039 格式
if len(timestamps) != 6: # 应该得到 ['00', '00', '10_000', '00', '00', '43_039']
raise ValueError(f"时间戳格式错误,无法分割: {timestamp}")
start_str = '-'.join(timestamps[0:3]) # 组合开始时间 "00-00-10_000"
end_str = '-'.join(timestamps[3:6]) # 组合结束时间 "00-00-43_039"
start_seconds = parse_timestamp(start_str)
end_seconds = parse_timestamp(end_str)
logger.debug(f"从文件名 {filename} 提取时间戳: {start_seconds:.3f} - {end_seconds:.3f}")
return start_seconds, end_seconds
except Exception as e:
logger.error(f"从文件名提取时间戳失败 {filename}: {str(e)}\n{traceback.format_exc()}")
return None
def resize_video_with_padding(clip, target_width: int, target_height: int):
"""辅助函数:调整视频尺寸并添加黑边"""
clip_ratio = clip.w / clip.h
target_ratio = target_width / target_height
if clip_ratio == target_ratio:
return clip.resize((target_width, target_height))
if clip_ratio > target_ratio:
scale_factor = target_width / clip.w
else:
scale_factor = target_height / clip.h
new_width = int(clip.w * scale_factor)
new_height = int(clip.h * scale_factor)
clip_resized = clip.resize(newsize=(new_width, new_height))
background = ColorClip(
size=(target_width, target_height),
color=(0, 0, 0)
).set_duration(clip.duration)
return CompositeVideoClip([
background,
clip_resized.set_position("center")
])
def validate_params(video_path, audio_path, output_file, params):
"""验证输入参数"""
if not os.path.exists(video_path):
raise FileNotFoundError(f"视频文件不存在: {video_path}")
if not os.path.exists(audio_path):
raise FileNotFoundError(f"音频文件不存在: {audio_path}")
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
raise FileNotFoundError(f"输出目录不存在: {output_dir}")
if not hasattr(params, 'video_aspect'):
raise ValueError("params 缺少必要参数 video_aspect")
def add_subtitles(video_clip, subtitle_path, font_size, font_name, font_color, position, shadow_color, shadow_offset):
"""
为视频添加字幕
Args:
video_clip: 视频剪辑对象
subtitle_path: 字幕文件路径
font_size: 字体大小
font_name: 字体名称
font_color: 字体颜色
position: 字幕位置 ('top', 'center', 'bottom')
shadow_color: 阴影颜色
shadow_offset: 阴影偏移
Returns:
带有字幕的视频剪辑对象
"""
try:
# 确保字体文件存在
font_path = os.path.join(utils.font_dir(), font_name)
if not os.path.exists(font_path):
logger.error(f"字体文件不存在: {font_path}")
# 尝试使用系统默认字体
font_path = "Arial" if os.name == 'nt' else "/System/Library/Fonts/STHeiti Light.ttc"
logger.info(f"使用默认字体: {font_path}")
# 设置字幕位置
if position == "top":
pos = ("center", 50)
elif position == "center":
pos = "center"
else: # bottom
pos = ("center", -50)
def subtitle_generator(txt):
return TextClip(
txt,
fontsize=font_size,
font=font_path,
color=font_color,
stroke_color=shadow_color,
stroke_width=shadow_offset,
method='caption', # 使用 caption 方法可能更稳定
size=(video_clip.w * 0.9, None) # 限制字幕宽度
)
# 使用 SubtitlesClip但明确指定 UTF-8 编码
subtitles = SubtitlesClip(
subtitle_path,
subtitle_generator,
encoding='utf-8' # 明确指定使用 UTF-8 编码
)
# 添加字幕到视频
video_with_subtitles = CompositeVideoClip([
video_clip,
subtitles.set_position(pos)
])
return video_with_subtitles
except Exception as e:
logger.error(f"添加字幕时出错: {str(e)}\n{traceback.format_exc()}")
# 如果添加字幕失败,返回原始视频
return video_clip
if __name__ == "__main__":
# combined_video_path = "../../storage/tasks/12312312/com123.mp4"
#
# video_paths = ['../../storage/cache_videos/vid-00_00-00_03.mp4',
# '../../storage/cache_videos/vid-00_03-00_07.mp4',
# '../../storage/cache_videos/vid-00_12-00_17.mp4',
# '../../storage/cache_videos/vid-00_26-00_31.mp4']
# video_ost_list = [False, True, False, True]
# list_script = [
# {
# "picture": "夜晚,一个小孩在树林里奔跑,后面有人拿着火把在追赶",
# "timestamp": "00:00-00:03",
# "narration": "夜风高的树林,一个小孩在拼命奔跑,后面的人穷追不舍!",
# "OST": False,
# "new_timestamp": "00:00-00:03"
# },
# {
# "picture": "追赶的人命令抓住小孩",
# "timestamp": "00:03-00:07",
# "narration": "原声播放1",
# "OST": True,
# "new_timestamp": "00:03-00:07"
# },
# {
# "picture": "小孩躲在草丛里,黑衣人用脚踢了踢他",
# "timestamp": "00:12-00:17",
# "narration": "小孩脱下外套,跑进树林, 一路奔跑,直到第二天清晨",
# "OST": False,
# "new_timestamp": "00:07-00:12"
# },
# {
# "picture": "小孩跑到车前,慌慌张张地对女人说有人要杀他",
# "timestamp": "00:26-00:31",
# "narration": "原声播放2",
# "OST": True,
# "new_timestamp": "00:12-00:17"
# }
# ]
# combine_clip_videos(combined_video_path=combined_video_path, video_paths=video_paths, video_ost_list=video_ost_list, list_script=list_script)
# cfg = VideoClipParams()
# cfg.video_aspect = VideoAspect.portrait
# cfg.font_name = "STHeitiMedium.ttc"
# cfg.font_size = 60
# cfg.stroke_color = "#000000"
# cfg.stroke_width = 1.5
# cfg.text_fore_color = "#FFFFFF"
# cfg.text_background_color = "transparent"
# cfg.bgm_type = "random"
# cfg.bgm_file = ""
# cfg.bgm_volume = 1.0
# cfg.subtitle_enabled = True
# cfg.subtitle_position = "bottom"
# cfg.n_threads = 2
# cfg.paragraph_number = 1
#
# cfg.voice_volume = 1.0
# generate_video(video_path=video_file,
# audio_path=audio_file,
# subtitle_path=subtitle_file,
# output_file=output_file,
# params=cfg
# )
#
# video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4"
#
# audio_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3"
#
# subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt"
#
# output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4"
#
# generate_video_v2(video_path=video_path,
# audio_path=audio_path,
# subtitle_path=subtitle_path,
# output_file=output_file,
# params=cfg
# )
# 合并视频
video_list = [
'./storage/cache_videos/vid-01_03-01_50.mp4',
'./storage/cache_videos/vid-01_55-02_29.mp4',
'./storage/cache_videos/vid-03_24-04_04.mp4',
'./storage/cache_videos/vid-04_50-05_28.mp4'
]