feat(audio): 改进音频合并功能,支持 OST 设置,提升时间戳精度

-重构了 merge_audio_files 函数,增加了对 OST 设置的支持
- 新增 time_to_seconds 函数,支持多种时间格式的转换
- 修改了 audio_merger 模块的逻辑,根据 OST 设置处理音频
- 更新了 task 模块中的 start_subclip 函数,传入 OST 信息
- 优化了 subtitle 和 video 模块的逻辑,适应新的音频处理方式
This commit is contained in:
linyq 2024-11-20 18:12:45 +08:00
parent c03a13db13
commit 401eb92fa3
10 changed files with 566 additions and 412 deletions

View File

@ -366,6 +366,8 @@ class VideoClipParams(BaseModel):
custom_position: float = Field(default=70.0, description="自定义位置")
n_threads: Optional[int] = 8 # 线程数,有助于提升视频处理速度
tts_volume: float = 1.0 # TTS音频音量
video_volume: float = 0.1 # 视频原声音量
class VideoTranscriptionRequest(BaseModel):
video_name: str

View File

@ -18,95 +18,119 @@ def check_ffmpeg():
return False
def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int, video_script: list):
def merge_audio_files(task_id: str, audio_files: list, total_duration: float, list_script: list):
"""
合并多个音频文件到一个指定总时长的音频文件中并生成相应的字幕
:param task_id: 任务ID
:param audio_file_paths: 音频文件路径列表
:param total_duration: 最终音频文件的总时长
:param video_script: JSON格式的视频脚本
合并音频文件根据OST设置处理不同的音频轨道
Args:
task_id: 任务ID
audio_files: TTS生成的音频文件列表
total_duration: 总时长
list_script: 完整脚本信息包含OST设置
Returns:
str: 合并后的音频文件路径
"""
output_dir = utils.task_dir(task_id)
# 检查FFmpeg是否安装
if not check_ffmpeg():
logger.error("错误FFmpeg未安装。请安装FFmpeg后再运行此脚本。")
return None, None
logger.error("FFmpeg未安装无法合并音频文件")
return None
# 创建一个总时长为total_duration的空白音频
blank_audio = AudioSegment.silent(duration=total_duration * 1000) # pydub使用毫秒
# 创建一个空的音频片段
final_audio = AudioSegment.silent(duration=total_duration * 1000) # 总时长以毫秒为单位
for audio_path in audio_file_paths:
if not os.path.exists(audio_path):
logger.info(f"警告:文件 {audio_path} 不存在,已跳过。")
# 遍历脚本中的每个片段
for segment, audio_file in zip(list_script, audio_files):
try:
# 加载TTS音频文件
tts_audio = AudioSegment.from_file(audio_file)
# 获取片段的开始和结束时间
start_time, end_time = segment['new_timestamp'].split('-')
start_seconds = utils.time_to_seconds(start_time)
end_seconds = utils.time_to_seconds(end_time)
# 根据OST设置处理音频
if segment['OST'] == 0:
# 只使用TTS音频
final_audio = final_audio.overlay(tts_audio, position=start_seconds * 1000)
elif segment['OST'] == 1:
# 只使用原声(假设原声已经在视频中)
continue
elif segment['OST'] == 2:
# 混合TTS音频和原声
original_audio = AudioSegment.silent(duration=(end_seconds - start_seconds) * 1000)
mixed_audio = original_audio.overlay(tts_audio)
final_audio = final_audio.overlay(mixed_audio, position=start_seconds * 1000)
except Exception as e:
logger.error(f"处理音频文件 {audio_file} 时出错: {str(e)}")
continue
# 从文件名中提取时间戳
filename = os.path.basename(audio_path)
start_time, end_time = extract_timestamp(filename)
# 保存合并后的音频文件
output_audio_path = os.path.join(utils.task_dir(task_id), "final_audio.mp3")
final_audio.export(output_audio_path, format="mp3")
logger.info(f"合并后的音频文件已保存: {output_audio_path}")
# 读取音频文件
try:
audio = AudioSegment.from_mp3(audio_path)
except Exception as e:
logger.error(f"错误:无法读取文件 {audio_path}。错误信息:{str(e)}")
continue
# 将音频插入到空白音频的指定位置
blank_audio = blank_audio.overlay(audio, position=start_time * 1000)
# 尝试导出为WAV格式
try:
output_file = os.path.join(output_dir, "audio.wav")
blank_audio.export(output_file, format="wav")
logger.info(f"音频合并完成,已保存为 {output_file}")
except Exception as e:
logger.info(f"导出为WAV格式失败尝试使用MP3格式{str(e)}")
try:
output_file = os.path.join(output_dir, "audio.mp3")
blank_audio.export(output_file, format="mp3", codec="libmp3lame")
logger.info(f"音频合并完成,已保存为 {output_file}")
except Exception as e:
logger.error(f"导出音频失败:{str(e)}")
return None, None
return output_file
def parse_timestamp(timestamp: str):
"""解析时间戳字符串为秒数"""
# 确保使用冒号作为分隔符
timestamp = timestamp.replace('_', ':')
return time_to_seconds(timestamp)
def extract_timestamp(filename):
"""从文件名中提取开始和结束时间戳"""
# 从 "audio_00_06-00_24.mp3" 这样的格式中提取时间
time_part = filename.split('_', 1)[1].split('.')[0] # 获取 "00_06-00_24" 部分
start_time, end_time = time_part.split('-') # 分割成 "00_06" 和 "00_24"
# 将下划线格式转换回冒号格式
start_time = start_time.replace('_', ':')
end_time = end_time.replace('_', ':')
# 将时间戳转换为秒
start_seconds = time_to_seconds(start_time)
end_seconds = time_to_seconds(end_time)
return start_seconds, end_seconds
return output_audio_path
def time_to_seconds(time_str):
""""00:06""00_06" 格式转换为总秒数"""
# 确保使用冒号作为分隔符
time_str = time_str.replace('_', ':')
"""
将时间字符串转换为秒数支持多种格式
1. 'HH:MM:SS,mmm' (::,毫秒)
2. 'MM:SS,mmm' (:,毫秒)
3. 'SS,mmm' (,毫秒)
"""
try:
parts = time_str.split(':')
if len(parts) != 2:
logger.error(f"Invalid time format: {time_str}")
return 0
return int(parts[0]) * 60 + int(parts[1])
# 处理毫秒部分
if ',' in time_str:
time_part, ms_part = time_str.split(',')
ms = float(ms_part) / 1000
else:
time_part = time_str
ms = 0
# 分割时间部分
parts = time_part.split(':')
if len(parts) == 3: # HH:MM:SS
h, m, s = map(int, parts)
seconds = h * 3600 + m * 60 + s
elif len(parts) == 2: # MM:SS
m, s = map(int, parts)
seconds = m * 60 + s
else: # SS
seconds = int(parts[0])
return seconds + ms
except (ValueError, IndexError) as e:
logger.error(f"Error parsing time {time_str}: {str(e)}")
return 0
return 0.0
def extract_timestamp(filename):
"""
从文件名中提取开始和结束时间戳
例如: "audio_00_06,500-00_24,800.mp3" -> (6.5, 24.8)
"""
try:
# 从文件名中提取时间部分
time_part = filename.split('_', 1)[1].split('.')[0] # 获取 "00_06,500-00_24,800" 部分
start_time, end_time = time_part.split('-') # 分割成开始和结束时间
# 将下划线格式转换回冒号格式
start_time = start_time.replace('_', ':')
end_time = end_time.replace('_', ':')
# 将时间戳转换为秒
start_seconds = time_to_seconds(start_time)
end_seconds = time_to_seconds(end_time)
return start_seconds, end_seconds
except Exception as e:
logger.error(f"Error extracting timestamp from {filename}: {str(e)}")
return 0.0, 0.0
if __name__ == "__main__":

View File

@ -3,6 +3,7 @@ import subprocess
import random
import traceback
from urllib.parse import urlencode
from datetime import datetime
import requests
from typing import List
@ -253,34 +254,58 @@ def download_videos(
def time_to_seconds(time_str: str) -> float:
"""
将时间字符串转换为秒数
支持格式
1. "MM:SS" (:)
2. "SS" (纯秒数)
将时间字符串转换为秒数支持多种格式
1. 'HH:MM:SS,mmm' (::,毫秒)
2. 'MM:SS' (:)
3. 'SS' ()
"""
parts = time_str.split(':')
if len(parts) == 2:
minutes, seconds = map(float, parts)
return minutes * 60 + seconds
return float(time_str)
try:
# 处理毫秒部分
if ',' in time_str:
time_part, ms_part = time_str.split(',')
ms = int(ms_part) / 1000
else:
time_part = time_str
ms = 0
# 根据格式分别处理
parts = time_part.split(':')
if len(parts) == 3: # HH:MM:SS
time_obj = datetime.strptime(time_part, "%H:%M:%S")
seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second
elif len(parts) == 2: # MM:SS
time_obj = datetime.strptime(time_part, "%M:%S")
seconds = time_obj.minute * 60 + time_obj.second
else: # SS
seconds = float(time_part)
return seconds + ms
except ValueError as e:
logger.error(f"时间格式错误: {time_str}")
raise ValueError(f"时间格式错误支持的格式HH:MM:SS,mmm 或 MM:SS 或 SS") from e
def format_timestamp(seconds: float) -> str:
"""
将秒数转换为 "MM:SS" 格式的时间字符串
将秒数转换为可读的时间格式 (HH:MM:SS,mmm)
"""
minutes = int(seconds) // 60
secs = int(seconds) % 60
return f"{minutes:02d}:{secs:02d}"
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds_remain = seconds % 60
whole_seconds = int(seconds_remain)
milliseconds = int((seconds_remain - whole_seconds) * 1000)
return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict:
"""
保存剪辑后的视频
Args:
timestamp: 需要裁剪的单个时间戳支持两种格式
1. '00:36-00:40' (:-:)
2. 'SS-SS' (-)
timestamp: 需要裁剪的单个时间戳支持格式
1. 'HH:MM:SS,mmm-HH:MM:SS,mmm' (::,毫秒)
2. 'MM:SS-MM:SS' (:-:)
3. 'SS-SS' (-)
origin_video: 原视频路径
save_dir: 存储目录
@ -293,7 +318,7 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
if not os.path.exists(save_dir):
os.makedirs(save_dir)
video_id = f"vid-{timestamp.replace(':', '_')}"
video_id = f"vid-{timestamp.replace(':', '_').replace(',', '-')}"
video_path = f"{save_dir}/{video_id}.mp4"
if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
@ -312,12 +337,12 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
# 验证时间段是否有效
if start >= total_duration:
logger.warning(f"起始时间 {format_timestamp(start)} ({start:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒)")
logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)")
video.close()
return {}
if end > total_duration:
logger.warning(f"结束时间 {format_timestamp(end)} ({end:.2f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.2f}秒),将自动调整为视频结尾")
logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒),将自动调整为视频结尾")
end = total_duration
if end <= start:
@ -332,7 +357,15 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
try:
# 检查视频是否有音频轨道并写入文件
subclip.write_videofile(video_path, audio=(subclip.audio is not None), logger=None)
subclip.write_videofile(
video_path,
codec='libx264',
audio_codec='aac',
temp_audiofile='temp-audio.m4a',
remove_temp=True,
audio=(subclip.audio is not None),
logger=None
)
# 验证生成的视频文件
if os.path.exists(video_path) and os.path.getsize(video_path) > 0:

View File

@ -206,134 +206,14 @@ def generate_final_videos(
return final_video_paths, combined_video_paths
def start(task_id, params: VideoParams, stop_at: str = "video"):
logger.info(f"start task: {task_id}, stop_at: {stop_at}")
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
if type(params.video_concat_mode) is str:
params.video_concat_mode = VideoConcatMode(params.video_concat_mode)
# 1. Generate script
video_script = generate_script(task_id, params)
if not video_script:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
return
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
if stop_at == "script":
sm.state.update_task(
task_id, state=const.TASK_STATE_COMPLETE, progress=100, script=video_script
)
return {"script": video_script}
# 2. Generate terms
video_terms = ""
if params.video_source != "local":
video_terms = generate_terms(task_id, params, video_script)
if not video_terms:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
return
save_script_data(task_id, video_script, video_terms, params)
if stop_at == "terms":
sm.state.update_task(
task_id, state=const.TASK_STATE_COMPLETE, progress=100, terms=video_terms
)
return {"script": video_script, "terms": video_terms}
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
# 3. Generate audio
audio_file, audio_duration, sub_maker = generate_audio(task_id, params, video_script)
if not audio_file:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
return
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
if stop_at == "audio":
sm.state.update_task(
task_id,
state=const.TASK_STATE_COMPLETE,
progress=100,
audio_file=audio_file,
)
return {"audio_file": audio_file, "audio_duration": audio_duration}
# 4. Generate subtitle
subtitle_path = generate_subtitle(task_id, params, video_script, sub_maker, audio_file)
if stop_at == "subtitle":
sm.state.update_task(
task_id,
state=const.TASK_STATE_COMPLETE,
progress=100,
subtitle_path=subtitle_path,
)
return {"subtitle_path": subtitle_path}
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
# 5. Get video materials
downloaded_videos = get_video_materials(
task_id, params, video_terms, audio_duration
)
if not downloaded_videos:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
return
if stop_at == "materials":
sm.state.update_task(
task_id,
state=const.TASK_STATE_COMPLETE,
progress=100,
materials=downloaded_videos,
)
return {"materials": downloaded_videos}
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
# 6. Generate final videos
final_video_paths, combined_video_paths = generate_final_videos(
task_id, params, downloaded_videos, audio_file, subtitle_path
)
if not final_video_paths:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
return
logger.success(
f"task {task_id} finished, generated {len(final_video_paths)} videos."
)
kwargs = {
"videos": final_video_paths,
"combined_videos": combined_video_paths,
"script": video_script,
"terms": video_terms,
"audio_file": audio_file,
"audio_duration": audio_duration,
"subtitle_path": subtitle_path,
"materials": downloaded_videos,
}
sm.state.update_task(
task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs
)
return kwargs
def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
"""
后台任务自动剪辑视频进行剪辑
task_id: 任务ID
params: 剪辑参数
subclip_path_videos: 视频文件路径
"""
"""后台任务(自动剪辑视频进行剪辑)"""
logger.info(f"\n\n## 开始任务: {task_id}")
# 初始化 ImageMagick
if not utils.init_imagemagick():
logger.warning("ImageMagick 初始化失败,字幕可能无法正常显示")
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
# tts 角色名称
@ -341,8 +221,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
logger.info("\n\n## 1. 加载视频脚本")
video_script_path = path.join(params.video_clip_json_path)
# video_script_path = video_clip_json_path
# 判断json文件是否存在
if path.exists(video_script_path):
try:
with open(video_script_path, "r", encoding="utf-8") as f:
@ -355,10 +234,12 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
logger.debug(f"解说完整脚本: \n{video_script}")
logger.debug(f"解说 OST 列表: \n{video_ost}")
logger.debug(f"解说时间戳列表: \n{time_list}")
# 获取视频总时长(单位 s)
total_duration = list_script[-1]['new_timestamp']
total_duration = int(total_duration.split("-")[1].split(":")[0]) * 60 + int(
total_duration.split("-")[1].split(":")[1])
last_timestamp = list_script[-1]['new_timestamp']
end_time = last_timestamp.split("-")[1]
total_duration = utils.time_to_seconds(end_time)
except Exception as e:
logger.error(f"无法读取视频json脚本请检查配置是否正确。{e}")
raise ValueError("无法读取视频json脚本请检查配置是否正确")
@ -366,32 +247,51 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
raise ValueError("解说脚本不存在!请检查配置是否正确。")
logger.info("\n\n## 2. 生成音频列表")
audio_files, sub_maker_list = voice.tts_multiple(
task_id=task_id,
list_script=list_script,
voice_name=voice_name,
voice_rate=params.voice_rate,
voice_pitch=params.voice_pitch,
force_regenerate=True
logger.info("\n\n## 2. 根据OST设置生成音频列表")
# 只为OST=0或2的片段生成TTS音频
tts_segments = [
segment for segment in list_script
if segment['OST'] in [0, 2]
]
logger.debug(f"tts_segments: {tts_segments}")
if tts_segments:
audio_files, sub_maker_list = voice.tts_multiple(
task_id=task_id,
list_script=tts_segments, # 只传入需要TTS的片段
voice_name=voice_name,
voice_rate=params.voice_rate,
voice_pitch=params.voice_pitch,
force_regenerate=True
)
if audio_files is None:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
return
else:
audio_files = []
logger.info(f"合并音频文件:\n{audio_files}")
# 传入OST信息以便正确处理音频
final_audio = audio_merger.merge_audio_files(
task_id=task_id,
audio_files=audio_files,
total_duration=total_duration,
list_script=list_script # 传入完整脚本以便处理OST
)
if audio_files is None:
sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
logger.error(
"TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
return
logger.info(f"合并音频:\n\n {audio_files}")
audio_file = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script)
sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
# 只为OST=0或2的片段生成字幕
subtitle_path = ""
if params.subtitle_enabled:
subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
# 使用 faster-whisper-large-v2 模型生成字幕
subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
subtitle.create(
audio_file=final_audio,
subtitle_file=subtitle_path,
)
subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
if not subtitle_lines:
@ -434,14 +334,15 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}")
# 把所有东西合到在一起
logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}")
# 传入OST信息以便正确处理音频和视频
video.generate_video_v2(
video_path=combined_video_path,
audio_path=audio_file,
audio_path=final_audio,
subtitle_path=subtitle_path,
output_file=final_video_path,
params=params,
list_script=list_script # 传入完整脚本以便处理OST
)
_progress += 50 / 2

View File

@ -173,7 +173,7 @@ def wrap_text(text, max_width, font, fontsize=60):
if width <= max_width:
return text, height
logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 本: {text}")
logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 本: {text}")
processed = True
@ -228,131 +228,93 @@ def manage_clip(clip):
def generate_video_v2(
video_path: str,
audio_path: str,
subtitle_path: str,
output_file: str,
params: Union[VideoParams, VideoClipParams],
progress_callback=None,
video_path: str,
audio_path: str,
subtitle_path: str,
output_file: str,
params: VideoClipParams,
list_script: list = None
):
"""
合并所有素材
生成最终视频处理音频和字幕
Args:
video_path: 视频路径
audio_path: 单个音频文件路径
video_path: 视频文件路径
audio_path: 音频文件路径
subtitle_path: 字幕文件路径
output_file: 输出文件路径
params: 视频参数
progress_callback: 进度回调函数接收 0-100 的进度值
Returns:
list_script: 视频脚本列表包含OST设置
"""
total_steps = 4
current_step = 0
def update_progress(step_name):
nonlocal current_step
current_step += 1
if progress_callback:
progress_callback(int(current_step * 100 / total_steps))
logger.info(f"完成步骤: {step_name}")
try:
validate_params(video_path, audio_path, output_file, params)
video_clip = VideoFileClip(video_path)
with manage_clip(VideoFileClip(video_path)) as video_clip:
aspect = VideoAspect(params.video_aspect)
video_width, video_height = aspect.to_resolution()
logger.info(f"开始,视频尺寸: {video_width} x {video_height}")
logger.info(f" ① 视频: {video_path}")
logger.info(f" ② 音频: {audio_path}")
logger.info(f" ③ 字幕: {subtitle_path}")
logger.info(f" ④ 输出: {output_file}")
output_dir = os.path.dirname(output_file)
update_progress("初始化完成")
# 字体设置
font_path = ""
if params.subtitle_enabled:
if not params.font_name:
params.font_name = "STHeitiMedium.ttc"
font_path = os.path.join(utils.font_dir(), params.font_name)
if os.name == "nt":
font_path = font_path.replace("\\", "/")
logger.info(f"使用字体: {font_path}")
def create_text_clip(subtitle_item):
phrase = subtitle_item[1]
max_width = video_width * 0.9
wrapped_txt, txt_height = wrap_text(
phrase, max_width=max_width, font=font_path, fontsize=params.font_size
)
_clip = TextClip(
wrapped_txt,
font=font_path,
fontsize=params.font_size,
color=params.text_fore_color,
bg_color=params.text_background_color,
stroke_color=params.stroke_color,
stroke_width=params.stroke_width,
print_cmd=False,
)
duration = subtitle_item[0][1] - subtitle_item[0][0]
_clip = _clip.set_start(subtitle_item[0][0])
_clip = _clip.set_end(subtitle_item[0][1])
_clip = _clip.set_duration(duration)
if params.subtitle_position == "bottom":
_clip = _clip.set_position(("center", video_height * 0.95 - _clip.h))
elif params.subtitle_position == "top":
_clip = _clip.set_position(("center", video_height * 0.05))
elif params.subtitle_position == "custom":
margin = 10
max_y = video_height - _clip.h - margin
min_y = margin
custom_y = (video_height - _clip.h) * (params.custom_position / 100)
custom_y = max(min_y, min(custom_y, max_y))
_clip = _clip.set_position(("center", custom_y))
else: # center
_clip = _clip.set_position(("center", "center"))
return _clip
update_progress("字体设置完成")
# 处理音频
original_audio = video_clip.audio
video_duration = video_clip.duration
new_audio = AudioFileClip(audio_path)
final_audio = process_audio_tracks(original_audio, new_audio, params, video_duration)
update_progress("音频处理完成")
# 处理字幕
if subtitle_path and os.path.exists(subtitle_path):
video_clip = process_subtitles(subtitle_path, video_clip, video_duration, create_text_clip)
update_progress("字幕处理完成")
# 合并音频和导出
video_clip = video_clip.set_audio(final_audio)
video_clip.write_videofile(
output_file,
audio_codec="aac",
temp_audiofile=os.path.join(output_dir, "temp-audio.m4a"),
threads=params.n_threads,
logger=None,
fps=30,
)
# 处理音频
if audio_path and os.path.exists(audio_path):
audio_clip = AudioFileClip(audio_path)
except FileNotFoundError as e:
logger.error(f"文件不存在: {str(e)}")
raise
if list_script:
# 根据OST设置处理音频
# OST=0: 只使用TTS音频
# OST=1: 只使用视频原声
# OST=2: 混合TTS音频和视频原声
original_audio = video_clip.audio
# 设置音频音量
tts_volume = params.tts_volume if hasattr(params, 'tts_volume') else 1.0
video_volume = params.video_volume if hasattr(params, 'video_volume') else 0.1
# 创建最终音频
if original_audio:
# 有些片段需要原声有些需要TTS
final_audio = CompositeAudioClip([
audio_clip.volumex(tts_volume), # TTS音频
original_audio.volumex(video_volume) # 原声音频
])
else:
final_audio = audio_clip.volumex(tts_volume)
else:
# 如果没有OST设置使用默认行为
final_audio = audio_clip
video_clip = video_clip.set_audio(final_audio)
# 处理字幕
if subtitle_path and os.path.exists(subtitle_path):
# 添加字幕
video_clip = add_subtitles(
video_clip,
subtitle_path,
params.font_size,
params.font_name,
params.text_fore_color,
params.subtitle_position,
params.stroke_color,
params.stroke_width
)
# 写入最终视频文件
video_clip.write_videofile(
output_file,
codec="libx264",
audio_codec="aac",
temp_audiofile="temp-audio.m4a",
remove_temp=True,
threads=params.n_threads
)
except Exception as e:
logger.error(f"视频生成失败: {str(e)}")
raise
logger.error(f"生成视频时发生错误: {str(e)}")
raise e
finally:
logger.success("完成")
# 清理资源
if 'video_clip' in locals():
video_clip.close()
if 'audio_clip' in locals():
audio_clip.close()
if 'final_audio' in locals():
final_audio.close()
def process_audio_tracks(original_audio, new_audio, params, video_duration):
@ -389,7 +351,7 @@ def process_subtitles(subtitle_path, video_clip, video_duration, create_text_cli
for item in sub.subtitles:
clip = create_text_clip(subtitle_item=item)
# 时间范围
# 时间范围<EFBFBD><EFBFBD>
start_time = max(clip.start, 0)
if start_time >= video_duration:
continue
@ -450,12 +412,12 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
def combine_clip_videos(combined_video_path: str,
video_paths: List[str],
video_ost_list: List[int],
list_script: list,
video_aspect: VideoAspect = VideoAspect.portrait,
threads: int = 2,
) -> str:
video_paths: List[str],
video_ost_list: List[int],
list_script: list,
video_aspect: VideoAspect = VideoAspect.portrait,
threads: int = 2,
) -> str:
"""
合并子视频
Args:
@ -469,9 +431,18 @@ def combine_clip_videos(combined_video_path: str,
Returns:
str: 合并后的视频路径
"""
from app.utils.utils import calculate_total_duration
audio_duration = calculate_total_duration(list_script)
logger.info(f"音频的最大持续时间: {audio_duration} s")
# 计算总时长时需要考虑毫秒精度
total_duration = 0.0
for item in list_script:
timestamp = item.get('new_timestamp', '')
if timestamp:
start_str, end_str = timestamp.split('-')
start_time = utils.time_to_seconds(start_str)
end_time = utils.time_to_seconds(end_str)
duration = end_time - start_time
total_duration += duration
logger.info(f"音频的最大持续时间: {total_duration:.3f} s")
output_dir = os.path.dirname(combined_video_path)
aspect = VideoAspect(video_aspect)
@ -480,11 +451,17 @@ def combine_clip_videos(combined_video_path: str,
clips = []
for video_path, video_ost in zip(video_paths, video_ost_list):
try:
# 加载视频片段
clip = VideoFileClip(video_path)
# 根据OST设置处理音频
if video_ost == 0: # 不保留原声
clip = clip.without_audio()
# video_ost 为 1 或 2 时都保留原声,不需要特殊处理
elif video_ost == 1: # 只保留原声
# 保持原声,但可能需要调整音量
if clip.audio:
clip = clip.set_audio(clip.audio.volumex(1.0)) # 可以调整音量系数
# OST == 2 的情况会在后续处理中混合音频
clip = clip.set_fps(30)
@ -498,6 +475,16 @@ def combine_clip_videos(combined_video_path: str,
)
logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}")
# 精确控制视频时长
filename = os.path.basename(video_path)
timestamp = extract_timestamp_from_filename(filename)
if timestamp:
start_time, end_time = timestamp
clip_duration = end_time - start_time
if abs(clip.duration - clip_duration) > 0.1: # 允许0.1秒的误差
logger.warning(f"视频 {video_path} 时长与时间戳不匹配,进行调整")
clip = clip.set_duration(clip_duration)
clips.append(clip)
except Exception as e:
@ -508,6 +495,7 @@ def combine_clip_videos(combined_video_path: str,
raise ValueError("没有有效的视频片段可以合并")
try:
# 合并所有视频片段
video_clip = concatenate_videoclips(clips)
video_clip = video_clip.set_fps(30)
@ -521,7 +509,7 @@ def combine_clip_videos(combined_video_path: str,
temp_audiofile=os.path.join(output_dir, "temp-audio.m4a")
)
finally:
# 确保资源被正确<EFBFBD><EFBFBD><EFBFBD>
# 确保资源被正确
video_clip.close()
for clip in clips:
clip.close()
@ -530,6 +518,59 @@ def combine_clip_videos(combined_video_path: str,
return combined_video_path
def extract_timestamp_from_filename(filename: str) -> tuple:
"""
从文件名中提取时间戳支持多种格式
- "vid-00_06,500-00_24,800.mp4" -> (6.5, 24.8)
- "vid-00_00_00-020-00_00_10-400.mp4" -> (0.02, 10.4)
"""
try:
# 提取时间戳部分
match = re.search(r'vid-(.+?)\.mp4$', filename)
if not match:
logger.warning(f"文件名格式不正确: {filename}")
return None
timestamp = match.group(1)
# 处理包含毫秒的格式 (00_00_00-020-00_00_10-400)
if timestamp.count('-') == 3:
parts = timestamp.split('-')
start_time = f"{parts[0]}-{parts[1]}" # 组合开始时间和毫秒
end_time = f"{parts[2]}-{parts[3]}" # 组合结束时间和毫秒
# 转换开始时间
start_time_str = start_time.replace('_', ':')
if start_time_str.count(':') == 2: # 如果是 00:00:00-020 格式
start_base = utils.time_to_seconds(start_time_str.split('-')[0])
start_ms = int(start_time_str.split('-')[1]) / 1000
start_seconds = start_base + start_ms
else:
start_seconds = utils.time_to_seconds(start_time_str)
# 转换结束时间
end_time_str = end_time.replace('_', ':')
if end_time_str.count(':') == 2: # 如果是 00:00:10-400 格式
end_base = utils.time_to_seconds(end_time_str.split('-')[0])
end_ms = int(end_time_str.split('-')[1]) / 1000
end_seconds = end_base + end_ms
else:
end_seconds = utils.time_to_seconds(end_time_str)
# 处理简单格式 (00_06-00_24)
else:
start_str, end_str = timestamp.split('-')
start_seconds = utils.time_to_seconds(start_str.replace('_', ':'))
end_seconds = utils.time_to_seconds(end_str.replace('_', ':'))
logger.debug(f"从文件名 {filename} 提取时间戳: {start_seconds:.3f} - {end_seconds:.3f}")
return start_seconds, end_seconds
except Exception as e:
logger.error(f"从文件名提取时间戳失败 {filename}: {str(e)}\n{traceback.format_exc()}")
return None
def resize_video_with_padding(clip, target_width: int, target_height: int):
"""辅助函数:调整视频尺寸并添加黑边"""
clip_ratio = clip.w / clip.h
@ -574,6 +615,71 @@ def validate_params(video_path, audio_path, output_file, params):
raise ValueError("params 缺少必要参数 video_aspect")
def add_subtitles(video_clip, subtitle_path, font_size, font_name, font_color, position, shadow_color, shadow_offset):
"""
为视频添加字幕
Args:
video_clip: 视频剪辑对象
subtitle_path: 字幕文件路径
font_size: 字体大小
font_name: 字体名称
font_color: 字体颜色
position: 字幕位置 ('top', 'center', 'bottom')
shadow_color: 阴影颜色
shadow_offset: 阴影偏移
Returns:
带有字幕的视频剪辑对象
"""
try:
# 确保字体文件存在
font_path = os.path.join(utils.font_dir(), font_name)
if not os.path.exists(font_path):
logger.error(f"字体文件不存在: {font_path}")
# 尝试使用系统默认字体
font_path = "Arial" if os.name == 'nt' else "/System/Library/Fonts/STHeiti Light.ttc"
logger.info(f"使用默认字体: {font_path}")
# 设置字幕位置
if position == "top":
pos = ("center", 50)
elif position == "center":
pos = "center"
else: # bottom
pos = ("center", -50)
def subtitle_generator(txt):
return TextClip(
txt,
fontsize=font_size,
font=font_path,
color=font_color,
stroke_color=shadow_color,
stroke_width=shadow_offset,
method='caption', # 使用 caption 方法可能更稳定
size=(video_clip.w * 0.9, None) # 限制字幕宽度
)
subtitles = SubtitlesClip(
subtitle_path,
subtitle_generator
)
# 添加字幕到视频
video_with_subtitles = CompositeVideoClip([
video_clip,
subtitles.set_position(pos)
])
return video_with_subtitles
except Exception as e:
logger.error(f"添加字幕时出错: {str(e)}\n{traceback.format_exc()}")
# 如果添加字幕失败,返回原始视频
return video_clip
if __name__ == "__main__":
# combined_video_path = "../../storage/tasks/12312312/com123.mp4"
#
@ -586,7 +692,7 @@ if __name__ == "__main__":
# {
# "picture": "夜晚,一个小孩在树林里奔跑,后面有人拿着火把在追赶",
# "timestamp": "00:00-00:03",
# "narration": "夜风高的树林,一个小孩在拼命奔跑,后面的人穷追不舍!",
# "narration": "夜<EFBFBD><EFBFBD><EFBFBD>风高的树林,一个小孩在拼命奔跑,后面的人穷追不舍!",
# "OST": False,
# "new_timestamp": "00:00-00:03"
# },

View File

@ -1,5 +1,5 @@
"""
使用 moviepy 库剪辑指定时间戳视频
使用 moviepy 库剪辑指定时间戳视频支持时分秒毫秒精度
"""
from moviepy.editor import VideoFileClip
@ -11,12 +11,22 @@ def time_str_to_seconds(time_str: str) -> float:
"""
将时间字符串转换为秒数
参数:
time_str: 格式为"MM:SS"的时间字符串
time_str: 格式为"HH:MM:SS,mmm"的时间字符串例如"00:01:23,456"
返回:
转换后的秒数
转换后的秒数(float)
"""
time_obj = datetime.strptime(time_str, "%M:%S")
return time_obj.minute * 60 + time_obj.second
try:
# 分离时间和毫秒
time_part, ms_part = time_str.split(',')
# 转换时分秒
time_obj = datetime.strptime(time_part, "%H:%M:%S")
# 计算总秒数
total_seconds = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second
# 添加毫秒部分
total_seconds += int(ms_part) / 1000
return total_seconds
except ValueError as e:
raise ValueError("时间格式错误,请使用 HH:MM:SS,mmm 格式,例如 00:01:23,456") from e
def format_duration(seconds: float) -> str:
@ -25,11 +35,15 @@ def format_duration(seconds: float) -> str:
参数:
seconds: 秒数
返回:
格式化的时间字符串 (MM:SS)
格式化的时间字符串 (HH:MM:SS,mmm)
"""
minutes = int(seconds // 60)
remaining_seconds = int(seconds % 60)
return f"{minutes:02d}:{remaining_seconds:02d}"
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds_remain = seconds % 60
whole_seconds = int(seconds_remain)
milliseconds = int((seconds_remain - whole_seconds) * 1000)
return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"
def cut_video(video_path: str, start_time: str, end_time: str, output_path: str) -> None:
@ -37,8 +51,8 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
剪辑视频
参数:
video_path: 视频文件路径
start_time: 开始时间 (格式: "MM:SS")
end_time: 结束时间 (格式: "MM:SS")
start_time: 开始时间 (格式: "HH:MM:SS,mmm")
end_time: 结束时间 (格式: "HH:MM:SS,mmm")
output_path: 输出文件路径
"""
try:
@ -62,10 +76,18 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
# 加载视频文件
video = VideoFileClip(video_path)
# 验证时间范围
if start_seconds >= video.duration or end_seconds > video.duration:
raise ValueError(f"剪辑时间超出视频长度!视频总长度为: {format_duration(video.duration)}")
if start_seconds >= end_seconds:
raise ValueError("结束时间必须大于开始时间!")
# 计算剪辑时长
clip_duration = end_seconds - start_seconds
print(f"原视频总长度: {format_duration(video.duration)}")
print(f"剪辑时长: {format_duration(clip_duration)}")
print(f"剪辑区间: {start_time} -> {end_time}")
# 剪辑视频
video = video.subclip(start_seconds, end_seconds)
@ -92,6 +114,9 @@ def cut_video(video_path: str, start_time: str, end_time: str, output_path: str)
if __name__ == "__main__":
# cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "00:00", "07:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-1")
# cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "07:00", "14:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-2")
cut_video("E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\ca4fee22-350b-47f9-bb2f-802ad96774f7\\final-2.mp4", "14:00", "22:00", "E:\\NarratoAI_v0.3.5_cuda\\NarratoAI\storage\\tasks\\yyjx2-3")
cut_video(
video_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp.mp4",
start_time="00:00:00,789",
end_time="00:02:00,123",
output_path="/Users/apple/Desktop/NarratoAI/resource/videos/duanju_yuansp_cut3.mp4"
)

View File

@ -2,11 +2,23 @@ import os
import traceback
import json
from openai import OpenAI
from test_moviepy import cut_video
from pydantic import BaseModel
from typing import List
from app.utils import utils
from app.services.subtitle import extract_audio_and_create_subtitle
class Step(BaseModel):
timestamp: str
picture: str
narration: str
OST: int
new_timestamp: str
class MathReasoning(BaseModel):
result: List[Step]
def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
"""
与通义千问AI模型进行对话
@ -23,7 +35,7 @@ def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
"""
try:
client = OpenAI(
api_key="sk-",
api_key="sk-a1acd853d88d41d3ae92777d7bfa2612",
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
@ -50,25 +62,25 @@ def chat_with_qwen(prompt: str, system_message: str, subtitle_path: str) -> str:
# 使用示例
if __name__ == "__main__":
try:
# video_path = utils.video_dir("duanju_yuansp.mp4")
video_path = utils.video_dir("duanju_yuansp.mp4")
# # 判断视频是否存在
# if not os.path.exists(video_path):
# print(f"视频文件不存在:{video_path}")
# exit(1)
# 提取字幕
subtitle_path = os.path.join(utils.video_dir(""), f"duanju_yuan.srt")
# extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path)
extract_audio_and_create_subtitle(video_file=video_path, subtitle_file=subtitle_path)
# 分析字幕
system_message = """
你是一个视频srt字幕分析剪辑器, 输入视频的srt字幕, 分析其中的精彩且尽可能连续的片段并裁剪出来, 注意确保文字与时间戳的正确匹配
输出需严格按照如下 json 格式:
输出需严格按照如下 json 格式:
[
{
"timestamp": "00:50-01:44",
"timestamp": "00:00:50,020-00,01:44,000",
"picture": "画面1",
"narration": "播放原声",
"OST": 0,
"new_timestamp": "00:00-00:54"
"new_timestamp": "00:00:00,000-00:00:54,020"
},
{
"timestamp": "01:49-02:30",

View File

@ -40,7 +40,7 @@ def to_json(obj):
# 如果对象是二进制数据转换为base64编码的字符串
elif isinstance(o, bytes):
return "*** binary data ***"
# 如果象是字典,递归处理每个键值对
# 如果<EFBFBD><EFBFBD><EFBFBD>象是字典,递归处理每个键值对
elif isinstance(o, dict):
return {k: serialize(v) for k, v in o.items()}
# 如果对象是列表或元组,递归处理每个元素
@ -302,15 +302,49 @@ def get_current_country():
def time_to_seconds(time_str: str) -> float:
parts = time_str.split(':')
if len(parts) == 2:
m, s = map(float, parts)
return m * 60 + s
elif len(parts) == 3:
h, m, s = map(float, parts)
return h * 3600 + m * 60 + s
else:
raise ValueError(f"Invalid time format: {time_str}")
"""
将时间字符串转换为秒数支持多种格式
- "HH:MM:SS,mmm" -> 小时:分钟:,毫秒
- "MM:SS,mmm" -> 分钟:,毫秒
- "SS,mmm" -> ,毫秒
- "SS-mmm" -> -毫秒
Args:
time_str: 时间字符串
Returns:
float: 转换后的秒数(包含毫秒)
"""
try:
# 处理带有'-'的毫秒格式
if '-' in time_str:
time_part, ms_part = time_str.split('-')
ms = float(ms_part) / 1000
# 处理带有','的毫秒格式
elif ',' in time_str:
time_part, ms_part = time_str.split(',')
ms = float(ms_part) / 1000
else:
time_part = time_str
ms = 0
# 分割时间部分
parts = time_part.split(':')
if len(parts) == 3: # HH:MM:SS
h, m, s = map(float, parts)
seconds = h * 3600 + m * 60 + s
elif len(parts) == 2: # MM:SS
m, s = map(float, parts)
seconds = m * 60 + s
else: # SS
seconds = float(parts[0])
return seconds + ms
except (ValueError, IndexError) as e:
logger.error(f"时间格式转换错误 {time_str}: {str(e)}")
return 0.0
def seconds_to_time(seconds: float) -> str:
@ -520,3 +554,21 @@ def download_font(url: str, font_path: str):
except Exception as e:
logger.error(f"下载字体文件失败: {e}")
raise
def init_imagemagick():
"""初始化 ImageMagick 配置"""
try:
# 检查 ImageMagick 是否已安装
import subprocess
result = subprocess.run(['magick', '-version'], capture_output=True, text=True)
if result.returncode != 0:
logger.error("ImageMagick 未安装或配置不正确")
return False
# 设置 IMAGEMAGICK_BINARY 环境变量
os.environ['IMAGEMAGICK_BINARY'] = 'magick'
return True
except Exception as e:
logger.error(f"初始化 ImageMagick 失败: {str(e)}")
return False

View File

@ -93,10 +93,8 @@ class VideoPipeline:
response.raise_for_status()
return response.json()
def save_script_to_json(self, script: list, script_name: str) -> str:
"""保存脚本到json文件"""
script_path = f"E:\\projects\\NarratoAI\\resource\\scripts\\{script_name}.json"
def save_script_to_json(self, script: list, script_path: str) -> str:
"""保存脚本到json文件"""
try:
with open(script_path, 'w', encoding='utf-8') as f:
json.dump(script, f, ensure_ascii=False, indent=2)
@ -133,8 +131,7 @@ class VideoPipeline:
# 2.2 保存脚本到json文件
print("保存脚本到json文件...")
script_path = self.save_script_to_json(script, script_name)
script_result["script_path"] = script_path
self.save_script_to_json(script=script, script_path=script_path)
# 3. 剪辑视频
print("开始剪辑视频...")
@ -143,7 +140,7 @@ class VideoPipeline:
# 4. 生成最终视频
print("开始生成最终视频...")
final_result = self.generate_final_video(
self.generate_final_video(
task_id=task_id,
video_path=video_path,
script_path=script_path,

View File

@ -369,4 +369,6 @@ output_path和script参数需要传递给请求3
}
}
subclip_videos和 output_path和script参数需要传递给请求4
最后完成工作流
最后完成工作流
0代表只播放文案音频禁用视频原声1代表只播放视频原声不需要播放文案音频和字幕2代表即播放文案音频也要播放视频原声