剪辑逻辑进度60%;

待优化点:
1. 生成字幕逻辑优化
2. 文案解说的时间和脚本时间的优化
This commit is contained in:
linyq 2024-09-28 17:10:43 +08:00
parent e440dc619f
commit 7b3014ad42
6 changed files with 109 additions and 81 deletions

View File

@ -353,7 +353,7 @@ class VideoClipParams(BaseModel):
bgm_file: Optional[str] = Field(default="", description="背景音乐文件") bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量") bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量")
subtitle_enabled: Optional[bool] = Field(default=True, description="是否启用字幕") subtitle_enabled: Optional[bool] = Field(default=False, description="是否启用字幕")
subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置") # top, bottom, center subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置") # top, bottom, center
font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称") font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称")
text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色") text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色")
@ -364,5 +364,5 @@ class VideoClipParams(BaseModel):
stroke_width: float = Field(default=1.5, description="文字描边宽度") stroke_width: float = Field(default=1.5, description="文字描边宽度")
custom_position: float = Field(default=70.0, description="自定义位置") custom_position: float = Field(default=70.0, description="自定义位置")
# n_threads: Optional[int] = 2 # 线程数 n_threads: Optional[int] = 8 # 线程数,有助于提升视频处理速度
# paragraph_number: Optional[int] = 1 # 段落数量 # paragraph_number: Optional[int] = 1 # 段落数量

View File

@ -1,9 +1,10 @@
import os import os
import json
import subprocess import subprocess
import edge_tts import edge_tts
from edge_tts import submaker from edge_tts import submaker
from pydub import AudioSegment from pydub import AudioSegment
from typing import List from typing import List, Dict
from loguru import logger from loguru import logger
from app.utils import utils from app.utils import utils
@ -17,12 +18,13 @@ def check_ffmpeg():
return False return False
def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int): def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int, video_script: list):
""" """
合并多个音频文件到一个指定总时长的音频文件中 合并多个音频文件到一个指定总时长的音频文件中并生成相应的字幕
:param task_id: 任务ID
:param audio_file_paths: 音频文件路径列表 :param audio_file_paths: 音频文件路径列表
:param total_duration: 最终音频文件的总时长 :param total_duration: 最终音频文件的总时长
:param video_script: JSON格式的视频脚本
""" """
output_dir = utils.task_dir(task_id) output_dir = utils.task_dir(task_id)
@ -35,6 +37,17 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
# 创建SubMaker对象 # 创建SubMaker对象
sub_maker = edge_tts.SubMaker() sub_maker = edge_tts.SubMaker()
# 解析JSON格式的video_script
script_data = video_script
for segment in script_data:
start_time, end_time = parse_timestamp(segment['new_timestamp'])
duration = (end_time - start_time) * 1000 # 转换为毫秒
if not segment['OST']:
# 如果不是原声则添加narration作为字幕
sub_maker.create_sub((start_time * 1000, duration), segment['narration'])
for audio_path in audio_file_paths: for audio_path in audio_file_paths:
if not os.path.exists(audio_path): if not os.path.exists(audio_path):
logger.info(f"警告:文件 {audio_path} 不存在,已跳过。") logger.info(f"警告:文件 {audio_path} 不存在,已跳过。")
@ -50,14 +63,10 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
except Exception as e: except Exception as e:
logger.error(f"错误:无法读取文件 {audio_path}。错误信息:{str(e)}") logger.error(f"错误:无法读取文件 {audio_path}。错误信息:{str(e)}")
continue continue
# 将音频插入到空白音频的指定位置 # 将音频插入到空白音频的指定位置
blank_audio = blank_audio.overlay(audio, position=start_time * 1000) blank_audio = blank_audio.overlay(audio, position=start_time * 1000)
# 添加字幕信息
duration = (end_time - start_time) * 1000 # 转换为毫秒
# TODO 不是 filename 需要考虑怎么把字幕文本弄过来
sub_maker.create_sub((start_time * 1000, duration), filename)
# 尝试导出为WAV格式 # 尝试导出为WAV格式
try: try:
output_file = os.path.join(output_dir, "audio.wav") output_file = os.path.join(output_dir, "audio.wav")
@ -66,7 +75,7 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
except Exception as e: except Exception as e:
logger.info(f"导出为WAV格式失败尝试使用MP3格式{str(e)}") logger.info(f"导出为WAV格式失败尝试使用MP3格式{str(e)}")
try: try:
output_file = "merged_audio.mp3" output_file = os.path.join(output_dir, "audio.mp3")
blank_audio.export(output_file, format="mp3", codec="libmp3lame") blank_audio.export(output_file, format="mp3", codec="libmp3lame")
logger.info(f"音频合并完成,已保存为 {output_file}") logger.info(f"音频合并完成,已保存为 {output_file}")
except Exception as e: except Exception as e:
@ -75,6 +84,10 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration:
return output_file, sub_maker return output_file, sub_maker
def parse_timestamp(timestamp: str) -> tuple:
"""解析时间戳字符串为秒数"""
start, end = timestamp.split('-')
return time_to_seconds(*start.split(':')), time_to_seconds(*end.split(':'))
def extract_timestamp(filename): def extract_timestamp(filename):
"""从文件名中提取开始和结束时间戳""" """从文件名中提取开始和结束时间戳"""
@ -95,14 +108,17 @@ def time_to_seconds(minutes, seconds):
if __name__ == "__main__": if __name__ == "__main__":
# 示例用法 # 示例用法
audio_files = [ audio_files =[
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-06-00-24.mp3", "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-06-00-24.mp3",
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-32-00-38.mp3", "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-32-00-38.mp3",
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-43-00-52.mp3", "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-43-00-52.mp3",
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-52-01-09.mp3", "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-52-01-09.mp3",
"/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01-13-01-15.mp3" "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01-13-01-15.mp3",
] ]
total_duration = 75 total_duration = 38
video_script_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json"
with open(video_script_path, "r", encoding="utf-8") as f:
video_script = json.load(f)
a, b = merge_audio_files("test456", audio_files, total_duration) output_file, sub_maker = merge_audio_files("test456", audio_files, total_duration, video_script)
print(a, b) print(output_file, sub_maker)

View File

@ -1,10 +1,12 @@
import json import json
import os.path import os.path
import re import re
from typing import Optional
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
from timeit import default_timer as timer from timeit import default_timer as timer
from loguru import logger from loguru import logger
import google.generativeai as genai
from app.config import config from app.config import config
from app.utils import utils from app.utils import utils
@ -278,8 +280,40 @@ def correct(subtitle_file, video_script):
logger.success("Subtitle is correct") logger.success("Subtitle is correct")
def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Optional[str] = None) -> Optional[str]:
if not api_key:
logger.error("Gemini API key is not provided")
return None
genai.configure(api_key=api_key)
logger.info(f"开始使用Gemini模型处理音频文件: {audio_file}")
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
prompt = "生成这段语音的转录文本。请以SRT格式输出包含时间戳。"
try:
with open(audio_file, "rb") as f:
audio_data = f.read()
response = model.generate_content([prompt, audio_data])
transcript = response.text
if not subtitle_file:
subtitle_file = f"{audio_file}.srt"
with open(subtitle_file, "w", encoding="utf-8") as f:
f.write(transcript)
logger.info(f"Gemini生成的字幕文件已保存: {subtitle_file}")
return subtitle_file
except Exception as e:
logger.error(f"使用Gemini处理音频时出错: {e}")
return None
if __name__ == "__main__": if __name__ == "__main__":
task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072" task_id = "task456"
task_dir = utils.task_dir(task_id) task_dir = utils.task_dir(task_id)
subtitle_file = f"{task_dir}/subtitle.srt" subtitle_file = f"{task_dir}/subtitle.srt"
audio_file = f"{task_dir}/audio.mp3" audio_file = f"{task_dir}/audio.mp3"
@ -297,3 +331,10 @@ if __name__ == "__main__":
subtitle_file = f"{task_dir}/subtitle-test.srt" subtitle_file = f"{task_dir}/subtitle-test.srt"
create(audio_file, subtitle_file) create(audio_file, subtitle_file)
# 使用Gemini模型处理音频
gemini_api_key = config.app.get("gemini_api_key") # 请替换为实际的API密钥
gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
if gemini_subtitle_file:
print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")

View File

@ -338,7 +338,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
# tts 角色名称 # tts 角色名称
voice_name = voice.parse_voice_name(params.voice_name) voice_name = voice.parse_voice_name(params.voice_name)
logger.info("\n\n## 1. 读取视频json脚本") logger.info("\n\n## 1. 加载视频脚本")
video_script_path = path.join(params.video_clip_json_path) video_script_path = path.join(params.video_clip_json_path)
# 判断json文件是否存在 # 判断json文件是否存在
if path.exists(video_script_path): if path.exists(video_script_path):
@ -376,7 +376,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
"音频文件为空可能是网络不可用。如果您在中国请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频") "音频文件为空可能是网络不可用。如果您在中国请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
return return
logger.info("合并音频") logger.info("合并音频")
audio_file, sub_maker = audio_merger.merge_audio_files(task_id, audio_files, total_duration) audio_file, sub_maker = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script)
# audio_duration = voice.get_audio_duration(sub_maker) # audio_duration = voice.get_audio_duration(sub_maker)
# audio_duration = math.ceil(audio_duration) # audio_duration = math.ceil(audio_duration)
@ -387,7 +387,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
subtitle_path = path.join(utils.task_dir(task_id), f"subtitle111.srt") subtitle_path = path.join(utils.task_dir(task_id), f"subtitle111.srt")
subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}") logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
# subtitle_fallback = False subtitle_fallback = False
if subtitle_provider == "edge": if subtitle_provider == "edge":
voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path) voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path)
# voice.create_subtitle( # voice.create_subtitle(
@ -401,7 +401,8 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
# logger.warning("找不到字幕文件回退到whisper") # logger.warning("找不到字幕文件回退到whisper")
# #
# if subtitle_provider == "whisper" or subtitle_fallback: # if subtitle_provider == "whisper" or subtitle_fallback:
# subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path) # # subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
# subtitle.create_with_gemini(audio_file=audio_file, subtitle_file=subtitle_path, api_key=config.app.get("gemini_api_key", ""))
# logger.info("\n\n## 更正字幕") # logger.info("\n\n## 更正字幕")
# subtitle.correct(subtitle_file=subtitle_path, video_script=video_script) # subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
@ -449,7 +450,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
video_ost_list=video_ost, video_ost_list=video_ost,
list_script=list_script, list_script=list_script,
video_aspect=params.video_aspect, video_aspect=params.video_aspect,
threads=1 # 暂时只支持单线程 threads=params.n_threads # 多线程
) )
_progress += 50 / 2 _progress += 50 / 2
@ -461,7 +462,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
# 把所有东西合到在一起 # 把所有东西合到在一起
video.generate_video_v2( video.generate_video_v2(
video_path=combined_video_path, video_path=combined_video_path,
audio_paths=audio_files, audio_path=audio_file,
subtitle_path=subtitle_path, subtitle_path=subtitle_path,
output_file=final_video_path, output_file=final_video_path,
params=params, params=params,

View File

@ -294,7 +294,7 @@ def generate_video(
output_file, output_file,
audio_codec="aac", audio_codec="aac",
temp_audiofile_path=output_dir, temp_audiofile_path=output_dir,
threads=params.n_threads or 2, threads=params.n_threads,
logger=None, logger=None,
fps=30, fps=30,
) )
@ -306,7 +306,7 @@ def generate_video(
def generate_video_v2( def generate_video_v2(
video_path: str, video_path: str,
audio_paths: List[str], audio_path: str,
subtitle_path: str, subtitle_path: str,
output_file: str, output_file: str,
params: Union[VideoParams, VideoClipParams], params: Union[VideoParams, VideoClipParams],
@ -314,11 +314,11 @@ def generate_video_v2(
""" """
合并所有素材 合并所有素材
Args: Args:
video_path: video_path: 视频路径
audio_paths: audio_path: 单个音频文件路径
subtitle_path: subtitle_path: 字幕文件路径
output_file: output_file: 输出文件路径
params: params: 视频参数
Returns: Returns:
@ -328,7 +328,7 @@ def generate_video_v2(
logger.info(f"开始,视频尺寸: {video_width} x {video_height}") logger.info(f"开始,视频尺寸: {video_width} x {video_height}")
logger.info(f" ① 视频: {video_path}") logger.info(f" ① 视频: {video_path}")
logger.info(f" ② 音频文件数量: {len(audio_paths)}") logger.info(f" ② 音频: {audio_path}")
logger.info(f" ③ 字幕: {subtitle_path}") logger.info(f" ③ 字幕: {subtitle_path}")
logger.info(f" ④ 输出: {output_file}") logger.info(f" ④ 输出: {output_file}")
@ -386,40 +386,8 @@ def generate_video_v2(
original_audio = video_clip.audio # 保存原始视频的音轨 original_audio = video_clip.audio # 保存原始视频的音轨
video_duration = video_clip.duration video_duration = video_clip.duration
# 处理多个音频文件 # 处理新的音频文件
audio_clips = [] new_audio = AudioFileClip(audio_path).volumex(params.voice_volume)
for audio_path in audio_paths:
# 确保每个音频文件路径是正确的
if not os.path.exists(audio_path):
logger.warning(f"音频文件不存在: {audio_path}")
continue
# 从文件名中提取时间信息
match = re.search(r'audio_(\d{2}-\d{2}-\d{2}-\d{2})\.mp3', os.path.basename(audio_path))
if match:
time_str = match.group(1)
start, end = time_str.split('-')[:2], time_str.split('-')[2:]
start_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(start)))
end_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(end)))
audio_clip = AudioFileClip(audio_path).volumex(params.voice_volume)
# 确保结束时间不超过音频实际长度
actual_end_time = min(end_time - start_time, audio_clip.duration)
audio_clip = audio_clip.subclip(0, actual_end_time)
audio_clip = audio_clip.set_start(start_time).set_end(start_time + actual_end_time)
audio_clips.append(audio_clip)
else:
logger.warning(f"无法从文件名解析时间信息: {audio_path}")
# 合并所有音频剪辑,包括原始音轨
if audio_clips:
audio_clips.insert(0, original_audio) # 将原始音轨添加到音频剪辑列表的开头
audio_clip = CompositeAudioClip(audio_clips)
else:
logger.warning("没有有效的音频文件,使用原始音轨")
audio_clip = original_audio
# 字幕处理部分 # 字幕处理部分
if subtitle_path and os.path.exists(subtitle_path): if subtitle_path and os.path.exists(subtitle_path):
@ -451,22 +419,29 @@ def generate_video_v2(
# 背景音乐处理部分 # 背景音乐处理部分
bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file) bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
# 合并音频轨道
audio_tracks = [original_audio, new_audio]
if bgm_file: if bgm_file:
try: try:
bgm_clip = ( bgm_clip = (
AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3) AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3)
) )
bgm_clip = afx.audio_loop(bgm_clip, duration=video_clip.duration) bgm_clip = afx.audio_loop(bgm_clip, duration=video_duration)
audio_clip = CompositeAudioClip([audio_clip, bgm_clip]) audio_tracks.append(bgm_clip)
except Exception as e: except Exception as e:
logger.error(f"添加背景音乐失败: {str(e)}") logger.error(f"添加背景音乐失败: {str(e)}")
video_clip = video_clip.set_audio(audio_clip) # 合并所有音频轨道
final_audio = CompositeAudioClip(audio_tracks)
video_clip = video_clip.set_audio(final_audio)
video_clip.write_videofile( video_clip.write_videofile(
output_file, output_file,
audio_codec="aac", audio_codec="aac",
temp_audiofile_path=output_dir, temp_audiofile_path=output_dir,
threads=params.n_threads or 2, threads=params.n_threads,
logger=None, logger=None,
fps=30, fps=30,
) )
@ -607,7 +582,7 @@ def combine_clip_videos(combined_video_path: str,
video_clip = concatenate_videoclips(clips) video_clip = concatenate_videoclips(clips)
video_clip = video_clip.set_fps(30) video_clip = video_clip.set_fps(30)
logger.info(f"合并中...") logger.info(f"合并视频中...")
video_clip.write_videofile(filename=combined_video_path, video_clip.write_videofile(filename=combined_video_path,
threads=threads, threads=threads,
logger=None, logger=None,
@ -687,19 +662,14 @@ if __name__ == "__main__":
video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4" video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4"
audio_paths = ['../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3', audio_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3"
'../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-14-00-17.mp3',
'../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-17-00-22.mp3',
'../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-34-00-45.mp3',
'../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-59-01-09.mp3',
]
subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt" subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt"
output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4" output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4"
generate_video_v2(video_path=video_path, generate_video_v2(video_path=video_path,
audio_paths=audio_paths, audio_path=audio_path,
subtitle_path=subtitle_path, subtitle_path=subtitle_path,
output_file=output_file, output_file=output_file,
params=cfg params=cfg

View File

@ -1034,8 +1034,8 @@ def is_azure_v2_voice(voice_name: str):
def tts( def tts(
text: str, voice_name: str, voice_rate: float, voice_file: str text: str, voice_name: str, voice_rate: float, voice_file: str
) -> [SubMaker, None]: ) -> [SubMaker, None]:
if is_azure_v2_voice(voice_name): # if is_azure_v2_voice(voice_name):
return azure_tts_v2(text, voice_name, voice_file) # return azure_tts_v2(text, voice_name, voice_file)
return azure_tts_v1(text, voice_name, voice_rate, voice_file) return azure_tts_v1(text, voice_name, voice_rate, voice_file)
@ -1414,7 +1414,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3") audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
# 检查文件是否已存在,如存在且不强制重新生成,则跳过 # 检查文件是否已存在,如存在且不强制重新生成,则跳过
if os.path.exists(audio_file) and not force_regenerate: if os.path.exists(audio_file):
logger.info(f"音频文件已存在,跳过生成: {audio_file}") logger.info(f"音频文件已存在,跳过生成: {audio_file}")
audio_files.append(audio_file) audio_files.append(audio_file)
continue continue