diff --git a/app/models/schema.py b/app/models/schema.py index f20657a..bf39e2b 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -353,7 +353,7 @@ class VideoClipParams(BaseModel): bgm_file: Optional[str] = Field(default="", description="背景音乐文件") bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量") - subtitle_enabled: Optional[bool] = Field(default=True, description="是否启用字幕") + subtitle_enabled: Optional[bool] = Field(default=False, description="是否启用字幕") subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置") # top, bottom, center font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称") text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色") @@ -364,5 +364,5 @@ class VideoClipParams(BaseModel): stroke_width: float = Field(default=1.5, description="文字描边宽度") custom_position: float = Field(default=70.0, description="自定义位置") - # n_threads: Optional[int] = 2 # 线程数 + n_threads: Optional[int] = 8 # 线程数,有助于提升视频处理速度 # paragraph_number: Optional[int] = 1 # 段落数量 diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py index cf2a204..e35a22c 100644 --- a/app/services/audio_merger.py +++ b/app/services/audio_merger.py @@ -1,9 +1,10 @@ import os +import json import subprocess import edge_tts from edge_tts import submaker from pydub import AudioSegment -from typing import List +from typing import List, Dict from loguru import logger from app.utils import utils @@ -17,12 +18,13 @@ def check_ffmpeg(): return False -def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int): +def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int, video_script: list): """ - 合并多个音频文件到一个指定总时长的音频文件中 - + 合并多个音频文件到一个指定总时长的音频文件中,并生成相应的字幕 + :param task_id: 任务ID :param audio_file_paths: 音频文件路径列表 :param total_duration: 最终音频文件的总时长(秒) + :param video_script: JSON格式的视频脚本 """ output_dir = utils.task_dir(task_id) @@ -35,6 +37,17 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: # 创建SubMaker对象 sub_maker = edge_tts.SubMaker() + # 解析JSON格式的video_script + script_data = video_script + + for segment in script_data: + start_time, end_time = parse_timestamp(segment['new_timestamp']) + duration = (end_time - start_time) * 1000 # 转换为毫秒 + + if not segment['OST']: + # 如果不是原声,则添加narration作为字幕 + sub_maker.create_sub((start_time * 1000, duration), segment['narration']) + for audio_path in audio_file_paths: if not os.path.exists(audio_path): logger.info(f"警告:文件 {audio_path} 不存在,已跳过。") @@ -50,14 +63,10 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: except Exception as e: logger.error(f"错误:无法读取文件 {audio_path}。错误信息:{str(e)}") continue + # 将音频插入到空白音频的指定位置 blank_audio = blank_audio.overlay(audio, position=start_time * 1000) - # 添加字幕信息 - duration = (end_time - start_time) * 1000 # 转换为毫秒 - # TODO 不是 filename 需要考虑怎么把字幕文本弄过来 - sub_maker.create_sub((start_time * 1000, duration), filename) - # 尝试导出为WAV格式 try: output_file = os.path.join(output_dir, "audio.wav") @@ -66,7 +75,7 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: except Exception as e: logger.info(f"导出为WAV格式失败,尝试使用MP3格式:{str(e)}") try: - output_file = "merged_audio.mp3" + output_file = os.path.join(output_dir, "audio.mp3") blank_audio.export(output_file, format="mp3", codec="libmp3lame") logger.info(f"音频合并完成,已保存为 {output_file}") except Exception as e: @@ -75,6 +84,10 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: return output_file, sub_maker +def parse_timestamp(timestamp: str) -> tuple: + """解析时间戳字符串为秒数""" + start, end = timestamp.split('-') + return time_to_seconds(*start.split(':')), time_to_seconds(*end.split(':')) def extract_timestamp(filename): """从文件名中提取开始和结束时间戳""" @@ -95,14 +108,17 @@ def time_to_seconds(minutes, seconds): if __name__ == "__main__": # 示例用法 - audio_files = [ + audio_files =[ "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-06-00-24.mp3", "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-32-00-38.mp3", "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-43-00-52.mp3", "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00-52-01-09.mp3", - "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01-13-01-15.mp3" + "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01-13-01-15.mp3", ] - total_duration = 75 + total_duration = 38 + video_script_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json" + with open(video_script_path, "r", encoding="utf-8") as f: + video_script = json.load(f) - a, b = merge_audio_files("test456", audio_files, total_duration) - print(a, b) \ No newline at end of file + output_file, sub_maker = merge_audio_files("test456", audio_files, total_duration, video_script) + print(output_file, sub_maker) \ No newline at end of file diff --git a/app/services/subtitle.py b/app/services/subtitle.py index ba6e224..b915c6c 100644 --- a/app/services/subtitle.py +++ b/app/services/subtitle.py @@ -1,10 +1,12 @@ import json import os.path import re +from typing import Optional from faster_whisper import WhisperModel from timeit import default_timer as timer from loguru import logger +import google.generativeai as genai from app.config import config from app.utils import utils @@ -278,8 +280,40 @@ def correct(subtitle_file, video_script): logger.success("Subtitle is correct") +def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Optional[str] = None) -> Optional[str]: + if not api_key: + logger.error("Gemini API key is not provided") + return None + + genai.configure(api_key=api_key) + + logger.info(f"开始使用Gemini模型处理音频文件: {audio_file}") + + model = genai.GenerativeModel(model_name="gemini-1.5-flash") + prompt = "生成这段语音的转录文本。请以SRT格式输出,包含时间戳。" + + try: + with open(audio_file, "rb") as f: + audio_data = f.read() + + response = model.generate_content([prompt, audio_data]) + transcript = response.text + + if not subtitle_file: + subtitle_file = f"{audio_file}.srt" + + with open(subtitle_file, "w", encoding="utf-8") as f: + f.write(transcript) + + logger.info(f"Gemini生成的字幕文件已保存: {subtitle_file}") + return subtitle_file + except Exception as e: + logger.error(f"使用Gemini处理音频时出错: {e}") + return None + + if __name__ == "__main__": - task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072" + task_id = "task456" task_dir = utils.task_dir(task_id) subtitle_file = f"{task_dir}/subtitle.srt" audio_file = f"{task_dir}/audio.mp3" @@ -297,3 +331,10 @@ if __name__ == "__main__": subtitle_file = f"{task_dir}/subtitle-test.srt" create(audio_file, subtitle_file) + + # 使用Gemini模型处理音频 + gemini_api_key = config.app.get("gemini_api_key") # 请替换为实际的API密钥 + gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key) + + if gemini_subtitle_file: + print(f"Gemini生成的字幕文件: {gemini_subtitle_file}") diff --git a/app/services/task.py b/app/services/task.py index 7de5ac4..fd53d1d 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -338,7 +338,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos): # tts 角色名称 voice_name = voice.parse_voice_name(params.voice_name) - logger.info("\n\n## 1. 读取视频json脚本") + logger.info("\n\n## 1. 加载视频脚本") video_script_path = path.join(params.video_clip_json_path) # 判断json文件是否存在 if path.exists(video_script_path): @@ -376,7 +376,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos): "音频文件为空,可能是网络不可用。如果您在中国,请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频") return logger.info("合并音频") - audio_file, sub_maker = audio_merger.merge_audio_files(task_id, audio_files, total_duration) + audio_file, sub_maker = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script) # audio_duration = voice.get_audio_duration(sub_maker) # audio_duration = math.ceil(audio_duration) @@ -387,7 +387,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos): subtitle_path = path.join(utils.task_dir(task_id), f"subtitle111.srt") subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}") - # subtitle_fallback = False + subtitle_fallback = False if subtitle_provider == "edge": voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path) # voice.create_subtitle( @@ -401,7 +401,8 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos): # logger.warning("找不到字幕文件,回退到whisper") # # if subtitle_provider == "whisper" or subtitle_fallback: - # subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path) + # # subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path) + # subtitle.create_with_gemini(audio_file=audio_file, subtitle_file=subtitle_path, api_key=config.app.get("gemini_api_key", "")) # logger.info("\n\n## 更正字幕") # subtitle.correct(subtitle_file=subtitle_path, video_script=video_script) @@ -449,7 +450,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos): video_ost_list=video_ost, list_script=list_script, video_aspect=params.video_aspect, - threads=1 # 暂时只支持单线程 + threads=params.n_threads # 多线程 ) _progress += 50 / 2 @@ -461,7 +462,7 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos): # 把所有东西合到在一起 video.generate_video_v2( video_path=combined_video_path, - audio_paths=audio_files, + audio_path=audio_file, subtitle_path=subtitle_path, output_file=final_video_path, params=params, diff --git a/app/services/video.py b/app/services/video.py index 864634c..6bfb9bf 100644 --- a/app/services/video.py +++ b/app/services/video.py @@ -294,7 +294,7 @@ def generate_video( output_file, audio_codec="aac", temp_audiofile_path=output_dir, - threads=params.n_threads or 2, + threads=params.n_threads, logger=None, fps=30, ) @@ -306,7 +306,7 @@ def generate_video( def generate_video_v2( video_path: str, - audio_paths: List[str], + audio_path: str, subtitle_path: str, output_file: str, params: Union[VideoParams, VideoClipParams], @@ -314,11 +314,11 @@ def generate_video_v2( """ 合并所有素材 Args: - video_path: - audio_paths: - subtitle_path: - output_file: - params: + video_path: 视频路径 + audio_path: 单个音频文件路径 + subtitle_path: 字幕文件路径 + output_file: 输出文件路径 + params: 视频参数 Returns: @@ -328,7 +328,7 @@ def generate_video_v2( logger.info(f"开始,视频尺寸: {video_width} x {video_height}") logger.info(f" ① 视频: {video_path}") - logger.info(f" ② 音频文件数量: {len(audio_paths)}") + logger.info(f" ② 音频: {audio_path}") logger.info(f" ③ 字幕: {subtitle_path}") logger.info(f" ④ 输出: {output_file}") @@ -386,40 +386,8 @@ def generate_video_v2( original_audio = video_clip.audio # 保存原始视频的音轨 video_duration = video_clip.duration - # 处理多个音频文件 - audio_clips = [] - for audio_path in audio_paths: - # 确保每个音频文件路径是正确的 - if not os.path.exists(audio_path): - logger.warning(f"音频文件不存在: {audio_path}") - continue - - # 从文件名中提取时间信息 - match = re.search(r'audio_(\d{2}-\d{2}-\d{2}-\d{2})\.mp3', os.path.basename(audio_path)) - if match: - time_str = match.group(1) - start, end = time_str.split('-')[:2], time_str.split('-')[2:] - start_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(start))) - end_time = sum(int(x) * 60 ** i for i, x in enumerate(reversed(end))) - - audio_clip = AudioFileClip(audio_path).volumex(params.voice_volume) - - # 确保结束时间不超过音频实际长度 - actual_end_time = min(end_time - start_time, audio_clip.duration) - - audio_clip = audio_clip.subclip(0, actual_end_time) - audio_clip = audio_clip.set_start(start_time).set_end(start_time + actual_end_time) - audio_clips.append(audio_clip) - else: - logger.warning(f"无法从文件名解析时间信息: {audio_path}") - - # 合并所有音频剪辑,包括原始音轨 - if audio_clips: - audio_clips.insert(0, original_audio) # 将原始音轨添加到音频剪辑列表的开头 - audio_clip = CompositeAudioClip(audio_clips) - else: - logger.warning("没有有效的音频文件,使用原始音轨") - audio_clip = original_audio + # 处理新的音频文件 + new_audio = AudioFileClip(audio_path).volumex(params.voice_volume) # 字幕处理部分 if subtitle_path and os.path.exists(subtitle_path): @@ -451,22 +419,29 @@ def generate_video_v2( # 背景音乐处理部分 bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file) + + # 合并音频轨道 + audio_tracks = [original_audio, new_audio] + if bgm_file: try: bgm_clip = ( AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3) ) - bgm_clip = afx.audio_loop(bgm_clip, duration=video_clip.duration) - audio_clip = CompositeAudioClip([audio_clip, bgm_clip]) + bgm_clip = afx.audio_loop(bgm_clip, duration=video_duration) + audio_tracks.append(bgm_clip) except Exception as e: logger.error(f"添加背景音乐失败: {str(e)}") - video_clip = video_clip.set_audio(audio_clip) + # 合并所有音频轨道 + final_audio = CompositeAudioClip(audio_tracks) + + video_clip = video_clip.set_audio(final_audio) video_clip.write_videofile( output_file, audio_codec="aac", temp_audiofile_path=output_dir, - threads=params.n_threads or 2, + threads=params.n_threads, logger=None, fps=30, ) @@ -607,7 +582,7 @@ def combine_clip_videos(combined_video_path: str, video_clip = concatenate_videoclips(clips) video_clip = video_clip.set_fps(30) - logger.info(f"合并中...") + logger.info(f"合并视频中...") video_clip.write_videofile(filename=combined_video_path, threads=threads, logger=None, @@ -687,19 +662,14 @@ if __name__ == "__main__": video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4" - audio_paths = ['../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3', - '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-14-00-17.mp3', - '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-17-00-22.mp3', - '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-34-00-45.mp3', - '../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-59-01-09.mp3', - ] + audio_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3" subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt" output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4" generate_video_v2(video_path=video_path, - audio_paths=audio_paths, + audio_path=audio_path, subtitle_path=subtitle_path, output_file=output_file, params=cfg diff --git a/app/services/voice.py b/app/services/voice.py index 4464140..cf5c24d 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1034,8 +1034,8 @@ def is_azure_v2_voice(voice_name: str): def tts( text: str, voice_name: str, voice_rate: float, voice_file: str ) -> [SubMaker, None]: - if is_azure_v2_voice(voice_name): - return azure_tts_v2(text, voice_name, voice_file) + # if is_azure_v2_voice(voice_name): + # return azure_tts_v2(text, voice_name, voice_file) return azure_tts_v1(text, voice_name, voice_rate, voice_file) @@ -1414,7 +1414,7 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3") # 检查文件是否已存在,如存在且不强制重新生成,则跳过 - if os.path.exists(audio_file) and not force_regenerate: + if os.path.exists(audio_file): logger.info(f"音频文件已存在,跳过生成: {audio_file}") audio_files.append(audio_file) continue