From 6c341388f7f8670bbc5a48ba0b859fda5d295bb3 Mon Sep 17 00:00:00 2001 From: linyqh Date: Mon, 4 Nov 2024 01:05:06 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96tts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/controllers/v1/llm.py | 51 +++++++++++++++++++++++++++++++++++- app/models/schema.py | 10 +++++++ app/services/audio_merger.py | 35 ++++++++++++++++++------- app/services/video.py | 51 ++++++++++++++++++++++-------------- app/services/voice.py | 8 +++--- 5 files changed, 120 insertions(+), 35 deletions(-) diff --git a/app/controllers/v1/llm.py b/app/controllers/v1/llm.py index e841d68..b5da6ae 100644 --- a/app/controllers/v1/llm.py +++ b/app/controllers/v1/llm.py @@ -1,18 +1,24 @@ -from fastapi import Request +from fastapi import Request, File, UploadFile +import os from app.controllers.v1.base import new_router from app.models.schema import ( VideoScriptResponse, VideoScriptRequest, VideoTermsResponse, VideoTermsRequest, + VideoTranscriptionRequest, + VideoTranscriptionResponse, ) from app.services import llm from app.utils import utils +from app.config import config # 认证依赖项 # router = new_router(dependencies=[Depends(base.verify_token)]) router = new_router() +# 定义上传目录 +UPLOAD_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "uploads") @router.post( "/scripts", @@ -42,3 +48,46 @@ def generate_video_terms(request: Request, body: VideoTermsRequest): ) response = {"video_terms": video_terms} return utils.get_response(200, response) + + +@router.post( + "/transcription", + response_model=VideoTranscriptionResponse, + summary="Transcribe video content using Gemini" +) +async def transcribe_video( + request: Request, + video_name: str, + language: str = "zh-CN", + video_file: UploadFile = File(...) +): + """ + 使用 Gemini 转录视频内容,包括时间戳、画面描述和语音内容 + + Args: + video_name: 视频名称 + language: 语言代码,默认zh-CN + video_file: 上传的视频文件 + """ + # 创建临时目录用于存储上传的视频 + os.makedirs(UPLOAD_DIR, exist_ok=True) + + # 保存上传的视频文件 + video_path = os.path.join(UPLOAD_DIR, video_file.filename) + with open(video_path, "wb") as buffer: + content = await video_file.read() + buffer.write(content) + + try: + transcription = llm.gemini_video_transcription( + video_name=video_name, + video_path=video_path, + language=language, + llm_provider_video=config.app.get("video_llm_provider", "gemini") + ) + response = {"transcription": transcription} + return utils.get_response(200, response) + finally: + # 处理完成后删除临时文件 + if os.path.exists(video_path): + os.remove(video_path) diff --git a/app/models/schema.py b/app/models/schema.py index 682cd94..64e0cb6 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -365,3 +365,13 @@ class VideoClipParams(BaseModel): custom_position: float = Field(default=70.0, description="自定义位置") n_threads: Optional[int] = 8 # 线程数,有助于提升视频处理速度 + +class VideoTranscriptionRequest(BaseModel): + video_name: str + language: str = "zh-CN" + + class Config: + arbitrary_types_allowed = True + +class VideoTranscriptionResponse(BaseModel): + transcription: str diff --git a/app/services/audio_merger.py b/app/services/audio_merger.py index 80c9aff..f0face0 100644 --- a/app/services/audio_merger.py +++ b/app/services/audio_merger.py @@ -73,25 +73,40 @@ def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: def parse_timestamp(timestamp: str): """解析时间戳字符串为秒数""" - # start, end = timestamp.split('-') + # 确保使用冒号作为分隔符 + timestamp = timestamp.replace('_', ':') return time_to_seconds(timestamp) def extract_timestamp(filename): """从文件名中提取开始和结束时间戳""" - time_part = filename.split('_')[1].split('.')[0] - times = time_part.split('-') - + # 从 "audio_00_06-00_24.mp3" 这样的格式中提取时间 + time_part = filename.split('_', 1)[1].split('.')[0] # 获取 "00_06-00_24" 部分 + start_time, end_time = time_part.split('-') # 分割成 "00_06" 和 "00_24" + + # 将下划线格式转换回冒号格式 + start_time = start_time.replace('_', ':') + end_time = end_time.replace('_', ':') + # 将时间戳转换为秒 - start_seconds = time_to_seconds(times[0]) - end_seconds = time_to_seconds(times[1]) + start_seconds = time_to_seconds(start_time) + end_seconds = time_to_seconds(end_time) return start_seconds, end_seconds -def time_to_seconds(times): - """将 “00:06” 转换为总秒数 """ - times = times.split(':') - return int(times[0]) * 60 + int(times[1]) +def time_to_seconds(time_str): + """将 "00:06" 或 "00_06" 格式转换为总秒数""" + # 确保使用冒号作为分隔符 + time_str = time_str.replace('_', ':') + try: + parts = time_str.split(':') + if len(parts) != 2: + logger.error(f"Invalid time format: {time_str}") + return 0 + return int(parts[0]) * 60 + int(parts[1]) + except (ValueError, IndexError) as e: + logger.error(f"Error parsing time {time_str}: {str(e)}") + return 0 if __name__ == "__main__": diff --git a/app/services/video.py b/app/services/video.py index 6bfb9bf..76689eb 100644 --- a/app/services/video.py +++ b/app/services/video.py @@ -332,7 +332,7 @@ def generate_video_v2( logger.info(f" ③ 字幕: {subtitle_path}") logger.info(f" ④ 输出: {output_file}") - # 写入与输出文件相同的目录 + # ��入与输出文件相同的目录 output_dir = os.path.dirname(output_file) # 字体设置部分保持不变 @@ -389,6 +389,36 @@ def generate_video_v2( # 处理新的音频文件 new_audio = AudioFileClip(audio_path).volumex(params.voice_volume) + # 合并音频轨道 + audio_tracks = [] + + # 检查原始视频音轨 + if original_audio is not None: + audio_tracks.append(original_audio) + + # 添加新的音频 + audio_tracks.append(new_audio) + + # 背景音乐处理部分 + bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file) + if bgm_file: + try: + bgm_clip = ( + AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3) + ) + bgm_clip = afx.audio_loop(bgm_clip, duration=video_duration) + audio_tracks.append(bgm_clip) + except Exception as e: + logger.error(f"添加背景音乐失败: {str(e)}") + + # 确保至少有一个有效的音轨 + if not audio_tracks: + logger.warning("没有有效的音轨可用") + final_audio = new_audio + else: + # 合并所有音频轨道 + final_audio = CompositeAudioClip(audio_tracks) + # 字幕处理部分 if subtitle_path and os.path.exists(subtitle_path): sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8") @@ -417,25 +447,6 @@ def generate_video_v2( # 创建一个新的视频剪辑,包含所有字幕 video_clip = CompositeVideoClip([video_clip, *text_clips]) - # 背景音乐处理部分 - bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file) - - # 合并音频轨道 - audio_tracks = [original_audio, new_audio] - - if bgm_file: - try: - bgm_clip = ( - AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3) - ) - bgm_clip = afx.audio_loop(bgm_clip, duration=video_duration) - audio_tracks.append(bgm_clip) - except Exception as e: - logger.error(f"添加背景音乐失败: {str(e)}") - - # 合并所有音频轨道 - final_audio = CompositeAudioClip(audio_tracks) - video_clip = video_clip.set_audio(final_audio) video_clip.write_videofile( output_file, diff --git a/app/services/voice.py b/app/services/voice.py index e4776bf..fff3353 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1060,7 +1060,7 @@ def azure_tts_v1( logger.info(f"start, voice name: {voice_name}, try: {i + 1}") async def _do() -> SubMaker: - communicate = edge_tts.Communicate(text, voice_name, rate=rate_str) + communicate = edge_tts.Communicate(text, voice_name, rate=rate_str, proxy="http://127.0.0.1:7890") sub_maker = edge_tts.SubMaker() with open(voice_file, "wb") as file: async for chunk in communicate.stream(): @@ -1410,12 +1410,12 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f for item in list_script: if not item['OST']: - # timestamp = item['new_timestamp'].replace(':', '@') - timestamp = item['new_timestamp'] + # 将时间戳中的冒号替换为下划线 + timestamp = item['new_timestamp'].replace(':', '_') audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3") # 检查文件是否已存在,如存在且不强制重新生成,则跳过 - if os.path.exists(audio_file): + if os.path.exists(audio_file) and not force_regenerate: logger.info(f"音频文件已存在,跳过生成: {audio_file}") audio_files.append(audio_file) continue