From dc4ce80ea5d0a44346e448a69deceb678f7c1767 Mon Sep 17 00:00:00 2001 From: linyq Date: Sun, 29 Sep 2024 14:39:20 +0800 Subject: [PATCH] =?UTF-8?q?=E5=89=AA=E8=BE=91=E9=80=BB=E8=BE=91=E8=BF=9B?= =?UTF-8?q?=E5=BA=A680%=EF=BC=9B=20=E5=BE=85=E4=BC=98=E5=8C=96=E7=82=B9?= =?UTF-8?q?=EF=BC=9A=201.=20=E4=BC=98=E5=8C=96=E8=84=9A=E6=9C=AC-=E8=A7=A3?= =?UTF-8?q?=E8=AF=B4=E8=B4=A8=E9=87=8F=202.=20=E4=BC=98=E5=8C=96webui?= =?UTF-8?q?=E4=BD=93=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/models/schema.py | 3 +- app/services/subtitle.py | 80 +++++++++++++++++++++++++++------------- app/services/task.py | 22 ++--------- config.example.toml | 7 ++-- 4 files changed, 62 insertions(+), 50 deletions(-) diff --git a/app/models/schema.py b/app/models/schema.py index bf39e2b..682cd94 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -353,7 +353,7 @@ class VideoClipParams(BaseModel): bgm_file: Optional[str] = Field(default="", description="背景音乐文件") bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量") - subtitle_enabled: Optional[bool] = Field(default=False, description="是否启用字幕") + subtitle_enabled: Optional[bool] = Field(default=True, description="是否启用字幕") subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置") # top, bottom, center font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称") text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色") @@ -365,4 +365,3 @@ class VideoClipParams(BaseModel): custom_position: float = Field(default=70.0, description="自定义位置") n_threads: Optional[int] = 8 # 线程数,有助于提升视频处理速度 - # paragraph_number: Optional[int] = 1 # 段落数量 diff --git a/app/services/subtitle.py b/app/services/subtitle.py index b915c6c..b9894b0 100644 --- a/app/services/subtitle.py +++ b/app/services/subtitle.py @@ -1,6 +1,7 @@ import json import os.path import re +import traceback from typing import Optional from faster_whisper import WhisperModel @@ -11,35 +12,53 @@ import google.generativeai as genai from app.config import config from app.utils import utils -model_size = config.whisper.get("model_size", "large-v3") +model_size = config.whisper.get("model_size", "faster-whisper-large-v2") device = config.whisper.get("device", "cpu") compute_type = config.whisper.get("compute_type", "int8") model = None def create(audio_file, subtitle_file: str = ""): + """ + 为给定的音频文件创建字幕文件。 + + 参数: + - audio_file: 音频文件的路径。 + - subtitle_file: 字幕文件的输出路径(可选)。如果未提供,将根据音频文件的路径生成字幕文件。 + + 返回: + 无返回值,但会在指定路径生成字幕文件。 + """ global model if not model: - model_path = f"{utils.root_dir()}/models/whisper-{model_size}" + model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v2" model_bin_file = f"{model_path}/model.bin" if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file): - model_path = model_size + logger.error( + "请先下载 whisper 模型\n\n" + "********************************************\n" + "下载地址:https://huggingface.co/guillaumekln/faster-whisper-large-v2\n" + "存放路径:app/models \n" + "********************************************\n" + ) + return None logger.info( - f"loading model: {model_path}, device: {device}, compute_type: {compute_type}" + f"加载模型: {model_path}, 设备: {device}, 计算类型: {compute_type}" ) try: model = WhisperModel( - model_size_or_path=model_path, device=device, compute_type=compute_type + model_size_or_path=model_path, device=device, compute_type=compute_type, local_files_only=True ) except Exception as e: logger.error( - f"failed to load model: {e} \n\n" + f"加载模型失败: {e} \n\n" f"********************************************\n" - f"this may be caused by network issue. \n" - f"please download the model manually and put it in the 'models' folder. \n" - f"see [README.md FAQ](https://github.com/harry0703/NarratoAI) for more details.\n" + f"这可能是由网络问题引起的. \n" + f"请手动下载模型并将其放入 'app/models' 文件夹中。 \n" + f"see [README.md FAQ](https://github.com/linyqh/NarratoAI) for more details.\n" f"********************************************\n\n" + f"{traceback.format_exc()}" ) return None @@ -56,7 +75,7 @@ def create(audio_file, subtitle_file: str = ""): ) logger.info( - f"detected language: '{info.language}', probability: {info.language_probability:.2f}" + f"检测到的语言: '{info.language}', probability: {info.language_probability:.2f}" ) start = timer() @@ -139,6 +158,15 @@ def create(audio_file, subtitle_file: str = ""): def file_to_subtitles(filename): + """ + 将字幕文件转换为字幕列表。 + + 参数: + filename (str): 字幕文件的路径。 + + 返回: + list: 包含字幕序号、出现时间、和字幕文本的元组列表。 + """ if not filename or not os.path.isfile(filename): return [] @@ -313,28 +341,28 @@ def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Option if __name__ == "__main__": - task_id = "task456" + task_id = "test456" task_dir = utils.task_dir(task_id) subtitle_file = f"{task_dir}/subtitle.srt" - audio_file = f"{task_dir}/audio.mp3" + audio_file = f"{task_dir}/audio.wav" subtitles = file_to_subtitles(subtitle_file) print(subtitles) - script_file = f"{task_dir}/script.json" - with open(script_file, "r") as f: - script_content = f.read() - s = json.loads(script_content) - script = s.get("script") + # script_file = f"{task_dir}/script.json" + # with open(script_file, "r") as f: + # script_content = f.read() + # s = json.loads(script_content) + # script = s.get("script") + # + # correct(subtitle_file, script) - correct(subtitle_file, script) - - subtitle_file = f"{task_dir}/subtitle-test.srt" + subtitle_file = f"{task_dir}/subtitle111.srt" create(audio_file, subtitle_file) - # 使用Gemini模型处理音频 - gemini_api_key = config.app.get("gemini_api_key") # 请替换为实际的API密钥 - gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key) - - if gemini_subtitle_file: - print(f"Gemini生成的字幕文件: {gemini_subtitle_file}") + # # 使用Gemini模型处理音频 + # gemini_api_key = config.app.get("gemini_api_key") # 请替换为实际的API密钥 + # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key) + # + # if gemini_subtitle_file: + # print(f"Gemini生成的字幕文件: {gemini_subtitle_file}") diff --git a/app/services/task.py b/app/services/task.py index b6bc504..946b4cd 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -383,27 +383,11 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos): subtitle_path = "" if params.subtitle_enabled: - subtitle_path = path.join(utils.task_dir(task_id), f"subtitle111.srt") + subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt") subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}") - subtitle_fallback = False - if subtitle_provider == "edge": - voice.create_subtitle(text=video_script, sub_maker="sub_maker", subtitle_file=subtitle_path) - # voice.create_subtitle( - # text=video_script, - # sub_maker_list=sub_maker_list, - # list_script=list_script, - # subtitle_file=subtitle_path - # ) - # if not os.path.exists(subtitle_path): - # subtitle_fallback = True - # logger.warning("找不到字幕文件,回退到whisper") - # - # if subtitle_provider == "whisper" or subtitle_fallback: - # # subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path) - # subtitle.create_with_gemini(audio_file=audio_file, subtitle_file=subtitle_path, api_key=config.app.get("gemini_api_key", "")) - # logger.info("\n\n## 更正字幕") - # subtitle.correct(subtitle_file=subtitle_path, video_script=video_script) + # 使用 faster-whisper-large-v2 模型生成字幕 + subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path) subtitle_lines = subtitle.file_to_subtitles(subtitle_path) if not subtitle_lines: diff --git a/config.example.toml b/config.example.toml index 50b2531..7b4e09e 100644 --- a/config.example.toml +++ b/config.example.toml @@ -73,9 +73,10 @@ deepseek_base_url = "https://api.deepseek.com" deepseek_model_name = "deepseek-chat" - # Subtitle Provider, "edge" or "whisper" + # Subtitle Provider, "whisper" # If empty, the subtitle will not be generated - subtitle_provider = "edge" + subtitle_provider = "faster-whisper-large-v2" + subtitle_enabled = true # # ImageMagick @@ -159,7 +160,7 @@ # model = WhisperModel(model_size, device="cpu", compute_type="int8") # recommended model_size: "large-v3" - model_size="large-v3" + model_size="faster-whisper-large-v2" # if you want to use GPU, set device="cuda" device="CPU" compute_type="int8"