From dc4ce80ea5d0a44346e448a69deceb678f7c1767 Mon Sep 17 00:00:00 2001
From: linyq <linyqemail@163.com>
Date: Sun, 29 Sep 2024 14:39:20 +0800
Subject: [PATCH] =?UTF-8?q?=E5=89=AA=E8=BE=91=E9=80=BB=E8=BE=91=E8=BF=9B?=
 =?UTF-8?q?=E5=BA=A680%=EF=BC=9B=20=E5=BE=85=E4=BC=98=E5=8C=96=E7=82=B9?=
 =?UTF-8?q?=EF=BC=9A=201.=20=E4=BC=98=E5=8C=96=E8=84=9A=E6=9C=AC-=E8=A7=A3?=
 =?UTF-8?q?=E8=AF=B4=E8=B4=A8=E9=87=8F=202.=20=E4=BC=98=E5=8C=96webui?=
 =?UTF-8?q?=E4=BD=93=E9=AA=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/models/schema.py     |  3 +-
 app/services/subtitle.py | 80 +++++++++++++++++++++++++++-------------
 app/services/task.py     | 22 ++---------
 config.example.toml      |  7 ++--
 4 files changed, 62 insertions(+), 50 deletions(-)

diff --git a/app/models/schema.py b/app/models/schema.py
index bf39e2b..682cd94 100644
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -353,7 +353,7 @@ class VideoClipParams(BaseModel):
     bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
     bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量")
 
-    subtitle_enabled: Optional[bool] = Field(default=False, description="是否启用字幕")
+    subtitle_enabled: Optional[bool] = Field(default=True, description="是否启用字幕")
     subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置")  # top, bottom, center
     font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称")
     text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色")
@@ -365,4 +365,3 @@ class VideoClipParams(BaseModel):
     custom_position: float = Field(default=70.0, description="自定义位置")
 
     n_threads: Optional[int] = 8    # 线程数，有助于提升视频处理速度
-    # paragraph_number: Optional[int] = 1     # 段落数量
diff --git a/app/services/subtitle.py b/app/services/subtitle.py
index b915c6c..b9894b0 100644
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@@ -1,6 +1,7 @@
 import json
 import os.path
 import re
+import traceback
 from typing import Optional
 
 from faster_whisper import WhisperModel
@@ -11,35 +12,53 @@ import google.generativeai as genai
 from app.config import config
 from app.utils import utils
 
-model_size = config.whisper.get("model_size", "large-v3")
+model_size = config.whisper.get("model_size", "faster-whisper-large-v2")
 device = config.whisper.get("device", "cpu")
 compute_type = config.whisper.get("compute_type", "int8")
 model = None
 
 
 def create(audio_file, subtitle_file: str = ""):
+    """
+    为给定的音频文件创建字幕文件。
+
+    参数:
+    - audio_file: 音频文件的路径。
+    - subtitle_file: 字幕文件的输出路径（可选）。如果未提供，将根据音频文件的路径生成字幕文件。
+
+    返回:
+    无返回值，但会在指定路径生成字幕文件。
+    """
     global model
     if not model:
-        model_path = f"{utils.root_dir()}/models/whisper-{model_size}"
+        model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v2"
         model_bin_file = f"{model_path}/model.bin"
         if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
-            model_path = model_size
+            logger.error(
+                "请先下载 whisper 模型\n\n"
+                "********************************************\n"
+                "下载地址：https://huggingface.co/guillaumekln/faster-whisper-large-v2\n"
+                "存放路径：app/models \n"
+                "********************************************\n"
+            )
+            return None
 
         logger.info(
-            f"loading model: {model_path}, device: {device}, compute_type: {compute_type}"
+            f"加载模型: {model_path}, 设备: {device}, 计算类型: {compute_type}"
         )
         try:
             model = WhisperModel(
-                model_size_or_path=model_path, device=device, compute_type=compute_type
+                model_size_or_path=model_path, device=device, compute_type=compute_type, local_files_only=True
             )
         except Exception as e:
             logger.error(
-                f"failed to load model: {e} \n\n"
+                f"加载模型失败: {e} \n\n"
                 f"********************************************\n"
-                f"this may be caused by network issue. \n"
-                f"please download the model manually and put it in the 'models' folder. \n"
-                f"see [README.md FAQ](https://github.com/harry0703/NarratoAI) for more details.\n"
+                f"这可能是由网络问题引起的. \n"
+                f"请手动下载模型并将其放入 'app/models' 文件夹中。 \n"
+                f"see [README.md FAQ](https://github.com/linyqh/NarratoAI) for more details.\n"
                 f"********************************************\n\n"
+                f"{traceback.format_exc()}"
             )
             return None
 
@@ -56,7 +75,7 @@ def create(audio_file, subtitle_file: str = ""):
     )
 
     logger.info(
-        f"detected language: '{info.language}', probability: {info.language_probability:.2f}"
+        f"检测到的语言: '{info.language}', probability: {info.language_probability:.2f}"
     )
 
     start = timer()
@@ -139,6 +158,15 @@ def create(audio_file, subtitle_file: str = ""):
 
 
 def file_to_subtitles(filename):
+    """
+    将字幕文件转换为字幕列表。
+
+    参数:
+    filename (str): 字幕文件的路径。
+
+    返回:
+    list: 包含字幕序号、出现时间、和字幕文本的元组列表。
+    """
     if not filename or not os.path.isfile(filename):
         return []
 
@@ -313,28 +341,28 @@ def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Option
 
 
 if __name__ == "__main__":
-    task_id = "task456"
+    task_id = "test456"
     task_dir = utils.task_dir(task_id)
     subtitle_file = f"{task_dir}/subtitle.srt"
-    audio_file = f"{task_dir}/audio.mp3"
+    audio_file = f"{task_dir}/audio.wav"
 
     subtitles = file_to_subtitles(subtitle_file)
     print(subtitles)
 
-    script_file = f"{task_dir}/script.json"
-    with open(script_file, "r") as f:
-        script_content = f.read()
-    s = json.loads(script_content)
-    script = s.get("script")
+    # script_file = f"{task_dir}/script.json"
+    # with open(script_file, "r") as f:
+    #     script_content = f.read()
+    # s = json.loads(script_content)
+    # script = s.get("script")
+    #
+    # correct(subtitle_file, script)
 
-    correct(subtitle_file, script)
-
-    subtitle_file = f"{task_dir}/subtitle-test.srt"
+    subtitle_file = f"{task_dir}/subtitle111.srt"
     create(audio_file, subtitle_file)
 
-    # 使用Gemini模型处理音频
-    gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
-    gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
-
-    if gemini_subtitle_file:
-        print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
+    # # 使用Gemini模型处理音频
+    # gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
+    # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
+    #
+    # if gemini_subtitle_file:
+    #     print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
diff --git a/app/services/task.py b/app/services/task.py
index b6bc504..946b4cd 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -383,27 +383,11 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos):
 
     subtitle_path = ""
     if params.subtitle_enabled:
-        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle111.srt")
+        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
         logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-        subtitle_fallback = False
-        if subtitle_provider == "edge":
-            voice.create_subtitle(text=video_script, sub_maker="sub_maker", subtitle_file=subtitle_path)
-            # voice.create_subtitle(
-            #     text=video_script,
-            #     sub_maker_list=sub_maker_list,
-            #     list_script=list_script,
-            #     subtitle_file=subtitle_path
-            # )
-        #     if not os.path.exists(subtitle_path):
-        #         subtitle_fallback = True
-        #         logger.warning("找不到字幕文件，回退到whisper")
-        #
-        # if subtitle_provider == "whisper" or subtitle_fallback:
-        #     # subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
-        #     subtitle.create_with_gemini(audio_file=audio_file, subtitle_file=subtitle_path, api_key=config.app.get("gemini_api_key", ""))
-        #     logger.info("\n\n## 更正字幕")
-        #     subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+        # 使用 faster-whisper-large-v2 模型生成字幕
+        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
 
         subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
         if not subtitle_lines:
diff --git a/config.example.toml b/config.example.toml
index 50b2531..7b4e09e 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -73,9 +73,10 @@
     deepseek_base_url = "https://api.deepseek.com"
     deepseek_model_name = "deepseek-chat"
 
-    # Subtitle Provider, "edge" or "whisper"
+    # Subtitle Provider, "whisper"
     # If empty, the subtitle will not be generated
-    subtitle_provider = "edge"
+    subtitle_provider = "faster-whisper-large-v2"
+    subtitle_enabled = true
 
     #
     # ImageMagick
@@ -159,7 +160,7 @@
     # model = WhisperModel(model_size, device="cpu", compute_type="int8")
 
     # recommended model_size: "large-v3"
-    model_size="large-v3"
+    model_size="faster-whisper-large-v2"
     # if you want to use GPU, set device="cuda"
     device="CPU"
     compute_type="int8"