Merge pull request #37 from linyqh/dev_duanju

0.2.0新版预发布
2026-03-13 23:31:15 +00:00 · 2024-10-02 02:12:16 +08:00 · 2024-10-02 02:12:16 +08:00 · e45f2b83ec
commit e45f2b83ec
parent 73647f6461 95828144be
21 changed files with 1899 additions and 785 deletions
--- a/.github/workflows/dockerImageBuild.yml.bak
+++ b/.github/workflows/dockerImageBuild.yml.bak
@ -14,14 +14,22 @@ jobs:

      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
+
      - name: Login to DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}

+      - name: Extract project version
+        id: extract_version
+        run: |
+          project_version=$(grep 'project_version' config.example.toml | cut -d '"' -f 2)
+          echo "PROJECT_VERSION=$project_version" >> $GITHUB_ENV
+
      - name: Build and push
        id: docker_build
        uses: docker/build-push-action@v6
@ -31,5 +39,5 @@ jobs:
          push: true
          platforms: linux/amd64,linux/arm64
          tags: |
-            ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:${{ github.ref_name }}
+            ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:${{ env.PROJECT_VERSION }}
            ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:latest
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,4 @@ resource/scripts/*
 resource/videos/*
 resource/songs/*
 resource/fonts/*
+app/models/faster-whisper-large-v2/*
--- a/16
+++ b/16
@ -1,4 +1,3 @@
-# Use an official Python runtime as a parent image
 FROM python:3.10-slim-bullseye

 # Set the working directory in the container
@ -12,6 +11,7 @@ ENV PYTHONPATH="/NarratoAI"
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    git \
+    git-lfs \
    imagemagick \
    ffmpeg \
    wget \
@ -29,17 +29,11 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Now copy the rest of the codebase into the image
 COPY . .

+# 安装 git lfs 并下载模型到指定目录
+RUN git lfs install
+
 # Expose the port the app runs on
 EXPOSE 8501

 # Command to run the application
-CMD ["streamlit", "run", "./webui/Main.py","--browser.serverAddress=127.0.0.1","--server.enableCORS=True","--browser.gatherUsageStats=False"]
-
-# 1. Build the Docker image using the following command
-# docker build -t moneyprinterturbo .
-
-# 2. Run the Docker container using the following command
-## For Linux or MacOS:
-# docker run -v $(pwd)/config.toml:/NarratoAI/config.toml -v $(pwd)/storage:/NarratoAI/storage -p 8501:8501 moneyprinterturbo
-## For Windows:
-# docker run -v %cd%/config.toml:/NarratoAI/config.toml -v %cd%/storage:/NarratoAI/storage -p 8501:8501 moneyprinterturbo
+CMD ["streamlit", "run", "webui.py","--browser.serverAddress=127.0.0.1","--server.enableCORS=True","--browser.gatherUsageStats=False"]
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -3,7 +3,7 @@ from enum import Enum
 from typing import Any, List, Optional

 import pydantic
-from pydantic import BaseModel
+from pydantic import BaseModel, Field

 # 忽略 Pydantic 的特定警告
 warnings.filterwarnings(
@ -330,41 +330,38 @@ class BgmUploadResponse(BaseResponse):


 class VideoClipParams(BaseModel):
-    video_subject: Optional[str] = "春天的花海让人心旷神怡"
+    """
+    NarratoAI 数据模型
+    """
+    video_clip_json: Optional[list] = Field(default=[], description="LLM 生成的视频剪辑脚本内容")
+    video_clip_json_path: Optional[str] = Field(default="", description="LLM 生成的视频剪辑脚本路径")
+    video_origin_path: Optional[str] = Field(default="", description="原视频路径")
+    video_aspect: Optional[VideoAspect] = Field(default=VideoAspect.portrait.value, description="视频比例")
+    video_language: Optional[str] = Field(default="zh-CN", description="视频语言")

-    video_clip_json: Optional[str] = ""      # 视频剪辑脚本
-    video_origin_path: Optional[str] = ""    # 原视频路径
-    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value        # 视频比例
-    video_clip_duration: Optional[int] = 5      # 视频片段时长
-    video_count: Optional[int] = 1      # 视频片段数量
-    video_source: Optional[str] = "local"
-    video_language: Optional[str] = ""  # 自动检测
-    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
+    # video_clip_duration: Optional[int] = 5      # 视频片段时长
+    # video_count: Optional[int] = 1      # 视频片段数量
+    # video_source: Optional[str] = "local"
+    # video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value

-    # # 女性
-    # "zh-CN-XiaoxiaoNeural",
-    # "zh-CN-XiaoyiNeural",
-    # # 男性
-    # "zh-CN-YunjianNeural" 男声
-    # "zh-CN-YunyangNeural",
-    # "zh-CN-YunxiNeural",
-    voice_name: Optional[str] = "zh-CN-YunjianNeural"      # 语音名称 指定选择：
-    voice_volume: Optional[float] = 1.0     # 语音音量
-    voice_rate: Optional[float] = 1.0       # 语速
+    voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
+    voice_volume: Optional[float] = Field(default=1.0, description="语音音量")
+    voice_rate: Optional[float] = Field(default=1.0, description="语速")

-    bgm_name: Optional[str] = "random"  # 背景音乐名称
-    bgm_type: Optional[str] = "random"  # 背景音乐类型
-    bgm_file: Optional[str] = ""        # 背景音乐文件
-    bgm_volume: Optional[float] = 0.2
+    bgm_name: Optional[str] = Field(default="random", description="背景音乐名称")
+    bgm_type: Optional[str] = Field(default="random", description="背景音乐类型")
+    bgm_file: Optional[str] = Field(default="", description="背景音乐文件")
+    bgm_volume: Optional[float] = Field(default=0.2, description="背景音乐音量")

-    subtitle_enabled: Optional[bool] = True     # 是否启用字幕
-    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
-    font_name: Optional[str] = "STHeitiMedium.ttc"      # 字体名称
-    text_fore_color: Optional[str] = "#FFFFFF"      # 文字前景色
-    text_background_color: Optional[str] = "transparent"    # 文字背景色
+    subtitle_enabled: Optional[bool] = Field(default=True, description="是否启用字幕")
+    subtitle_position: Optional[str] = Field(default="bottom", description="字幕位置")  # top, bottom, center
+    font_name: Optional[str] = Field(default="STHeitiMedium.ttc", description="字体名称")
+    text_fore_color: Optional[str] = Field(default="#FFFFFF", description="文字前景色")
+    text_background_color: Optional[str] = Field(default="transparent", description="文字背景色")

-    font_size: int = 60     # 文字大小
-    stroke_color: Optional[str] = "#000000"     # 文字描边颜色
-    stroke_width: float = 1.5       # 文字描边宽度
-    n_threads: Optional[int] = 2    # 线程数
-    paragraph_number: Optional[int] = 1     # 段落数量
+    font_size: int = Field(default=60, description="文字大小")
+    stroke_color: Optional[str] = Field(default="#000000", description="文字描边颜色")
+    stroke_width: float = Field(default=1.5, description="文字描边宽度")
+    custom_position: float = Field(default=70.0, description="自定义位置")
+
+    n_threads: Optional[int] = 8    # 线程数，有助于提升视频处理速度
--- a/app/services/audio_merger.py
+++ b/app/services/audio_merger.py
@ -0,0 +1,112 @@
+import os
+import json
+import subprocess
+import edge_tts
+from edge_tts import submaker
+from pydub import AudioSegment
+from typing import List, Dict
+from loguru import logger
+from app.utils import utils
+
+
+def check_ffmpeg():
+    """检查FFmpeg是否已安装"""
+    try:
+        subprocess.run(['ffmpeg', '-version'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        return True
+    except FileNotFoundError:
+        return False
+
+
+def merge_audio_files(task_id: str, audio_file_paths: List[str], total_duration: int, video_script: list):
+    """
+    合并多个音频文件到一个指定总时长的音频文件中，并生成相应的字幕
+    :param task_id: 任务ID
+    :param audio_file_paths: 音频文件路径列表
+    :param total_duration: 最终音频文件的总时长（秒）
+    :param video_script: JSON格式的视频脚本
+    """
+    output_dir = utils.task_dir(task_id)
+
+    if not check_ffmpeg():
+        logger.error("错误：FFmpeg未安装。请安装FFmpeg后再运行此脚本。")
+        return None, None
+
+    # 创建一个总时长为total_duration的空白音频
+    blank_audio = AudioSegment.silent(duration=total_duration * 1000)  # pydub使用毫秒
+
+    for audio_path in audio_file_paths:
+        if not os.path.exists(audio_path):
+            logger.info(f"警告：文件 {audio_path} 不存在，已跳过。")
+            continue
+
+        # 从文件名中提取时间戳
+        filename = os.path.basename(audio_path)
+        start_time, end_time = extract_timestamp(filename)
+
+        # 读取音频文件
+        try:
+            audio = AudioSegment.from_mp3(audio_path)
+        except Exception as e:
+            logger.error(f"错误：无法读取文件 {audio_path}。错误信息：{str(e)}")
+            continue
+        
+        # 将音频插入到空白音频的指定位置
+        blank_audio = blank_audio.overlay(audio, position=start_time * 1000)
+
+    # 尝试导出为WAV格式
+    try:
+        output_file = os.path.join(output_dir, "audio.wav")
+        blank_audio.export(output_file, format="wav")
+        logger.info(f"音频合并完成，已保存为 {output_file}")
+    except Exception as e:
+        logger.info(f"导出为WAV格式失败，尝试使用MP3格式：{str(e)}")
+        try:
+            output_file = os.path.join(output_dir, "audio.mp3")
+            blank_audio.export(output_file, format="mp3", codec="libmp3lame")
+            logger.info(f"音频合并完成，已保存为 {output_file}")
+        except Exception as e:
+            logger.error(f"导出音频失败：{str(e)}")
+            return None, None
+
+    return output_file
+
+def parse_timestamp(timestamp: str):
+    """解析时间戳字符串为秒数"""
+    # start, end = timestamp.split('-')
+    return time_to_seconds(timestamp)
+
+def extract_timestamp(filename):
+    """从文件名中提取开始和结束时间戳"""
+    time_part = filename.split('_')[1].split('.')[0]
+    times = time_part.split('-')
+
+    # 将时间戳转换为秒
+    start_seconds = time_to_seconds(times[0])
+    end_seconds = time_to_seconds(times[1])
+
+    return start_seconds, end_seconds
+
+
+def time_to_seconds(times):
+    """将 “00:06” 转换为总秒数 """
+    times = times.split(':')
+    return int(times[0]) * 60 + int(times[1])
+
+
+if __name__ == "__main__":
+    # 示例用法
+    audio_files =[
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:06-00:24.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:32-00:38.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:43-00:52.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:52-01:09.mp3",
+        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01:13-01:15.mp3",
+    ]
+    total_duration = 38
+    video_script_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json"
+    with open(video_script_path, "r", encoding="utf-8") as f:
+        video_script = json.load(f)
+
+    output_file = merge_audio_files("test456", audio_files, total_duration, video_script)
+    print(output_file)
--- a/app/services/llm.py
+++ b/app/services/llm.py
@ -1,22 +1,144 @@
-import logging
+import os
 import re
 import json
 import traceback
+import streamlit as st
 from typing import List
 from loguru import logger
 from openai import OpenAI
 from openai import AzureOpenAI
+from moviepy.editor import VideoFileClip
 from openai.types.chat import ChatCompletion
 import google.generativeai as gemini
+from googleapiclient.errors import ResumableUploadError
+from google.api_core.exceptions import *
+from google.generativeai.types import *
+import subprocess

 from app.config import config
+from app.utils.utils import clean_model_output

 _max_retries = 5

+Method = """
+重要提示：每一部剧的文案，前几句必须吸引人
+首先我们在看完看懂电影后，大脑里面要先有一个大概的轮廓，也就是一个类似于作文的大纲，电影主题线在哪里，首先要找到。
+一般将文案分为开头、内容、结尾
+## 开头部分
+文案开头三句话，是留住用户的关键！

-def _generate_response(prompt: str) -> str:
+### 方式一：开头概括总结
+文案的前三句，是整部电影的概括总结，2-3句介绍后，开始叙述故事剧情！
+推荐新手（新号）做：（盘点型）
+盘点全球最恐怖的10部电影
+盘<EFBFBD><EFBFBD><EFBFBD>全球最科幻的10部电影
+盘点全球最悲惨的10部电影
+盘全球最值得看的10部灾难电影
+盘点全球最值得看的10部励志电影
+
+下面的示例就是最简单的解说文案开头：
+1.这是XXX国20年来最大尺度的一部剧，极度烧脑，却让99%的人看得心潮澎湃、无法自拔，故事开始……
+2.这是有史以来电影院唯一一部全程开灯放完的电影，期间无数人尖叫昏厥，他被成为勇敢者的专属，因为99%的人都不敢看到结局，许多人看完它从此不愿再碰手机，他就是大名鼎鼎的暗黑神作《XXX》……
+3.这到底是一部什么样的电影，能被55个国家公开抵制，它甚至为了上映，不惜删减掉整整47分钟的剧情……
+4.是什么样的一个人被豆瓣网友称之为史上最牛P的老太太，都70岁了还要去贩毒……
+5.他是M国历史上最NB/惨/猖狂/冤枉……的囚犯/抢劫犯/……
+6.这到底是一部什么样的影片，他一个人就拿了4个顶级奖项，第一季8.7分，第二季直接干到9.5分，11万人给出5星好评，一共也就6集，却斩获26项国际大奖，看过的人都说，他是近年来最好的xxx剧，几乎成为了近年来xxx剧的标杆。故事发生在……
+7.他是国产电影的巅峰佳作，更是许多80-90后的青春启蒙，曾入选《<EFBFBD><EFBFBD>代》周刊，获得年度佳片第一，可在国内却被尘封多年，至今为止都无法在各大视频网站看到完整资源，他就是《xxxxxx》
+8.这是一部让所有人看得荷尔蒙飙升的爽片……
+9.他被成为世界上最虐心绝望的电影，至今无人敢看第二遍，很难想象，他是根据真实事件改编而来……
+10.这大概是有史以来最令人不寒而栗的电影，当年一经放映，就点燃了无数人的怒火，不少观众不等影片放完，就愤然离场，它比《xxx》更让人绝望，比比《xxx》更让人xxx，能坚持看完全片的人，更是万中无一，包括我。甚至观影结束后，有无数人抵制投诉这部电影，认为影片的导演玩弄了他们的情感！他是顶级神作《xxxx》……
+11.这是X国有史以来最高赞的一部悬疑电影，然而却因为某些原因，国内90%的人，没能看过这部片子，他就是《xxx》……
+12.有这样一部电影，这辈子，你绝对不想再看第二遍，并不是它剧情烂俗，而是它的结局你根本承受不起/想象不到……甚至有80%的观众在观影途中情绪崩溃中途离场，更让许多同行都不想解说这部电影，他就是大名鼎鼎的暗黑神作《xxx》…
+13.它被誉为史上最牛悬疑片无数人在看完它时候，一个月不敢照镜<EFBFBD><EFBFBD>，这样一部仅适合部分年龄段观看的影片，究竟有什么样的魅力，竟然获得某瓣8.2的高分，很多人说这部电影到处都是看点，他就是《xxx》….
+14.这是一部在某瓣上被70万人打出9.3分的高分的电影……到底是一部什么样的电影，能够在某瓣上被70万人打出9.3分的高分……
+15.这是一部细思极恐的科幻大片，整部电影颠覆你的三观，它的名字叫……
+16.史上最震撼的灾难片，每一点都不舍得快进的电影，他叫……
+17.今天给大家带来一部基于真实事件改编的（主题介绍一句……）的故事片，这是一部连环悬疑剧，如果不看到最后绝对想不到结局竟然是这样的反转……
+
+### 方式：情景式、假设性开头
+1.他叫……你以为他是……的吗？不。他是来……然后开始叙述
+2.你知道……吗？原来……然后开始叙述
+3.如果给你….，你会怎么样？
+4.如果你是….，你会怎么样？
+
+### 方式三：以国家为开头！简单明了。话语不需要多，但是需要讲解透彻！
+1.这是一部韩国最新灾难片，你一定没有看过……
+2.这是一部印度高分悬疑片，
+3.这部电影原在日本因为……而被下架，
+4.这是韩国最恐怖的犯罪片，
+5.这是最近国产片评分最高的悬疑<EFBFBD><EFBFBD>
+以上均按照影片国家来区分，然后简单介绍下主题。就可以开始直接叙述作品。也是一个很不错的方法！
+
+### 方式四：如何自由发挥
+正常情况下，每一部电影都有非常关键的一个大纲，这部电影的主题其实是可以用一句话、两句话概括的。只要看懂电影，就能找到这个主题大纲。
+我们提前把这个主题大纲给放到影视最前面，作为我们的前三句的文案，将会非常吸引人！
+
+例如：
+1.这不是电影，这是真实故事。两个女人和一个男人被关在可桑拿室。喊破喉咙也没有一丝回音。窒息感和热度让人抓狂，故事就是从这里开始！ 
+2.如果你男朋友出轨了，他不爱你了，还你家暴，怎么办？接下来这部电影就会教你如何让老公服服帖帖的呆在你身边！女主是一个……开始叙述了。 
+3.他力大无穷，双眼放光，这不是拯救地球的超人吗？然而不是。今天给大家推荐的这部电影叫……
+
+以上是需要看完影片，看懂影片，然后从里面提炼出精彩的几句话,当然是比较难的，当你不会自己去总结前三句的经典的话。可以用前面方式一二三！
+实在想不出来如何去提炼，可以去搜索这部剧，对这部电影的影评，也会给你带过来很多灵感的！
+
+
+## 内容部分
+开头有了，剩下的就是开始叙述正文了。主题介绍是根据影片内容来介绍，如果实在自己想不出来。可以参考其他平台中对这部电影的精彩介绍，提取2-3句也可以！
+正常情况下，我们叙述的时候其实是非常简单的，把整部电影主题线，叙述下来，其实文案就是加些修饰词把电影重点内容叙述下来。加上一些修饰词。
+
+以悬疑剧为例：
+竟然，突然，原来，但是，但，可是，结果，直到，如果，而，果然，发现，只是，出奇，之后，没错，不止，更是，当然，因为，所以……等！
+以上是比较常用的，当然还有很多，需要靠平时思考和阅读的积累！因悬疑剧会有多处反转剧情。所以需要用到反转的修饰词比较多，只有用到这些词。才能体现出各种反转剧情！
+建议大家在刚开始做的时候，做8分钟内的，不要太长，分成三段。每段也是不超过三分钟，这样时间刚好。可以比较好的完成完播率！
+
+
+## 结尾部分
+最后故事的结局，除了反转，可以来点人生的道理！如果刚开始不会，可以不写。
+后面水平越来越高的时候，可以进行人生道理的讲评。
+
+比如：这部电影告诉我们……
+类似于哲理性质<EFBFBD><EFBFBD>作为一个总结！
+也可以把最后的影视反转，原生放出来，留下悬念。
+
+比如：也可以总结下这部短片如何的好，推荐/值得大家去观看之类的话语。
+其实就是给我们的作品来一个总结，总结我们所做的三个视频，有开始就要有结束。这个结束不一定是固定的模版。但是视频一定要有结尾。让人感觉有头有尾才最舒服！
+做解说第一次，可能会做两天。第二次可能就需要一天了。慢慢的。时间缩短到8个小时之内是我们平的制作全部时间！
+
+"""
+
+
+def handle_exception(err):
+    if isinstance(err, PermissionDenied):
+        raise Exception("403 用户没有权限访问该资源")
+    elif isinstance(err, ResourceExhausted):
+        raise Exception("429 您的配额已用尽。请稍后重试。请考虑设置自动重试来处理这些错误")
+    elif isinstance(err, InvalidArgument):
+        raise Exception("400 参数无效。例如，文件过大，超出了载荷大小限制。另一个事件提供了无效的 API 密钥。")
+    elif isinstance(err, AlreadyExists):
+        raise Exception("409 已存在具有相同 ID 的已调参模型。对新模型进行调参时，请指定唯一的模型 ID。")
+    elif isinstance(err, RetryError):
+        raise Exception("使用不支持 gRPC 的代理时可能会引起此错误。请尝试将 REST 传输与 genai.configure(..., transport=rest) 搭配使用。")
+    elif isinstance(err, BlockedPromptException):
+        raise Exception("400 出于安全原因，该提示已被屏蔽。")
+    elif isinstance(err, BrokenResponseError):
+        raise Exception("500 流式传输响应已损坏。在访问需要完整响应的内容（例如聊天记录）时引发。查看堆栈轨迹中提供的错误详情。")
+    elif isinstance(err, IncompleteIterationError):
+        raise Exception("500 访问需要完整 API 响应但流式响应尚未完全迭代的内容时引发。对响应对象调用 resolve() 以使用迭代器。")
+    elif isinstance(err, ConnectionError):
+        raise Exception("网络连接错误, 请检查您的网络连接(建议使用 NarratoAI 官方提供的 url)")
+    else:
+        raise Exception(f"大模型请求失败, 下面是具体报错信息: \n\n{traceback.format_exc()}")
+
+
+def _generate_response(prompt: str, llm_provider: str = None) -> str:
+    """
+    调用大模型通用方法
+        prompt：
+        llm_provider：
+    """
    content = ""
-    llm_provider = config.app.get("llm_provider", "openai")
+    if not llm_provider:
+        llm_provider = config.app.get("llm_provider", "openai")
    logger.info(f"llm provider: {llm_provider}")
    if llm_provider == "g4f":
        model_name = config.app.get("g4f_model_name", "")
@ -132,46 +254,23 @@ def _generate_response(prompt: str) -> str:

            genai.configure(api_key=api_key, transport="rest")

-            generation_config = {
-                "temperature": 0.5,
-                "top_p": 1,
-                "top_k": 1,
-                "max_output_tokens": 2048,
+            safety_settings = {
+                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
            }

-            safety_settings = [
-                {
-                    "category": "HARM_CATEGORY_HARASSMENT",
-                    "threshold": "BLOCK_ONLY_HIGH",
-                },
-                {
-                    "category": "HARM_CATEGORY_HATE_SPEECH",
-                    "threshold": "BLOCK_ONLY_HIGH",
-                },
-                {
-                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-                    "threshold": "BLOCK_ONLY_HIGH",
-                },
-                {
-                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-                    "threshold": "BLOCK_ONLY_HIGH",
-                },
-            ]
-
            model = genai.GenerativeModel(
                model_name=model_name,
-                generation_config=generation_config,
                safety_settings=safety_settings,
            )

            try:
                response = model.generate_content(prompt)
-                candidates = response.candidates
-                generated_text = candidates[0].content.parts[0].text
-            except (AttributeError, IndexError) as e:
-                print("Gemini Error:", e)
-
-            return generated_text
+                return response.text
+            except Exception as err:
+                return handle_exception(err)

        if llm_provider == "cloudflare":
            import requests
@ -254,76 +353,171 @@ def _generate_response(prompt: str) -> str:
    return content.replace("\n", "")


-def generate_script(
-    video_subject: str, language: str = "", paragraph_number: int = 1
-) -> str:
-    prompt = f"""
-# Role: Video Script Generator
+def _generate_response_video(prompt: str, llm_provider_video: str, video_file: str | File) -> str:
+    """
+    多模态能力大模型
+    """
+    if llm_provider_video == "gemini":
+        api_key = config.app.get("gemini_api_key")
+        model_name = config.app.get("gemini_model_name")
+        base_url = "***"
+    else:
+        raise ValueError(
+            "llm_provider 未设置，请在 config.toml 文件中进行设置。"
+        )

-## Goals:
-Generate a script for a video, depending on the subject of the video.
+    if llm_provider_video == "gemini":
+        import google.generativeai as genai

-## Constrains:
-1. the script is to be returned as a string with the specified number of paragraphs.
-2. do not under any circumstance reference this prompt in your response.
-3. get straight to the point, don't start with unnecessary things like, "welcome to this video".
-4. you must not include any type of markdown or formatting in the script, never use a title.
-5. only return the raw content of the script.
-6. do not include "voiceover", "narrator" or similar indicators of what should be spoken at the beginning of each paragraph or line.
-7. you must not mention the prompt, or anything about the script itself. also, never talk about the amount of paragraphs or lines. just write the script.
-8. respond in the same language as the video subject.
+        genai.configure(api_key=api_key, transport="rest")

-# Initialization:
- video subject: {video_subject}
- number of paragraphs: {paragraph_number}
-""".strip()
-    if language:
-        prompt += f"\n- language: {language}"
+        safety_settings = {
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        }

-    final_script = ""
-    logger.info(f"subject: {video_subject}")
+        model = genai.GenerativeModel(
+            model_name=model_name,
+            safety_settings=safety_settings,
+        )

-    def format_response(response):
-        # Clean the script
-        # Remove asterisks, hashes
-        response = response.replace("*", "")
-        response = response.replace("#", "")
-
-        # Remove markdown syntax
-        response = re.sub(r"\[.*\]", "", response)
-        response = re.sub(r"\(.*\)", "", response)
-
-        # Split the script into paragraphs
-        paragraphs = response.split("\n\n")
-
-        # Select the specified number of paragraphs
-        selected_paragraphs = paragraphs[:paragraph_number]
-
-        # Join the selected paragraphs into a single string
-        return "\n\n".join(paragraphs)
-
-    for i in range(_max_retries):
        try:
-            response = _generate_response(prompt=prompt)
-            if response:
-                final_script = format_response(response)
-            else:
-                logging.error("gpt returned an empty response")
+            response = model.generate_content([prompt, video_file])
+            return response.text
+        except Exception as err:
+            return handle_exception(err)

-            # g4f may return an error message
-            if final_script and "当日额度已消耗完" in final_script:
-                raise ValueError(final_script)

-            if final_script:
-                break
-        except Exception as e:
-            logger.error(f"failed to generate script: {e}")
+def compress_video(input_path: str, output_path: str):
+    """
+    压缩视频文件
+    Args:
+        input_path: 输入视频文件路径
+        output_path: 输出压缩后的视频文件路径
+    """
+    # 如果压缩后的视频文件已经存在，则直接使用
+    if os.path.exists(output_path):
+        logger.info(f"压缩视频文件已存在: {output_path}")
+        return

-        if i < _max_retries:
-            logger.warning(f"failed to generate video script, trying again... {i + 1}")
+    try:
+        clip = VideoFileClip(input_path)
+        clip.write_videofile(output_path, codec='libx264', audio_codec='aac', bitrate="500k", audio_bitrate="128k")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"视频压缩失败: {e}")
+        raise

-    logger.success(f"completed: \n{final_script}")
-    return final_script.strip()
+
+def generate_script(
+    video_path: str, video_plot: str, video_name: str, language: str = "zh-CN", progress_callback=None
+) -> str:
+    """
+    生成视频剪辑脚本
+    Args:
+        video_path: 视频文件路径
+        video_plot: 视频剧情内容
+        video_name: 视频名称
+        language: 语言
+        progress_callback: 进度回调函数
+
+    Returns:
+        str: 生成的脚本
+    """
+    try:
+        # 1. 压缩视频
+        compressed_video_path = f"{os.path.splitext(video_path)[0]}_compressed.mp4"
+        compress_video(video_path, compressed_video_path)
+
+        # 在关键步骤更新进度
+        if progress_callback:
+            progress_callback(15, "压缩完成")  # 例如,在压缩视频后
+
+        # 2. 转录视频
+        transcription = gemini_video_transcription(
+            video_name=video_name,
+            video_path=compressed_video_path,
+            language=language,
+            llm_provider_video=config.app["video_llm_provider"],
+            progress_callback=progress_callback
+        )
+        if progress_callback:
+            progress_callback(60, "生成解说文案...")  # 例如,在转录视频后
+
+        # 3. 编写解说文案
+        script = writing_short_play(video_plot, video_name, config.app["llm_provider"], count=300)
+
+        # 在关键步骤更新进度
+        if progress_callback:
+            progress_callback(70, "匹配画面...")  # 例如,在生成脚本后
+
+        # 4. 文案匹配画面
+        if transcription != "":
+            matched_script = screen_matching(huamian=transcription, wenan=script, llm_provider=config.app["video_llm_provider"])
+            # 在关键步骤更新进度
+            if progress_callback:
+                progress_callback(80, "匹配成功")
+            return matched_script
+        else:
+            return ""
+    except Exception as e:
+        handle_exception(e)
+        raise
+
+
+def gemini_video_transcription(video_name: str, video_path: str, language: str, llm_provider_video: str, progress_callback=None):
+    '''
+    使用 gemini-1.5-xxx 进行视频画面转录
+    '''
+    api_key = config.app.get("gemini_api_key")
+    gemini.configure(api_key=api_key)
+
+    prompt = """
+    请转录音频，包括时间戳，并提供视觉描述，然后以 JSON 格式输出，当前视频中使用的语言为 %s。
+    
+    在转录视频时，请通过确保以下条件来完成转录：
+    1. 画面描述使用语言: %s 进行输出。
+    2. 同一个画面合并为一个转录记录。
+    3. 使用以下 JSON schema:    
+        Graphics = {"timestamp": "MM:SS-MM:SS"(时间戳格式), "picture": "str"(画面描述), "speech": "str"(台词，如果没有人说话，则使用空字符串。)}
+        Return: list[Graphics]
+    4. 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
+    """ % (language, language)
+
+    logger.debug(f"视频名称: {video_name}")
+    try:
+        if progress_callback:
+            progress_callback(20, "上传视频至 Google cloud")
+        gemini_video_file = gemini.upload_file(video_path)
+        logger.debug(f"视频 {gemini_video_file.name} 上传至 Google cloud 成功, 开始解析...")
+        while gemini_video_file.state.name == "PROCESSING":
+            gemini_video_file = gemini.get_file(gemini_video_file.name)
+            if progress_callback:
+                progress_callback(30, "上传成功, 开始解析")  # 更新进度为20%
+        if gemini_video_file.state.name == "FAILED":
+            raise ValueError(gemini_video_file.state.name)
+        elif gemini_video_file.state.name == "ACTIVE":
+            if progress_callback:
+                progress_callback(40, "解析完成, 开始转录...")  # 更新进度为30%
+            logger.debug("解析完成, 开始转录...")
+    except ResumableUploadError as err:
+        logger.error(f"上传视频至 Google cloud 失败, 用户的位置信息不支持用于该API; \n{traceback.format_exc()}")
+        return False
+    except FailedPrecondition as err:
+        logger.error(f"400 用户位置不支持 Google API 使用。\n{traceback.format_exc()}")
+        return False
+
+    if progress_callback:
+        progress_callback(50, "开始转录")
+    try:
+        response = _generate_response_video(prompt=prompt, llm_provider_video=llm_provider_video, video_file=gemini_video_file)
+        logger.success("视频转录成功")
+        logger.debug(response)
+        print(type(response))
+        return response
+    except Exception as err:
+        return handle_exception(err)


 def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
@ -405,60 +599,65 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
    model = gemini.GenerativeModel(model_name=model_name)

    prompt = """
-# 角色设定：
-你是一位影视解说专家，擅长根据剧情描述视频的画面和故事生成一段有趣且吸引人的解说文案。你特别熟悉 tiktok/抖音 风格的影视解说文案创作。
+**角色设定：**  
+你是一位影视解说专家，擅长根据剧情生成引人入胜的短视频解说文案，特别熟悉适用于TikTok/抖音风格的快速、抓人视频解说。

-# 任务目标：
-1.	根据给定的剧情描述，详细描述视频画面并展开叙述，尤其是对重要画面进行细致刻画。
-2.	生成风格符合 tiktok/抖音 的影视解说文案，使其节奏快、内容抓人。
-3.	最终结果以 JSON 格式输出，字段包含：
-  • "picture"：画面描述
-  • "timestamp"：时间戳（表示画面出现的时间-画面结束的时间）
-  • "narration"：对应的解说文案
+**任务目标：**  
+1. 根据给定剧情，详细描述画面，重点突出重要场景和情节。  
+2. 生成符合TikTok/抖音风格的解说，节奏紧凑，语言简洁，吸引观众。  
+3. 解说的时候需要解说一段播放一段原视频，原视频一般为有台词的片段，原视频的控制有 OST 字段控制。
+4. 结果输出为JSON格式，包含字段：  
+   - "picture"：画面描述  
+   - "timestamp"：画面出现的时间范围  
+   - "narration"：解说内容
+   - "OST": 是否开启原声（true / false）

-# 输入示例：
-```text
-在一个黑暗的小巷中，主角缓慢走进，四周静谧无声，只有远处隐隐传来猫的叫声。突然，背后出现一个神秘的身影。
-```
+**输入示例：**  
+```text  
+在一个<EFBFBD><EFBFBD><EFBFBD>暗的小巷中，主角缓慢走进，四周静谧无声，只有远处隐隐传来猫的叫声。突然，背后出现一个神秘的身影。  
+```  
+
+**输出格式：**  
+```json  
+[  
+    {  
+        "picture": "黑暗的小巷，主角缓慢走入，四周安静，远处传来猫叫声。",  
+        "timestamp": "00:00-00:17",  
+        "narration": "静谧的小巷里，主角步步前行，气氛渐渐变得压抑。"  
+        "OST": False  
+    },  
+    {  
+        "picture": "神秘身影突然出现，紧张气氛加剧。",  
+        "timestamp": "00:17-00:39",  
+        "narration": "原声播放"  
+        "OST": True  
+    }  
+]  
+```  
+
+**提示：**  
+- 文案要简短有力，契合短视频平台用户的观赏习惯。  
+- 保持强烈的悬念和情感代入，吸引观众继续观看。  
+- 解说一段后播放一段原声，原声内容尽量和解说匹配。
+- 文案语言为：%s  
+- 剧情内容：%s (为空则忽略)  

-# 输出格式：
-```json
-[
-    {
-        "picture": "黑暗的小巷中，主角缓慢走进，四周静谧无声，远处有模糊的猫叫声。",
-        "timestamp": "00:00-00:17",
-        "narration": "昏暗的小巷里，他独自前行，空气中透着一丝不安，隐约中能听到远处的猫叫声。 "
-    },
-    {
-        "picture": "主角背后突然出现一个神秘的身影，气氛骤然紧张。",
-        "timestamp": "00:17-00:39",
-        "narration": "就在他以为安全时，一个身影悄无声息地出现在他身后，危险一步步逼近！ "
-    }
-    ...
-]
-```
-# 提示：
-  - 生成的解说文案应简洁有力，符合短视频平台用户的偏好。
-  - 叙述中应有强烈的代入感和悬念，以吸引观众持续观看。
-  - 文案语言为：%s
-  - 剧情内容如下：%s (若为空则忽略)
-  
 """ % (language, video_plot)

    logger.debug(f"视频名称: {video_origin_name}")
-    try:
-        gemini_video_file = gemini.upload_file(video_origin_path)
-        logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
-        while gemini_video_file.state.name == "PROCESSING":
-            import time
-            time.sleep(1)
-            gemini_video_file = gemini.get_file(gemini_video_file.name)
-            logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
-        if gemini_video_file.state.name == "FAILED":
-            raise ValueError(gemini_video_file.state.name)
-    except Exception as err:
-        logger.error(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确 \n{traceback.format_exc()}")
-        raise TimeoutError(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确; {err}")
+    # try:
+    gemini_video_file = gemini.upload_file(video_origin_path)
+    logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
+    while gemini_video_file.state.name == "PROCESSING":
+        import time
+        time.sleep(1)
+        gemini_video_file = gemini.get_file(gemini_video_file.name)
+        logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
+    if gemini_video_file.state.name == "FAILED":
+        raise ValueError(gemini_video_file.state.name)
+    # except Exception as err:
+    #     logger.error(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确 \n{traceback.format_exc()}")
+    #     raise TimeoutError(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确; {err}")

    streams = model.generate_content([prompt, gemini_video_file], stream=True)
    response = []
@ -471,60 +670,132 @@ def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot
    return response


-if __name__ == "__main__":
-    video_subject = "摔跤吧！爸爸 Dangal"
-    video_path = "/NarratoAI/resource/videos/test.mp4"
-    video_plot = '''
-马哈维亚（阿米尔·汗 Aamir Khan 饰）曾经是一名前途无量的摔跤运动员，在放弃了职业生涯后，他最大的遗憾就是没有能够替国家赢得金牌。马哈维亚将这份希望寄托在了尚未出生的儿子身上，哪知道妻子接连给他生了两个女儿，取名吉塔（法缇玛·萨那·纱卡 Fatima Sana Shaikh 饰）和巴比塔（桑亚·玛荷塔 Sanya Malhotra 饰）。让马哈维亚没有想到的是，两个姑娘展现出了杰出的摔跤天赋，让他幡然醒悟，就算是女孩，也能够昂首挺胸的站在比赛场上，为了国家和她们自己赢得荣誉。
-就这样，在马哈维亚的指导下，吉塔和巴比塔开始了艰苦的训练，两人进步神速，很快就因为在比赛中连连获胜而成为了当地的名人。为了获得更多的机会，吉塔进入了国家体育学院学习，在那里，她将面对更大的诱惑和更多的选择。
-'''
-    language = "zh-CN"
-    res = gemini_video2json(video_subject, video_path, video_plot, language)
-    print(res)
+def writing_movie(video_plot, video_name, llm_provider):
+    """
+    影视解说（电影解说）
+    """
+    prompt = f"""
+    **角色设定：**  
+    你是一名有10年经验的影视解说文案的创作者，
+    下面是关于如何写解说文案的方法 {Method}，请认真阅读它，之后我会给你一部影视作品的名称，然后让你写一篇文案
+    请根据方法撰写 《{video_name}》的影视解说文案，《{video_name}》的大致剧情如下: {video_plot}
+    文案要符合以下要求:
+    
+    **任务目标：**  
+    1. 文案字数在 1500字左右，严格要求字数，最低不得少于 1000字。
+    2. 避免使用 markdown 格式输出文案。  
+    3. 仅输出解说文案，不输出任何其他内容。
+    4. 不要包含小标题，每个段落以 \n 进行分隔。
+    """
+    try:
+        response = _generate_response(prompt, llm_provider)
+        logger.success("解说文案生成成功")
+        return response
+    except Exception as err:
+        return handle_exception(err)

-    # video_subject = "生命的意义是什么"
-    # script = generate_script(
-    #     video_subject=video_subject, language="zh-CN", paragraph_number=1
-    # )
-    # print("######################")
-    # print(script)
-    # search_terms = generate_terms(
-    #     video_subject=video_subject, video_script=script, amount=5
-    # )
-    # print("######################")
-    # print(search_terms)
-    #     prompt = """
-    # # Role: 影视解说专家
-    #
-    # ## Background:
-    # 擅长根据剧情描述视频的画面和故事，能够生成一段非常有趣的解说文案。
-    #
-    # ## Goals:
-    # 1. 根据剧情描述视频的画面和故事，并对重要的画面进行展开叙述
-    # 2. 根据剧情内容，生成符合 tiktok/抖音 风格的影视解说文案
-    # 3. 将结果直接以json格式输出给用户，需要包含字段： picture 画面描述， timestamp 时间戳， narration 解说文案
-    # 4. 剧情内容如下：{%s}
-    #
-    # ## Skills
-    # - 精通 tiktok/抖音 等短视频影视解说文案撰写
-    # - 能够理解视频中的故事和画面表现
-    # - 能精准匹配视频中的画面和时间戳
-    # - 能精准把控旁白和时长
-    # - 精通中文
-    # - 精通JSON数据格式
-    #
-    # ## Constrains
-    # - 解说文案的时长要和时间戳的时长尽量匹配
-    # - 忽略视频中关于广告的内容
-    # - 忽略视频中片头和片尾
-    # - 不得在脚本中包含任何类型的 Markdown 或格式
-    #
-    # ## Format
-    # - 对应JSON的key为：picture， timestamp， narration
-    #
-    # # Initialization:
-    # - video subject: {video_subject}
-    # - number of paragraphs: {paragraph_number}
-    # """.strip()
-    #     if language:
-    #         prompt += f"\n- language: {language}"
+
+def writing_short_play(video_plot: str, video_name: str, llm_provider: str, count: int = 500):
+    """
+    影视解说（短剧解说）
+    """
+    if not video_plot:
+        raise ValueError("短剧的简介不能为空")
+    if not video_name:
+        raise ValueError("短剧名称不能为空")
+
+    prompt = f"""
+    **角色设定：**  
+    你是一名有10年经验的短剧解说文案的创作者，
+    下面是关于如何写解说文案的方法 {Method}，请认真阅读它，之后我会给你一部短剧作品的简介，然后让你写一篇解说文案
+    请根据方法撰写 《{video_name}》的解说文案，《{video_name}》的大致剧情如下: {video_plot}
+    文案要符合以下要求:
+
+    **任务目标：**  
+    1. 请严格要求文案字数, 字数控制在 {count} 字左右。
+    2. 避免使用 markdown 格式输出文案。
+    3. 仅输出解说文案，不输出任何其他内容。
+    4. 不要包含小标题，每个段落以 \\n 进行分隔。
+    """
+    try:
+        response = _generate_response(prompt, llm_provider)
+        logger.success("解说文案生成成功")
+        logger.debug(response)
+        return response
+    except Exception as err:
+        return handle_exception(err)
+
+
+def screen_matching(huamian: str, wenan: str, llm_provider: str):
+    """
+    画面匹配（一次性匹配）
+    """
+    if not huamian:
+        raise ValueError("画面不能为空")
+    if not wenan:
+        raise ValueError("文案不能为空")
+
+    prompt = """
+    你是一名有10年经验的影视解说创作者，
+    你的任务是根据视频转录脚本和解说文案，匹配出每段解说文案对应的画面时间戳, 结果以 json 格式输出。
+    
+    注意：
+    转录脚本中 
+        - timestamp: 表示视频时间戳
+        - picture: 表示当前画面描述
+        - speech": 表示当前视频中人物的台词
+    
+    转录脚本和文案（由 XML 标记<PICTURE></PICTURE>和 <COPYWRITER></COPYWRITER>分隔）如下所示：
+    <PICTURE>
+    %s
+    </PICTURE>
+    
+    <COPYWRITER>
+    %s
+    </COPYWRITER>
+
+    在匹配的过程中，请通过确保以下条件来完成匹配：
+    - 使用以下 JSON schema:    
+        script = {'picture': str, 'timestamp': str(时间戳), "narration": str, "OST": bool(是否开启原声)}
+        Return: list[script]
+    - picture: 字段表示当前画面描述，与转录脚本保持一致
+    - timestamp: 字段表示某一段文案对应的画面的时间戳，不必和转录脚本的时间戳一致，应该充分考虑文案内容，匹配出与其描述最匹配的时间戳
+        - 请注意，请严格的执行已经出现的画面不能重复出现，即生成的脚本中 timestamp 不能有重叠的部分。
+    - narration: 字段表示需要解说文案，每段解说文案尽量不要超过30字
+    - OST: 字段表示是否开启原声，即当 OST 字段为 true 时，narration 字段为空字符串，当 OST 为 false 时，narration 字段为对应的解说文案
+    - 注意，在画面匹配的过程中，需要适当的加入原声播放，使得解说和画面更加匹配，请按照 1:1 的比例，生成原声和解说的脚本内容。
+    - 注意，在时间戳匹配上，一定不能原样照搬“转录脚本”，应当适当的合并或者删减一些片段。
+    - 注意，第一个画面一定是原声播放并且时长不少于 20 s，为了吸引观众，第一段一定是整个转录脚本中最精彩的片段。
+    - 请以严格的 JSON 格式返回数据，不要包含任何注释、标记或其他字符。数据应符合 JSON 语法，可以被 json.loads() 函数直接解析， 不要添加 ```json 或其他标记。
+    """ % (huamian, wenan)
+
+    try:
+        response = _generate_response(prompt, llm_provider)
+        logger.success("匹配成功")
+        logger.debug(response)
+        return response
+    except Exception as err:
+        return handle_exception(err)
+
+
+if __name__ == "__main__":
+    # 1. 视频转录
+    # video_subject = "第二十条之无罪释放"
+    # video_path = "../../resource/videos/test01.mp4"
+    # language = "zh-CN"
+    # gemini_video_transcription(video_subject, video_path, language)
+
+    # 2. 解说文案
+    video_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4"
+    # video_path = "E:\\projects\\NarratoAI\\resource\\videos\\1.mp4"
+    video_plot = """
+        李自忠拿着儿子李牧名下的存折，去银行取钱给儿子救命，却被要求证明"你儿子是你儿子"。
+    走投无路时碰到银行被抢劫，劫匪给了他两沓钱救命，李自忠却因此被银行以抢劫罪起诉，并顶格判处20年有期徒刑。
+    苏醒后的李牧坚决为父亲做无罪辩护，面对银行的顶级律师团队，他一个法学院大一学生，能否力挽狂澜，创作奇迹？挥法律之利剑 ，持正义之天平！
+    """
+    res = generate_script(video_path, video_plot, video_name="第二十条之无罪释放")
+    # res = generate_script(video_path, video_plot, video_name="海岸")
+    print("脚本生成成功:\n", res)
+    res = clean_model_output(res)
+    aaa = json.loads(res)
+    print(json.dumps(aaa, indent=2, ensure_ascii=False))
--- a/app/services/material.py
+++ b/app/services/material.py
@ -1,4 +1,5 @@
 import os
+import subprocess
 import random
 from urllib.parse import urlencode

@ -266,7 +267,6 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

-    # url_hash = utils.md5(str(uuid.uuid4()))
    video_id = f"vid-{timestamp.replace(':', '_')}"
    video_path = f"{save_dir}/{video_id}.mp4"

@ -277,7 +277,7 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
    # 剪辑视频
    start, end = utils.split_timestamp(timestamp)
    video = VideoFileClip(origin_video).subclip(start, end)
-    video.write_videofile(video_path)
+    video.write_videofile(video_path, logger=None)  # 禁用 MoviePy 的内置日志

    if os.path.getsize(video_path) > 0 and os.path.exists(video_path):
        try:
@ -296,20 +296,21 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
    return {}


-def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, ) -> dict:
+def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None):
    """
    剪辑视频
    Args:
        task_id: 任务id
        timestamp_terms: 需要剪辑的时间戳列表，如:['00:00-00:20', '00:36-00:40', '07:07-07:22']
        origin_video: 原视频路径
+        progress_callback: 进度回调函数

    Returns:
        剪辑后的视频路径
    """
    video_paths = {}
-    for item in timestamp_terms:
-        logger.info(f"需要裁剪 '{origin_video}' 为 {len(timestamp_terms)} 个视频")
+    total_items = len(timestamp_terms)
+    for index, item in enumerate(timestamp_terms):
        material_directory = config.app.get("material_directory", "").strip()
        if material_directory == "task":
            material_directory = utils.task_dir(task_id)
@ -317,11 +318,14 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, ) -
            material_directory = ""

        try:
-            logger.info(f"clip video: {item}")
            saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory)
            if saved_video_path:
                logger.info(f"video saved: {saved_video_path}")
                video_paths.update(saved_video_path)
+            
+            # 更新进度
+            if progress_callback:
+                progress_callback(index + 1, total_items)
        except Exception as e:
            logger.error(f"视频裁剪失败: {utils.to_json(item)} => {str(e)}")
            return {}
@ -329,6 +333,85 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, ) -
    return video_paths


+def merge_videos(video_paths, ost_list):
+    """
+    合并多个视频为一个视频，可选择是否保留每个视频的原声。
+
+    :param video_paths: 视频文件路径列表
+    :param ost_list: 是否保留原声的布尔值列表
+    :return: 合并后的视频文件路径
+    """
+    if len(video_paths) != len(ost_list):
+        raise ValueError("视频路径列表和保留原声列表长度必须相同")
+
+    if not video_paths:
+        raise ValueError("视频路径列表不能为空")
+
+    # 准备临时文件列表
+    temp_file = "temp_file_list.txt"
+    with open(temp_file, "w") as f:
+        for video_path, keep_ost in zip(video_paths, ost_list):
+            if keep_ost:
+                f.write(f"file '{video_path}'\n")
+            else:
+                # 如果不保留原声，创建一个无声的临时视频
+                silent_video = f"silent_{os.path.basename(video_path)}"
+                subprocess.run(["ffmpeg", "-i", video_path, "-c:v", "copy", "-an", silent_video], check=True)
+                f.write(f"file '{silent_video}'\n")
+
+    # 合并视频
+    output_file = "combined.mp4"
+    ffmpeg_cmd = [
+        "ffmpeg",
+        "-f", "concat",
+        "-safe", "0",
+        "-i", temp_file,
+        "-c:v", "copy",
+        "-c:a", "aac",
+        "-strict", "experimental",
+        output_file
+    ]
+
+    try:
+        subprocess.run(ffmpeg_cmd, check=True)
+        print(f"视频合并成功：{output_file}")
+    except subprocess.CalledProcessError as e:
+        print(f"视频合并失败：{e}")
+        return None
+    finally:
+        # 清理临时文件
+        os.remove(temp_file)
+        for video_path, keep_ost in zip(video_paths, ost_list):
+            if not keep_ost:
+                silent_video = f"silent_{os.path.basename(video_path)}"
+                if os.path.exists(silent_video):
+                    os.remove(silent_video)
+
+    return output_file
+
+
+# 使用示例
+# if __name__ == "__main__":
+#     video_paths = ['/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_17-01_37.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_00-00_06.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_06-00_09.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_03-01_10.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_10-01_17.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_24-00_27.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_28-01_36.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_32-00_41.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_36-01_58.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_12-00_15.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-00_09-00_12.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-02_12-02_25.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-02_03-02_12.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-01_58-02_03.mp4',
+#                    '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-03_14-03_18.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/cache_videos/vid-03_18-03_20.mp4']
+#
+#     ost_list = [True, False, False, False, False, False, False, False, True, False, False, False, False, False, False,
+#                 False]
+#
+#     result = merge_videos(video_paths, ost_list)
+#     if result:
+#         print(f"合并后的视频文件：{result}")
+#     else:
+#         print("视频合并失败")
+#
+
+
 if __name__ == "__main__":
    download_videos(
        "test123", ["Money Exchange Medium"], audio_duration=100, source="pixabay"
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@ -1,43 +1,67 @@
 import json
 import os.path
 import re
+import traceback
+from typing import Optional

 from faster_whisper import WhisperModel
 from timeit import default_timer as timer
 from loguru import logger
+import google.generativeai as genai

 from app.config import config
 from app.utils import utils

-model_size = config.whisper.get("model_size", "large-v3")
+model_size = config.whisper.get("model_size", "faster-whisper-large-v2")
 device = config.whisper.get("device", "cpu")
 compute_type = config.whisper.get("compute_type", "int8")
 model = None


 def create(audio_file, subtitle_file: str = ""):
+    """
+    为给定的音频文件创建字幕文件。
+
+    参数:
+    - audio_file: 音频文件的路径。
+    - subtitle_file: 字幕文件的输出路径（可选）。如果未提供，将根据音频文件的路径生成字幕文件。
+
+    返回:
+    无返回值，但会在指定路径生成字幕文件。
+    """
    global model
    if not model:
-        model_path = f"{utils.root_dir()}/models/whisper-{model_size}"
+        model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v2"
        model_bin_file = f"{model_path}/model.bin"
        if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
-            model_path = model_size
+            logger.error(
+                "请先下载 whisper 模型\n\n"
+                "********************************************\n"
+                "下载地址：https://huggingface.co/guillaumekln/faster-whisper-large-v2\n"
+                "存放路径：app/models \n"
+                "********************************************\n"
+            )
+            return None

        logger.info(
-            f"loading model: {model_path}, device: {device}, compute_type: {compute_type}"
+            f"加载模型: {model_path}, 设备: {device}, 计算类型: {compute_type}"
        )
        try:
            model = WhisperModel(
-                model_size_or_path=model_path, device=device, compute_type=compute_type
+                model_size_or_path=model_path,
+                device=device,
+                compute_type=compute_type,
+                local_files_only=True
            )
        except Exception as e:
            logger.error(
-                f"failed to load model: {e} \n\n"
+                f"加载模型失败: {e} \n\n"
                f"********************************************\n"
-                f"this may be caused by network issue. \n"
-                f"please download the model manually and put it in the 'models' folder. \n"
-                f"see [README.md FAQ](https://github.com/harry0703/NarratoAI) for more details.\n"
+                f"这可能是由网络问题引起的. \n"
+                f"请手动下载模型并将其放入 'app/models' 文件夹中。 \n"
+                f"see [README.md FAQ](https://github.com/linyqh/NarratoAI) for more details.\n"
                f"********************************************\n\n"
+                f"{traceback.format_exc()}"
            )
            return None

@ -51,10 +75,11 @@ def create(audio_file, subtitle_file: str = ""):
        word_timestamps=True,
        vad_filter=True,
        vad_parameters=dict(min_silence_duration_ms=500),
+        initial_prompt="以下是普通话的句子"
    )

    logger.info(
-        f"detected language: '{info.language}', probability: {info.language_probability:.2f}"
+        f"检测到的语言: '{info.language}', probability: {info.language_probability:.2f}"
    )

    start = timer()
@ -137,6 +162,15 @@ def create(audio_file, subtitle_file: str = ""):


 def file_to_subtitles(filename):
+    """
+    将字幕文件转换为字幕列表。
+
+    参数:
+    filename (str): 字幕文件的路径。
+
+    返回:
+    list: 包含字幕序号、出现时间、和字幕文本的元组列表。
+    """
    if not filename or not os.path.isfile(filename):
        return []

@ -278,22 +312,61 @@ def correct(subtitle_file, video_script):
        logger.success("Subtitle is correct")


+def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Optional[str] = None) -> Optional[str]:
+    if not api_key:
+        logger.error("Gemini API key is not provided")
+        return None
+
+    genai.configure(api_key=api_key)
+
+    logger.info(f"开始使用Gemini模型处理音频文件: {audio_file}")
+    
+    model = genai.GenerativeModel(model_name="gemini-1.5-flash")
+    prompt = "生成这段语音的转录文本。请以SRT格式输出，包含时间戳。"
+
+    try:
+        with open(audio_file, "rb") as f:
+            audio_data = f.read()
+        
+        response = model.generate_content([prompt, audio_data])
+        transcript = response.text
+
+        if not subtitle_file:
+            subtitle_file = f"{audio_file}.srt"
+
+        with open(subtitle_file, "w", encoding="utf-8") as f:
+            f.write(transcript)
+
+        logger.info(f"Gemini生成的字幕文件已保存: {subtitle_file}")
+        return subtitle_file
+    except Exception as e:
+        logger.error(f"使用Gemini处理音频时出错: {e}")
+        return None
+
+
 if __name__ == "__main__":
-    task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072"
+    task_id = "test456"
    task_dir = utils.task_dir(task_id)
    subtitle_file = f"{task_dir}/subtitle.srt"
-    audio_file = f"{task_dir}/audio.mp3"
+    audio_file = f"{task_dir}/audio.wav"

    subtitles = file_to_subtitles(subtitle_file)
    print(subtitles)

-    script_file = f"{task_dir}/script.json"
-    with open(script_file, "r") as f:
-        script_content = f.read()
-    s = json.loads(script_content)
-    script = s.get("script")
+    # script_file = f"{task_dir}/script.json"
+    # with open(script_file, "r") as f:
+    #     script_content = f.read()
+    # s = json.loads(script_content)
+    # script = s.get("script")
+    #
+    # correct(subtitle_file, script)

-    correct(subtitle_file, script)
-
-    subtitle_file = f"{task_dir}/subtitle-test.srt"
+    subtitle_file = f"{task_dir}/subtitle111.srt"
    create(audio_file, subtitle_file)
+
+    # # 使用Gemini模型处理音频
+    # gemini_api_key = config.app.get("gemini_api_key")  # 请替换为实际的API密钥
+    # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key)
+    #
+    # if gemini_subtitle_file:
+    #     print(f"Gemini生成的字幕文件: {gemini_subtitle_file}")
--- a/app/services/task.py
+++ b/app/services/task.py
@ -2,15 +2,14 @@ import math
 import json
 import os.path
 import re
+import traceback
 from os import path
-
-from edge_tts import SubMaker
 from loguru import logger

 from app.config import config
 from app.models import const
 from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams
-from app.services import llm, material, subtitle, video, voice
+from app.services import llm, material, subtitle, video, voice, audio_merger
 from app.services import state as sm
 from app.utils import utils

@ -99,7 +98,7 @@ def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
    if not params.subtitle_enabled:
        return ""

-    subtitle_path = path.join(utils.task_dir(task_id), "subtitle.srt")
+    subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt")
    subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
    logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")

@ -213,7 +212,7 @@ def start(task_id, params: VideoParams, stop_at: str = "video"):

    if type(params.video_concat_mode) is str:
        params.video_concat_mode = VideoConcatMode(params.video_concat_mode)
-        
+
    # 1. Generate script
    video_script = generate_script(task_id, params)
    if not video_script:
@ -325,51 +324,63 @@ def start(task_id, params: VideoParams, stop_at: str = "video"):
    return kwargs


-def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
+def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: list):
    """
    后台任务（自动剪辑视频进行剪辑）
+
+        task_id: 任务ID
+        params: 剪辑参数
+        subclip_path_videos: 视频文件路径
+
    """
    logger.info(f"\n\n## 开始任务: {task_id}")
    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)

+    # tts 角色名称
    voice_name = voice.parse_voice_name(params.voice_name)
-    # voice_name = 'zh-CN-XiaoyiNeural'
-    paragraph_number = params.paragraph_number
-    n_threads = params.n_threads
-    max_clip_duration = params.video_clip_duration

-    logger.info("\n\n## 1. 读取json")
-    video_script_path = path.join(params.video_clip_json)
+    logger.info("\n\n## 1. 加载视频脚本")
+    video_script_path = path.join(params.video_clip_json_path)
+    # video_script_path = video_clip_json_path
    # 判断json文件是否存在
    if path.exists(video_script_path):
-        # 读取json文件内容，并转为dict
-        with open(video_script_path, "r", encoding="utf-8") as f:
-            list_script = json.load(f)
-            video_list = [i['narration'] for i in list_script]
-            time_list = [i['timestamp'] for i in list_script]
-
-            video_script = " ".join(video_list)
-            logger.debug(f"原json脚本: \n{video_script}")
-            logger.debug(f"原json时间戳: \n{time_list}")
+        try:
+            with open(video_script_path, "r", encoding="utf-8") as f:
+                list_script = json.load(f)
+                video_list = [i['narration'] for i in list_script]
+                video_ost = [i['OST'] for i in list_script]
+                time_list = [i['timestamp'] for i in list_script]

+                video_script = " ".join(video_list)
+                logger.debug(f"解说完整脚本: \n{video_script}")
+                logger.debug(f"解说 OST 列表: \n{video_ost}")
+                logger.debug(f"解说时间戳列表: \n{time_list}")
+                # 获取视频总时长(单位 s)
+                total_duration = list_script[-1]['new_timestamp']
+                total_duration = int(total_duration.split("-")[1].split(":")[0]) * 60 + int(
+                    total_duration.split("-")[1].split(":")[1])
+        except Exception as e:
+            logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
+            raise ValueError("无法读取视频json脚本，请检查配置是否正确")
    else:
-        raise ValueError("解说文案不存在！检查文案名称是否正确。")
+        logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
+        raise ValueError("解说脚本不存在！请检查配置是否正确。")

-    # video_script = llm.text_polishing(context=video_script, language=params.video_language)
-    # logger.debug(f"润色后的视频脚本: \n{video_script}")
-    # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
-
-    logger.info("\n\n## 2. 生成音频")
-    audio_file = path.join(utils.task_dir(task_id), f"audio.mp3")
-    sub_maker = voice.tts(text=video_script, voice_name=voice_name, voice_file=audio_file, voice_rate=params.voice_rate)
-    if sub_maker is None:
+    logger.info("\n\n## 2. 生成音频列表")
+    audio_files, sub_maker_list = voice.tts_multiple(
+        task_id=task_id,
+        list_script=list_script,
+        voice_name=voice_name,
+        voice_rate=params.voice_rate,
+        force_regenerate=True
+    )
+    if audio_files is None:
        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
        logger.error(
-            "无法生成音频，可能是网络不可用。如果您在中国，请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
+            "音频文件为空，可能是网络不可用。如果您在中国，请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
        return
-
-    audio_duration = voice.get_audio_duration(sub_maker)
-    audio_duration = math.ceil(audio_duration)
+    logger.info(f"合并音频:\n\n {audio_files}")
+    audio_file = audio_merger.merge_audio_files(task_id, audio_files, total_duration, list_script)

    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)

@ -378,17 +389,8 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
        subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
        logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-        subtitle_fallback = False
-        if subtitle_provider == "edge":
-            voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path)
-            if not os.path.exists(subtitle_path):
-                subtitle_fallback = True
-                logger.warning("找不到字幕文件，回退到whisper")
-
-        if subtitle_provider == "whisper" or subtitle_fallback:
-            subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
-            logger.info("\n\n## 更正字幕")
-            subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+        # 使用 faster-whisper-large-v2 模型生成字幕
+        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)

        subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
        if not subtitle_lines:
@ -399,10 +401,6 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):

    logger.info("\n\n## 4. 裁剪视频")
    subclip_videos = [x for x in subclip_path_videos.values()]
-    # subclip_videos = material.clip_videos(task_id=task_id,
-    #                                          timestamp_terms=time_list,
-    #                                          origin_video=params.video_origin_path
-    #                                          )
    logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}")

    if not subclip_videos:
@ -417,36 +415,39 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
    combined_video_paths = []

    _progress = 50
-    for i in range(params.video_count):
-        index = i + 1
-        combined_video_path = path.join(utils.task_dir(task_id), f"combined-{index}.mp4")
-        logger.info(f"\n\n## 5. 合并视频: {index} => {combined_video_path}")
-        video.combine_clip_videos(combined_video_path=combined_video_path,
-                             video_paths=subclip_videos,
-                             video_script_list=video_list,
-                             audio_file=audio_file,
-                             video_aspect=params.video_aspect,
-                             threads=n_threads)
+    index = 1
+    combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
+    logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")

-        _progress += 50 / params.video_count / 2
-        sm.state.update_task(task_id, progress=_progress)
+    video.combine_clip_videos(
+        combined_video_path=combined_video_path,
+        video_paths=subclip_videos,
+        video_ost_list=video_ost,
+        list_script=list_script,
+        video_aspect=params.video_aspect,
+        threads=params.n_threads  # 多线程
+    )

-        final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
+    _progress += 50 / 2
+    sm.state.update_task(task_id, progress=_progress)

-        logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}")
-        # 把所有东西合到在一起
-        video.generate_video(video_path=combined_video_path,
-                             audio_path=audio_file,
-                             subtitle_path=subtitle_path,
-                             output_file=final_video_path,
-                             params=params,
-                             )
+    final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")

-        _progress += 50 / params.video_count / 2
-        sm.state.update_task(task_id, progress=_progress)
+    logger.info(f"\n\n## 6. 最后一步: {index} => {final_video_path}")
+    # 把所有东西合到在一起
+    video.generate_video_v2(
+        video_path=combined_video_path,
+        audio_path=audio_file,
+        subtitle_path=subtitle_path,
+        output_file=final_video_path,
+        params=params,
+    )

-        final_video_paths.append(final_video_path)
-        combined_video_paths.append(combined_video_path)
+    _progress += 50 / 2
+    sm.state.update_task(task_id, progress=_progress)
+
+    final_video_paths.append(final_video_path)
+    combined_video_paths.append(combined_video_path)

    logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.")

@ -459,11 +460,35 @@ def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):


 if __name__ == "__main__":
-    task_id = "task_id"
-    params = VideoParams(
-        video_subject="金钱的作用",
-        voice_name="zh-CN-XiaoyiNeural-Female",
-        voice_rate=1.0,
+    # task_id = "test123"
+    # subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4',
+    #                        '00:06-00:15': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_06-00_15.mp4',
+    #                        '01:10-01:17': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_10-01_17.mp4',
+    #                        '00:47-01:03': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_47-01_03.mp4',
+    #                        '01:03-01:10': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_03-01_10.mp4',
+    #                        '02:40-03:08': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-02_40-03_08.mp4',
+    #                        '03:02-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_02-03_20.mp4',
+    #                        '03:18-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_18-03_20.mp4'}
+    #
+    # params = VideoClipParams(
+    #     video_clip_json_path="E:\\projects\\NarratoAI\\resource/scripts/test003.json",
+    #     video_origin_path="E:\\projects\\NarratoAI\\resource/videos/1.mp4",
+    # )
+    # start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)

+    task_id = "test456"
+    subclip_path_videos = {'01:10-01:17': './storage/cache_videos/vid-01_10-01_17.mp4',
+                           '01:58-02:04': './storage/cache_videos/vid-01_58-02_04.mp4',
+                           '02:25-02:31': './storage/cache_videos/vid-02_25-02_31.mp4',
+                           '01:28-01:33': './storage/cache_videos/vid-01_28-01_33.mp4',
+                           '03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4',
+                           '00:24-00:28': './storage/cache_videos/vid-00_24-00_28.mp4',
+                           '03:02-03:08': './storage/cache_videos/vid-03_02-03_08.mp4',
+                           '00:41-00:44': './storage/cache_videos/vid-00_41-00_44.mp4',
+                           '02:12-02:25': './storage/cache_videos/vid-02_12-02_25.mp4'}
+
+    params = VideoClipParams(
+        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test004.json",
+        video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4",
    )
-    start(task_id, params, stop_at="video")
+    start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
--- a/app/services/video.py
+++ b/app/services/video.py
@ -1,3 +1,5 @@
+import re
+import os
 import glob
 import random
 from typing import List
@ -216,9 +218,7 @@ def generate_video(
    logger.info(f"  ③ subtitle: {subtitle_path}")
    logger.info(f"  ④ output: {output_file}")

-    # https://github.com/harry0703/NarratoAI/issues/217
-    # PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'final-1.mp4.tempTEMP_MPY_wvf_snd.mp3'
-    # write into the same directory as the output file
+    # 写入与输出文件相同的目录
    output_dir = os.path.dirname(output_file)

    font_path = ""
@ -294,13 +294,160 @@ def generate_video(
        output_file,
        audio_codec="aac",
        temp_audiofile_path=output_dir,
-        threads=params.n_threads or 2,
+        threads=params.n_threads,
        logger=None,
        fps=30,
    )
    video_clip.close()
    del video_clip
-    logger.success("completed")
+    logger.success(""
+                   "completed")
+
+
+def generate_video_v2(
+        video_path: str,
+        audio_path: str,
+        subtitle_path: str,
+        output_file: str,
+        params: Union[VideoParams, VideoClipParams],
+):
+    """
+    合并所有素材
+    Args:
+        video_path: 视频路径
+        audio_path: 单个音频文件路径
+        subtitle_path: 字幕文件路径
+        output_file: 输出文件路径
+        params: 视频参数
+
+    Returns:
+
+    """
+    aspect = VideoAspect(params.video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    logger.info(f"开始，视频尺寸: {video_width} x {video_height}")
+    logger.info(f"  ① 视频: {video_path}")
+    logger.info(f"  ② 音频: {audio_path}")
+    logger.info(f"  ③ 字幕: {subtitle_path}")
+    logger.info(f"  ④ 输出: {output_file}")
+
+    # 写入与输出文件相同的目录
+    output_dir = os.path.dirname(output_file)
+
+    # 字体设置部分保持不变
+    font_path = ""
+    if params.subtitle_enabled:
+        if not params.font_name:
+            params.font_name = "STHeitiMedium.ttc"
+        font_path = os.path.join(utils.font_dir(), params.font_name)
+        if os.name == "nt":
+            font_path = font_path.replace("\\", "/")
+        logger.info(f"使用字体: {font_path}")
+
+    # create_text_clip 函数保持不变
+    def create_text_clip(subtitle_item):
+        phrase = subtitle_item[1]
+        max_width = video_width * 0.9
+        wrapped_txt, txt_height = wrap_text(
+            phrase, max_width=max_width, font=font_path, fontsize=params.font_size
+        )
+        _clip = TextClip(
+            wrapped_txt,
+            font=font_path,
+            fontsize=params.font_size,
+            color=params.text_fore_color,
+            bg_color=params.text_background_color,
+            stroke_color=params.stroke_color,
+            stroke_width=params.stroke_width,
+            print_cmd=False,
+        )
+        duration = subtitle_item[0][1] - subtitle_item[0][0]
+        _clip = _clip.set_start(subtitle_item[0][0])
+        _clip = _clip.set_end(subtitle_item[0][1])
+        _clip = _clip.set_duration(duration)
+        if params.subtitle_position == "bottom":
+            _clip = _clip.set_position(("center", video_height * 0.95 - _clip.h))
+        elif params.subtitle_position == "top":
+            _clip = _clip.set_position(("center", video_height * 0.05))
+        elif params.subtitle_position == "custom":
+            # 确保字幕完全在屏幕内
+            margin = 10  # 额外的边距，单位为像素
+            max_y = video_height - _clip.h - margin
+            min_y = margin
+            custom_y = (video_height - _clip.h) * (params.custom_position / 100)
+            custom_y = max(min_y, min(custom_y, max_y))  # 限制 y 值在有效范围内
+            _clip = _clip.set_position(("center", custom_y))
+        else:  # center
+            _clip = _clip.set_position(("center", "center"))
+        return _clip
+
+    video_clip = VideoFileClip(video_path)
+    original_audio = video_clip.audio  # 保存原始视频的音轨
+    video_duration = video_clip.duration
+
+    # 处理新的音频文件
+    new_audio = AudioFileClip(audio_path).volumex(params.voice_volume)
+
+    # 字幕处理部分
+    if subtitle_path and os.path.exists(subtitle_path):
+        sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8")
+        text_clips = []
+        
+        for item in sub.subtitles:
+            clip = create_text_clip(subtitle_item=item)
+            
+            # 确保字幕的开始时间不早于视频开始
+            start_time = max(clip.start, 0)
+            
+            # 如果字幕的开始时间晚于视频结束时间，则跳过此字幕
+            if start_time >= video_duration:
+                continue
+            
+            # 调整字幕的结束时间，但不要超过视频长度
+            end_time = min(clip.end, video_duration)
+            
+            # 调整字幕的时间范围
+            clip = clip.set_start(start_time).set_end(end_time)
+            
+            text_clips.append(clip)
+        
+        logger.info(f"处理了 {len(text_clips)} 段字幕")
+        
+        # 创建一个新的视频剪辑，包含所有字幕
+        video_clip = CompositeVideoClip([video_clip, *text_clips])
+
+    # 背景音乐处理部分
+    bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
+    
+    # 合并音频轨道
+    audio_tracks = [original_audio, new_audio]
+    
+    if bgm_file:
+        try:
+            bgm_clip = (
+                AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3)
+            )
+            bgm_clip = afx.audio_loop(bgm_clip, duration=video_duration)
+            audio_tracks.append(bgm_clip)
+        except Exception as e:
+            logger.error(f"添加背景音乐失败: {str(e)}")
+
+    # 合并所有音频轨道
+    final_audio = CompositeAudioClip(audio_tracks)
+
+    video_clip = video_clip.set_audio(final_audio)
+    video_clip.write_videofile(
+        output_file,
+        audio_codec="aac",
+        temp_audiofile_path=output_dir,
+        threads=params.n_threads,
+        logger=None,
+        fps=30,
+    )
+    video_clip.close()
+    del video_clip
+    logger.success("完成")


 def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
@ -352,8 +499,8 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4):

 def combine_clip_videos(combined_video_path: str,
                        video_paths: List[str],
-                        video_script_list: List[str],
-                        audio_file: str,
+                        video_ost_list: List[bool],
+                        list_script: list,
                        video_aspect: VideoAspect = VideoAspect.portrait,
                        threads: int = 2,
                        ) -> str:
@ -362,15 +509,16 @@ def combine_clip_videos(combined_video_path: str,
    Args:
        combined_video_path: 合并后的存储路径
        video_paths: 子视频路径列表
-        audio_file: mp3旁白
+        video_ost_list: 原声播放列表
+        list_script: 剪辑脚本
        video_aspect: 屏幕比例
        threads: 线程数

    Returns:

    """
-    audio_clip = AudioFileClip(audio_file)
-    audio_duration = audio_clip.duration
+    from app.utils.utils import calculate_total_duration
+    audio_duration = calculate_total_duration(list_script)
    logger.info(f"音频的最大持续时间: {audio_duration} s")
    # 每个剪辑所需的持续时间
    req_dur = audio_duration / len(video_paths)
@ -384,62 +532,57 @@ def combine_clip_videos(combined_video_path: str,
    clips = []
    video_duration = 0
    # 一遍又一遍地添加下载的剪辑，直到达到音频的持续时间 （max_duration）
-    while video_duration < audio_duration:
-        for video_path, video_script in zip(video_paths, video_script_list):
-            clip = VideoFileClip(video_path).without_audio()
-            # 检查剪辑是否比剩余音频长
-            if (audio_duration - video_duration) < clip.duration:
-                clip = clip.subclip(0, (audio_duration - video_duration))
-            # 仅当计算出的剪辑长度 （req_dur） 短于实际剪辑时，才缩短剪辑以防止静止图像
-            elif req_dur < clip.duration:
-                clip = clip.subclip(0, req_dur)
-            clip = clip.set_fps(30)
+    # while video_duration < audio_duration:
+    for video_path, video_ost in zip(video_paths, video_ost_list):
+        cache_video_path = utils.root_dir()
+        clip = VideoFileClip(os.path.join(cache_video_path, video_path))
+        # 通过 ost 字段判断是否播放原声
+        if not video_ost:
+            clip = clip.without_audio()
+        # # 检查剪辑是否比剩余音频长
+        # if (audio_duration - video_duration) < clip.duration:
+        #     clip = clip.subclip(0, (audio_duration - video_duration))
+        # # 仅当计算出的剪辑长度 （req_dur） 短于实际剪辑时，才缩短剪辑以防止静止图像
+        # elif req_dur < clip.duration:
+        #     clip = clip.subclip(0, req_dur)
+        clip = clip.set_fps(30)

-            # 并非所有视频的大小都相同，因此我们需要调整它们的大小
-            clip_w, clip_h = clip.size
-            if clip_w != video_width or clip_h != video_height:
-                clip_ratio = clip.w / clip.h
-                video_ratio = video_width / video_height
+        # 并非所有视频的大小都相同，因此我们需要调整它们的大小
+        clip_w, clip_h = clip.size
+        if clip_w != video_width or clip_h != video_height:
+            clip_ratio = clip.w / clip.h
+            video_ratio = video_width / video_height

-                if clip_ratio == video_ratio:
-                    # 等比例缩放
-                    clip = clip.resize((video_width, video_height))
+            if clip_ratio == video_ratio:
+                # 等比例缩放
+                clip = clip.resize((video_width, video_height))
+            else:
+                # 等比缩放视频
+                if clip_ratio > video_ratio:
+                    # 按照目标宽度等比缩放
+                    scale_factor = video_width / clip_w
                else:
-                    # 等比缩放视频
-                    if clip_ratio > video_ratio:
-                        # 按照目标宽度等比缩放
-                        scale_factor = video_width / clip_w
-                    else:
-                        # 按照目标高度等比缩放
-                        scale_factor = video_height / clip_h
+                    # 按照目标高度等比缩放
+                    scale_factor = video_height / clip_h

-                    new_width = int(clip_w * scale_factor)
-                    new_height = int(clip_h * scale_factor)
-                    clip_resized = clip.resize(newsize=(new_width, new_height))
+                new_width = int(clip_w * scale_factor)
+                new_height = int(clip_h * scale_factor)
+                clip_resized = clip.resize(newsize=(new_width, new_height))

-                    background = ColorClip(size=(video_width, video_height), color=(0, 0, 0))
-                    clip = CompositeVideoClip([
-                        background.set_duration(clip.duration),
-                        clip_resized.set_position("center")
-                    ])
+                background = ColorClip(size=(video_width, video_height), color=(0, 0, 0))
+                clip = CompositeVideoClip([
+                    background.set_duration(clip.duration),
+                    clip_resized.set_position("center")
+                ])

-                logger.info(f"将视频 {video_path} 大小调整为 {video_width} x {video_height}, 剪辑尺寸: {clip_w} x {clip_h}")
+            logger.info(f"将视频 {video_path} 大小调整为 {video_width} x {video_height}, 剪辑尺寸: {clip_w} x {clip_h}")

-            # TODO: 片段时长过长时，需要缩短，但暂时没有好的解决方案
-            # if clip.duration > 5:
-            #     ctime = utils.reduce_video_time(txt=video_script)
-            #     if clip.duration > (2 * ctime):
-            #         clip = clip.subclip(ctime, 2*ctime)
-            #     else:
-            #         clip = clip.subclip(0, ctime)
-            #     logger.info(f"视频 {video_path} 片段时长较长，将剪辑时长缩短至 {ctime} 秒")
-
-            clips.append(clip)
-            video_duration += clip.duration
+        clips.append(clip)
+        video_duration += clip.duration

    video_clip = concatenate_videoclips(clips)
    video_clip = video_clip.set_fps(30)
-    logger.info(f"合并中...")
+    logger.info(f"合并视频中...")
    video_clip.write_videofile(filename=combined_video_path,
                               threads=threads,
                               logger=None,
@ -453,68 +596,81 @@ def combine_clip_videos(combined_video_path: str,


 if __name__ == "__main__":
-    from app.utils import utils
+    # combined_video_path = "../../storage/tasks/12312312/com123.mp4"
+    #
+    # video_paths = ['../../storage/cache_videos/vid-00_00-00_03.mp4',
+    #                '../../storage/cache_videos/vid-00_03-00_07.mp4',
+    #                '../../storage/cache_videos/vid-00_12-00_17.mp4',
+    #                '../../storage/cache_videos/vid-00_26-00_31.mp4']
+    # video_ost_list = [False, True, False, True]
+    # list_script = [
+    #     {
+    #         "picture": "夜晚，一个小孩在树林里奔跑，后面有人拿着火把在追赶",
+    #         "timestamp": "00:00-00:03",
+    #         "narration": "夜黑风高的树林，一个小孩在拼命奔跑，后面的人穷追不舍！",
+    #         "OST": False,
+    #         "new_timestamp": "00:00-00:03"
+    #     },
+    #     {
+    #         "picture": "追赶的人命令抓住小孩",
+    #         "timestamp": "00:03-00:07",
+    #         "narration": "原声播放1",
+    #         "OST": True,
+    #         "new_timestamp": "00:03-00:07"
+    #     },
+    #     {
+    #         "picture": "小孩躲在草丛里，黑衣人用脚踢了踢他",
+    #         "timestamp": "00:12-00:17",
+    #         "narration": "小孩脱下外套，跑进树林, 一路奔跑，直到第二天清晨",
+    #         "OST": False,
+    #         "new_timestamp": "00:07-00:12"
+    #     },
+    #     {
+    #         "picture": "小孩跑到车前，慌慌张张地对女人说有人要杀他",
+    #         "timestamp": "00:26-00:31",
+    #         "narration": "原声播放2",
+    #         "OST": True,
+    #         "new_timestamp": "00:12-00:17"
+    #     }
+    # ]
+    # combine_clip_videos(combined_video_path=combined_video_path, video_paths=video_paths, video_ost_list=video_ost_list, list_script=list_script)

-    suffix = "*.mp4"
-    song_dir = utils.video_dir()
-    files = glob.glob(os.path.join(song_dir, suffix))
+    cfg = VideoClipParams()
+    cfg.video_aspect = VideoAspect.portrait
+    cfg.font_name = "STHeitiMedium.ttc"
+    cfg.font_size = 60
+    cfg.stroke_color = "#000000"
+    cfg.stroke_width = 1.5
+    cfg.text_fore_color = "#FFFFFF"
+    cfg.text_background_color = "transparent"
+    cfg.bgm_type = "random"
+    cfg.bgm_file = ""
+    cfg.bgm_volume = 1.0
+    cfg.subtitle_enabled = True
+    cfg.subtitle_position = "bottom"
+    cfg.n_threads = 2
+    cfg.paragraph_number = 1

-    print(files)
+    cfg.voice_volume = 1.0

-    # m = MaterialInfo()
-    # m.url = "/Users/harry/Downloads/IMG_2915.JPG"
-    # m.provider = "local"
-    # materials = preprocess_video([m], clip_duration=4)
-    # print(materials)
-
-    # txt_en = "Here's your guide to travel hacks for budget-friendly adventures"
-    # txt_zh = "测试长字段这是您的旅行技巧指南帮助您进行预算友好的冒险"
-    # font = utils.resource_dir() + "/fonts/STHeitiMedium.ttc"
-    # for txt in [txt_en, txt_zh]:
-    #     t, h = wrap_text(text=txt, max_width=1000, font=font, fontsize=60)
-    #     print(t)
-    #
-    # task_id = "aa563149-a7ea-49c2-b39f-8c32cc225baf"
-    # task_dir = utils.task_dir(task_id)
-    # video_file = f"{task_dir}/combined-1.mp4"
-    # audio_file = f"{task_dir}/audio.mp3"
-    # subtitle_file = f"{task_dir}/subtitle.srt"
-    # output_file = f"{task_dir}/final.mp4"
-    #
-    # # video_paths = []
-    # # for file in os.listdir(utils.storage_dir("test")):
-    # #     if file.endswith(".mp4"):
-    # #         video_paths.append(os.path.join(utils.storage_dir("test"), file))
-    # #
-    # # combine_videos(combined_video_path=video_file,
-    # #                audio_file=audio_file,
-    # #                video_paths=video_paths,
-    # #                video_aspect=VideoAspect.portrait,
-    # #                video_concat_mode=VideoConcatMode.random,
-    # #                max_clip_duration=5,
-    # #                threads=2)
-    #
-    # cfg = VideoParams()
-    # cfg.video_aspect = VideoAspect.portrait
-    # cfg.font_name = "STHeitiMedium.ttc"
-    # cfg.font_size = 60
-    # cfg.stroke_color = "#000000"
-    # cfg.stroke_width = 1.5
-    # cfg.text_fore_color = "#FFFFFF"
-    # cfg.text_background_color = "transparent"
-    # cfg.bgm_type = "random"
-    # cfg.bgm_file = ""
-    # cfg.bgm_volume = 1.0
-    # cfg.subtitle_enabled = True
-    # cfg.subtitle_position = "bottom"
-    # cfg.n_threads = 2
-    # cfg.paragraph_number = 1
-    #
-    # cfg.voice_volume = 1.0
-    #
    # generate_video(video_path=video_file,
    #                audio_path=audio_file,
    #                subtitle_path=subtitle_file,
    #                output_file=output_file,
    #                params=cfg
    #                )
+
+    video_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/combined-1.mp4"
+
+    audio_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/audio_00-00-00-07.mp3"
+
+    subtitle_path = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa\subtitle.srt"
+
+    output_file = "../../storage/tasks/7f5ae494-abce-43cf-8f4f-4be43320eafa/final-123.mp4"
+
+    generate_video_v2(video_path=video_path,
+                       audio_path=audio_path,
+                       subtitle_path=subtitle_path,
+                       output_file=output_file,
+                       params=cfg
+                      )
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -1,12 +1,15 @@
-import asyncio
 import os
 import re
-from datetime import datetime
-from xml.sax.saxutils import unescape
-from edge_tts.submaker import mktimestamp
-from loguru import logger
-from edge_tts import submaker, SubMaker
+import json
+import traceback
 import edge_tts
+import asyncio
+from loguru import logger
+from typing import List
+from datetime import datetime
+from edge_tts.submaker import mktimestamp
+from xml.sax.saxutils import unescape
+from edge_tts import submaker, SubMaker
 from moviepy.video.tools import subtitles

 from app.config import config
@ -1031,8 +1034,8 @@ def is_azure_v2_voice(voice_name: str):
 def tts(
    text: str, voice_name: str, voice_rate: float, voice_file: str
 ) -> [SubMaker, None]:
-    if is_azure_v2_voice(voice_name):
-        return azure_tts_v2(text, voice_name, voice_file)
+    # if is_azure_v2_voice(voice_name):
+    #     return azure_tts_v2(text, voice_name, voice_file)
    return azure_tts_v1(text, voice_name, voice_rate, voice_file)


@ -1068,7 +1071,10 @@ def azure_tts_v1(
                                (chunk["offset"], chunk["duration"]), chunk["text"]
                            )
                return sub_maker
-
+            # 判断音频文件是否一件存在
+            if os.path.exists(voice_file):
+                logger.info(f"voice file exists, skip tts: {voice_file}")
+                continue
            sub_maker = asyncio.run(_do())
            if not sub_maker or not sub_maker.subs:
                logger.warning(f"failed, sub_maker is None or sub_maker.subs is None")
@ -1184,6 +1190,109 @@ def _format_text(text: str) -> str:
    return text


+def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], list_script: List[dict], 
+                                  subtitle_file: str):
+    """
+    根据多个 SubMaker 对象、完整文本和原始脚本创建优化的字幕文件
+    1. 使用原始脚本中的时间戳
+    2. 跳过 OST 为 true 的部分
+    3. 将字幕文件按照标点符号分割成多行
+    4. 根据完整文本分段，保持原文的语句结构
+    5. 生成新的字幕文件，时间戳包含小时单位
+    """
+    text = _format_text(text)
+    sentences = utils.split_string_by_punctuations(text)
+
+    def formatter(idx: int, start_time: str, end_time: str, sub_text: str) -> str:
+        return f"{idx}\n{start_time.replace('.', ',')} --> {end_time.replace('.', ',')}\n{sub_text}\n"
+
+    sub_items = []
+    sub_index = 0
+    sentence_index = 0
+
+    try:
+        sub_maker_index = 0
+        for script_item in list_script:
+            if script_item['OST']:
+                continue
+
+            start_time, end_time = script_item['new_timestamp'].split('-')
+            if sub_maker_index >= len(sub_maker_list):
+                logger.error(f"Sub maker list index out of range: {sub_maker_index}")
+                break
+            sub_maker = sub_maker_list[sub_maker_index]
+            sub_maker_index += 1
+
+            script_duration = utils.time_to_seconds(end_time) - utils.time_to_seconds(start_time)
+            audio_duration = get_audio_duration(sub_maker)
+            time_ratio = script_duration / audio_duration if audio_duration > 0 else 1
+
+            current_sub = ""
+            current_start = None
+            current_end = None
+
+            for offset, sub in zip(sub_maker.offset, sub_maker.subs):
+                sub = unescape(sub).strip()
+                sub_start = utils.seconds_to_time(utils.time_to_seconds(start_time) + offset[0] / 10000000 * time_ratio)
+                sub_end = utils.seconds_to_time(utils.time_to_seconds(start_time) + offset[1] / 10000000 * time_ratio)
+                
+                if current_start is None:
+                    current_start = sub_start
+                current_end = sub_end
+                
+                current_sub += sub
+                
+                # 检查当前累积的字幕是否匹配下一个句子
+                while sentence_index < len(sentences) and sentences[sentence_index] in current_sub:
+                    sub_index += 1
+                    line = formatter(
+                        idx=sub_index,
+                        start_time=current_start,
+                        end_time=current_end,
+                        sub_text=sentences[sentence_index].strip(),
+                    )
+                    sub_items.append(line)
+                    current_sub = current_sub.replace(sentences[sentence_index], "", 1).strip()
+                    current_start = current_end
+                    sentence_index += 1
+
+                # 如果当前字幕长度超过15个字符，也生成一个新的字幕项
+                if len(current_sub) > 15:
+                    sub_index += 1
+                    line = formatter(
+                        idx=sub_index,
+                        start_time=current_start,
+                        end_time=current_end,
+                        sub_text=current_sub.strip(),
+                    )
+                    sub_items.append(line)
+                    current_sub = ""
+                    current_start = current_end
+
+            # 处理剩余的文本
+            if current_sub.strip():
+                sub_index += 1
+                line = formatter(
+                    idx=sub_index,
+                    start_time=current_start,
+                    end_time=current_end,
+                    sub_text=current_sub.strip(),
+                )
+                sub_items.append(line)
+
+        if len(sub_items) == 0:
+            logger.error("No subtitle items generated")
+            return
+
+        with open(subtitle_file, "w", encoding="utf-8") as file:
+            file.write("\n".join(sub_items))
+
+        logger.info(f"completed, subtitle file created: {subtitle_file}")
+    except Exception as e:
+        logger.error(f"failed, error: {str(e)}")
+        traceback.print_exc()
+
+
 def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
    """
    优化字幕文件
@ -1283,72 +1392,73 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
    return sub_maker.offset[-1][1] / 10000000


-if __name__ == "__main__":
-    voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
+def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, force_regenerate: bool = True):
+    """
+    根据JSON文件中的多段文本进行TTS转换
+    
+    :param task_id: 任务ID
+    :param list_script: 脚本列表
+    :param voice_name: 语音名称
+    :param voice_rate: 语音速率
+    :param force_regenerate: 是否强制重新生成已存在的音频文件
+    :return: 生成的音频文件列表
+    """
+    voice_name = parse_voice_name(voice_name)
+    output_dir = utils.task_dir(task_id)
+    audio_files = []
+    sub_maker_list = []
+
+    for item in list_script:
+        if not item['OST']:
+            # timestamp = item['new_timestamp'].replace(':', '@')
+            timestamp = item['new_timestamp']
+            audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
+            
+            # 检查文件是否已存在，如存在且不强制重新生成，则跳过
+            if os.path.exists(audio_file):
+                logger.info(f"音频文件已存在，跳过生成: {audio_file}")
+                audio_files.append(audio_file)
+                continue
+
+            text = item['narration']
+
+            sub_maker = tts(
+                text=text,
+                voice_name=voice_name,
+                voice_rate=voice_rate,
+                voice_file=audio_file
+            )
+
+            if sub_maker is None:
+                logger.error(f"无法为时间戳 {timestamp} 生成音频; "
+                             f"如果您在中国，请使用VPN。或者手动选择 zh-CN-YunyangNeural 等角色；"
+                             f"或者使用其他 tts 引擎")
+                continue
+
+            audio_files.append(audio_file)
+            sub_maker_list.append(sub_maker)
+            logger.info(f"已生成音频文件: {audio_file}")
+
+    return audio_files, sub_maker_list
+
+
+if __name__ == "__main__":
+    voice_name = "zh-CN-YunyangNeural"
+    # voice_name = "af-ZA-AdriNeural"
    voice_name = parse_voice_name(voice_name)
-    voice_name = is_azure_v2_voice(voice_name)
    print(voice_name)

-    voices = get_all_azure_voices()
-    print(len(voices))
+    with open("../../resource/scripts/test.json", 'r', encoding='utf-8') as f:
+        data = json.load(f)

-    async def _do():
-        temp_dir = utils.storage_dir("temp")
+    audio_files, sub_maker_list = tts_multiple(task_id="12312312", list_script=data, voice_name=voice_name, voice_rate=1)

-        voice_names = [
-            "zh-CN-XiaoxiaoMultilingualNeural",
-            # 女性
-            "zh-CN-XiaoxiaoNeural",
-            "zh-CN-XiaoyiNeural",
-            # 男性
-            "zh-CN-YunyangNeural",
-            "zh-CN-YunxiNeural",
-        ]
-        text = """
-        静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人，表达了他对家乡和亲人的深深思念之情。全诗内容是：“床前明月光，疑是地上霜。举头望明月，低头思故乡。”在这短短的四句诗中，诗人通过“明月”和“思故乡”的意象，巧妙地表达了离乡背井人的孤独与哀愁。首句“床前明月光”设景立意，通过明亮的月光引出诗人的遐想；“疑是地上霜”增添了夜晚的寒冷感，加深了诗人的孤寂之情；“举头望明月”和“低头思故乡”则是情感的升华，展现了诗人内心深处的乡愁和对家的渴望。这首诗简洁明快，情感真挚，是中国古典诗歌中非常著名的一首，也深受后人喜爱和推崇。
-            """
+    full_text = " ".join([item['narration'] for item in data if not item['OST']])
+    subtitle_file = os.path.join(utils.task_dir("12312312"), "subtitle_multiple.srt")
+    create_subtitle_from_multiple(full_text, sub_maker_list, data, subtitle_file)
+    print(f"生成的音频文件列表: {audio_files}")
+    print(f"生成的字幕文件: {subtitle_file}")

-        text = """
-        What is the meaning of life? This question has puzzled philosophers, scientists, and thinkers of all kinds for centuries. Throughout history, various cultures and individuals have come up with their interpretations and beliefs around the purpose of life. Some say it's to seek happiness and self-fulfillment, while others believe it's about contributing to the welfare of others and making a positive impact in the world. Despite the myriad of perspectives, one thing remains clear: the meaning of life is a deeply personal concept that varies from one person to another. It's an existential inquiry that encourages us to reflect on our values, desires, and the essence of our existence.
-        """
-
-        text = """
-               预计未来3天深圳冷空气活动频繁，未来两天持续阴天有小雨，出门带好雨具；
-               10-11日持续阴天有小雨，日温差小，气温在13-17℃之间，体感阴凉；
-               12日天气短暂好转，早晚清凉；
-                   """
-
-        text = "[Opening scene: A sunny day in a suburban neighborhood. A young boy named Alex, around 8 years old, is playing in his front yard with his loyal dog, Buddy.]\n\n[Camera zooms in on Alex as he throws a ball for Buddy to fetch. Buddy excitedly runs after it and brings it back to Alex.]\n\nAlex: Good boy, Buddy! You're the best dog ever!\n\n[Buddy barks happily and wags his tail.]\n\n[As Alex and Buddy continue playing, a series of potential dangers loom nearby, such as a stray dog approaching, a ball rolling towards the street, and a suspicious-looking stranger walking by.]\n\nAlex: Uh oh, Buddy, look out!\n\n[Buddy senses the danger and immediately springs into action. He barks loudly at the stray dog, scaring it away. Then, he rushes to retrieve the ball before it reaches the street and gently nudges it back towards Alex. Finally, he stands protectively between Alex and the stranger, growling softly to warn them away.]\n\nAlex: Wow, Buddy, you're like my superhero!\n\n[Just as Alex and Buddy are about to head inside, they hear a loud crash from a nearby construction site. They rush over to investigate and find a pile of rubble blocking the path of a kitten trapped underneath.]\n\nAlex: Oh no, Buddy, we have to help!\n\n[Buddy barks in agreement and together they work to carefully move the rubble aside, allowing the kitten to escape unharmed. The kitten gratefully nuzzles against Buddy, who responds with a friendly lick.]\n\nAlex: We did it, Buddy! We saved the day again!\n\n[As Alex and Buddy walk home together, the sun begins to set, casting a warm glow over the neighborhood.]\n\nAlex: Thanks for always being there to watch over me, Buddy. You're not just my dog, you're my best friend.\n\n[Buddy barks happily and nuzzles against Alex as they disappear into the sunset, ready to face whatever adventures tomorrow may bring.]\n\n[End scene.]"
-
-        text = "大家好，我是乔哥，一个想帮你把信用卡全部还清的家伙！\n今天我们要聊的是信用卡的取现功能。\n你是不是也曾经因为一时的资金紧张，而拿着信用卡到ATM机取现？如果是，那你得好好看看这个视频了。\n现在都2024年了，我以为现在不会再有人用信用卡取现功能了。前几天一个粉丝发来一张图片，取现1万。\n信用卡取现有三个弊端。\n一，信用卡取现功能代价可不小。会先收取一个取现手续费，比如这个粉丝，取现1万，按2.5%收取手续费，收取了250元。\n二，信用卡正常消费有最长56天的免息期，但取现不享受免息期。从取现那一天开始，每天按照万5收取利息，这个粉丝用了11天，收取了55元利息。\n三，频繁的取现行为，银行会认为你资金紧张，会被标记为高风险用户，影响你的综合评分和额度。\n那么，如果你资金紧张了，该怎么办呢？\n乔哥给你支一招，用破思机摩擦信用卡，只需要少量的手续费，而且还可以享受最长56天的免息期。\n最后，如果你对玩卡感兴趣，可以找乔哥领取一本《卡神秘籍》，用卡过程中遇到任何疑惑，也欢迎找乔哥交流。\n别忘了，关注乔哥，回复用卡技巧，免费领取《2024用卡技巧》，让我们一起成为用卡高手！"
-
-        text = """
-        2023全年业绩速览
-公司全年累计实现营业收入1476.94亿元，同比增长19.01%，归母净利润747.34亿元，同比增长19.16%。EPS达到59.49元。第四季度单季，营业收入444.25亿元，同比增长20.26%，环比增长31.86%；归母净利润218.58亿元，同比增长19.33%，环比增长29.37%。这一阶段
-的业绩表现不仅突显了公司的增长动力和盈利能力，也反映出公司在竞争激烈的市场环境中保持了良好的发展势头。
-2023年Q4业绩速览
-第四季度，营业收入贡献主要增长点；销售费用高增致盈利能力承压；税金同比上升27%，扰动净利率表现。
-业绩解读
-利润方面，2023全年贵州茅台，>归母净利润增速为19%，其中营业收入正贡献18%，营业成本正贡献百分之一，管理费用正贡献百分之一点四。(注：归母净利润增速值=营业收入增速+各科目贡献，展示贡献/拖累的前四名科目，且要求贡献值/净利润增速>15%)
-"""
-        text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人"
-
-        text = _format_text(text)
-        lines = utils.split_string_by_punctuations(text)
-        print(lines)
-
-        for voice_name in voice_names:
-            voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
-            subtitle_file = f"{temp_dir}/tts.mp3.srt"
-            sub_maker = azure_tts_v2(
-                text=text, voice_name=voice_name, voice_file=voice_file
-            )
-            create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
-            audio_duration = get_audio_duration(sub_maker)
-            print(f"voice: {voice_name}, audio duration: {audio_duration}s")
-
-    loop = asyncio.get_event_loop_policy().get_event_loop()
-    try:
-        loop.run_until_complete(_do())
-    finally:
-        loop.close()
+    # text = " ".join([item['narration'] for item in data])
+    # sub_marks = tts(text=text, voice_name=voice_name, voice_rate=1, voice_file="../../storage/tasks/12312312/aaa.mp3")
+    # create_subtitle(text=text, sub_maker=sub_marks, subtitle_file="../../storage/tasks/12312312/subtitle_123.srt")
--- a/app/utils/check_script.py
+++ b/app/utils/check_script.py
@ -0,0 +1,115 @@
+import json
+from loguru import logger
+import os
+from datetime import timedelta
+
+def time_to_seconds(time_str):
+    parts = list(map(int, time_str.split(':')))
+    if len(parts) == 2:
+        return timedelta(minutes=parts[0], seconds=parts[1]).total_seconds()
+    elif len(parts) == 3:
+        return timedelta(hours=parts[0], minutes=parts[1], seconds=parts[2]).total_seconds()
+    raise ValueError(f"无法解析时间字符串: {time_str}")
+
+def seconds_to_time_str(seconds):
+    hours, remainder = divmod(int(seconds), 3600)
+    minutes, seconds = divmod(remainder, 60)
+    if hours > 0:
+        return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+    else:
+        return f"{minutes:02d}:{seconds:02d}"
+
+def adjust_timestamp(start_time, duration):
+    start_seconds = time_to_seconds(start_time)
+    end_seconds = start_seconds + duration
+    return f"{start_time}-{seconds_to_time_str(end_seconds)}"
+
+def estimate_audio_duration(text):
+    # 假设平均每个字符需要 0.2 秒
+    return len(text) * 0.2
+
+def check_script(data, total_duration):
+    errors = []
+    time_ranges = []
+
+    logger.info("开始检查脚本")
+    logger.info(f"视频总时长: {total_duration:.2f} 秒")
+    logger.info("=" * 50)
+
+    for i, item in enumerate(data, 1):
+        logger.info(f"\n检查第 {i} 项:")
+
+        # 检查所有必需字段
+        required_fields = ['picture', 'timestamp', 'narration', 'OST']
+        for field in required_fields:
+            if field not in item:
+                errors.append(f"第 {i} 项缺少 {field} 字段")
+                logger.info(f"  - 错误: 缺少 {field} 字段")
+            else:
+                logger.info(f"  - {field}: {item[field]}")
+
+        # 检查 OST 相关规则
+        if item.get('OST') == False:
+            if not item.get('narration'):
+                errors.append(f"第 {i} 项 OST 为 false，但 narration 为空")
+                logger.info("  - 错误: OST 为 false，但 narration 为空")
+            elif len(item['narration']) > 60:
+                errors.append(f"第 {i} 项 OST 为 false，但 narration 超过 60 字")
+                logger.info(f"  - 错误: OST 为 false，但 narration 超过 60 字 (当前: {len(item['narration'])} 字)")
+            else:
+                logger.info("  - OST 为 false，narration 检查通过")
+        elif item.get('OST') == True:
+            if "原声播放_" not in item.get('narration'):
+                errors.append(f"第 {i} 项 OST 为 true，但 narration 不为空")
+                logger.info("  - 错误: OST 为 true，但 narration 不为空")
+            else:
+                logger.info("  - OST 为 true，narration 检查通过")
+
+        # 检查 timestamp
+        if 'timestamp' in item:
+            start, end = map(time_to_seconds, item['timestamp'].split('-'))
+            if any((start < existing_end and end > existing_start) for existing_start, existing_end in time_ranges):
+                errors.append(f"第 {i} 项 timestamp '{item['timestamp']}' 与其他时间段重叠")
+                logger.info(f"  - 错误: timestamp '{item['timestamp']}' 与其他时间段重叠")
+            else:
+                logger.info(f"  - timestamp '{item['timestamp']}' 检查通过")
+                time_ranges.append((start, end))
+
+            # if end > total_duration:
+            #     errors.append(f"第 {i} 项 timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f} 秒")
+            #     logger.info(f"  - 错误: timestamp '{item['timestamp']}' 超过总时长 {total_duration:.2f} 秒")
+            # else:
+            #     logger.info(f"  - timestamp 在总时长范围内")
+
+        # 处理 narration 字段
+        if item.get('OST') == False and item.get('narration'):
+            estimated_duration = estimate_audio_duration(item['narration'])
+            start_time = item['timestamp'].split('-')[0]
+            item['timestamp'] = adjust_timestamp(start_time, estimated_duration)
+            logger.info(f"  - 已调整 timestamp 为 {item['timestamp']} (估算音频时长: {estimated_duration:.2f} 秒)")
+
+    if errors:
+        logger.info("检查结果：不通过")
+        logger.info("发现以下错误：")
+        for error in errors:
+            logger.info(f"- {error}")
+    else:
+        logger.info("检查结果：通过")
+        logger.info("所有项目均符合规则要求。")
+
+    return errors, data
+
+
+if __name__ == "__main__":
+    file_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test004.json"
+
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    total_duration = 280
+
+    # check_script(data, total_duration)
+
+    from app.utils.utils import add_new_timestamps
+    res = add_new_timestamps(data)
+    print(json.dumps(res, indent=4, ensure_ascii=False))
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@ -1,14 +1,20 @@
 import locale
 import os
-import platform
+import traceback
+
+import requests
 import threading
 from typing import Any
 from loguru import logger
+import streamlit as st
 import json
 from uuid import uuid4
 import urllib3
+from datetime import datetime, timedelta

 from app.models import const
+from app.utils import check_script
+from app.services import material

 urllib3.disable_warnings()

@ -269,3 +275,153 @@ def reduce_video_time(txt: str, duration: float = 0.21531):
    # 返回结果四舍五入为整数
    duration = len(txt) * duration
    return int(duration)
+
+
+def get_current_country():
+    """
+    判断当前网络IP地址所在的国家
+    """
+    try:
+        # 使用ipapi.co的免费API获取IP地址信息
+        response = requests.get('https://ipapi.co/json/')
+        data = response.json()
+
+        # 获取国家名称
+        country = data.get('country_name')
+
+        if country:
+            logger.debug(f"当前网络IP地址位于：{country}")
+            return country
+        else:
+            logger.debug("无法确定当前网络IP地址所在的国家")
+            return None
+
+    except requests.RequestException:
+        logger.error("获取IP地址信息时发生错误，请检查网络连接")
+        return None
+
+
+def time_to_seconds(time_str: str) -> float:
+    parts = time_str.split(':')
+    if len(parts) == 2:
+        m, s = map(float, parts)
+        return m * 60 + s
+    elif len(parts) == 3:
+        h, m, s = map(float, parts)
+        return h * 3600 + m * 60 + s
+    else:
+        raise ValueError(f"Invalid time format: {time_str}")
+
+
+def seconds_to_time(seconds: float) -> str:
+    h, remainder = divmod(seconds, 3600)
+    m, s = divmod(remainder, 60)
+    return f"{int(h):02d}:{int(m):02d}:{s:06.3f}"
+
+
+def calculate_total_duration(scenes):
+    total_seconds = 0
+    
+    for scene in scenes:
+        start, end = scene['timestamp'].split('-')
+        start_time = datetime.strptime(start, '%M:%S')
+        end_time = datetime.strptime(end, '%M:%S')
+        
+        duration = end_time - start_time
+        total_seconds += duration.total_seconds()
+    
+    return total_seconds
+
+
+def add_new_timestamps(scenes):
+    """
+    新增新视频的时间戳，并为"原生播放"的narration添加唯一标识符
+    Args:
+        scenes: 场景列表
+
+    Returns:
+        更新后的场景列表
+    """
+    current_time = timedelta()
+    updated_scenes = []
+
+    # 保存脚本前先检查脚本是否正确
+    check_script.check_script(scenes, calculate_total_duration(scenes))
+
+    for scene in scenes:
+        new_scene = scene.copy()  # 创建场景的副本，以保留原始数据
+        start, end = new_scene['timestamp'].split('-')
+        start_time = datetime.strptime(start, '%M:%S')
+        end_time = datetime.strptime(end, '%M:%S')
+        duration = end_time - start_time
+
+        new_start = current_time
+        current_time += duration
+        new_end = current_time
+
+        # 将 timedelta 转换为分钟和秒
+        new_start_str = f"{int(new_start.total_seconds() // 60):02d}:{int(new_start.total_seconds() % 60):02d}"
+        new_end_str = f"{int(new_end.total_seconds() // 60):02d}:{int(new_end.total_seconds() % 60):02d}"
+
+        new_scene['new_timestamp'] = f"{new_start_str}-{new_end_str}"
+
+        # 为"原生播放"的narration添加唯一标识符
+        if new_scene.get('narration') == "" or new_scene.get('narration') == None:
+            unique_id = str(uuid4())[:8]  # 使用UUID的前8个字符作为唯一标识符
+            new_scene['narration'] = f"原声播放_{unique_id}"
+
+        updated_scenes.append(new_scene)
+
+    return updated_scenes
+
+
+def clean_model_output(output):
+    # 移除可能的代码块标记
+    output = output.strip('```json').strip('```')
+    # 移除开头和结尾的空白字符
+    output = output.strip()
+    return output
+
+
+def cut_video(params, progress_callback=None):
+    try:
+        task_id = str(uuid4())
+        st.session_state['task_id'] = task_id
+
+        if not st.session_state.get('video_clip_json'):
+            raise ValueError("视频脚本不能为空")
+
+        video_script_list = st.session_state['video_clip_json']
+        time_list = [i['timestamp'] for i in video_script_list]
+        
+        total_clips = len(time_list)
+        
+        def clip_progress(current, total):
+            progress = int((current / total) * 100)
+            if progress_callback:
+                progress_callback(progress)
+
+        subclip_videos = material.clip_videos(
+            task_id=task_id,
+            timestamp_terms=time_list,
+            origin_video=params.video_origin_path,
+            progress_callback=clip_progress
+        )
+
+        if subclip_videos is None:
+            raise ValueError("裁剪视频失败")
+
+        st.session_state['subclip_videos'] = subclip_videos
+
+        for i, video_script in enumerate(video_script_list):
+            try:
+                video_script['path'] = subclip_videos[video_script['timestamp']]
+            except KeyError as err:
+                logger.error(f"裁剪视频失败: {err}")
+                raise ValueError(f"裁剪视频失败: {err}")
+
+        return task_id, subclip_videos
+
+    except Exception as e:
+        logger.error(f"视频裁剪过程中发生错误: {traceback.format_exc()}")
+        raise
--- a/config.example.toml
+++ b/config.example.toml
@ -1,22 +1,8 @@
 [app]
-    project_version="0.1.2"
-    video_source = "pexels"  # "pexels" or "pixabay"
-    # Pexels API Key
-    # Register at https://www.pexels.com/api/ to get your API key.
-    # You can use multiple keys to avoid rate limits.
-    # For example: pexels_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
-    # 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
-    pexels_api_keys = []
-
-    # Pixabay API Key
-    # Register at https://pixabay.com/api/docs/ to get your API key.
-    # You can use multiple keys to avoid rate limits.
-    # For example: pixabay_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
-    # 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
-    pixabay_api_keys = []
-
+    project_version="0.2.0"
    # 如果你没有 OPENAI API Key，可以使用 g4f 代替，或者使用国内的 Moonshot API
    # If you don't have an OPENAI API Key, you can use g4f instead
+    video_llm_provider="gemini"

    # 支持的提供商 (Supported providers):
    #   openai
@ -27,6 +13,7 @@
    #   qwen (通义千问)
    #   gemini
    llm_provider="openai"
+    # 支持多模态视频理解能力的大模型

    ########## Ollama Settings
    # No need to set it unless you want to use your own proxy
@ -86,9 +73,10 @@
    deepseek_base_url = "https://api.deepseek.com"
    deepseek_model_name = "deepseek-chat"

-    # Subtitle Provider, "edge" or "whisper"
+    # Subtitle Provider, "whisper"
    # If empty, the subtitle will not be generated
-    subtitle_provider = "edge"
+    subtitle_provider = "faster-whisper-large-v2"
+    subtitle_enabled = true

    #
    # ImageMagick
@ -172,7 +160,7 @@
    # model = WhisperModel(model_size, device="cpu", compute_type="int8")

    # recommended model_size: "large-v3"
-    model_size="large-v3"
+    model_size="faster-whisper-large-v2"
    # if you want to use GPU, set device="cuda"
    device="CPU"
    compute_type="int8"
@ -184,8 +172,8 @@
    ### Example: "http://user:pass@proxy:1234"
    ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies

-    # http = "http://10.10.1.10:3128"
-    # https = "http://10.10.1.10:1080"
+    http = "http://127.0.0.1:7890"
+    https = "http://127.0.0.1:7890"

 [azure]
    # Azure Speech API Key
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -6,6 +6,7 @@ services:
    build:
      context: .
      dockerfile: Dockerfile
+    image: linyq1/narratoai:latest
    container_name: "webui"
    ports:
      - "8501:8501"
@ -18,9 +19,12 @@ services:
    build:
      context: .
      dockerfile: Dockerfile
+    image: linyq1/narratoai:latest
    container_name: "api"
    ports:
      - "8502:8080"
    command: [ "python3", "main.py" ]
    volumes: *common-volumes
+    environment:
+      - "VPN_PROXY_URL=http://host.docker.internal:7890"
    restart: always
--- a/requirements.txt
+++ b/requirements.txt
@ -14,7 +14,7 @@ pillow~=10.3.0
 pydantic~=2.6.3
 g4f~=0.3.0.4
 dashscope~=1.15.0
-google.generativeai>=0.7.2
+google.generativeai>=0.8.2
 python-multipart~=0.0.9
 redis==5.0.3
 # if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video
@ -24,3 +24,5 @@ opencv-python~=4.9.0.80
 # https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
 azure-cognitiveservices-speech~=1.37.0
 git-changelog~=2.5.2
+watchdog==5.0.2
+pydub==0.25.1
--- a/webui.bat
+++ b/webui.bat
@ -3,6 +3,12 @@ set CURRENT_DIR=%CD%
 echo ***** Current directory: %CURRENT_DIR% *****
 set PYTHONPATH=%CURRENT_DIR%

+set "vpn_proxy_url=%http://127.0.0.1:7890%"
+
+:: 使用VPN代理进行一些操作，例如通过代理下载文件
+set "http_proxy=%vpn_proxy_url%"
+set "https_proxy=%vpn_proxy_url%"
+
@echo off
 setlocal enabledelayedexpansion

@ -40,4 +46,4 @@ pause


 rem set HF_ENDPOINT=https://hf-mirror.com
-streamlit run .\webui\Main.py --browser.gatherUsageStats=False --server.enableCORS=True
+streamlit run webui.py --browser.serverAddress="127.0.0.1" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False
--- a/webui/Main.py
+++ b/webui/Main.py
@ -1,26 +1,4 @@
-import sys
-import os
-import glob
-import json
-import time
-import datetime
-import traceback
-
-# 将项目的根目录添加到系统路径中，以允许从项目导入模块
-root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-if root_dir not in sys.path:
-    sys.path.append(root_dir)
-    print("******** sys.path ********")
-    print(sys.path)
-    print("")
-
 import streamlit as st
-
-import os
-from uuid import uuid4
-import platform
-import streamlit.components.v1 as components
-from loguru import logger
 from app.config import config

 st.set_page_config(
@ -35,11 +13,31 @@ st.set_page_config(
    },
 )

-from app.models.const import FILE_TYPE_IMAGES, FILE_TYPE_VIDEOS
+import sys
+import os
+import glob
+import json
+import time
+import datetime
+import traceback
+from uuid import uuid4
+import platform
+import streamlit.components.v1 as components
+from loguru import logger
+
+from app.models.const import FILE_TYPE_VIDEOS
 from app.models.schema import VideoClipParams, VideoAspect, VideoConcatMode
 from app.services import task as tm, llm, voice, material
 from app.utils import utils

+# # 将项目的根目录添加到系统路径中，以允许从项目导入模块
+root_dir = os.path.dirname(os.path.realpath(__file__))
+if root_dir not in sys.path:
+    sys.path.append(root_dir)
+    print("******** sys.path ********")
+    print(sys.path)
+    print("*" * 20)
+
 proxy_url_http = config.proxy.get("http", "") or os.getenv("VPN_PROXY_URL", "")
 proxy_url_https = config.proxy.get("https", "") or os.getenv("VPN_PROXY_URL", "")
 os.environ["HTTP_PROXY"] = proxy_url_http
@ -62,14 +60,14 @@ i18n_dir = os.path.join(root_dir, "webui", "i18n")
 config_file = os.path.join(root_dir, "webui", ".streamlit", "webui.toml")
 system_locale = utils.get_system_locale()

-if 'video_subject' not in st.session_state:
-    st.session_state['video_subject'] = ''
 if 'video_clip_json' not in st.session_state:
-    st.session_state['video_clip_json'] = ''
+    st.session_state['video_clip_json'] = []
 if 'video_plot' not in st.session_state:
    st.session_state['video_plot'] = ''
 if 'ui_language' not in st.session_state:
    st.session_state['ui_language'] = config.ui.get("language", system_locale)
+if 'subclip_videos' not in st.session_state:
+    st.session_state['subclip_videos'] = {}


 def get_all_fonts():
@ -126,7 +124,7 @@ def init_log():
    _lvl = "DEBUG"

    def format_record(record):
-        # 获取日志记录中的文件全路径
+        # 获取日志记录中的文件全径
        file_path = record["file"].path
        # 将绝对路径转换为相对于项目根目录的路径
        relative_path = os.path.relpath(file_path, root_dir)
@ -190,18 +188,37 @@ with st.expander(tr("Basic Settings"), expanded=False):
        if HTTPS_PROXY:
            config.proxy["https"] = HTTPS_PROXY

-
+    # 视频转录大模型
    with middle_config_panel:
-        #   openai
-        #   moonshot (月之暗面)
-        #   oneapi
-        #   g4f
-        #   azure
-        #   qwen (通义千问)
-        #   gemini
-        #   ollama
-        # llm_providers = ['Gemini', 'OpenAI', 'Moonshot', 'Azure', 'Qwen', 'Ollama', 'G4f', 'OneAPI', "Cloudflare"]
-        llm_providers = ['Gemini']
+        video_llm_providers = ['Gemini']
+        saved_llm_provider = config.app.get("llm_provider", "OpenAI").lower()
+        saved_llm_provider_index = 0
+        for i, provider in enumerate(video_llm_providers):
+            if provider.lower() == saved_llm_provider:
+                saved_llm_provider_index = i
+                break
+
+        video_llm_provider = st.selectbox(tr("Video LLM Provider"), options=video_llm_providers, index=saved_llm_provider_index)
+        video_llm_provider = video_llm_provider.lower()
+        config.app["video_llm_provider"] = video_llm_provider
+
+        video_llm_api_key = config.app.get(f"{video_llm_provider}_api_key", "")
+        video_llm_base_url = config.app.get(f"{video_llm_provider}_base_url", "")
+        video_llm_model_name = config.app.get(f"{video_llm_provider}_model_name", "")
+        video_llm_account_id = config.app.get(f"{video_llm_provider}_account_id", "")
+        st_llm_api_key = st.text_input(tr("Video API Key"), value=video_llm_api_key, type="password")
+        st_llm_base_url = st.text_input(tr("Video Base Url"), value=video_llm_base_url)
+        st_llm_model_name = st.text_input(tr("Video Model Name"), value=video_llm_model_name)
+        if st_llm_api_key:
+            config.app[f"{video_llm_provider}_api_key"] = st_llm_api_key
+        if st_llm_base_url:
+            config.app[f"{video_llm_provider}_base_url"] = st_llm_base_url
+        if st_llm_model_name:
+            config.app[f"{video_llm_provider}_model_name"] = st_llm_model_name
+
+    # 大语言模型
+    with right_config_panel:
+        llm_providers = ['Gemini', 'OpenAI', 'Moonshot', 'Azure', 'Qwen', 'Ollama', 'G4f', 'OneAPI', "Cloudflare"]
        saved_llm_provider = config.app.get("llm_provider", "OpenAI").lower()
        saved_llm_provider_index = 0
        for i, provider in enumerate(llm_providers):
@ -232,17 +249,6 @@ with st.expander(tr("Basic Settings"), expanded=False):
            if st_llm_account_id:
                config.app[f"{llm_provider}_account_id"] = st_llm_account_id

-    with right_config_panel:
-        pexels_api_keys = config.app.get("pexels_api_keys", [])
-        if isinstance(pexels_api_keys, str):
-            pexels_api_keys = [pexels_api_keys]
-        pexels_api_key = ", ".join(pexels_api_keys)
-
-        pexels_api_key = st.text_input(tr("Pexels API Key"), value=pexels_api_key, type="password")
-        pexels_api_key = pexels_api_key.replace(" ", "")
-        if pexels_api_key:
-            config.app["pexels_api_keys"] = pexels_api_key.split(",")
-
 panel = st.columns(3)
 left_panel = panel[0]
 middle_panel = panel[1]
@ -278,45 +284,56 @@ with left_panel:
                "name": os.path.basename(file),
                "size": os.path.getsize(file),
                "file": file,
+                "ctime": os.path.getctime(file)  # 获取文件创建时间
            })

-        script_path = [(tr("Auto Generate"), ""), ]
-        for code in [file['file'] for file in script_list]:
-            script_path.append((code, code))
+        # 按创建时间降序排序
+        script_list.sort(key=lambda x: x["ctime"], reverse=True)

-        selected_json2 = st.selectbox(tr("Script Files"),
-                                      index=0,
-                                      options=range(len(script_path)),  # 使用索引作为内部选项值
-                                      format_func=lambda x: script_path[x][0]  # 显示给用户的是标签
-                                      )
-        params.video_clip_json = script_path[selected_json2][1]
-        video_json_file = params.video_clip_json
+        # 本文件 下拉框
+        script_path = [(tr("Auto Generate"), ""), ]
+        for file in script_list:
+            display_name = file['file'].replace(root_dir, "")
+            script_path.append((display_name, file['file']))
+        selected_script_index = st.selectbox(tr("Script Files"),
+                                             index=0,
+                                             options=range(len(script_path)),  # 使用索引作为内部选项值
+                                             format_func=lambda x: script_path[x][0]  # 显示给用户的是标签
+                                             )
+        params.video_clip_json_path = script_path[selected_script_index][1]
+        config.app["video_clip_json_path"] = params.video_clip_json_path
+        st.session_state['video_clip_json_path'] = params.video_clip_json_path

        # 视频文件处理
-        files = []
+        video_files = []
        for suffix in ["*.mp4", "*.mov", "*.avi", "*.mkv"]:
-            files.extend(glob.glob(os.path.join(utils.video_dir(), suffix)))
-        files = files[::-1]
+            video_files.extend(glob.glob(os.path.join(utils.video_dir(), suffix)))
+        video_files = video_files[::-1]

        video_list = []
-        for file in files:
+        for video_file in video_files:
            video_list.append({
-                "name": os.path.basename(file),
-                "size": os.path.getsize(file),
-                "file": file,
+                "name": os.path.basename(video_file),
+                "size": os.path.getsize(video_file),
+                "file": video_file,
+                "ctime": os.path.getctime(video_file)  # 获取文件创建时间
            })
+        # 按创建时间降序排序
+        video_list.sort(key=lambda x: x["ctime"], reverse=True)
+        video_path = [(tr("None"), ""), (tr("Upload Local Files"), "local")]
+        for file in video_list:
+            display_name = file['file'].replace(root_dir, "")
+            video_path.append((display_name, file['file']))

-        video_path = [("None", ""), (tr("Upload Local Files"), "local")]
-        for code in [file['file'] for file in video_list]:
-            video_path.append((code, code))
-
-        selected_index2 = st.selectbox(tr("Video File"),
-                                       index=0,
-                                       options=range(len(video_path)),  # 使用索引作为内部选项值
-                                       format_func=lambda x: video_path[x][0]  # 显示给用户的是标签
-                                       )
-        params.video_origin_path = video_path[selected_index2][1]
+        # 视频文件
+        selected_video_index = st.selectbox(tr("Video File"),
+                                            index=0,
+                                            options=range(len(video_path)),  # 使用索引作为内部选项值
+                                            format_func=lambda x: video_path[x][0]  # 显示给用户的是标签
+                                            )
+        params.video_origin_path = video_path[selected_video_index][1]
        config.app["video_origin_path"] = params.video_origin_path
+        st.session_state['video_origin_path'] = params.video_origin_path

        # 从本地上传 mp4 文件
        if params.video_origin_path == "local":
@ -341,9 +358,8 @@ with left_panel:
                    st.success(tr("File Uploaded Successfully"))
                    time.sleep(1)
                    st.rerun()
-            # params.video_origin_path = video_path[selected_index2][1]
-            # config.app["video_origin_path"] = params.video_origin_path
-
+        # 视频名称
+        video_name = st.text_input(tr("Video Name"))
        # 剧情内容
        video_plot = st.text_area(
            tr("Plot Description"),
@ -351,31 +367,74 @@ with left_panel:
            height=180
        )

-        if st.button(tr("Video Script Generate"), key="auto_generate_script"):
-            with st.spinner(tr("Video Script Generate")):
-                if video_json_file == "" and params.video_origin_path != "":
-                    script = llm.gemini_video2json(
-                        video_origin_name=params.video_origin_path.split("\\")[-1],
-                        video_origin_path=params.video_origin_path,
-                        video_plot=video_plot,
-                        language=params.video_language,
-                    )
-                    st.session_state['video_clip_json'] = script
-                    cleaned_string = script.strip("```json").strip("```")
-                    st.session_state['video_script_list'] = json.loads(cleaned_string)
-                else:
-                    with open(video_json_file, 'r', encoding='utf-8') as f:
-                        script = f.read()
-                        st.session_state['video_clip_json'] = script
-                        cleaned_string = script.strip("```json").strip("```")
-                        st.session_state['video_script_list'] = json.loads(cleaned_string)
+        # 生成视频脚本
+        if st.session_state['video_clip_json_path']:
+            generate_button_name = tr("Video Script Load")
+        else:
+            generate_button_name = tr("Video Script Generate")
+        if st.button(generate_button_name, key="auto_generate_script"):
+            progress_bar = st.progress(0)
+            status_text = st.empty()

+            def update_progress(progress: float, message: str = ""):
+                progress_bar.progress(progress)
+                if message:
+                    status_text.text(f"{progress}% - {message}")
+                else:
+                    status_text.text(f"进度: {progress}%")
+
+            try:
+                with st.spinner("正在生成脚本..."):
+                    if not video_plot:
+                        st.warning("视频剧情为空; 会极大影响生成效果！")
+                    if params.video_clip_json_path == "" and params.video_origin_path != "":
+                        update_progress(10, "压缩视频中...")
+                        # 使用大模型生成视频脚本
+                        script = llm.generate_script(
+                            video_path=params.video_origin_path,
+                            video_plot=video_plot,
+                            video_name=video_name,
+                            language=params.video_language,
+                            progress_callback=update_progress
+                        )
+                        if script is None:
+                            st.error("生成脚本失败，请检查日志")
+                            st.stop()
+                        else:
+                            update_progress(90)
+
+                        script = utils.clean_model_output(script)
+                        st.session_state['video_clip_json'] = json.loads(script)
+                    else:
+                        # 从本地加载
+                        with open(params.video_clip_json_path, 'r', encoding='utf-8') as f:
+                            update_progress(50)
+                            status_text.text("从本地加载中...")
+                            script = f.read()
+                            script = utils.clean_model_output(script)
+                            st.session_state['video_clip_json'] = json.loads(script)
+                            update_progress(100)
+                            status_text.text("从本地加载成功")
+
+                time.sleep(0.5)  # 给进度条一点时间到达100%
+                progress_bar.progress(100)
+                status_text.text("脚本生成完成！")
+                st.success("视频脚本生成成功！")
+            except Exception as err:
+                st.error(f"生成过程中发生错误: {str(err)}")
+            finally:
+                time.sleep(2)  # 给用户一些时间查看最终状态
+                progress_bar.empty()
+                status_text.empty()
+
+        # 视频脚本
        video_clip_json_details = st.text_area(
            tr("Video Script"),
-            value=st.session_state['video_clip_json'],
+            value=json.dumps(st.session_state.video_clip_json, indent=2, ensure_ascii=False),
            height=180
        )

+        # 保存脚本
        button_columns = st.columns(2)
        with button_columns[0]:
            if st.button(tr("Save Script"), key="auto_generate_terms", use_container_width=True):
@ -389,117 +448,50 @@ with left_panel:
                    timestamp = datetime.datetime.now().strftime("%Y-%m%d-%H%M%S")
                    save_path = os.path.join(script_dir, f"{timestamp}.json")

-                    # 尝试解析输入的 JSON 数据
-                    input_json = str(video_clip_json_details)
-                    # 去掉json的头尾标识
-                    input_json = input_json.strip('```json').strip('```')
                    try:
-                        data = json.loads(input_json)
+                        data = utils.add_new_timestamps(json.loads(video_clip_json_details))
                    except Exception as err:
-                        raise ValueError(
-                            f"视频脚本格式错误，请检查脚本是否符合 JSON 格式；{err} \n\n{traceback.format_exc()}")
-
-                    # 检查是否是一个列表
-                    if not isinstance(data, list):
-                        raise ValueError("JSON is not a list")
-
-                    # 检查列表中的每个元素是否包含所需的键
-                    required_keys = {"picture", "timestamp", "narration"}
-                    for item in data:
-                        if not isinstance(item, dict):
-                            raise ValueError("List 元素不是字典")
-                        if not required_keys.issubset(item.keys()):
-                            raise ValueError("Dict 元素不包含必需的键")
+                        st.error(f"视频脚本格式错误，请检查脚本是否符合 JSON 格式；{err} \n\n{traceback.format_exc()}")
+                        st.stop()

                    # 存储为新的 JSON 文件
                    with open(save_path, 'w', encoding='utf-8') as file:
                        json.dump(data, file, ensure_ascii=False, indent=4)
                        # 将data的值存储到 session_state 中，类似缓存
-                        st.session_state['video_script_list'] = data
+                        st.session_state['video_clip_json'] = data
                        st.session_state['video_clip_json_path'] = save_path
                        # 刷新页面
                        st.rerun()

-
-        def caijian():
-            with st.spinner(tr("裁剪视频中...")):
-                st.session_state['task_id'] = str(uuid4())
-
-            if st.session_state.get('video_script_list', None) is not None:
-                video_script_list = st.session_state.video_script_list
-                time_list = [i['timestamp'] for i in video_script_list]
-                subclip_videos = material.clip_videos(
-                    task_id=st.session_state['task_id'],
-                    timestamp_terms=time_list,
-                    origin_video=params.video_origin_path
-                )
-                if subclip_videos is None:
-                    st.error(tr("裁剪视频失败"))
-                    st.stop()
-                st.session_state['subclip_videos'] = subclip_videos
-                for video_script in video_script_list:
-                    try:
-                        video_script['path'] = subclip_videos[video_script['timestamp']]
-                    except KeyError as e:
-                        st.error(f"裁剪视频失败")
-                # logger.debug(f"当前的脚本为：{st.session_state.video_script_list}")
-            else:
-                st.error(tr("请先生成视频脚本"))
-
-
+        # 裁剪视频
        with button_columns[1]:
            if st.button(tr("Crop Video"), key="auto_crop_video", use_container_width=True):
-                caijian()
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+
+                def update_progress(progress):
+                    progress_bar.progress(progress)
+                    status_text.text(f"剪辑进度: {progress}%")
+
+                try:
+                    utils.cut_video(params, update_progress)
+                    time.sleep(0.5)  # 给进度条一点时间到达100%
+                    progress_bar.progress(100)
+                    status_text.text("剪辑完成！")
+                    st.success("视频剪辑成功完成！")
+                except Exception as e:
+                    st.error(f"剪辑过程中发生错误: {str(e)}")
+                finally:
+                    time.sleep(2)  # 给用户一些时间查看最终状态
+                    progress_bar.empty()
+                    status_text.empty()

 # 新中间面板
 with middle_panel:
    with st.container(border=True):
        st.write(tr("Video Settings"))
-        video_concat_modes = [
-            (tr("Sequential"), "sequential"),
-            (tr("Random"), "random"),
-        ]
-        # video_sources = [
-        #     (tr("Pexels"), "pexels"),
-        #     (tr("Pixabay"), "pixabay"),
-        #     (tr("Local file"), "local"),
-        #     (tr("TikTok"), "douyin"),
-        #     (tr("Bilibili"), "bilibili"),
-        #     (tr("Xiaohongshu"), "xiaohongshu"),
-        # ]
-        #
-        # saved_video_source_name = config.app.get("video_source", "pexels")
-        # saved_video_source_index = [v[1] for v in video_sources].index(
-        #     saved_video_source_name
-        # )
-        #
-        # selected_index = st.selectbox(
-        #     tr("Video Source"),
-        #     options=range(len(video_sources)),
-        #     format_func=lambda x: video_sources[x][0],
-        #     index=saved_video_source_index,
-        # )
-        # params.video_source = video_sources[selected_index][1]
-        # config.app["video_source"] = params.video_source
-        #
-        # if params.video_source == "local":
-        #     _supported_types = FILE_TYPE_VIDEOS + FILE_TYPE_IMAGES
-        #     uploaded_files = st.file_uploader(
-        #         "Upload Local Files",
-        #         type=["mp4", "mov", "avi", "flv", "mkv", "jpg", "jpeg", "png"],
-        #         accept_multiple_files=True,
-        #     )
-
-        selected_index = st.selectbox(
-            tr("Video Concat Mode"),
-            index=1,
-            options=range(len(video_concat_modes)),  # 使用索引作为内部选项值
-            format_func=lambda x: video_concat_modes[x][0],  # 显示给用户的是标签
-        )
-        params.video_concat_mode = VideoConcatMode(
-            video_concat_modes[selected_index][1]
-        )

+        # 视频比例
        video_aspect_ratios = [
            (tr("Portrait"), VideoAspect.portrait.value),
            (tr("Landscape"), VideoAspect.landscape.value),
@ -511,14 +503,14 @@ with middle_panel:
        )
        params.video_aspect = VideoAspect(video_aspect_ratios[selected_index][1])

-        params.video_clip_duration = st.selectbox(
-            tr("Clip Duration"), options=[2, 3, 4, 5, 6, 7, 8, 9, 10], index=1
-        )
-        params.video_count = st.selectbox(
-            tr("Number of Videos Generated Simultaneously"),
-            options=[1, 2, 3, 4, 5],
-            index=0,
-        )
+        # params.video_clip_duration = st.selectbox(
+        #     tr("Clip Duration"), options=[2, 3, 4, 5, 6, 7, 8, 9, 10], index=1
+        # )
+        # params.video_count = st.selectbox(
+        #     tr("Number of Videos Generated Simultaneously"),
+        #     options=[1, 2, 3, 4, 5],
+        #     index=0,
+        # )
    with st.container(border=True):
        st.write(tr("Audio Settings"))

@ -557,8 +549,9 @@ with middle_panel:
        params.voice_name = voice_name
        config.ui["voice_name"] = voice_name

+        # 试听语言合成
        if st.button(tr("Play Voice")):
-            play_content = params.video_subject
+            play_content = "这是一段试听语言"
            if not play_content:
                play_content = params.video_script
            if not play_content:
@ -637,7 +630,7 @@ with middle_panel:
            index=2,
        )

-# 新右侧面板
+# 新侧面板
 with right_panel:
    with st.container(border=True):
        st.write(tr("Subtitle Settings"))
@ -675,6 +668,7 @@ with right_panel:
                if params.custom_position < 0 or params.custom_position > 100:
                    st.error(tr("Please enter a value between 0 and 100"))
            except ValueError:
+                logger.error(f"输入的值无效: {traceback.format_exc()}")
                st.error(tr("Please enter a valid number"))

        font_cols = st.columns([0.3, 0.7])
@ -699,7 +693,7 @@ with right_panel:
 # 视频编辑面板
 with st.expander(tr("Video Check"), expanded=False):
    try:
-        video_list = st.session_state['video_script_list']
+        video_list = st.session_state.video_clip_json
    except KeyError as e:
        video_list = []

@ -729,13 +723,16 @@ with st.expander(tr("Video Check"), expanded=False):
                        # 可编辑的输入框
                        text_panels = st.columns(2)
                        with text_panels[0]:
-                            text1 = st.text_area(tr("timestamp"), value=initial_timestamp, height=20)
+                            text1 = st.text_area(tr("timestamp"), value=initial_timestamp, height=20,
+                                                 key=f"timestamp_{index}")
                        with text_panels[1]:
-                            text2 = st.text_area(tr("Picture description"), value=initial_picture, height=20)
-                        text3 = st.text_area(tr("Narration"), value=initial_narration, height=100)
+                            text2 = st.text_area(tr("Picture description"), value=initial_picture, height=20,
+                                                 key=f"picture_{index}")
+                        text3 = st.text_area(tr("Narration"), value=initial_narration, height=100,
+                                             key=f"narration_{index}")

                        # 重新生成按钮
-                        if st.button(tr("Rebuild"), key=f"button_{index}"):
+                        if st.button(tr("Rebuild"), key=f"rebuild_{index}"):
                            # 更新video_list中的对应项
                            video_list[index]['timestamp'] = text1
                            video_list[index]['picture'] = text2
@ -744,28 +741,35 @@ with st.expander(tr("Video Check"), expanded=False):
                            for video in video_list:
                                if 'path' in video:
                                    del video['path']
-                                    # 更新session_state以确保更改被保存
+                            # 更新session_state以确保更改被保存
                            st.session_state['video_clip_json'] = utils.to_json(video_list)
                            # 替换原JSON 文件
-                            with open(video_json_file, 'w', encoding='utf-8') as file:
+                            with open(params.video_clip_json_path, 'w', encoding='utf-8') as file:
                                json.dump(video_list, file, ensure_ascii=False, indent=4)
-                            caijian()
+                            utils.cut_video(params, progress_callback=None)
                            st.rerun()

+# 开始按钮
 start_button = st.button(tr("Generate Video"), use_container_width=True, type="primary")
 if start_button:
+    # 重置日志容器和记录
+    log_container = st.empty()
+    log_records = []
+
    config.save_config()
    task_id = st.session_state.get('task_id')
    if st.session_state.get('video_script_json_path') is not None:
        params.video_clip_json = st.session_state.get('video_clip_json')

-    logger.debug(f"当前的脚本为：{params.video_clip_json}")
+    logger.debug(f"当前的脚本文件为：{st.session_state.video_clip_json_path}")
+    logger.debug(f"当前的视频文件为：{st.session_state.video_origin_path}")
+    logger.debug(f"裁剪后是视频列表：{st.session_state.subclip_videos}")

    if not task_id:
        st.error(tr("请先裁剪视频"))
        scroll_to_bottom()
        st.stop()
-    if not params.video_clip_json:
+    if not params.video_clip_json_path:
        st.error(tr("脚本文件不能为空"))
        scroll_to_bottom()
        st.stop()
@ -773,21 +777,12 @@ if start_button:
        st.error(tr("视频文件不能为空"))
        scroll_to_bottom()
        st.stop()
-    if llm_provider != 'g4f' and not config.app.get(f"{llm_provider}_api_key", ""):
-        st.error(tr("请输入 LLM API 密钥"))
-        scroll_to_bottom()
-        st.stop()
-
-    log_container = st.empty()
-    log_records = []
-

    def log_received(msg):
        with log_container:
            log_records.append(msg)
            st.code("\n".join(log_records))

-
    logger.add(log_received)

    st.toast(tr("生成视频"))
--- a/webui.sh
+++ b/webui.sh
@ -44,7 +44,24 @@ for url in "${!urls_paths[@]}"; do
        echo "下载失败: $url" >&2
    }
 done
+
+# 安装 git lfs 并下载模型到指定目录
+git lfs install
+mkdir -p /NarratoAI/app/models
+cd /NarratoAI/app/models
+if [ ! -d "faster-whisper-large-v2" ] || [ -z "$(ls -A faster-whisper-large-v2)" ]; then
+    if git clone https://huggingface.co/guillaumekln/faster-whisper-large-v2; then
+        echo "下载faster-whisper-large-v2成功"
+    else
+        echo "下载faster-whisper-large-v2失败" >&2
+        exit 1
+    fi
+else
+    echo "faster-whisper-large-v2 已存在，跳过下载"
+fi
+
 # 等待所有后台任务完成
 wait
 echo "所有文件已成功下载到指定目录"
-streamlit run ./webui/Main.py --browser.serverAddress="0.0.0.0" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False
+cd /NarratoAI/
+streamlit run webui.py --browser.serverAddress="0.0.0.0" --server.enableCORS=True  --server.maxUploadSize=2048 --browser.gatherUsageStats=False
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@ -73,7 +73,7 @@
    "Please Enter the LLM API Key": "Please enter the **LLM API Key**",
    "Please Enter the Pexels API Key": "Please enter the **Pexels API Key**",
    "Please Enter the Pixabay API Key": "Please enter the **Pixabay API Key**",
-    "Get Help": "One-stop AI video commentary + automated editing tool\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\nFor any questions or suggestions, you can join the **community channel** for help or discussion: https://discord.gg/WBKChhmZ",
+    "Get Help": "One-stop AI video commentary + automated editing tool\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\nFor any questions or suggestions, you can join the **community channel** for help or discussion: https://github.com/linyqh/NarratoAI/wiki",
    "Video Source": "Video Source",
    "TikTok": "TikTok (Support is coming soon)",
    "Bilibili": "Bilibili (Support is coming soon)",
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -9,7 +9,8 @@
    "Generate Video Script and Keywords": "点击使用AI根据**主题**生成 【视频文案】 和 【视频关键词】",
    "Auto Detect": "自动检测",
    "Auto Generate": "自动生成",
-    "Video Script": "视频脚本（:blue[①可不填，使用AI生成  ②合理使用标点断句，有助于生成字幕]）",
+    "Video Name": "视频名称",
+    "Video Script": "视频脚本（:blue[①使用AI生成    ②从本机加载]）",
    "Save Script": "保存脚本",
    "Crop Video": "裁剪视频",
    "Video File": "视频文件（:blue[1️⃣支持上传视频文件(限制2G) 2️⃣大文件建议直接导入 ./resource/videos 目录]）",
@ -63,17 +64,16 @@
    "You can download the generated video from the following links": "你可以从以下链接下载生成的视频",
    "Basic Settings": "**基础设置** (:blue[点击展开])",
    "Language": "界面语言",
-    "Pexels API Key": "Pexels API Key ([点击获取](https://www.pexels.com/api/)) :red[推荐使用]",
    "Pixabay API Key": "Pixabay API Key ([点击获取](https://pixabay.com/api/docs/#api_search_videos)) :red[可以不用配置，如果 Pexels 无法使用，再选择Pixabay]",
-    "LLM Provider": "大模型提供商",
+    "Video LLM Provider": "视频转录大模型",
+    "LLM Provider": "大语言模型",
    "API Key": "API Key (:red[必填，需要到大模型提供商的后台申请])",
    "Base Url": "Base Url (可选)",
    "Account ID": "账户ID (Cloudflare的dash面板url中获取)",
    "Model Name": "模型名称 (:blue[需要到大模型提供商的后台确认被授权的模型名称])",
    "Please Enter the LLM API Key": "请先填写大模型 **API Key**",
-    "Please Enter the Pexels API Key": "请先填写 **Pexels API Key**",
    "Please Enter the Pixabay API Key": "请先填写 **Pixabay API Key**",
-    "Get Help": "一站式 AI 影视解说+自动化剪辑工具\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\n有任何问题或建议，可以加入 **社区频道** 求助或讨论：https://discord.gg/WBKChhmZ",
+    "Get Help": "一站式 AI 影视解说+自动化剪辑工具\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\n有任何问题或建议，可以加入 **社区频道** 求助或讨论：https://github.com/linyqh/NarratoAI/wiki",
    "Video Source": "视频来源",
    "TikTok": "抖音 (TikTok 支持中，敬请期待)",
    "Bilibili": "哔哩哔哩 (Bilibili 支持中，敬请期待)",
@ -90,6 +90,7 @@
    "timestamp": "时间戳",
    "Picture description": "图片描述",
    "Narration": "视频文案",
-    "Rebuild": "重新生成"
+    "Rebuild": "重新生成",
+    "Video Script Load": "加载视频脚本"
  }
 }