Merge branch 'main' of github.com:linyqh/NarratoAI

2026-05-12 03:23:51 +00:00 · 2025-07-06 23:03:33 +08:00 · 2025-07-06 23:03:33 +08:00 · db2696f1b6
commit db2696f1b6
parent 39d5cb36de d3df2931c4
83 changed files with 6440 additions and 2180 deletions
--- a/.github/workflows/auto-release-generator.yml
+++ b/.github/workflows/auto-release-generator.yml
@ -0,0 +1,157 @@
+name: Auto Release Generator
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'project_version'  # 确保路径准确，不使用通配符
+
+jobs:
+  check-version-and-release:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write  # 用于创建 releases
+      pull-requests: write  # 可能需要的额外权限
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # 获取完整历史以检查变更
+
+      - name: Debug Environment
+        run: |
+          echo "工作目录内容:"
+          ls -la
+          echo "project_version 文件内容:"
+          cat project_version || echo "文件不存在"
+
+      - name: Check if version changed
+        id: check-version
+        run: |
+          # 获取当前版本号
+          if [ -f "project_version" ]; then
+            CURRENT_VERSION=$(cat project_version)
+            echo "Current version: $CURRENT_VERSION"
+            
+            # 获取上一个提交中的版本号
+            git fetch origin main
+            if git show HEAD~1:project_version &>/dev/null; then
+              PREVIOUS_VERSION=$(git show HEAD~1:project_version)
+              echo "Previous version from commit: $PREVIOUS_VERSION"
+              
+              if [[ "$CURRENT_VERSION" != "$PREVIOUS_VERSION" ]]; then
+                echo "Version changed from $PREVIOUS_VERSION to $CURRENT_VERSION"
+                echo "version_changed=true" >> $GITHUB_OUTPUT
+                echo "current_version=$CURRENT_VERSION" >> $GITHUB_OUTPUT
+              else
+                echo "Version unchanged"
+                echo "version_changed=false" >> $GITHUB_OUTPUT
+              fi
+            else
+              echo "Cannot find previous version, assuming first release"
+              echo "version_changed=true" >> $GITHUB_OUTPUT
+              echo "current_version=$CURRENT_VERSION" >> $GITHUB_OUTPUT
+            fi
+          else
+            echo "project_version file not found"
+            echo "version_changed=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Set up Python
+        if: steps.check-version.outputs.version_changed == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install OpenAI SDK
+        if: steps.check-version.outputs.version_changed == 'true'
+        run: pip install openai
+
+      - name: Get commits since last release
+        if: steps.check-version.outputs.version_changed == 'true'
+        id: get-commits
+        run: |
+          # 直接获取最近10个提交
+          echo "Getting last 13 commits"
+          COMMITS=$(git log -13 --pretty=format:"%s")
+          
+          echo "Commits to be included in release notes:"
+          echo "$COMMITS"
+          
+          echo "commits<<EOF" >> $GITHUB_OUTPUT
+          echo "$COMMITS" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+      - name: Generate release notes with AI
+        if: steps.check-version.outputs.version_changed == 'true'
+        id: generate-notes
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          OPENAI_BASE_URL: https://api.siliconflow.cn/v1
+          CURRENT_VERSION: ${{ steps.check-version.outputs.current_version }}
+        run: |
+          cat > generate_release_notes.py << 'EOF'
+          import os
+          import sys
+          from openai import OpenAI
+
+          # 设置OpenAI客户端
+          client = OpenAI(
+              api_key=os.environ.get("OPENAI_API_KEY"),
+              base_url=os.environ.get("OPENAI_BASE_URL")
+          )
+
+          # 获取提交信息和版本号
+          commits = sys.stdin.read()
+          version = os.environ.get("CURRENT_VERSION")
+
+          # 调用API生成发布说明
+          try:
+              response = client.chat.completions.create(
+                  model="deepseek-ai/DeepSeek-V3",
+                  messages=[
+                      {"role": "system", "content": "你是一个专业的软件发布说明生成助手。请根据提供的git提交信息，生成一个结构化的发布说明，包括新功能、改进、修复的bug等类别。使用中文回复。"},
+                      {"role": "user", "content": f"请根据以下git提交信息,生成一个版本{version}的发布说明,内容详细且完整,相似的提交信息不要重复出现: \n\n{commits}"}
+                  ],
+                  temperature=0.7,
+              )
+              release_notes = response.choices[0].message.content
+              print(f"commits: \n{commits}")
+              print(f"大模型总结的发布说明: \n{release_notes}")
+          except Exception as e:
+              print(f"Error calling OpenAI API: {e}")
+              release_notes = f"# 版本 {version} 发布\n\n## 更新内容\n\n"
+              # 简单处理提交信息
+              for line in commits.strip().split("\n"):
+                  if line:
+                      release_notes += f"- {line}\n"
+          
+          # 输出生成的发布说明
+          print(release_notes)
+          
+          # 保存到GitHub输出
+          with open(os.environ.get("GITHUB_OUTPUT"), "a") as f:
+              f.write("release_notes<<RELEASE_NOTES_EOF\n")
+              f.write(release_notes)
+              f.write("\nRELEASE_NOTES_EOF\n")
+          EOF
+          
+          python generate_release_notes.py < <(echo "${{ steps.get-commits.outputs.commits }}")
+
+      - name: Debug release notes
+        if: steps.check-version.outputs.version_changed == 'true'
+        run: |
+          echo "Generated release notes:"
+          echo "${{ steps.generate-notes.outputs.release_notes }}"
+
+      - name: Create GitHub Release
+        if: steps.check-version.outputs.version_changed == 'true'
+        uses: softprops/action-gh-release@v1
+        with:
+          tag_name: v${{ steps.check-version.outputs.current_version }}
+          name: v${{ steps.check-version.outputs.current_version }}
+          body: ${{ steps.generate-notes.outputs.release_notes }}
+          draft: false
+          prerelease: false
+          token: ${{ secrets.GIT_TOKEN }} 
--- a/.github/workflows/codeReview.yml
+++ b/.github/workflows/codeReview.yml
@ -19,6 +19,6 @@ jobs:
        env:
          GITHUB_TOKEN: ${{ secrets.GIT_TOKEN }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          OPENAI_API_ENDPOINT: https://api.groq.com/openai/v1
-          MODEL: llama-3.1-70b-versatile
+          OPENAI_API_ENDPOINT: https://api.siliconflow.cn/v1
+          MODEL: deepseek-ai/DeepSeek-V3
          LANGUAGE: Chinese
--- a/.github/workflows/discord-release-notification.yml
+++ b/.github/workflows/discord-release-notification.yml
@ -0,0 +1,197 @@
+name: Discord Release Notification
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  notify-discord:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: pip install openai discord-webhook requests
+
+      - name: Enhance release notes and send to Discord
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          OPENAI_BASE_URL: https://api.siliconflow.cn/v1
+          DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          cat > send_discord_notification.py << 'EOF'
+          import os
+          import sys
+          import json
+          from openai import OpenAI
+          import requests
+          from datetime import datetime
+          from discord_webhook import DiscordWebhook, DiscordEmbed
+
+          # 设置OpenAI客户端
+          client = OpenAI(
+              api_key=os.environ.get("OPENAI_API_KEY"),
+              base_url=os.environ.get("OPENAI_BASE_URL")
+          )
+
+          # 获取GitHub release信息
+          github_token = os.environ.get("GITHUB_TOKEN")
+          repo = os.environ.get("GITHUB_REPOSITORY")
+          
+          # 直接从GitHub API获取最新release
+          headers = {"Authorization": f"token {github_token}"}
+          response = requests.get(f"https://api.github.com/repos/{repo}/releases/latest", headers=headers)
+          
+          if response.status_code != 200:
+              print(f"Error fetching release info: {response.status_code}")
+              print(response.text)
+              sys.exit(1)
+              
+          release_info = response.json()
+          
+          # 提取需要的信息
+          release_notes = release_info.get("body", "无发布说明")
+          version = release_info.get("tag_name", "未知版本")
+          
+          # 安全地解析发布日期
+          published_at = release_info.get("published_at")
+          if published_at:
+              try:
+                  release_date = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y年%m月%d日")
+              except ValueError:
+                  release_date = "未知日期"
+          else:
+              release_date = "未知日期"
+
+          # 使用大模型润色发布说明
+          try:
+              response = client.chat.completions.create(
+                  model="deepseek-ai/DeepSeek-V3",
+                  messages=[
+                      {"role": "system", "content": "你是一个专业的软件发布公告优化助手。请优化以下发布说明，使其更加生动、专业，并明确区分新功能、优化内容、修复内容和移除内容等类别。保持原有信息的完整性，同时增强可读性和专业性。使用中文回复。\n\n重要：Discord不支持复杂的Markdown格式，因此请使用简单的格式化：\n1. 使用**粗体**和*斜体*而不是Markdown标题\n2. 使用简单的列表符号（•）而不是Markdown列表\n3. 避免使用#、##等标题格式\n4. 不要使用表格、代码块等复杂格式\n5. 确保段落之间有空行\n6. 使用简单的分隔符（如 ------）来分隔不同部分"},
+                      {"role": "user", "content": f"请优化以下版本{version}的发布说明，使其更适合在Discord社区发布。请记住Discord不支持复杂的Markdown格式，所以使用简单的格式化方式：\n\n{release_notes}"}
+                  ],
+                  temperature=0.7,
+              )
+              enhanced_notes = response.choices[0].message.content
+              print(f"大模型润色后的发布说明: \n{enhanced_notes}")
+          except Exception as e:
+              print(f"Error calling OpenAI API: {e}")
+              enhanced_notes = release_notes  # 如果API调用失败，使用原始发布说明
+          
+          # 创建Discord消息
+          webhook_url = os.environ.get("DISCORD_WEBHOOK_URL")
+          if not webhook_url:
+              print("Error: DISCORD_WEBHOOK_URL not set")
+              sys.exit(1)
+              
+          webhook = DiscordWebhook(url=webhook_url)
+          
+          # 创建嵌入式消息
+          embed = DiscordEmbed(
+              title=f"🚀 NarratoAI {version} 发布公告",
+              description=f"发布日期: {release_date}",
+              color="5865F2"  # Discord蓝色
+          )
+          
+          # 处理发布说明，确保不超过Discord的字段限制
+          # Discord字段值限制为1024个字符
+          MAX_FIELD_LENGTH = 1024
+          
+          # 如果内容很短，直接添加
+          if enhanced_notes and len(enhanced_notes) <= MAX_FIELD_LENGTH:
+              embed.add_embed_field(name="📋 更新内容", value=enhanced_notes)
+          elif enhanced_notes:
+              # 尝试按段落或明显的分隔符分割内容
+              sections = []
+              
+              # 检查是否有明显的新功能、优化、修复等部分
+              if "**新增功能**" in enhanced_notes or "**新功能**" in enhanced_notes:
+                  parts = enhanced_notes.split("**新增功能**", 1)
+                  if len(parts) > 1:
+                      intro = parts[0].strip()
+                      if intro:
+                          sections.append(("📋 更新概述", intro))
+                      
+                      rest = "**新增功能**" + parts[1]
+                      
+                      # 进一步分割剩余部分
+                      feature_end = -1
+                      for marker in ["**优化内容**", "**性能优化**", "**修复内容**", "**bug修复**", "**问题修复**"]:
+                          pos = rest.lower().find(marker.lower())
+                          if pos != -1 and (feature_end == -1 or pos < feature_end):
+                              feature_end = pos
+                      
+                      if feature_end != -1:
+                          sections.append(("✨ 新增功能", rest[:feature_end].strip()))
+                          rest = rest[feature_end:]
+                      else:
+                          sections.append(("✨ 新增功能", rest.strip()))
+                          rest = ""
+                      
+                      # 继续分割剩余部分
+                      if rest:
+                          optimize_end = -1
+                          for marker in ["**修复内容**", "**bug修复**", "**问题修复**"]:
+                              pos = rest.lower().find(marker.lower())
+                              if pos != -1 and (optimize_end == -1 or pos < optimize_end):
+                                  optimize_end = pos
+                          
+                          if optimize_end != -1:
+                              sections.append(("⚡ 优化内容", rest[:optimize_end].strip()))
+                              sections.append(("🔧 修复内容", rest[optimize_end:].strip()))
+                          else:
+                              sections.append(("⚡ 优化内容", rest.strip()))
+              else:
+                  # 如果没有明显的结构，按长度分割
+                  chunks = [enhanced_notes[i:i+MAX_FIELD_LENGTH] for i in range(0, len(enhanced_notes), MAX_FIELD_LENGTH)]
+                  for i, chunk in enumerate(chunks):
+                      if i == 0:
+                          sections.append(("📋 更新内容", chunk))
+                      else:
+                          sections.append((f"📋 更新内容（续{i}）", chunk))
+              
+              # 添加所有部分到embed
+              for name, content in sections:
+                  if len(content) > MAX_FIELD_LENGTH:
+                      # 如果单个部分仍然过长，进一步分割
+                      sub_chunks = [content[i:i+MAX_FIELD_LENGTH] for i in range(0, len(content), MAX_FIELD_LENGTH)]
+                      for i, chunk in enumerate(sub_chunks):
+                          if i == 0:
+                              embed.add_embed_field(name=name, value=chunk)
+                          else:
+                              embed.add_embed_field(name=f"{name}（续{i}）", value=chunk)
+                  else:
+                      embed.add_embed_field(name=name, value=content)
+          else:
+              embed.add_embed_field(name="📋 更新内容", value="无详细更新内容")
+          
+          # 添加下载链接
+          html_url = release_info.get("html_url", "")
+          if html_url:
+              embed.add_embed_field(name="📥 下载链接", value=html_url, inline=False)
+          
+          # 设置页脚
+          embed.set_footer(text=f"NarratoAI 团队 • {release_date}")
+          embed.set_timestamp()
+          
+          # 添加嵌入式消息到webhook
+          webhook.add_embed(embed)
+          
+          # 发送消息
+          response = webhook.execute()
+          if response:
+              print(f"Discord notification sent with status code: {response.status_code}")
+          else:
+              print("Failed to send Discord notification")
+          EOF
+          
+          # 执行脚本
+          python send_discord_notification.py 
--- a/.github/workflows/dockerImageBuild.yml
+++ b/.github/workflows/dockerImageBuild.yml
@ -1,48 +0,0 @@
-name: build_docker
-
-on:
-  release:
-    types: [created] # 表示在创建新的 Release 时触发
-  workflow_dispatch:
-
-jobs:
-  build_docker:
-    name: Build docker
-    runs-on: ubuntu-latest
-    steps:
-      - name: Remove unnecessary files
-        run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-      - name: Extract project version
-        id: extract_version
-        run: |
-          project_version=$(grep 'project_version' config.example.toml | cut -d '"' -f 2)
-          echo "PROJECT_VERSION=$project_version" >> $GITHUB_ENV
-
-      - name: Build and push
-        id: docker_build
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: ./Dockerfile
-          push: true
-          platforms: linux/amd64,linux/arm64
-          tags: |
-            ${{ secrets.DOCKERHUB_USERNAME }}/narratoai:${{ env.PROJECT_VERSION }}
-            ${{ secrets.DOCKERHUB_USERNAME }}/narratoai:latest
--- a/.github/workflows/latest-changes.yml
+++ b/.github/workflows/latest-changes.yml
@ -1,40 +0,0 @@
-name: Latest Changes
-
-on:
-  pull_request_target:
-    branches:
-      - main
-    types:
-      - closed
-  workflow_dispatch:
-    inputs:
-      number:
-        description: PR number
-        required: true
-      debug_enabled:
-        description: "在启用 tmate 调试的情况下运行构建 (https://github.com/marketplace/actions/debugging-with-tmate)"
-        required: false
-        default: "false"
-
-jobs:
-  latest-changes:
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read
-    steps:
-      - name: Dump GitHub context
-        env:
-          GITHUB_CONTEXT: ${{ toJson(github) }}
-        run: echo "$GITHUB_CONTEXT"
-      - uses: actions/checkout@v4
-        with:
-          # 允许将最新更改提交到主分支
-          token: ${{ secrets.GIT_TOKEN }}
-      - uses: tiangolo/latest-changes@0.3.2
-        with:
-          token: ${{ secrets.GIT_TOKEN }}
-          latest_changes_file: ./release-notes.md
-          latest_changes_header: "## Latest Changes"
-          end_regex: "^## "
-          debug_logs: true
-          label_header_prefix: "### "
--- a/.github/workflows/release-drafter.yml
+++ b/.github/workflows/release-drafter.yml
@ -1,22 +0,0 @@
-name: Release Drafter
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, reopened, synchronize]
-
-permissions:
-  contents: read
-
-jobs:
-  update_release_draft:
-    permissions:
-      contents: write
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-      - uses: release-drafter/release-drafter@v5
-        env:
-          GITHUB_TOKEN: ${{ secrets.GIT_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@ -32,4 +32,8 @@ resource/fonts/*.ttf
 resource/fonts/*.otf
 resource/srt/*.srt
 app/models/faster-whisper-large-v2/*
+app/models/faster-whisper-large-v3/*
 app/models/bert/*
+
+bug清单.md
+task.md
--- a/README-cn.md
+++ b/README-cn.md
@ -1,197 +0,0 @@
-
-<div align="center">
-<h1 align="center" style="font-size: 2cm;"> NarratoAI 😎📽️ </h1>
-<h3 align="center">An all-in-one AI-powered tool for film commentary and automated video editing.🎬🎞️ </h3>
-
-
-<h3>📖 English | <a href="README.md">简体中文</a> | <a href="README-ja.md">日本語</a> </h3>
-<div align="center">
-
-[//]: # (  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
-</div>
-<br>
-NarratoAI is an automated video narration tool that provides an all-in-one solution for script writing, automated video editing, voice-over, and subtitle generation, powered by LLM to enhance efficient content creation.
-<br>
-
-[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI)
-[![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE)
-[![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues)
-[![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers)
-
-<a href="https://github.com/linyqh/NarratoAI/wiki" target="_blank">💬 Join the open source community to get project updates and the latest news.</a>
-
-<h3>Home</h3>
-
-![](docs/index-en.png)
-
-<h3>Video Review Interface</h3>
-
-![](docs/check-en.png)
-
-</div>
-
-## Future Plans 🥳 
- [x] Windows Integration Pack Release
- [ ] Optimized the story generation process and improved the generation effect 
- [ ] Support local large model MiniCPM-V 
- [ ] Support local large model Qwen2-VL 
- [ ] ...
-
-## System Requirements 📦
-
- Recommended minimum: CPU with 4 cores or more, 8GB RAM or more, GPU is not required
- Windows 10 or MacOS 11.0 or above
-
-## Quick Start 🚀
-### 1. Apply for Google AI Studio Account
-1. Visit https://aistudio.google.com/app/prompts/new_chat to apply for an account.
-2. Click `Get API Key` to request an API Key.
-3. Enter the obtained API Key into the `gemini_api_key` setting in the `config.example.toml` file.
-
-### 2. Configure Proxy VPN
-> The method to configure VPN is not restricted, as long as you can access Google's network. Here, `clash` is used as an example.
-1. Note the port of the clash service, usually `http://127.0.0.1:7890`.
-2. If the port is not `7890`, modify the `VPN_PROXY_URL` in the `docker-compose.yml` file to your proxy address.
-   ```yaml
-   environment:
-     - "VPN_PROXY_URL=http://host.docker.internal:7890" # Change to your proxy port; host.docker.internal represents the IP of the physical machine.
-    ```
-
-3. (Optional) Or modify the `proxy` settings in the `config.example.toml` file.
-   ```toml
-   [proxy]
-    ### Use a proxy to access the Pexels API
-    ### Format: "http://<username>:<password>@<proxy>:<port>"
-    ### Example: "http://user:pass@proxy:1234"
-    ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
-
-    http = "http://xx.xx.xx.xx:7890"
-    https = "http://xx.xx.xx.xx:7890"
-   ```
-
-
-### 3. Get Started 📥 with the Modpack (for Windows users)
-NarratoAI Modpack v0.1.2 is released 🚀 
-
-Hurry up and follow the WeChat public account [NarratoAI助手] and reply to the keyword [整合包] to get the latest download link! Give it a try! 
-
-Note: 
- Currently only available for Windows, Mac version is in development, Linux version will be available in a future release.
-
-
-
-### 4. Get started 🐳 with docker (for Mac and Linux users)
-#### ① clone project, Start Docker
-```shell
-git clone https://github.com/linyqh/NarratoAI.git
-cd NarratoAI
-docker-compose up
-```
-#### ② Access the Web Interface
-
-Open your browser and go to http://127.0.0.1:8501
-
-#### ③ Access the API Documentation
-
-Open your browser and go to http://127.0.0.1:8080/docs or http://127.0.0.1:8080/redoc
-
-## Usage
-#### 1. Basic Configuration, Select Model, Enter API Key, and Choose Model
-> Currently, only the `Gemini` model is supported. Other modes will be added in future updates. Contributions are welcome via [PR](https://github.com/linyqh/NarratoAI/pulls) to join in the development 🎉🎉🎉
-<div align="center">
-  <img src="docs/img001-en.png" alt="001" width="1000"/>
-</div>
-
-#### 2. Select the Video for Narration and Click to Generate Video Script
-> A demo video is included in the platform. To use your own video, place the mp4 file in the `resource/videos` directory and refresh your browser.
-> Note: The filename can be anything, but it must not contain Chinese characters, special characters, spaces, backslashes, etc.
-<div align="center">
-  <img src="docs/img002-en.png" alt="002" width="400"/>
-</div>
-
-#### 3. Save the Script and Start Editing
-> After saving the script, refresh the browser, and the newly generated `.json` script file will appear in the script file dropdown. Select the json file and video to start editing.
-<div align="center">
-  <img src="docs/img003-en.png" alt="003" width="400"/>
-</div>
-
-#### 4. Review the Video; if there are segments that don't meet the rules, click to regenerate or manually edit them.
-<div align="center">
-  <img src="docs/img004-en.png" alt="003" width="1000"/>
-</div>
-
-#### 5. Configure Basic Video Parameters
-<div align="center">
-  <img src="docs/img005-en.png" alt="003" width="700"/>
-</div>
-
-#### 6. Start Generating
-<div align="center">
-  <img src="docs/img006-en.png" alt="003" width="1000"/>
-</div>
-
-#### 7. Video Generation Complete
-<div align="center">
-  <img src="docs/img007-en.png" alt="003" width="1000"/>
-</div>
-
-## Development 💻
-1. Install Dependencies
-```shell
-conda create -n narratoai python=3.10
-conda activate narratoai
-cd narratoai
-pip install -r requirements.txt
-```
-2. Install ImageMagick
-###### Windows:
-
- Download https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-38-Q16-x64-static.exe
- Install the downloaded ImageMagick, ensuring you do not change the installation path
- Update `imagemagick_path` in the `config.toml` file to your actual installation path (typically `C:\Program Files\ImageMagick-7.1.1-Q16\magick.exe`)
-
-###### MacOS:
-
-```shell
-brew install imagemagick
-````
-
-###### Ubuntu
-
-```shell
-sudo apt-get install imagemagick
-```
-
-###### CentOS
-
-```shell
-sudo yum install ImageMagick
-```
-
-3. initiate webui
-```shell
-streamlit run ./webui/Main.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
-```
-4. Access http://127.0.0.1:8501
-
-## Feedback & Suggestions 📢
-
-### 👏 1. You can submit [issues](https://github.com/linyqh/NarratoAI/issues) or [pull requests](https://github.com/linyqh/NarratoAI/pulls) 
-
-### 💬 2. [Join the open source community exchange group]((https://github.com/linyqh/NarratoAI/wiki))
-
-### 👉 3. [frequently asked questions](https://thread-marsupial-df8.notion.site/105866888dab80988650fa063b1df4eb)
-
-## Reference Projects 📚
- https://github.com/FujiwaraChoki/MoneyPrinter
- https://github.com/harry0703/MoneyPrinterTurbo
-
-This project was refactored based on the above projects with the addition of video narration features. Thanks to the original authors for their open-source spirit 🥳🥳🥳 
-
-## License 📝
-
-Click to view the [`LICENSE`](LICENSE) file
-
-## Star History
-
-[![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date)
--- a/README-en.md
+++ b/README-en.md
@ -0,0 +1,115 @@
+<div align="center">
+<h1 align="center" style="font-size: 2cm;"> NarratoAI 😎📽️ </h1>
+<h3 align="center">An all-in-one AI-powered tool for film commentary and automated video editing.🎬🎞️ </h3>
+
+
+<h3>📖 English | <a href="README.md">简体中文</a> | <a href="README-ja.md">日本語</a> </h3>
+<div align="center">
+
+[//]: # (  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
+</div>
+<br>
+NarratoAI is an automated video narration tool that provides an all-in-one solution for script writing, automated video editing, voice-over, and subtitle generation, powered by LLM to enhance efficient content creation.
+<br>
+
+[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI)
+[![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE)
+[![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues)
+[![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers)
+
+<a href="https://discord.com/invite/V2pbAqqQNb" target="_blank">💬 Join the open source community to get project updates and the latest news.</a>
+
+<h2><a href="https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg?from=from_copylink" target="_blank">🎉🎉🎉 Official Documentation 🎉🎉🎉</a> </h2>
+<h3>Home</h3>
+
+![](docs/index-en.png)
+
+<h3>Video Review Interface</h3>
+
+![](docs/check-en.png)
+
+</div>
+
+## Latest News
+- 2025.05.11 Released new version 0.6.0, supports **short drama commentary** and optimized editing process
+- 2025.03.06 Released new version 0.5.2, supports DeepSeek R1 and DeepSeek V3 models for short drama mixing
+- 2024.12.16 Released new version 0.3.9, supports Alibaba Qwen2-VL model for video understanding; supports short drama mixing
+- 2024.11.24 Opened Discord community: https://discord.com/invite/V2pbAqqQNb
+- 2024.11.11 Migrated open source community, welcome to join! [Join the official community](https://github.com/linyqh/NarratoAI/wiki)
+- 2024.11.10 Released official documentation, details refer to [Official Documentation](https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg)
+- 2024.11.10 Released new version v0.3.5; optimized video editing process,
+
+## Major Benefits 🎉
+From now on, fully support DeepSeek model! Register to enjoy 20 million free tokens (worth 14 yuan platform quota), editing a 10-minute video only costs 0.1 yuan!  
+
+🔥 Quick benefits:  
+1️⃣ Click the link to register: https://cloud.siliconflow.cn/i/pyOKqFCV  
+2️⃣ Log in with your phone number, **be sure to fill in the invitation code: pyOKqFCV**  
+3️⃣ Receive a 14 yuan quota, experience high cost-effective AI editing quickly!  
+
+💡 Low cost, high creativity:  
+Silicon Flow API Key can be integrated with one click, doubling intelligent editing efficiency!  
+(Note: The invitation code is the only proof for benefit collection, automatically credited after registration)  
+
+Immediately take action to unlock your AI productivity with "pyOKqFCV"!
+
+😊 Update Steps:
+Integration Package: Click update.bat one-click update script
+Code Build: Use git pull to fetch the latest code
+
+## Announcement 📢
+_**Note⚠️: Recently, someone has been impersonating the author on x (Twitter) to issue tokens on the pump.fun platform! This is a scam!!! Do not be deceived! Currently, NarratoAI has not made any official promotions on x (Twitter), please be cautious**_
+
+Below is a screenshot of this person's x (Twitter) homepage
+
+<img src="https://github.com/user-attachments/assets/c492ab99-52cd-4ba2-8695-1bd2073ecf12" alt="Screenshot_20250109_114131_Samsung Internet" style="width:30%; height:auto;">
+
+## Future Plans 🥳
+- [x] Windows Integration Pack Release
+- [x] Optimized the story generation process and improved the generation effect
+- [x] Released version 0.3.5 integration package
+- [x] Support Alibaba Qwen2-VL large model for video understanding
+- [x] Support short drama commentary
+  - [x] One-click merge materials
+  - [x] One-click transcription
+  - [x] One-click clear cache
+- [ ] Support exporting to Jianying drafts
+- [X] Support short drama commentary
+- [ ] Character face matching
+- [ ] Support automatic matching based on voiceover, script, and video materials
+- [ ] Support more TTS engines
+- [ ] ...
+
+## System Requirements 📦
+
+- Recommended minimum: CPU with 4 cores or more, 8GB RAM or more, GPU is not required
+- Windows 10/11 or MacOS 11.0 or above
+- [Python 3.12+](https://www.python.org/downloads/)
+
+## Feedback & Suggestions 📢
+
+👏 1. You can submit [issue](https://github.com/linyqh/NarratoAI/issues) or [pull request](https://github.com/linyqh/NarratoAI/pulls)
+
+💬 2. [Join the open source community exchange group](https://github.com/linyqh/NarratoAI/wiki)
+
+📷 3. Follow the official account [NarratoAI助手] to grasp the latest news
+
+## Reference Projects 📚
+- https://github.com/FujiwaraChoki/MoneyPrinter
+- https://github.com/harry0703/MoneyPrinterTurbo
+
+This project was refactored based on the above projects with the addition of video narration features. Thanks to the original authors for their open-source spirit 🥳🥳🥳 
+
+## Buy the Author a Cup of Coffee ☕️
+<div style="display: flex; justify-content: space-between;">
+  <img src="https://github.com/user-attachments/assets/5038ccfb-addf-4db1-9966-99415989fd0c" alt="Image 1" style="width: 350px; height: 350px; margin: auto;"/>
+  <img src="https://github.com/user-attachments/assets/07d4fd58-02f0-425c-8b59-2ab94b4f09f8" alt="Image 2" style="width: 350px; height: 350px; margin: auto;"/>
+</div>
+
+## License 📝
+
+Click to view [`LICENSE`](LICENSE) file
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date)
--- a/README-ja.md
+++ b/README-ja.md
@ -39,8 +39,15 @@ NarratoAIは、LLMを活用してスクリプト作成、自動ビデオ編集
 - [x] Windows統合パックリリース
 - [x] ストーリー生成プロセスの最適化、生成効果の向上
 - [x] バージョン0.3.5統合パックリリース
- [ ] アリババQwen2-VL大規模モデルのビデオ理解サポート
- [ ] 短編ドラマの解説サポート
+- [x] アリババQwen2-VL大規模モデルのビデオ理解サポート
+- [x] 短編ドラマの解説サポート
+  - [x] 一クリックで素材を統合
+  - [x] 一クリックで文字起こし
+  - [x] 一クリックでキャッシュをクリア
+- [ ] ジャン映草稿のエクスポートをサポート
+- [ ] 主役の顔のマッチング
+- [ ] 音声、スクリプト、ビデオ素材に基づいて自動マッチングをサポート
+- [ ] より多くのTTSエンジンをサポート
 - [ ] ...

 ## システム要件 📦
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 <h3 align="center">一站式 AI 影视解说+自动化剪辑工具🎬🎞️ </h3>


-<h3>📖 <a href="README-cn.md">English</a> | 简体中文 | <a href="README-ja.md">日本語</a> </h3>
+<h3>📖 <a href="README-en.md">English</a> | 简体中文 | <a href="README-ja.md">日本語</a> </h3>
 <div align="center">

 [//]: # (  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
@ -32,6 +32,7 @@ NarratoAI 是一个自动化影视解说工具，基于LLM实现文案撰写、
 </div>

 ## 最新资讯
+- 2025.05.11 发布新版本 0.6.0，支持 **短剧解说** 和 优化剪辑流程
 - 2025.03.06 发布新版本 0.5.2，支持 DeepSeek R1 和 DeepSeek V3 模型进行短剧混剪
 - 2024.12.16 发布新版本 0.3.9，支持阿里 Qwen2-VL 模型理解视频；支持短剧混剪
 - 2024.11.24 开通 discord 社群：https://discord.com/invite/V2pbAqqQNb
@ -75,16 +76,17 @@ _**注意⚠️：近期在 x (推特) 上发现有人冒充作者在 pump.fun
  - [x] 一键转录
  - [x] 一键清理缓存
 - [ ] 支持导出剪映草稿
- [ ] 支持短剧解说
+- [X] 支持短剧解说
 - [ ] 主角人脸匹配
 - [ ] 支持根据口播，文案，视频素材自动匹配
+- [ ] 支持更多 TTS 引擎
 - [ ] ...

 ## 配置要求 📦

 - 建议最低 CPU 4核或以上，内存 8G 或以上，显卡非必须
- Windows 10 或 MacOS 11.0 以上系统
- [Python 3.10+](https://www.python.org/downloads/)
+- Windows 10/11 或 MacOS 11.0 以上系统
+- [Python 3.12+](https://www.python.org/downloads/)

 ## 反馈建议 📢

--- a/app/asgi.py
+++ b/app/asgi.py
@ -13,6 +13,7 @@ from app.config import config
 from app.models.exception import HttpException
 from app.router import root_api_router
 from app.utils import utils
+from app.utils import ffmpeg_utils


 def exception_handler(request: Request, e: HttpException):
@ -80,3 +81,10 @@ def shutdown_event():
@app.on_event("startup")
 def startup_event():
    logger.info("startup event")
+
+    # 检测FFmpeg硬件加速
+    hwaccel_info = ffmpeg_utils.detect_hardware_acceleration()
+    if hwaccel_info["available"]:
+        logger.info(f"FFmpeg硬件加速检测结果: 可用 | 类型: {hwaccel_info['type']} | 编码器: {hwaccel_info['encoder']} | 独立显卡: {hwaccel_info['is_dedicated_gpu']} | 参数: {hwaccel_info['hwaccel_args']}")
+    else:
+        logger.warning(f"FFmpeg硬件加速不可用: {hwaccel_info['message']}, 将使用CPU软件编码")
--- a/app/config/config.py
+++ b/app/config/config.py
@ -6,6 +6,19 @@ from loguru import logger

 root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 config_file = f"{root_dir}/config.toml"
+version_file = f"{root_dir}/project_version"
+
+
+def get_version_from_file():
+    """从project_version文件中读取版本号"""
+    try:
+        if os.path.isfile(version_file):
+            with open(version_file, "r", encoding="utf-8") as f:
+                return f.read().strip()
+        return "0.1.0"  # 默认版本号
+    except Exception as e:
+        logger.error(f"读取版本号文件失败: {str(e)}")
+        return "0.1.0"  # 默认版本号


 def load_config():
@ -57,7 +70,8 @@ project_description = _cfg.get(
    "project_description",
    "<a href='https://github.com/linyqh/NarratoAI'>https://github.com/linyqh/NarratoAI</a>",
 )
-project_version = _cfg.get("app", {}).get("project_version")
+# 从文件读取版本号，而不是从配置文件中获取
+project_version = get_version_from_file()
 reload_debug = False

 imagemagick_path = app.get("imagemagick_path", "")
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -1,6 +1,6 @@
 import warnings
 from enum import Enum
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union

 import pydantic
 from pydantic import BaseModel, Field
@ -13,6 +13,24 @@ warnings.filterwarnings(
 )


+class AudioVolumeDefaults:
+    """音量配置默认值常量类 - 确保全局一致性"""
+
+    # 语音音量默认值
+    VOICE_VOLUME = 1.0
+    TTS_VOLUME = 1.0
+
+    # 原声音量默认值 - 这是修复bug的关键
+    ORIGINAL_VOLUME = 0.7
+
+    # 背景音乐音量默认值
+    BGM_VOLUME = 0.3
+
+    # 音量范围
+    MIN_VOLUME = 0.0
+    MAX_VOLUME = 1.0
+
+
 class VideoConcatMode(str, Enum):
    random = "random"
    sequential = "sequential"
@ -20,7 +38,9 @@ class VideoConcatMode(str, Enum):

 class VideoAspect(str, Enum):
    landscape = "16:9"
+    landscape_2 = "4:3"
    portrait = "9:16"
+    portrait_2 = "3:4"
    square = "1:1"

    def to_resolution(self):
@ -99,7 +119,7 @@ class VideoParams(BaseModel):

    video_subject: str
    video_script: str = ""  # 用于生成视频的脚本
-    video_terms: Optional[str | list] = None  # 用于生成视频的关键词
+    video_terms: Optional[Union[str, list]] = None  # 用于生成视频的关键词
    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
    video_clip_duration: Optional[int] = 5
@ -111,11 +131,11 @@ class VideoParams(BaseModel):
    video_language: Optional[str] = ""  # auto detect

    voice_name: Optional[str] = ""
-    voice_volume: Optional[float] = 1.0
+    voice_volume: Optional[float] = AudioVolumeDefaults.VOICE_VOLUME
    voice_rate: Optional[float] = 1.0
    bgm_type: Optional[str] = "random"
    bgm_file: Optional[str] = ""
-    bgm_volume: Optional[float] = 0.2
+    bgm_volume: Optional[float] = AudioVolumeDefaults.BGM_VOLUME

    subtitle_enabled: Optional[bool] = True
    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
@ -155,11 +175,11 @@ class AudioRequest(BaseModel):
    video_script: str
    video_language: Optional[str] = ""
    voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
-    voice_volume: Optional[float] = 1.0
+    voice_volume: Optional[float] = AudioVolumeDefaults.VOICE_VOLUME
    voice_rate: Optional[float] = 1.2
    bgm_type: Optional[str] = "random"
    bgm_file: Optional[str] = ""
-    bgm_volume: Optional[float] = 0.2
+    bgm_volume: Optional[float] = AudioVolumeDefaults.BGM_VOLUME
    video_source: Optional[str] = "local"


@ -345,7 +365,7 @@ class VideoClipParams(BaseModel):
    # video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value

    voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称")
-    voice_volume: Optional[float] = Field(default=1.0, description="解说语音音量")
+    voice_volume: Optional[float] = Field(default=AudioVolumeDefaults.VOICE_VOLUME, description="解说语音音量")
    voice_rate: Optional[float] = Field(default=1.0, description="语速")
    voice_pitch: Optional[float] = Field(default=1.0, description="语调")

@ -360,13 +380,14 @@ class VideoClipParams(BaseModel):
    text_back_color: Optional[str] = None       # 文本背景色
    stroke_color: str = "black"                 # 描边颜色
    stroke_width: float = 1.5                   # 描边宽度
-    subtitle_position: str = "bottom"  # top, bottom, center, custom
+    subtitle_position: str = "bottom"   # top, bottom, center, custom
+    custom_position: float = 70.0       # 自定义位置

-    n_threads: Optional[int] = Field(default=16, description="解说语音音量")    # 线程<E7BABF><E7A88B><EFBFBD>，有助于提升视频处理速度
+    n_threads: Optional[int] = Field(default=16, description="线程数")    # 线程数，有助于提升视频处理速度

-    tts_volume: Optional[float] = Field(default=1.0, description="解说语音音量（后处理）")
-    original_volume: Optional[float] = Field(default=1.0, description="视频原声音量")
-    bgm_volume: Optional[float] = Field(default=0.6, description="背景音乐音量")
+    tts_volume: Optional[float] = Field(default=AudioVolumeDefaults.TTS_VOLUME, description="解说语音音量（后处理）")
+    original_volume: Optional[float] = Field(default=AudioVolumeDefaults.ORIGINAL_VOLUME, description="视频原声音量")
+    bgm_volume: Optional[float] = Field(default=AudioVolumeDefaults.BGM_VOLUME, description="背景音乐音量")


 class VideoTranscriptionRequest(BaseModel):
--- a/app/models/schema_v2.py
+++ b/app/models/schema_v2.py
@ -6,6 +6,7 @@ class GenerateScriptRequest(BaseModel):
    video_path: str
    video_theme: Optional[str] = ""
    custom_prompt: Optional[str] = ""
+    frame_interval_input: Optional[int] = 5
    skip_seconds: Optional[int] = 0
    threshold: Optional[int] = 30
    vision_batch_size: Optional[int] = 5
--- a/app/services/SDE/prompt.py
+++ b/app/services/SDE/prompt.py
@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : prompt
+@Author : 小林同学
+@Date   : 2025/5/9 上午12:57 
+'''
+# 字幕剧情分析提示词
+subtitle_plot_analysis_v1 = """
+# 角色
+你是一位专业的剧本分析师和剧情概括助手。
+
+# 任务
+我将为你提供一部短剧的完整字幕文本。请你基于这些字幕，完成以下任务：
+1.  **整体剧情分析**：简要概括整个短剧的核心剧情脉络、主要冲突和结局（如果有的话）。
+2.  **分段剧情解析与时间戳定位**：
+    *   将整个短剧划分为若干个关键的剧情段落（例如：开端、发展、转折、高潮、结局，或根据具体情节自然划分）。
+    *   段落数应该与字幕长度成正比。
+    *   对于每一个剧情段落：
+        *   **概括该段落的主要内容**：用简洁的语言描述这段剧情发生了什么。
+        *   **标注对应的时间戳范围**：明确指出该剧情段落对应的开始字幕时间戳和结束字幕时间戳。请直接从字幕中提取时间信息。
+
+# 输入格式
+字幕内容通常包含时间戳和对话，例如：
+```
+00:00:05,000 --> 00:00:10,000
+[角色A]: 你好吗？
+00:00:10,500 --> 00:00:15,000
+[角色B]: 我很好，谢谢。发生了一些有趣的事情。
+... (更多字幕内容) ...
+```
+我将把实际字幕粘贴在下方。
+
+# 输出格式要求
+请按照以下格式清晰地呈现分析结果：
+
+**一、整体剧情概括：**
+[此处填写对整个短剧剧情的概括]
+
+**二、分段剧情解析：**
+
+**剧情段落 1：[段落主题/概括，例如：主角登场与背景介绍]**
+*   **时间戳：** [开始时间戳] --> [结束时间戳]
+*   **内容概要：** [对这段剧情的详细描述]
+
+**剧情段落 2：[段落主题/概括，例如：第一个冲突出现]**
+*   **时间戳：** [开始时间戳] --> [结束时间戳]
+*   **内容概要：** [对这段剧情的详细描述]
+
+... (根据实际剧情段落数量继续) ...
+
+**剧情段落 N：[段落主题/概括，例如：结局与反思]**
+*   **时间戳：** [开始时间戳] --> [结束时间戳]
+*   **内容概要：** [对这段剧情的详细描述]
+
+# 注意事项
+*   请确保时间戳的准确性，直接引用字幕中的时间。
+*   剧情段落的划分应合乎逻辑，能够反映剧情的起承转合。
+*   语言表达应简洁、准确、客观。
+
+# 限制
+1. 严禁输出与分析结果无关的内容
+2. 
+
+# 请处理以下字幕：
+"""
+
+plot_writing = """
+我是一个影视解说up主，需要为我的粉丝讲解短剧《%s》的剧情，目前正在解说剧情，希望能让粉丝通过我的解说了解剧情，并且产生 继续观看的兴趣，请生成一篇解说脚本，包含解说文案，以及穿插原声的片段，下面<plot>中的内容是短剧的剧情概述：
+
+<plot>
+%s
+</plot>
+
+请使用 json 格式进行输出；使用 <output> 中的输出格式：
+<output>
+{
+  "items": [
+    {
+        "_id": 1, # 唯一递增id
+        "timestamp": "00:00:05,390-00:00:10,430",
+        "picture": "剧情描述或者备注",
+        "narration": "解说文案，如果片段为穿插的原片片段，可以直接使用 ‘播放原片+_id‘ 进行占位",
+        "OST": "值为 0 表示当前片段为解说片段，值为 1 表示当前片段为穿插的原片"
+    }
+}
+</output>
+
+<restriction>
+1. 只输出 json 内容，不要输出其他任何说明性的文字
+2. 解说文案的语言使用 简体中文
+3. 严禁虚构剧情，所有画面只能从 <polt> 中摘取
+4. 严禁虚构时间戳，所有时间戳范围只能从 <polt> 中摘取
+</restriction>
+"""
--- a/app/services/SDE/short_drama_explanation.py
+++ b/app/services/SDE/short_drama_explanation.py
@ -0,0 +1,456 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : 短剧解说
+@Author : 小林同学
+@Date   : 2025/5/9 上午12:36 
+'''
+
+import os
+import json
+import requests
+from typing import Dict, Any, Optional
+from loguru import logger
+from app.config import config
+from app.utils.utils import get_uuid, storage_dir
+from app.services.SDE.prompt import subtitle_plot_analysis_v1, plot_writing
+
+
+class SubtitleAnalyzer:
+    """字幕剧情分析器，负责分析字幕内容并提取关键剧情段落"""
+    
+    def __init__(
+        self, 
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        base_url: Optional[str] = None,
+        custom_prompt: Optional[str] = None,
+        temperature: Optional[float] = 1.0,
+    ):
+        """
+        初始化字幕分析器
+        
+        Args:
+            api_key: API密钥，如果不提供则从配置中读取
+            model: 模型名称，如果不提供则从配置中读取
+            base_url: API基础URL，如果不提供则从配置中读取或使用默认值
+            custom_prompt: 自定义提示词，如果不提供则使用默认值
+            temperature: 模型温度
+        """
+        # 使用传入的参数或从配置中获取
+        self.api_key = api_key
+        self.model = model
+        self.base_url = base_url
+        self.temperature = temperature
+        
+        # 设置提示词模板
+        self.prompt_template = custom_prompt or subtitle_plot_analysis_v1
+        
+        # 初始化HTTP请求所需的头信息
+        self._init_headers()
+    
+    def _init_headers(self):
+        """初始化HTTP请求头"""
+        try:
+            # 基础请求头，包含API密钥和内容类型
+            self.headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.api_key}"
+            }
+            # logger.debug(f"初始化成功 - API Key: {self.api_key[:8]}... - Base URL: {self.base_url}")
+        except Exception as e:
+            logger.error(f"初始化请求头失败: {str(e)}")
+            raise
+    
+    def analyze_subtitle(self, subtitle_content: str) -> Dict[str, Any]:
+        """
+        分析字幕内容
+        
+        Args:
+            subtitle_content: 字幕内容文本
+            
+        Returns:
+            Dict[str, Any]: 包含分析结果的字典
+        """
+        try:
+            # 构建完整提示词
+            prompt = f"{self.prompt_template}\n\n{subtitle_content}"
+            
+            # 构建请求体数据
+            payload = {
+                "model": self.model,
+                "messages": [
+                    {"role": "system", "content": "你是一位专业的剧本分析师和剧情概括助手。"},
+                    {"role": "user", "content": prompt}
+                ],
+                "temperature": self.temperature
+            }
+            
+            # 构建请求地址
+            url = f"{self.base_url}/chat/completions"
+            
+            # 发送HTTP请求
+            response = requests.post(url, headers=self.headers, json=payload)
+            
+            # 解析响应
+            if response.status_code == 200:
+                response_data = response.json()
+                
+                # 提取响应内容
+                if "choices" in response_data and len(response_data["choices"]) > 0:
+                    analysis_result = response_data["choices"][0]["message"]["content"]
+                    logger.debug(f"字幕分析完成，消耗的tokens: {response_data.get('usage', {}).get('total_tokens', 0)}")
+                    
+                    # 返回结果
+                    return {
+                        "status": "success",
+                        "analysis": analysis_result,
+                        "tokens_used": response_data.get("usage", {}).get("total_tokens", 0),
+                        "model": self.model,
+                        "temperature": self.temperature
+                    }
+                else:
+                    logger.error("字幕分析失败: 未获取到有效响应")
+                    return {
+                        "status": "error",
+                        "message": "未获取到有效响应",
+                        "temperature": self.temperature
+                    }
+            else:
+                error_msg = f"请求失败，状态码: {response.status_code}, 响应: {response.text}"
+                logger.error(error_msg)
+                return {
+                    "status": "error",
+                    "message": error_msg,
+                    "temperature": self.temperature
+                }
+                
+        except Exception as e:
+            logger.error(f"字幕分析过程中发生错误: {str(e)}")
+            return {
+                "status": "error",
+                "message": str(e),
+                "temperature": self.temperature
+            }
+    
+    def analyze_subtitle_from_file(self, subtitle_file_path: str) -> Dict[str, Any]:
+        """
+        从文件读取字幕并分析
+        
+        Args:
+            subtitle_file_path: 字幕文件的路径
+            
+        Returns:
+            Dict[str, Any]: 包含分析结果的字典
+        """
+        try:
+            # 检查文件是否存在
+            if not os.path.exists(subtitle_file_path):
+                return {
+                    "status": "error",
+                    "message": f"字幕文件不存在: {subtitle_file_path}",
+                    "temperature": self.temperature
+                }
+            
+            # 读取文件内容
+            with open(subtitle_file_path, 'r', encoding='utf-8') as f:
+                subtitle_content = f.read()
+            
+            # 分析字幕
+            return self.analyze_subtitle(subtitle_content)
+            
+        except Exception as e:
+            logger.error(f"从文件读取字幕并分析过程中发生错误: {str(e)}")
+            return {
+                "status": "error",
+                "message": str(e),
+                "temperature": self.temperature
+            }
+
+    def save_analysis_result(self, analysis_result: Dict[str, Any], output_path: Optional[str] = None) -> str:
+        """
+        保存分析结果到文件
+        
+        Args:
+            analysis_result: 分析结果
+            output_path: 输出文件路径，如果不提供则自动生成
+            
+        Returns:
+            str: 输出文件的路径
+        """
+        try:
+            # 如果未提供输出路径，则自动生成
+            if not output_path:
+                output_dir = storage_dir("drama_analysis", create=True)
+                output_path = os.path.join(output_dir, f"analysis_{get_uuid(True)}.txt")
+            
+            # 确保目录存在
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            
+            # 保存结果
+            with open(output_path, 'w', encoding='utf-8') as f:
+                if analysis_result["status"] == "success":
+                    f.write(analysis_result["analysis"])
+                else:
+                    f.write(f"分析失败: {analysis_result['message']}")
+            
+            logger.info(f"分析结果已保存到: {output_path}")
+            return output_path
+            
+        except Exception as e:
+            logger.error(f"保存分析结果时发生错误: {str(e)}")
+            return ""
+
+    def generate_narration_script(self, short_name:str, plot_analysis: str, temperature: float = 0.7) -> Dict[str, Any]:
+        """
+        根据剧情分析生成解说文案
+        
+        Args:
+            short_name: 短剧名称
+            plot_analysis: 剧情分析内容
+            temperature: 生成温度，控制创造性，默认0.7
+            
+        Returns:
+            Dict[str, Any]: 包含生成结果的字典
+        """
+        try:
+            # 构建完整提示词
+            prompt = plot_writing % (short_name, plot_analysis)
+
+            # 构建请求体数据
+            payload = {
+                "model": self.model,
+                "messages": [
+                    {"role": "system", "content": "你是一位专业的短视频解说脚本撰写专家。"},
+                    {"role": "user", "content": prompt}
+                ],
+                "temperature": temperature
+            }
+            
+            # 对特定模型添加响应格式设置
+            if self.model not in ["deepseek-reasoner"]:
+                payload["response_format"] = {"type": "json_object"}
+            
+            # 构建请求地址
+            url = f"{self.base_url}/chat/completions"
+            
+            # 发送HTTP请求
+            response = requests.post(url, headers=self.headers, json=payload)
+            
+            # 解析响应
+            if response.status_code == 200:
+                response_data = response.json()
+                
+                # 提取响应内容
+                if "choices" in response_data and len(response_data["choices"]) > 0:
+                    narration_script = response_data["choices"][0]["message"]["content"]
+                    logger.debug(f"解说文案生成完成，消耗的tokens: {response_data.get('usage', {}).get('total_tokens', 0)}")
+                    
+                    # 返回结果
+                    return {
+                        "status": "success",
+                        "narration_script": narration_script,
+                        "tokens_used": response_data.get("usage", {}).get("total_tokens", 0),
+                        "model": self.model,
+                        "temperature": self.temperature
+                    }
+                else:
+                    logger.error("解说文案生成失败: 未获取到有效响应")
+                    return {
+                        "status": "error",
+                        "message": "未获取到有效响应",
+                        "temperature": self.temperature
+                    }
+            else:
+                error_msg = f"请求失败，状态码: {response.status_code}, 响应: {response.text}"
+                logger.error(error_msg)
+                return {
+                    "status": "error",
+                    "message": error_msg,
+                    "temperature": self.temperature
+                }
+                
+        except Exception as e:
+            logger.error(f"解说文案生成过程中发生错误: {str(e)}")
+            return {
+                "status": "error",
+                "message": str(e),
+                "temperature": self.temperature
+            }
+    
+    def save_narration_script(self, narration_result: Dict[str, Any], output_path: Optional[str] = None) -> str:
+        """
+        保存解说文案到文件
+        
+        Args:
+            narration_result: 解说文案生成结果
+            output_path: 输出文件路径，如果不提供则自动生成
+            
+        Returns:
+            str: 输出文件的路径
+        """
+        try:
+            # 如果未提供输出路径，则自动生成
+            if not output_path:
+                output_dir = storage_dir("narration_scripts", create=True)
+                output_path = os.path.join(output_dir, f"narration_{get_uuid(True)}.json")
+            
+            # 确保目录存在
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            
+            # 保存结果
+            with open(output_path, 'w', encoding='utf-8') as f:
+                if narration_result["status"] == "success":
+                    f.write(narration_result["narration_script"])
+                else:
+                    f.write(f"生成失败: {narration_result['message']}")
+            
+            logger.info(f"解说文案已保存到: {output_path}")
+            return output_path
+            
+        except Exception as e:
+            logger.error(f"保存解说文案时发生错误: {str(e)}")
+            return ""
+
+
+def analyze_subtitle(
+        subtitle_content: str = None,
+        subtitle_file_path: str = None,
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        base_url: Optional[str] = None,
+        custom_prompt: Optional[str] = None,
+        temperature: float = 1.0,
+        save_result: bool = False,
+        output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    分析字幕内容的便捷函数
+    
+    Args:
+        subtitle_content: 字幕内容文本
+        subtitle_file_path: 字幕文件路径
+        custom_prompt: 自定义提示词
+        api_key: API密钥
+        model: 模型名称
+        base_url: API基础URL
+        temperature: 模型温度
+        save_result: 是否保存结果到文件
+        output_path: 输出文件路径
+        
+    Returns:
+        Dict[str, Any]: 包含分析结果的字典
+    """
+    # 初始化分析器
+    analyzer = SubtitleAnalyzer(
+        temperature=temperature,
+        api_key=api_key,
+        model=model,
+        base_url=base_url,
+        custom_prompt=custom_prompt
+    )
+    logger.debug(f"使用模型: {analyzer.model} 开始分析, 温度: {analyzer.temperature}")
+    # 分析字幕
+    if subtitle_content:
+        result = analyzer.analyze_subtitle(subtitle_content)
+    elif subtitle_file_path:
+        result = analyzer.analyze_subtitle_from_file(subtitle_file_path)
+    else:
+        return {
+            "status": "error",
+            "message": "必须提供字幕内容或字幕文件路径",
+            "temperature": temperature
+        }
+    
+    # 保存结果
+    if save_result and result["status"] == "success":
+        result["output_path"] = analyzer.save_analysis_result(result, output_path)
+    
+    return result
+
+
+def generate_narration_script(
+    short_name: str = None,
+    plot_analysis: str = None,
+    api_key: Optional[str] = None,
+    model: Optional[str] = None,
+    base_url: Optional[str] = None,
+    temperature: float = 1.0,
+    save_result: bool = False,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    根据剧情分析生成解说文案的便捷函数
+    
+    Args:
+        short_name: 短剧名称
+        plot_analysis: 剧情分析内容，直接提供
+        api_key: API密钥
+        model: 模型名称
+        base_url: API基础URL
+        temperature: 生成温度，控制创造性
+        save_result: 是否保存结果到文件
+        output_path: 输出文件路径
+        
+    Returns:
+        Dict[str, Any]: 包含生成结果的字典
+    """
+    # 初始化分析器
+    analyzer = SubtitleAnalyzer(
+        temperature=temperature,
+        api_key=api_key,
+        model=model,
+        base_url=base_url
+    )
+    
+    # 生成解说文案
+    result = analyzer.generate_narration_script(short_name, plot_analysis, temperature)
+    
+    # 保存结果
+    if save_result and result["status"] == "success":
+        result["output_path"] = analyzer.save_narration_script(result, output_path)
+    
+    return result
+
+
+if __name__ == '__main__':
+    text_api_key = "skxxxx"
+    text_model = "gemini-2.0-flash"
+    text_base_url = "https://api.narratoai.cn/v1/chat/completions"  # 确保URL不以斜杠结尾，便于后续拼接
+    subtitle_path = "/Users/apple/Desktop/home/NarratoAI/resource/srt/家里家外1-5.srt"
+    
+    # 示例用法
+    if subtitle_path:
+        # 分析字幕总结剧情
+        analysis_result = analyze_subtitle(
+            subtitle_file_path=subtitle_path,
+            api_key=text_api_key,
+            model=text_model,
+            base_url=text_base_url,
+            save_result=True
+        )
+        
+        if analysis_result["status"] == "success":
+            print("字幕分析成功！")
+            print("分析结果：")
+            print(analysis_result["analysis"])
+            
+            # 根据剧情生成解说文案
+            narration_result = generate_narration_script(
+                plot_analysis=analysis_result["analysis"],
+                api_key=text_api_key,
+                model=text_model,
+                base_url=text_base_url,
+                save_result=True
+            )
+            
+            if narration_result["status"] == "success":
+                print("\n解说文案生成成功！")
+                print("解说文案：")
+                print(narration_result["narration_script"])
+            else:
+                print(f"\n解说文案生成失败: {narration_result['message']}")
+        else:
+            print(f"分析失败: {analysis_result['message']}")
--- a/app/services/SDP/generate_script_short.py
+++ b/app/services/SDP/generate_script_short.py
@ -0,0 +1,37 @@
+"""
+视频脚本生成pipeline，串联各个处理步骤
+"""
+import os
+from .utils.step1_subtitle_analyzer_openai import analyze_subtitle
+from .utils.step5_merge_script import merge_script
+
+
+def generate_script(srt_path: str, api_key: str, model_name: str, output_path: str, base_url: str = None, custom_clips: int = 5):
+    """生成视频混剪脚本
+
+    Args:
+        srt_path: 字幕文件路径
+        output_path: 输出文件路径，可选
+
+    Returns:
+        str: 生成的脚本内容
+    """
+    # 验证输入文件
+    if not os.path.exists(srt_path):
+        raise FileNotFoundError(f"字幕文件不存在: {srt_path}")
+
+    # 分析字幕
+    print("开始分析...")
+    openai_analysis = analyze_subtitle(
+        srt_path=srt_path,
+        api_key=api_key,
+        model_name=model_name,
+        base_url=base_url,
+        custom_clips=custom_clips
+    )
+
+    # 合并生成最终脚本
+    adjusted_results = openai_analysis['plot_points']
+    final_script = merge_script(adjusted_results, output_path)
+
+    return final_script
--- a/app/services/SDP/generate_script_short.pyd
+++ b/app/services/SDP/generate_script_short.pyd
--- a/app/services/SDP/generate_script_short.so
+++ b/app/services/SDP/generate_script_short.so
--- a/app/services/SDP/utils/short_schema.py
+++ b/app/services/SDP/utils/short_schema.py
@ -0,0 +1,60 @@
+"""
+定义项目中使用的数据类型
+"""
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class PlotPoint:
+    timestamp: str
+    title: str
+    picture: str
+
+
+@dataclass
+class Commentary:
+    timestamp: str
+    title: str
+    copywriter: str
+
+
+@dataclass
+class SubtitleSegment:
+    start_time: float
+    end_time: float
+    text: str
+
+
+@dataclass
+class ScriptItem:
+    timestamp: str
+    title: str
+    picture: str
+    copywriter: str
+
+
+@dataclass
+class PipelineResult:
+    output_video_path: str
+    plot_points: List[PlotPoint]
+    subtitle_segments: List[SubtitleSegment]
+    commentaries: List[Commentary]
+    final_script: List[ScriptItem]
+    error: Optional[str] = None
+
+
+class VideoProcessingError(Exception):
+    pass
+
+
+class SubtitleProcessingError(Exception):
+    pass
+
+
+class PlotAnalysisError(Exception):
+    pass
+
+
+class CopywritingError(Exception):
+    pass
--- a/app/services/SDP/utils/short_schema.pyd
+++ b/app/services/SDP/utils/short_schema.pyd
--- a/app/services/SDP/utils/short_schema.so
+++ b/app/services/SDP/utils/short_schema.so
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.py
@ -0,0 +1,157 @@
+"""
+使用OpenAI API，分析字幕文件，返回剧情梗概和爆点
+"""
+import traceback
+from openai import OpenAI, BadRequestError
+import os
+import json
+
+from .utils import load_srt
+
+
+def analyze_subtitle(
+    srt_path: str,
+    model_name: str,
+    api_key: str = None,
+    base_url: str = None,
+    custom_clips: int = 5
+) -> dict:
+    """分析字幕内容，返回完整的分析结果
+
+    Args:
+        srt_path (str): SRT字幕文件路径
+        api_key (str, optional): 大模型API密钥. Defaults to None.
+        model_name (str, optional): 大模型名称. Defaults to "gpt-4o-2024-11-20".
+        base_url (str, optional): 大模型API基础URL. Defaults to None.
+
+    Returns:
+        dict: 包含剧情梗概和结构化的时间段分析的字典
+    """
+    try:
+        # 加载字幕文件
+        subtitles = load_srt(srt_path)
+        subtitle_content = "\n".join([f"{sub['timestamp']}\n{sub['text']}" for sub in subtitles])
+
+        # 初始化客户端
+        global client
+        if "deepseek" in model_name.lower():
+            client = OpenAI(
+                api_key=api_key or os.getenv('DeepSeek_API_KEY'),
+                base_url="https://api.siliconflow.cn/v1"    # 使用第三方 硅基流动 API
+            )
+        else:
+            client = OpenAI(
+                api_key=api_key or os.getenv('OPENAI_API_KEY'),
+                base_url=base_url
+            )
+
+        messages = [
+            {
+                "role": "system",
+                "content": """你是一名经验丰富的短剧编剧，擅长根据字幕内容按照先后顺序分析关键剧情,并找出 %s 个关键片段。
+                请返回一个JSON对象，包含以下字段：
+                {
+                    "summary": "整体剧情梗概",
+                    "plot_titles": [
+                        "关键剧情1",
+                        "关键剧情2",
+                        "关键剧情3",
+                        "关键剧情4",
+                        "关键剧情5",
+                        "..."
+                    ]
+                }
+                请确保返回的是合法的JSON格式, 请确保返回的是 %s 个片段。
+                """ % (custom_clips, custom_clips)
+            },
+            {
+                "role": "user",
+                "content": f"srt字幕如下：{subtitle_content}"
+            }
+        ]
+        # DeepSeek R1 和 V3 不支持 response_format=json_object
+        try:
+            completion = client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                response_format={"type": "json_object"}
+            )
+            summary_data = json.loads(completion.choices[0].message.content)
+        except BadRequestError as e:
+            completion = client.chat.completions.create(
+                model=model_name,
+                messages=messages
+            )
+            # 去除 completion 字符串前的 ```json 和 结尾的 ```
+            completion = completion.choices[0].message.content.replace("```json", "").replace("```", "")
+            summary_data = json.loads(completion)
+        except Exception as e:
+            raise Exception(f"大模型解析发生错误：{str(e)}\n{traceback.format_exc()}")
+
+        print(json.dumps(summary_data, indent=4, ensure_ascii=False))
+
+        # 获取爆点时间段分析
+        prompt = f"""剧情梗概：
+            {summary_data['summary']}
+
+            需要定位的爆点内容：
+            """
+        print(f"找到 {len(summary_data['plot_titles'])} 个片段")
+        for i, point in enumerate(summary_data['plot_titles'], 1):
+            prompt += f"{i}. {point}\n"
+
+        messages = [
+            {
+                "role": "system",
+                "content": """你是一名短剧编剧，非常擅长根据字幕中分析视频中关键剧情出现的具体时间段。
+                请仔细阅读剧情梗概和爆点内容，然后在字幕中找出每个爆点发生的具体时间段和爆点前后的详细剧情。
+                
+                请返回一个JSON对象，包含一个名为"plot_points"的数组，数组中包含多个对象，每个对象都要包含以下字段：
+                {
+                    "plot_points": [
+                        {
+                            "timestamp": "时间段，格式为xx:xx:xx,xxx-xx:xx:xx,xxx",
+                            "title": "关键剧情的主题",
+                            "picture": "关键剧情前后的详细剧情描述"
+                        }
+                    ]
+                }
+                请确保返回的是合法的JSON格式。"""
+            },
+            {
+                "role": "user",
+                "content": f"""字幕内容：
+{subtitle_content}
+
+{prompt}"""
+            }
+        ]
+        # DeepSeek R1 和 V3 不支持 response_format=json_object
+        try:
+            completion = client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                response_format={"type": "json_object"}
+            )
+            plot_points_data = json.loads(completion.choices[0].message.content)
+        except BadRequestError as e:
+            completion = client.chat.completions.create(
+                model=model_name,
+                messages=messages
+            )
+            # 去除 completion 字符串前的 ```json 和 结尾的 ```
+            completion = completion.choices[0].message.content.replace("```json", "").replace("```", "")
+            plot_points_data = json.loads(completion)
+        except Exception as e:
+            raise Exception(f"大模型解析错误：{str(e)}\n{traceback.format_exc()}")
+
+        print(json.dumps(plot_points_data, indent=4, ensure_ascii=False))
+
+        # 合并结果
+        return {
+            "plot_summary": summary_data,
+            "plot_points": plot_points_data["plot_points"]
+        }
+
+    except Exception as e:
+        raise Exception(f"分析字幕时发生错误：{str(e)}\n{traceback.format_exc()}")
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.pyd
--- a/app/services/SDP/utils/step1_subtitle_analyzer_openai.so
+++ b/app/services/SDP/utils/step1_subtitle_analyzer_openai.so
--- a/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd
+++ b/app/services/SDP/utils/step2_subtitle_analyzer_bert.pyd
--- a/app/services/SDP/utils/step2_subtitle_analyzer_bert.so
+++ b/app/services/SDP/utils/step2_subtitle_analyzer_bert.so
--- a/app/services/SDP/utils/step3_fragment_check.pyd
+++ b/app/services/SDP/utils/step3_fragment_check.pyd
--- a/app/services/SDP/utils/step3_fragment_check.so
+++ b/app/services/SDP/utils/step3_fragment_check.so
--- a/app/services/SDP/utils/step4_text_generate.pyd
+++ b/app/services/SDP/utils/step4_text_generate.pyd
--- a/app/services/SDP/utils/step4_text_generate.so
+++ b/app/services/SDP/utils/step4_text_generate.so
--- a/app/services/SDP/utils/step5_merge_script.py
+++ b/app/services/SDP/utils/step5_merge_script.py
@ -0,0 +1,69 @@
+"""
+合并生成最终脚本
+"""
+import os
+import json
+from typing import List, Dict, Tuple
+
+
+def merge_script(
+        plot_points: List[Dict],
+        output_path: str
+):
+    """合并生成最终脚本
+
+    Args:
+        plot_points: 校对后的剧情点
+        output_path: 输出文件路径，如果提供则保存到文件
+
+    Returns:
+        str: 最终合并的脚本
+    """
+    def parse_timestamp(ts: str) -> Tuple[float, float]:
+        """解析时间戳，返回开始和结束时间（秒）"""
+        start, end = ts.split('-')
+
+        def parse_time(time_str: str) -> float:
+            time_str = time_str.strip()
+            if ',' in time_str:
+                time_parts, ms_parts = time_str.split(',')
+                ms = float(ms_parts) / 1000
+            else:
+                time_parts = time_str
+                ms = 0
+
+            hours, minutes, seconds = map(int, time_parts.split(':'))
+            return hours * 3600 + minutes * 60 + seconds + ms
+
+        return parse_time(start), parse_time(end)
+
+    def format_timestamp(seconds: float) -> str:
+        """将秒数转换为时间戳格式 HH:MM:SS"""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+
+    # 创建包含所有信息的临时列表
+    final_script = []
+
+    # 处理原生画面条目
+    number = 1
+    for plot_point in plot_points:
+        start, end = parse_timestamp(plot_point["timestamp"])
+        script_item = {
+            "_id": number,
+            "timestamp": plot_point["timestamp"],
+            "picture": plot_point["picture"],
+            "narration": f"播放原生_{os.urandom(4).hex()}",
+            "OST": 1,  # OST=0 仅保留解说 OST=2 保留解说和原声
+        }
+        final_script.append(script_item)
+        number += 1
+
+    # 保存结果
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(final_script, f, ensure_ascii=False, indent=4)
+
+    print(f"脚本生成完成：{output_path}")
+    return final_script
--- a/app/services/SDP/utils/step5_merge_script.pyd
+++ b/app/services/SDP/utils/step5_merge_script.pyd
--- a/app/services/SDP/utils/step5_merge_script.so
+++ b/app/services/SDP/utils/step5_merge_script.so
--- a/app/services/SDP/utils/utils.py
+++ b/app/services/SDP/utils/utils.py
@ -0,0 +1,45 @@
+# 公共方法
+import json
+import requests  # 新增
+from typing import List, Dict
+
+
+def load_srt(file_path: str) -> List[Dict]:
+    """加载并解析SRT文件
+
+    Args:
+        file_path: SRT文件路径
+
+    Returns:
+        字幕内容列表
+    """
+    with open(file_path, 'r', encoding='utf-8-sig') as f:
+        content = f.read().strip()
+
+    # 按空行分割字幕块
+    subtitle_blocks = content.split('\n\n')
+    subtitles = []
+
+    for block in subtitle_blocks:
+        lines = block.split('\n')
+        if len(lines) >= 3:  # 确保块包含足够的行
+            try:
+                number = int(lines[0].strip())
+                timestamp = lines[1]
+                text = ' '.join(lines[2:])
+
+                # 解析时间戳
+                start_time, end_time = timestamp.split(' --> ')
+
+                subtitles.append({
+                    'number': number,
+                    'timestamp': timestamp,
+                    'text': text,
+                    'start_time': start_time,
+                    'end_time': end_time
+                })
+            except ValueError as e:
+                print(f"Warning: 跳过无效的字幕块: {e}")
+                continue
+
+    return subtitles
--- a/app/services/SDP/utils/utils.pyd
+++ b/app/services/SDP/utils/utils.pyd
--- a/app/services/SDP/utils/utils.so
+++ b/app/services/SDP/utils/utils.so
--- a/app/services/audio_merger.py
+++ b/app/services/audio_merger.py
@ -18,15 +18,14 @@ def check_ffmpeg():
        return False


-def merge_audio_files(task_id: str, audio_files: list, total_duration: float, list_script: list):
+def merge_audio_files(task_id: str, total_duration: float, list_script: list):
    """
-    合并音频文件，根据OST设置处理不同的音频轨道
+    合并音频文件
    
    Args:
        task_id: 任务ID
-        audio_files: TTS生成的音频文件列表
        total_duration: 总时长
-        list_script: 完整脚本信息，包含OST设置
+        list_script: 完整脚本信息，包含duration时长和audio路径
    
    Returns:
        str: 合并后的音频文件路径
@ -39,36 +38,38 @@ def merge_audio_files(task_id: str, audio_files: list, total_duration: float, li
    # 创建一个空的音频片段
    final_audio = AudioSegment.silent(duration=total_duration * 1000)  # 总时长以毫秒为单位

+    # 计算每个片段的开始位置（基于duration字段）
+    current_position = 0  # 初始位置（秒）
+    
    # 遍历脚本中的每个片段
-    for segment, audio_file in zip(list_script, audio_files):
+    for segment in list_script:
        try:
-            # 加载TTS音频文件
-            tts_audio = AudioSegment.from_file(audio_file)
-
-            # 获取片段的开始和结束时间
-            start_time, end_time = segment['new_timestamp'].split('-')
-            start_seconds = utils.time_to_seconds(start_time)
-            end_seconds = utils.time_to_seconds(end_time)
-
-            # 根据OST设置处理音频
-            if segment['OST'] == 0:
-                # 只使用TTS音频
-                final_audio = final_audio.overlay(tts_audio, position=start_seconds * 1000)
-            elif segment['OST'] == 1:
-                # 只使用原声（假设原声已经在视频中）
-                continue
-            elif segment['OST'] == 2:
-                # 混合TTS音频和原声
-                original_audio = AudioSegment.silent(duration=(end_seconds - start_seconds) * 1000)
-                mixed_audio = original_audio.overlay(tts_audio)
-                final_audio = final_audio.overlay(mixed_audio, position=start_seconds * 1000)
+            # 获取片段时长（秒）
+            duration = segment['duration']
+            
+            # 检查audio字段是否为空
+            if segment['audio'] and os.path.exists(segment['audio']):
+                # 加载TTS音频文件
+                tts_audio = AudioSegment.from_file(segment['audio'])
+                
+                # 将TTS音频添加到最终音频
+                final_audio = final_audio.overlay(tts_audio, position=current_position * 1000)
+            else:
+                # audio为空，不添加音频，仅保留间隔
+                logger.info(f"片段 {segment.get('timestamp', '')} 没有音频文件，保留 {duration} 秒的间隔")
+            
+            # 更新下一个片段的开始位置
+            current_position += duration

        except Exception as e:
-            logger.error(f"处理音频文件 {audio_file} 时出错: {str(e)}")
+            logger.error(f"处理音频片段时出错: {str(e)}")
+            # 即使处理失败，也要更新位置，确保后续片段位置正确
+            if 'duration' in segment:
+                current_position += segment['duration']
            continue

    # 保存合并后的音频文件
-    output_audio_path = os.path.join(utils.task_dir(task_id), "final_audio.mp3")
+    output_audio_path = os.path.join(utils.task_dir(task_id), "merger_audio.mp3")
    final_audio.export(output_audio_path, format="mp3")
    logger.info(f"合并后的音频文件已保存: {output_audio_path}")

@ -93,7 +94,7 @@ def time_to_seconds(time_str):

        # 分割时间部分
        parts = time_part.split(':')
-        
+
        if len(parts) == 3:  # HH:MM:SS
            h, m, s = map(int, parts)
            seconds = h * 3600 + m * 60 + s
@ -118,11 +119,11 @@ def extract_timestamp(filename):
        # 从文件名中提取时间部分
        time_part = filename.split('_', 1)[1].split('.')[0]  # 获取 "00_06,500-00_24,800" 部分
        start_time, end_time = time_part.split('-')  # 分割成开始和结束时间
-        
+
        # 将下划线格式转换回冒号格式
        start_time = start_time.replace('_', ':')
        end_time = end_time.replace('_', ':')
-        
+
        # 将时间戳转换为秒
        start_seconds = time_to_seconds(start_time)
        end_seconds = time_to_seconds(end_time)
@ -135,17 +136,36 @@ def extract_timestamp(filename):

 if __name__ == "__main__":
    # 示例用法
-    audio_files =[
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:06-00:24.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:32-00:38.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:43-00:52.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_00:52-01:09.mp3",
-        "/Users/apple/Desktop/home/NarratoAI/storage/tasks/test456/audio_01:13-01:15.mp3",
-    ]
-    total_duration = 38
-    video_script_path = "/Users/apple/Desktop/home/NarratoAI/resource/scripts/test003.json"
-    with open(video_script_path, "r", encoding="utf-8") as f:
-        video_script = json.load(f)
+    total_duration = 90

-    output_file = merge_audio_files("test456", audio_files, total_duration, video_script)
+    video_script = [
+        {'picture': '【解说】好的，各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！',
+         'timestamp': '00:00:00-00:00:26',
+         'narration': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！',
+         'OST': 0, 'duration': 26, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3'},
+        {'picture': '【解说】上一集我们看到，范闲在北齐遭遇了惊天变故，生死不明！', 'timestamp': '00:01:15-00:01:29',
+         'narration': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…',
+         'OST': 0, 'duration': 14, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3'},
+        {'picture': '画面切到王启年小心翼翼地向范闲汇报。', 'timestamp': '00:04:41-00:04:58',
+         'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪',
+         'OST': 1, 'duration': 17, 
+         'audio': ''},
+        {'picture': '【解说】"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。',
+         'timestamp': '00:04:58-00:05:20',
+         'narration': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！',
+         'OST': 0, 'duration': 22, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3'},
+        {'picture': '【解说】但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+         'timestamp': '00:05:45-00:05:53',
+         'narration': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+         'OST': 0, 'duration': 8, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'},
+        {'picture': '画面切换到范闲蒙面闯入皇宫，被侍卫包围的场景。', 'timestamp': '00:06:00-00:06:03',
+         'narration': '抓刺客',
+         'OST': 1, 'duration': 3, 
+         'audio': ''}]
+
+    output_file = merge_audio_files("test456", total_duration, video_script)
    print(output_file)
--- a/app/services/clip_video.py
+++ b/app/services/clip_video.py
@ -0,0 +1,237 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : clip_video
+@Author : 小林同学
+@Date   : 2025/5/6 下午6:14
+'''
+
+import os
+import subprocess
+import json
+import hashlib
+from loguru import logger
+from typing import Dict, List, Optional
+from pathlib import Path
+
+from app.utils import ffmpeg_utils
+
+
+def parse_timestamp(timestamp: str) -> tuple:
+    """
+    解析时间戳字符串，返回开始和结束时间
+
+    Args:
+        timestamp: 格式为'HH:MM:SS-HH:MM:SS'或'HH:MM:SS,sss-HH:MM:SS,sss'的时间戳字符串
+
+    Returns:
+        tuple: (开始时间, 结束时间) 格式为'HH:MM:SS'或'HH:MM:SS,sss'
+    """
+    start_time, end_time = timestamp.split('-')
+    return start_time, end_time
+
+
+def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str:
+    """
+    根据开始时间和持续时间计算结束时间
+
+    Args:
+        start_time: 开始时间，格式为'HH:MM:SS'或'HH:MM:SS,sss'(带毫秒)
+        duration: 持续时间，单位为秒
+        extra_seconds: 额外添加的秒数，默认为1秒
+
+    Returns:
+        str: 计算后的结束时间，格式与输入格式相同
+    """
+    # 检查是否包含毫秒
+    has_milliseconds = ',' in start_time
+    milliseconds = 0
+
+    if has_milliseconds:
+        time_part, ms_part = start_time.split(',')
+        h, m, s = map(int, time_part.split(':'))
+        milliseconds = int(ms_part)
+    else:
+        h, m, s = map(int, start_time.split(':'))
+
+    # 转换为总毫秒数
+    total_milliseconds = ((h * 3600 + m * 60 + s) * 1000 + milliseconds +
+                          int((duration + extra_seconds) * 1000))
+
+    # 计算新的时、分、秒、毫秒
+    ms_new = total_milliseconds % 1000
+    total_seconds = total_milliseconds // 1000
+    h_new = int(total_seconds // 3600)
+    m_new = int((total_seconds % 3600) // 60)
+    s_new = int(total_seconds % 60)
+
+    # 返回与输入格式一致的时间字符串
+    if has_milliseconds:
+        return f"{h_new:02d}:{m_new:02d}:{s_new:02d},{ms_new:03d}"
+    else:
+        return f"{h_new:02d}:{m_new:02d}:{s_new:02d}"
+
+
+def check_hardware_acceleration() -> Optional[str]:
+    """
+    检查系统支持的硬件加速选项
+
+    Returns:
+        Optional[str]: 硬件加速参数，如果不支持则返回None
+    """
+    # 使用集中式硬件加速检测
+    return ffmpeg_utils.get_ffmpeg_hwaccel_type()
+
+
+def clip_video(
+        video_origin_path: str,
+        tts_result: List[Dict],
+        output_dir: Optional[str] = None,
+        task_id: Optional[str] = None
+) -> Dict[str, str]:
+    """
+    根据时间戳裁剪视频
+
+    Args:
+        video_origin_path: 原始视频的路径
+        tts_result: 包含时间戳和持续时间信息的列表
+        output_dir: 输出目录路径，默认为None时会自动生成
+        task_id: 任务ID，用于生成唯一的输出目录，默认为None时会自动生成
+
+    Returns:
+        Dict[str, str]: 时间戳到裁剪后视频路径的映射
+    """
+    # 检查视频文件是否存在
+    if not os.path.exists(video_origin_path):
+        raise FileNotFoundError(f"视频文件不存在: {video_origin_path}")
+
+    # 如果未提供task_id，则根据输入生成一个唯一ID
+    if task_id is None:
+        content_for_hash = f"{video_origin_path}_{json.dumps(tts_result)}"
+        task_id = hashlib.md5(content_for_hash.encode()).hexdigest()
+
+    # 设置输出目录
+    if output_dir is None:
+        output_dir = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+            "storage", "temp", "clip_video", task_id
+        )
+
+    # 确保输出目录存在
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # 获取硬件加速支持
+    hwaccel = check_hardware_acceleration()
+    hwaccel_args = []
+    if hwaccel:
+        hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args()
+
+    # 存储裁剪结果
+    result = {}
+
+    for item in tts_result:
+        _id = item.get("_id", item.get("timestamp", "unknown"))
+        timestamp = item["timestamp"]
+        start_time, _ = parse_timestamp(timestamp)
+
+        # 根据持续时间计算真正的结束时间（加上1秒余量）
+        duration = item["duration"]
+        calculated_end_time = calculate_end_time(start_time, duration)
+
+        # 转换为FFmpeg兼容的时间格式（逗号替换为点）
+        ffmpeg_start_time = start_time.replace(',', '.')
+        ffmpeg_end_time = calculated_end_time.replace(',', '.')
+
+        # 格式化输出文件名（使用连字符替代冒号和逗号）
+        safe_start_time = start_time.replace(':', '-').replace(',', '-')
+        safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-')
+        output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4"
+        output_path = os.path.join(output_dir, output_filename)
+
+        # 构建FFmpeg命令
+        ffmpeg_cmd = [
+            "ffmpeg", "-y", *hwaccel_args,
+            "-i", video_origin_path,
+            "-ss", ffmpeg_start_time,
+            "-to", ffmpeg_end_time,
+            "-c:v", "h264_videotoolbox" if hwaccel == "videotoolbox" else "libx264",
+            "-c:a", "aac",
+            "-strict", "experimental",
+            output_path
+        ]
+
+        # 执行FFmpeg命令
+        try:
+            logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}")
+            # logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}")
+
+            # 在Windows系统上使用UTF-8编码处理输出，避免GBK编码错误
+            is_windows = os.name == 'nt'
+            if is_windows:
+                process = subprocess.run(
+                    ffmpeg_cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    encoding='utf-8',  # 明确指定编码为UTF-8
+                    text=True,
+                    check=True
+                )
+            else:
+                process = subprocess.run(
+                    ffmpeg_cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                    check=True
+                )
+
+            result[_id] = output_path
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"裁剪视频片段失败: {timestamp}")
+            logger.error(f"错误信息: {e.stderr}")
+            raise RuntimeError(f"视频裁剪失败: {e.stderr}")
+
+    return result
+
+
+if __name__ == "__main__":
+    video_origin_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/qyn2-2无片头片尾.mp4"
+
+    tts_result = [{'timestamp': '00:00:00-00:01:15',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt',
+                   'duration': 25.55,
+                   'text': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！'},
+                  {'timestamp': '00:01:15-00:04:40',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt',
+                   'duration': 13.488,
+                   'text': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…'},
+                  {'timestamp': '00:04:58-00:05:45',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt',
+                   'duration': 21.363,
+                   'text': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！'},
+                  {'timestamp': '00:05:45-00:06:00',
+                   'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3',
+                   'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt',
+                   'duration': 7.675, 'text': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！'}]
+    subclip_path_videos = {
+        '00:00:00-00:01:15': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-00-00-00-01-15.mp4',
+        '00:01:15-00:04:40': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-01-15-00-04-40.mp4',
+        '00:04:41-00:04:58': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4',
+        '00:04:58-00:05:45': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-58-00-05-45.mp4',
+        '00:05:45-00:06:00': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-05-45-00-06-00.mp4',
+        '00:06:00-00:06:03': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4',
+    }
+
+    # 使用方法示例
+    try:
+        result = clip_video(video_origin_path, tts_result, subclip_path_videos)
+        print("裁剪结果:")
+        print(json.dumps(result, indent=4, ensure_ascii=False))
+    except Exception as e:
+        print(f"发生错误: {e}")
--- a/app/services/generate_narration_script.py
+++ b/app/services/generate_narration_script.py
@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : 生成介绍文案
+@Author : 小林同学
+@Date   : 2025/5/8 上午11:33 
+'''
+
+import json
+import os
+import traceback
+from openai import OpenAI
+from loguru import logger
+
+
+def parse_frame_analysis_to_markdown(json_file_path):
+    """
+    解析视频帧分析JSON文件并转换为Markdown格式
+    
+    :param json_file_path: JSON文件路径
+    :return: Markdown格式的字符串
+    """
+    # 检查文件是否存在
+    if not os.path.exists(json_file_path):
+        return f"错误: 文件 {json_file_path} 不存在"
+    
+    try:
+        # 读取JSON文件
+        with open(json_file_path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+        
+        # 初始化Markdown字符串
+        markdown = ""
+        
+        # 获取总结和帧观察数据
+        summaries = data.get('overall_activity_summaries', [])
+        frame_observations = data.get('frame_observations', [])
+        
+        # 按批次组织数据
+        batch_frames = {}
+        for frame in frame_observations:
+            batch_index = frame.get('batch_index')
+            if batch_index not in batch_frames:
+                batch_frames[batch_index] = []
+            batch_frames[batch_index].append(frame)
+        
+        # 生成Markdown内容
+        for i, summary in enumerate(summaries, 1):
+            batch_index = summary.get('batch_index')
+            time_range = summary.get('time_range', '')
+            batch_summary = summary.get('summary', '')
+            
+            markdown += f"## 片段 {i}\n"
+            markdown += f"- 时间范围：{time_range}\n"
+            
+            # 添加片段描述
+            markdown += f"- 片段描述：{batch_summary}\n" if batch_summary else f"- 片段描述：\n"
+            
+            markdown += "- 详细描述：\n"
+            
+            # 添加该批次的帧观察详情
+            frames = batch_frames.get(batch_index, [])
+            for frame in frames:
+                timestamp = frame.get('timestamp', '')
+                observation = frame.get('observation', '')
+                
+                # 直接使用原始文本，不进行分割
+                markdown += f"  - {timestamp}: {observation}\n" if observation else f"  - {timestamp}: \n"
+            
+            markdown += "\n"
+        
+        return markdown
+    
+    except Exception as e:
+        return f"处理JSON文件时出错: {traceback.format_exc()}"
+
+
+def generate_narration(markdown_content, api_key, base_url, model):
+    """
+    调用OpenAI API根据视频帧分析的Markdown内容生成解说文案
+    
+    :param markdown_content: Markdown格式的视频帧分析内容
+    :param api_key: OpenAI API密钥
+    :param base_url: API基础URL，如果使用非官方API
+    :param model: 使用的模型名称
+    :return: 生成的解说文案
+    """
+    try:
+        # 构建提示词
+        prompt = """
+我是一名荒野建造解说的博主，以下是一些同行的对标文案，请你深度学习并总结这些文案的风格特点跟内容特点：
+
+<example_text_1>
+解压助眠的天花板就是荒野建造，沉浸丝滑的搭建过程可以说每一帧都是极致享受，我保证强迫症来了都找不出一丁点毛病。更别说全屋严丝合缝的拼接工艺，还能轻松抵御零下二十度气温，让你居住的每一天都温暖如春。
+在家闲不住的西姆今天也打算来一次野外建造，行走没多久他就发现许多倒塌的树，任由它们自生自灭不如将其利用起来。想到这他就开始挥舞铲子要把地基挖掘出来，虽然每次只能挖一点点，但架不住他体能惊人。没多长时间一个 2x3 的深坑就赫然出现，这深度住他一人绰绰有余。
+随后他去附近收集来原木，这些都是搭建墙壁的最好材料。而在投入使用前自然要把表皮刮掉，防止森林中的白蚁蛀虫。处理好一大堆后西姆还在两端打孔，使用木钉固定在一起。这可不是用来做墙壁的，而是做庇护所的承重柱。只要木头间的缝隙足够紧密，那搭建出的木屋就能足够坚固。
+每向上搭建一层，他都会在中间塞入苔藓防寒，保证不会泄露一丝热量。其他几面也是用相同方法，很快西姆就做好了三面墙壁，每一根木头都极其工整，保证强迫症来了都要点个赞再走。
+在继续搭建墙壁前西姆决定将壁炉制作出来，毕竟森林夜晚的气温会很低，保暖措施可是重中之重。完成后他找来一块大树皮用来充当庇护所的大门，而上面刮掉的木屑还能作为壁炉的引火物，可以说再完美不过。
+测试了排烟没问题后他才开始搭建最后一面墙壁，这一面要预留门和窗，所以在搭建到一半后还需要在原木中间开出卡口，让自己劈砍时能轻松许多。此时只需将另外一根如法炮制，两端拼接在一起后就是一扇大小适中的窗户。而随着随后一层苔藓铺好，最后一根原木落位，这个庇护所的雏形就算完成。
+大门的安装他没选择用合页，而是在底端雕刻出榫头，门框上则雕刻出榫眼，只能说西姆的眼就是一把尺，这完全就是严丝合缝。此时他才开始搭建屋顶。这里西姆用的方法不同，他先把最外围的原木固定好，随后将原木平铺在上面，就能得到完美的斜面屋顶。等他将四周的围栏也装好后，工整的屋顶看起来十分舒服，西姆躺上去都不想动。
+稍作休息后，他利用剩余的苔藓，对屋顶的缝隙处密封。可这样西姆觉得不够保险，于是他找来一些黏土，再次对原本的缝隙二次加工，保管这庇护所冬天也暖和。最后只需要平铺上枯叶，以及挖掘出的泥土，整个屋顶就算完成。
+考虑到庇护所的美观性，自然少不了覆盖上苔藓，翠绿的颜色看起来十分舒服。就连门口的庭院旁，他都移植了许多小树做点缀，让这木屋与周边环境融为一体。西姆才刚完成好这件事，一场大雨就骤然降临。好在此时的他已经不用淋雨，更别说这屋顶防水十分不错，室内没一点雨水渗透进来。
+等待温度回升的过程，西姆利用墙壁本身的凹槽，把床框镶嵌在上面，只需要铺上苔藓，以及自带的床单枕头，一张完美的单人床就做好。辛苦劳作一整天，西姆可不会亏待自己。他将自带的牛肉腌制好后，直接放到壁炉中烤，只需要等待三十分钟，就能享受这美味的一顿。
+在辛苦建造一星期后，他终于可以在自己搭建的庇护所中，享受最纯正的野外露营。后面西姆回家补给了一堆物资，再次回来时森林已经大雪纷飞，让他原本翠绿的小屋，更换上了冬季限定皮肤。好在内部设施没受什么影响，和他离开时一样整洁。
+就是房间中已经没多少柴火，让西姆今天又得劈柴。寒冷干燥的天气，让木头劈起来十分轻松。没多久他就收集到一大堆，这些足够燃烧好几天。虽然此时外面大雪纷飞，但小屋中却开始逐渐温暖。这次他除了带来一些食物外，还有几瓶调味料，以及一整套被褥，让自己的居住舒适度提高一大截。
+而秋天他有收集干草的缘故，只需要塞入枕套中密封起来，就能作为靠垫用。就这居住条件，比一般人在家过的还要奢侈。趁着壁炉木头变木炭的过程，西姆则开始不紧不慢的处理食物。他取出一块牛排，改好花刀以后，撒上一堆调料腌制起来。接着用锡纸包裹好，放到壁炉中直接炭烤，搭配上自带的红酒，是一个非常好的选择。
+随着时间来到第二天，外面的积雪融化了不少，西姆简单做顿煎蛋补充体力后，决定制作一个室外篝火堆，用来晚上驱散周边野兽。搭建这玩意没什么技巧，只需要找到一大堆木棍，利用大树的夹缝将其掰弯，然后将其堆积在一起，就是一个简易版的篝火堆。看这外形有点像帐篷，好在西姆没想那么多。
+等待天色暗淡下来后，他才来到室外将其点燃，顺便处理下多余的废料。只可惜这场景没朋友陪在身边，对西姆来说可能是个遗憾。而哪怕森林只有他一个人，都依旧做了好几个小时。等到里面的篝火彻底燃尽后，西姆还找来雪球，覆盖到上面将火熄灭，这防火意识可谓十分好。最后在室内二十五度的高温下，裹着被子睡觉。
+</example_text_1>
+
+<example_text_2>
+解压助眠的天花板就是荒野建造，沉浸丝滑的搭建过程每一帧都是极致享受，全屋严丝合缝的拼接工艺，能轻松抵御零下二十度气温，居住体验温暖如春。
+在家闲不住的西姆开启野外建造。他发现倒塌的树，决定加以利用。先挖掘出 2x3 的深坑作为地基，接着收集原木，刮掉表皮防白蚁蛀虫，打孔用木钉固定制作承重柱。搭建墙壁时，每一层都塞入苔藓防寒，很快做好三面墙。
+为应对森林夜晚低温，西姆制作壁炉，用大树皮当大门，刮下的木屑做引火物。搭建最后一面墙时预留门窗，通过在原木中间开口拼接做出窗户。大门采用榫卯结构安装，严丝合缝。
+搭建屋顶时，先固定外围原木，再平铺原木形成斜面屋顶，之后用苔藓、黏土密封缝隙，铺上枯叶和泥土。为美观，在木屋覆盖苔藓，移植小树点缀。完工时遇大雨，木屋防水良好。
+西姆利用墙壁凹槽镶嵌床框，铺上苔藓、床单枕头做成床。劳作一天后，他用壁炉烤牛肉享用。建造一星期后，他开始野外露营。
+后来西姆回家补给物资，回来时森林大雪纷飞。他劈柴储备，带回食物、调味料和被褥，提高居住舒适度，还用干草做靠垫。他用壁炉烤牛排，搭配红酒。
+第二天，积雪融化，西姆制作室外篝火堆防野兽。用大树夹缝掰弯木棍堆积而成，晚上点燃处理废料，结束后用雪球灭火，最后在室内二十五度的环境中裹被入睡。
+</example_text_2>
+
+<example_text_3>
+如果战争到来，这个深埋地下十几米的庇护所绝对是 bug 般的存在。即使被敌人发现，还能通过快速通道一秒逃出。里面不仅有竹子、地暖、地下水井，还自制抽水机。在解决用水问题的同时，甚至自研无土栽培技术，过上完全自给自足的生活。
+阿伟的老婆美如花，但阿伟从来不回家，来到野外他乐哈哈，一言不合就开挖。众所周知当战争来临时，地下堡垒的安全性是最高的。阿伟苦苦研习两载半，只为练就一身挖洞本领。在这双逆天麒麟臂的加持下，如此坚硬的泥土都只能当做炮灰。
+得到了充足的空间后，他便开始对这些边缘进行打磨。随后阿伟将细线捆在木棍上，以此描绘出圆柱的轮廓。接着再一点点铲掉多余的部分。虽然是由泥土一体式打造，但这样的桌子保准用上千年都不成问题。
+考虑到十几米的深度进出非常不方便，于是阿伟找来两根长达 66.6 米的木头，打算为庇护所打造一条快速通道。只见他将木桩牢牢地插入地下，并顺着洞口的方向延伸出去，直到贯穿整个山洞。接着在每个木桩的连接处钉入铁钉，确保轨道不能有一毫米的偏差。完成后再制作一个木质框架，从而达到前后滑动的效果。
+不得不说阿伟这手艺简直就是大钢管子杵青蛙。在上面放上一个木制的车斗，还能加快搬运泥土的速度。没多久庇护所的内部就已经初见雏形。为了住起来更加舒适，还需要为自己打造一张床。虽然深处的泥土同样很坚固，但好处就是不用担心垮塌的风险。
+阿伟不仅设计了更加符合人体工学的拱形，并且还在一旁雕刻处壁龛。就是这氛围怎么看着有点不太吉利。别看阿伟一身腱子肉，但这身体里的艺术细菌可不少。每个边缘的地方他都做了精雕细琢，瞬间让整个卧室的颜值提升一大截。
+住在地下的好处就是房子面积全靠挖，每平方消耗两个半馒头。不仅没有了房贷的压力，就连买墓地的钱也省了。阿伟将中间的墙壁挖空，从而得到取暖的壁炉。当然最重要的还有排烟问题，要想从上往下打通十几米的山体是件极其困难的事。好在阿伟年轻时报过忆坤年的古墓派补习班，这打洞技术堪比隔壁学校的土拨鼠专业。虽然深度长达十几米，但排烟效果却一点不受影响，一个字专业！
+随后阿伟继续对壁炉底部雕刻，打通了底部放柴火的空间，并制作出放锅的灶头。完成后阿伟从侧面将壁炉打通，并制作出一条导热的通道，以此连接到床铺的位置。毕竟住在这么一个风湿宝地，不注意保暖除湿很容易得老寒腿。
+阿伟在床面上挖出一条条管道，以便于温度能传输到床的每个角落。接下来就可以根据这些通道的长度裁切出同样长短的竹子，根据竹筒的大小凿出相互连接的孔洞，最后再将竹筒内部打通，以达到温度传送的效果。
+而后阿伟将这些管道安装到凹槽内，在他严谨的制作工艺下，每根竹子刚好都能镶嵌进去。在铺设床面之前还需要用木塞把圆孔堵住，防止泥土掉落进管道。泥土虽然不能隔绝湿气，但却是十分优良的导热材料。等他把床面都压平后就可以小心的将这些木塞拔出来，最后再用黏土把剩余的管道也遮盖起来，直到整个墙面恢复原样。
+接下来还需要测试一下加热效果，当他把火点起来后，温度很快就传送到了管道内，把火力一点点加大，直到热气流淌到更远的床面。随着小孔里的青烟冒出，也预示着阿伟的地暖可以投入使用。而后阿伟制作了一些竹条，并用细绳将它们喜结连理。
+千里之行始于足下，美好的家园要靠自己双手打造。明明可以靠才艺吃饭的阿伟偏偏要用八块腹肌征服大家，就问这样的男人哪个野生婆娘不喜欢？完成后阿伟还用自己 35 码的大腚感受了一下，真烫！
+随后阿伟来到野区找到一根上好的雷击木，他当即就把木头咔嚓成两段，并取下两节较为完整的带了回去，刚好能和圆桌配套。另外一个在里面凿出凹槽，并插入木棍连接，得到一个夯土的木锤。住过农村的小伙伴都知道，这样夯出来的地面堪比水泥地，不仅坚硬耐磨，还不用担心脚底打滑。忙碌了一天的阿伟已经饥渴难耐，拿出野生小烤肠，安安心心住新房，光脚爬上大热炕，一觉能睡到天亮。
+第二天阿伟打算将房间扩宽，毕竟吃住的地方有了，还要解决个人卫生的问题。阿伟在另一侧增加了一个房间，他打算将这里打造成洗澡的地方。为了防止泥土垮塌，他将顶部做成圆弧形，等挖出足够的空间后，旁边的泥土已经堆成了小山。
+为了方便清理这些泥土，阿伟在之前的轨道增加了转弯，交接处依然是用铁钉固定，一直延伸到房间的最里面。有了运输车的帮助，这些成吨的泥土也能轻松的运送出去，并且还能体验过山车的感觉。很快他就完成了清理工作。
+为了更方便的在里面洗澡，他将底部一点点挖空，这么大的浴缸，看来阿伟并不打算一个人住。完成后他将墙面雕刻的凹凸有致，让这里看起来更加豪华。接着用洛阳铲挖出排水口，并用一根相同大小的竹筒作为开关。
+由于四周都是泥土还不能防水，阿伟特意找了一些白蚁巢，用来制作可以防水的野生水泥。现在就可以将里里外外，能接触到水的地方都涂抹一遍。细心的阿伟还找来这种 500 克一斤的鹅卵石，对池子表面进行装饰。
+没错，水源问题阿伟早已经考虑在内，他打算直接在旁边挖个水井，毕竟已经挖了这么深，再向下挖一挖，应该就能到达地下水的深度。经过几日的奋战，能看得出阿伟已经消瘦了不少，但一想到马上就能拥有的豪宅，他直接化身为无情的挖土机器，很快就挖到了好几米的深度。
+考虑到自己的弹跳力有限，阿伟在一旁定入木桩，然后通过绳子爬上爬下。随着深度越来越深，井底已经开始渗出水来，这也预示着打井成功。没多久这里面将渗满泉水，仅凭一次就能挖到水源，看来这里还真是块风湿宝地。
+随后阿伟在井口四周挖出凹槽，以便于井盖的安置。这一量才知道，井的深度已经达到了足足的 5 米。阿伟把木板组合在一起，再沿着标记切掉多余部分，他甚至还给井盖做了把手。可是如何从这么深的井里打水还是个问题，但从阿伟坚定的眼神来看，他应该想到了解决办法。
+只见他将树桩锯成两半，然后用凿子把里面一点点掏空，另外一半也是如法炮制。接着还要在底部挖出圆孔，要想成功将水从 5 米深的地方抽上来，那就不得不提到大家熟知的勾股定理。没错，这跟勾股定理没什么关系。
+阿伟给竹筒做了一个木塞，并在里面打上安装连接轴的孔。为了增加密闭性，阿伟不得不牺牲了自己的 AJ，剪出与木塞相同的大小后，再用木钉固定住。随后他收集了一些树胶，并放到火上加热融化。接下来就可以涂在木塞上增加使用寿命。
+现在将竹筒组装完成，就可以利用虹吸原理将水抽上来。完成后就可以把井盖盖上去，再用泥土在上面覆盖，现在就不用担心失足掉下去了。
+接下来阿伟去采集了一些大漆，将它涂抹在木桶接缝处，就能将其二合为一。完了再接入旁边浴缸的入水口，每个连接的地方都要做好密封，不然后面很容易漏水。随后就可以安装上活塞，并用一根木桩作为省力杠杆，根据空气压强的原理将井水抽上来。
+经过半小时的来回拉扯，硕大的浴缸终于被灌满，阿伟也是忍不住洗了把脸。接下来还需要解决排水的问题，阿伟在地上挖出沟渠，一直贯穿到屋外，然后再用竹筒从出水口连接，每个接口处都要抹上胶水，就连门外的出水口他都做了隐藏。
+在野外最重要的就是庇护所、水源还有食物。既然已经完成了前二者，那么阿伟还需要拥有可持续发展的食物来源。他先是在地上挖了两排地洞，然后在每根竹筒的表面都打上无数孔洞，这就是他打算用来种植的载体。在此之前，还需要用大火对竹筒进行杀菌消毒。
+趁着这时候，他去搬了一麻袋的木屑，先用芭蕉叶覆盖在上面，再铺上厚厚的黏土隔绝温度。在火焰的温度下，能让里面的木屑达到生长条件。
+等到第二天所有材料都晾凉后，阿伟才将竹筒内部掏空，并将木屑一点点地塞入竹筒。一切准备就绪，就可以将竹筒插入提前挖好的地洞。最后再往竹筒里塞入种子，依靠房间内的湿度和温度，就能达到大棚种植的效果。稍加时日，这些种子就会慢慢发芽。
+虽然暂时还吃不上自己培养的食物，但好在阿伟从表哥贺强那里学到不少钓鱼本领，哪怕只有一根小小的竹竿，也能让他钓上两斤半的大鲶鱼。新鲜的食材，那肯定是少不了高温消毒的过程。趁着鱼没熟，阿伟直接爬进浴缸，冰凉的井水瞬间洗去了身上的疲惫。这一刻的阿伟是无比的享受。
+不久后鱼也烤得差不多了，阿伟的生活现在可以说是有滋有味。住在十几米的地下，不仅能安全感满满，哪怕遇到危险，还能通过轨道快速逃生。
+<example_text_3>
+
+<video_frame_description>
+%s
+</video_frame_description>
+
+我正在尝试做这个内容的解说纪录片视频，我需要你以 <video_frame_description> </video_frame_description> 中的内容为解说目标，根据我刚才提供给你的对标文案 <example_text> 特点，以及你总结的特点，帮我生成一段关于荒野建造的解说文案，文案需要符合平台受欢迎的解说风格，请使用 json 格式进行输出；使用 <output> 中的输出格式：
+
+<output>
+{
+  "items": [
+    {
+        "_id": 1, # 唯一递增id
+        "timestamp": "00:00:05,390-00:00:10,430",
+        "picture": "画面描述",
+        "narration": "解说文案",
+    }
+}
+</output>
+
+<restriction>
+1. 只输出 json 内容，不要输出其他任何说明性的文字
+2. 解说文案的语言使用 简体中文
+3. 严禁虚构画面，所有画面只能从 <video_frame_description> 中摘取
+</restriction>
+""" % (markdown_content)
+
+        # 使用OpenAI SDK初始化客户端
+        client = OpenAI(
+            api_key=api_key,
+            base_url=base_url
+        )
+        
+        # 使用SDK发送请求
+        if model not in ["deepseek-reasoner"]:
+            # deepseek-reasoner 不支持 json 输出
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=1.5,
+                response_format={"type": "json_object"},
+            )
+            # 提取生成的文案
+            if response.choices and len(response.choices) > 0:
+                narration_script = response.choices[0].message.content
+                # 打印消耗的tokens
+                logger.debug(f"消耗的tokens: {response.usage.total_tokens}")
+                return narration_script
+            else:
+                return "生成解说文案失败: 未获取到有效响应"
+        else:
+            # 不支持 json 输出，需要多一步处理 ```json ``` 的步骤
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=1.5,
+            )
+            # 提取生成的文案
+            if response.choices and len(response.choices) > 0:
+                narration_script = response.choices[0].message.content
+                # 打印消耗的tokens
+                logger.debug(f"文案消耗的tokens: {response.usage.total_tokens}")
+                # 清理 narration_script 字符串前后的 ```json ``` 字符串
+                narration_script = narration_script.replace("```json", "").replace("```", "")
+                return narration_script
+            else:
+                return "生成解说文案失败: 未获取到有效响应"
+    
+    except Exception as e:
+        return f"调用API生成解说文案时出错: {traceback.format_exc()}"
+
+
+if __name__ == '__main__':
+    text_provider = 'openai'
+    text_api_key = "sk-xxx"
+    text_model = "deepseek-reasoner"
+    text_base_url = "https://api.deepseek.com"
+    video_frame_description_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1139.json"
+
+    # 测试新的JSON文件
+    test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_2258.json"
+    markdown_output = parse_frame_analysis_to_markdown(test_file_path)
+    # print(markdown_output)
+    
+    # 输出到文件以便检查格式
+    output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/家里家外1-5.md"
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(markdown_output)
+    # print(f"\n已将Markdown输出保存到: {output_file}")
+    
+    # # 生成解说文案
+    # narration = generate_narration(
+    #     markdown_output,
+    #     text_api_key,
+    #     base_url=text_base_url,
+    #     model=text_model
+    # )
+    #
+    # # 保存解说文案
+    # print(narration)
+    # print(type(narration))
+    # narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json"
+    # with open(narration_file, 'w', encoding='utf-8') as f:
+    #     f.write(narration)
+    # print(f"\n已将解说文案保存到: {narration_file}")
--- a/app/services/generate_video.py
+++ b/app/services/generate_video.py
@ -0,0 +1,426 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : generate_video
+@Author : 小林同学
+@Date   : 2025/5/7 上午11:55 
+'''
+
+import os
+import traceback
+from typing import Optional, Dict, Any
+from loguru import logger
+from moviepy import (
+    VideoFileClip,
+    AudioFileClip,
+    CompositeAudioClip,
+    CompositeVideoClip,
+    TextClip,
+    afx
+)
+from moviepy.video.tools.subtitles import SubtitlesClip
+from PIL import ImageFont
+
+from app.utils import utils
+from app.models.schema import AudioVolumeDefaults
+
+
+def merge_materials(
+    video_path: str,
+    audio_path: str,
+    output_path: str,
+    subtitle_path: Optional[str] = None,
+    bgm_path: Optional[str] = None,
+    options: Optional[Dict[str, Any]] = None
+) -> str:
+    """
+    合并视频、音频、BGM和字幕素材生成最终视频
+    
+    参数:
+        video_path: 视频文件路径
+        audio_path: 音频文件路径
+        output_path: 输出文件路径
+        subtitle_path: 字幕文件路径，可选
+        bgm_path: 背景音乐文件路径，可选
+        options: 其他选项配置，可包含以下字段:
+            - voice_volume: 人声音量，默认1.0
+            - bgm_volume: 背景音乐音量，默认0.3
+            - original_audio_volume: 原始音频音量，默认0.0
+            - keep_original_audio: 是否保留原始音频，默认False
+            - subtitle_font: 字幕字体，默认None，系统会使用默认字体
+            - subtitle_font_size: 字幕字体大小，默认40
+            - subtitle_color: 字幕颜色，默认白色
+            - subtitle_bg_color: 字幕背景颜色，默认透明
+            - subtitle_position: 字幕位置，可选值'bottom', 'top', 'center'，默认'bottom'
+            - custom_position: 自定义位置
+            - stroke_color: 描边颜色，默认黑色
+            - stroke_width: 描边宽度，默认1
+            - threads: 处理线程数，默认2
+            - fps: 输出帧率，默认30
+            - subtitle_enabled: 是否启用字幕，默认True
+            
+    返回:
+        输出视频的路径
+    """
+    # 合并选项默认值
+    if options is None:
+        options = {}
+    
+    # 设置默认参数值 - 使用统一的音量配置
+    voice_volume = options.get('voice_volume', AudioVolumeDefaults.VOICE_VOLUME)
+    bgm_volume = options.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME)
+    # 修复bug: 将原声音量默认值从0.0改为0.7，确保短剧解说模式下原片音量正常
+    original_audio_volume = options.get('original_audio_volume', AudioVolumeDefaults.ORIGINAL_VOLUME)
+    keep_original_audio = options.get('keep_original_audio', True)  # 默认保留原声
+    subtitle_font = options.get('subtitle_font', '')
+    subtitle_font_size = options.get('subtitle_font_size', 40)
+    subtitle_color = options.get('subtitle_color', '#FFFFFF')
+    subtitle_bg_color = options.get('subtitle_bg_color', 'transparent')
+    subtitle_position = options.get('subtitle_position', 'bottom')
+    custom_position = options.get('custom_position', 70)
+    stroke_color = options.get('stroke_color', '#000000')
+    stroke_width = options.get('stroke_width', 1)
+    threads = options.get('threads', 2)
+    fps = options.get('fps', 30)
+    subtitle_enabled = options.get('subtitle_enabled', True)
+
+    # 配置日志 - 便于调试问题
+    logger.info(f"音量配置详情:")
+    logger.info(f"  - 配音音量: {voice_volume}")
+    logger.info(f"  - 背景音乐音量: {bgm_volume}")
+    logger.info(f"  - 原声音量: {original_audio_volume}")
+    logger.info(f"  - 是否保留原声: {keep_original_audio}")
+    logger.info(f"字幕配置详情:")
+    logger.info(f"  - 是否启用字幕: {subtitle_enabled}")
+    logger.info(f"  - 字幕文件路径: {subtitle_path}")
+
+    # 音量参数验证
+    def validate_volume(volume, name):
+        if not (AudioVolumeDefaults.MIN_VOLUME <= volume <= AudioVolumeDefaults.MAX_VOLUME):
+            logger.warning(f"{name}音量 {volume} 超出有效范围 [{AudioVolumeDefaults.MIN_VOLUME}, {AudioVolumeDefaults.MAX_VOLUME}]，将被限制")
+            return max(AudioVolumeDefaults.MIN_VOLUME, min(volume, AudioVolumeDefaults.MAX_VOLUME))
+        return volume
+
+    voice_volume = validate_volume(voice_volume, "配音")
+    bgm_volume = validate_volume(bgm_volume, "背景音乐")
+    original_audio_volume = validate_volume(original_audio_volume, "原声")
+
+    # 处理透明背景色问题 - MoviePy 2.1.1不支持'transparent'值
+    if subtitle_bg_color == 'transparent':
+        subtitle_bg_color = None  # None在新版MoviePy中表示透明背景
+
+    # 创建输出目录（如果不存在）
+    output_dir = os.path.dirname(output_path)
+    os.makedirs(output_dir, exist_ok=True)
+    
+    logger.info(f"开始合并素材...")
+    logger.info(f"  ① 视频: {video_path}")
+    logger.info(f"  ② 音频: {audio_path}")
+    if subtitle_path:
+        logger.info(f"  ③ 字幕: {subtitle_path}")
+    if bgm_path:
+        logger.info(f"  ④ 背景音乐: {bgm_path}")
+    logger.info(f"  ⑤ 输出: {output_path}")
+    
+    # 加载视频
+    try:
+        video_clip = VideoFileClip(video_path)
+        logger.info(f"视频尺寸: {video_clip.size[0]}x{video_clip.size[1]}, 时长: {video_clip.duration}秒")
+        
+        # 提取视频原声(如果需要)
+        original_audio = None
+        if keep_original_audio and original_audio_volume > 0:
+            try:
+                original_audio = video_clip.audio
+                if original_audio:
+                    original_audio = original_audio.with_effects([afx.MultiplyVolume(original_audio_volume)])
+                    logger.info(f"已提取视频原声，音量设置为: {original_audio_volume}")
+                else:
+                    logger.warning("视频没有音轨，无法提取原声")
+            except Exception as e:
+                logger.error(f"提取视频原声失败: {str(e)}")
+                original_audio = None
+        
+        # 移除原始音轨，稍后会合并新的音频
+        video_clip = video_clip.without_audio()
+        
+    except Exception as e:
+        logger.error(f"加载视频失败: {str(e)}")
+        raise
+    
+    # 处理背景音乐和所有音频轨道合成
+    audio_tracks = []
+
+    # 先添加主音频（配音）
+    if audio_path and os.path.exists(audio_path):
+        try:
+            voice_audio = AudioFileClip(audio_path).with_effects([afx.MultiplyVolume(voice_volume)])
+            audio_tracks.append(voice_audio)
+            logger.info(f"已添加配音音频，音量: {voice_volume}")
+        except Exception as e:
+            logger.error(f"加载配音音频失败: {str(e)}")
+
+    # 添加原声（如果需要）
+    if original_audio is not None:
+        audio_tracks.append(original_audio)
+        logger.info(f"已添加视频原声，音量: {original_audio_volume}")
+
+    # 添加背景音乐（如果有）
+    if bgm_path and os.path.exists(bgm_path):
+        try:
+            bgm_clip = AudioFileClip(bgm_path).with_effects([
+                afx.MultiplyVolume(bgm_volume),
+                afx.AudioFadeOut(3),
+                afx.AudioLoop(duration=video_clip.duration),
+            ])
+            audio_tracks.append(bgm_clip)
+            logger.info(f"已添加背景音乐，音量: {bgm_volume}")
+        except Exception as e:
+            logger.error(f"添加背景音乐失败: \n{traceback.format_exc()}")
+
+    # 合成最终的音频轨道
+    if audio_tracks:
+        final_audio = CompositeAudioClip(audio_tracks)
+        video_clip = video_clip.with_audio(final_audio)
+        logger.info(f"已合成所有音频轨道，共{len(audio_tracks)}个")
+    else:
+        logger.warning("没有可用的音频轨道，输出视频将没有声音")
+    
+    # 处理字体路径
+    font_path = None
+    if subtitle_path and subtitle_font:
+        font_path = os.path.join(utils.font_dir(), subtitle_font)
+        if os.name == "nt":
+            font_path = font_path.replace("\\", "/")
+        logger.info(f"使用字体: {font_path}")
+    
+    # 处理视频尺寸
+    video_width, video_height = video_clip.size
+    
+    # 字幕处理函数
+    def create_text_clip(subtitle_item):
+        """创建单个字幕片段"""
+        phrase = subtitle_item[1]
+        max_width = video_width * 0.9
+        
+        # 如果有字体路径，进行文本换行处理
+        wrapped_txt = phrase
+        txt_height = 0
+        if font_path:
+            wrapped_txt, txt_height = wrap_text(
+                phrase, 
+                max_width=max_width, 
+                font=font_path, 
+                fontsize=subtitle_font_size
+            )
+        
+        # 创建文本片段
+        try:
+            _clip = TextClip(
+                text=wrapped_txt,
+                font=font_path,
+                font_size=subtitle_font_size,
+                color=subtitle_color,
+                bg_color=subtitle_bg_color,  # 这里已经在前面处理过，None表示透明
+                stroke_color=stroke_color,
+                stroke_width=stroke_width,
+            )
+        except Exception as e:
+            logger.error(f"创建字幕片段失败: {str(e)}, 使用简化参数重试")
+            # 如果上面的方法失败，尝试使用更简单的参数
+            _clip = TextClip(
+                text=wrapped_txt,
+                font=font_path,
+                font_size=subtitle_font_size,
+                color=subtitle_color,
+            )
+        
+        # 设置字幕时间
+        duration = subtitle_item[0][1] - subtitle_item[0][0]
+        _clip = _clip.with_start(subtitle_item[0][0])
+        _clip = _clip.with_end(subtitle_item[0][1])
+        _clip = _clip.with_duration(duration)
+        
+        # 设置字幕位置
+        if subtitle_position == "bottom":
+            _clip = _clip.with_position(("center", video_height * 0.95 - _clip.h))
+        elif subtitle_position == "top":
+            _clip = _clip.with_position(("center", video_height * 0.05))
+        elif subtitle_position == "custom":
+            margin = 10
+            max_y = video_height - _clip.h - margin
+            min_y = margin
+            custom_y = (video_height - _clip.h) * (custom_position / 100)
+            custom_y = max(
+                min_y, min(custom_y, max_y)
+            )
+            _clip = _clip.with_position(("center", custom_y))
+        else:  # center
+            _clip = _clip.with_position(("center", "center"))
+            
+        return _clip
+        
+    # 创建TextClip工厂函数
+    def make_textclip(text):
+        return TextClip(
+            text=text,
+            font=font_path,
+            font_size=subtitle_font_size,
+            color=subtitle_color,
+        )
+    
+    # 处理字幕 - 修复字幕开关bug
+    if subtitle_enabled and subtitle_path and os.path.exists(subtitle_path):
+        logger.info("字幕已启用，开始处理字幕文件")
+        try:
+            # 加载字幕文件
+            sub = SubtitlesClip(
+                subtitles=subtitle_path,
+                encoding="utf-8",
+                make_textclip=make_textclip
+            )
+
+            # 创建每个字幕片段
+            text_clips = []
+            for item in sub.subtitles:
+                clip = create_text_clip(subtitle_item=item)
+                text_clips.append(clip)
+
+            # 合成视频和字幕
+            video_clip = CompositeVideoClip([video_clip, *text_clips])
+            logger.info(f"已添加{len(text_clips)}个字幕片段")
+        except Exception as e:
+            logger.error(f"处理字幕失败: \n{traceback.format_exc()}")
+    elif not subtitle_enabled:
+        logger.info("字幕已禁用，跳过字幕处理")
+    elif not subtitle_path:
+        logger.info("未提供字幕文件路径，跳过字幕处理")
+    elif not os.path.exists(subtitle_path):
+        logger.warning(f"字幕文件不存在: {subtitle_path}，跳过字幕处理")
+    
+    # 导出最终视频
+    try:
+        video_clip.write_videofile(
+            output_path,
+            audio_codec="aac",
+            temp_audiofile_path=output_dir,
+            threads=threads,
+            fps=fps,
+        )
+        logger.success(f"素材合并完成: {output_path}")
+    except Exception as e:
+        logger.error(f"导出视频失败: {str(e)}")
+        raise
+    finally:
+        # 释放资源
+        video_clip.close()
+        del video_clip
+    
+    return output_path
+
+
+def wrap_text(text, max_width, font="Arial", fontsize=60):
+    """
+    文本换行函数，使长文本适应指定宽度
+    
+    参数:
+        text: 需要换行的文本
+        max_width: 最大宽度（像素）
+        font: 字体路径
+        fontsize: 字体大小
+        
+    返回:
+        换行后的文本和文本高度
+    """
+    # 创建ImageFont对象
+    try:
+        font_obj = ImageFont.truetype(font, fontsize)
+    except:
+        # 如果无法加载指定字体，使用默认字体
+        font_obj = ImageFont.load_default()
+    
+    def get_text_size(inner_text):
+        inner_text = inner_text.strip()
+        left, top, right, bottom = font_obj.getbbox(inner_text)
+        return right - left, bottom - top
+
+    width, height = get_text_size(text)
+    if width <= max_width:
+        return text, height
+
+    processed = True
+
+    _wrapped_lines_ = []
+    words = text.split(" ")
+    _txt_ = ""
+    for word in words:
+        _before = _txt_
+        _txt_ += f"{word} "
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            if _txt_.strip() == word.strip():
+                processed = False
+                break
+            _wrapped_lines_.append(_before)
+            _txt_ = f"{word} "
+    _wrapped_lines_.append(_txt_)
+    if processed:
+        _wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
+        result = "\n".join(_wrapped_lines_).strip()
+        height = len(_wrapped_lines_) * height
+        return result, height
+
+    _wrapped_lines_ = []
+    chars = list(text)
+    _txt_ = ""
+    for word in chars:
+        _txt_ += word
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            _wrapped_lines_.append(_txt_)
+            _txt_ = ""
+    _wrapped_lines_.append(_txt_)
+    result = "\n".join(_wrapped_lines_).strip()
+    height = len(_wrapped_lines_) * height
+    return result, height
+
+
+if __name__ == '__main__':
+    merger_mp4 = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger.mp4'
+    merger_sub = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merged_subtitle_00_00_00-00_01_30.srt'
+    merger_audio = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger_audio.mp3'
+    bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3'
+    output_video = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/combined_test.mp4'
+    
+    # 调用示例
+    options = {
+        'voice_volume': 1.0,            # 配音音量
+        'bgm_volume': 0.1,              # 背景音乐音量
+        'original_audio_volume': 1.0,   # 视频原声音量，0表示不保留
+        'keep_original_audio': True,    # 是否保留原声
+        'subtitle_enabled': True,       # 是否启用字幕 - 修复字幕开关bug
+        'subtitle_font': 'MicrosoftYaHeiNormal.ttc',  # 这里使用相对字体路径，会自动在 font_dir() 目录下查找
+        'subtitle_font_size': 40,
+        'subtitle_color': '#FFFFFF',
+        'subtitle_bg_color': None,      # 直接使用None表示透明背景
+        'subtitle_position': 'bottom',
+        'threads': 2
+    }
+    
+    try:
+        merge_materials(
+            video_path=merger_mp4,
+            audio_path=merger_audio,
+            subtitle_path=merger_sub,
+            bgm_path=bgm_path,
+            output_path=output_video,
+            options=options
+        )
+    except Exception as e:
+        logger.error(f"合并素材失败: \n{traceback.format_exc()}")
--- a/app/services/llm.py
+++ b/app/services/llm.py
@ -7,7 +7,7 @@ from typing import List
 from loguru import logger
 from openai import OpenAI
 from openai import AzureOpenAI
-from moviepy.editor import VideoFileClip
+from moviepy import VideoFileClip
 from openai.types.chat import ChatCompletion
 import google.generativeai as gemini
 from googleapiclient.errors import ResumableUploadError
--- a/app/services/material.py
+++ b/app/services/material.py
@ -4,15 +4,17 @@ import random
 import traceback
 from urllib.parse import urlencode
 from datetime import datetime
+import json

 import requests
-from typing import List
+from typing import List, Optional
 from loguru import logger
 from moviepy.video.io.VideoFileClip import VideoFileClip

 from app.config import config
 from app.models.schema import VideoAspect, VideoConcatMode, MaterialInfo
 from app.utils import utils
+from app.utils import ffmpeg_utils

 requested_count = 0

@ -256,10 +258,10 @@ def time_to_seconds(time_str: str) -> float:
    """
    将时间字符串转换为秒数
    支持格式: 'HH:MM:SS,mmm' (时:分:秒,毫秒)
-    
+
    Args:
        time_str: 时间字符串,如 "00:00:20,100"
-        
+
    Returns:
        float: 转换后的秒数(包含毫秒)
    """
@ -281,7 +283,7 @@ def time_to_seconds(time_str: str) -> float:
            raise ValueError("时间格式必须为 HH:MM:SS,mmm")

        return seconds + ms
-        
+
    except ValueError as e:
        logger.error(f"时间格式错误: {time_str}")
        raise ValueError(f"时间格式错误: 必须为 HH:MM:SS,mmm 格式") from e
@ -290,10 +292,10 @@ def time_to_seconds(time_str: str) -> float:
 def format_timestamp(seconds: float) -> str:
    """
    将秒数转换为可读的时间格式 (HH:MM:SS,mmm)
-    
+
    Args:
        seconds: 秒数(可包含毫秒)
-        
+
    Returns:
        str: 格式化的时间字符串,如 "00:00:20,100"
    """
@ -302,14 +304,26 @@ def format_timestamp(seconds: float) -> str:
    seconds_remain = seconds % 60
    whole_seconds = int(seconds_remain)
    milliseconds = int((seconds_remain - whole_seconds) * 1000)
-    
+
    return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}"


-def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict:
+def _detect_hardware_acceleration() -> Optional[str]:
+    """
+    检测系统可用的硬件加速器
+
+    Returns:
+        Optional[str]: 硬件加速参数，如果不支持则返回None
+    """
+    # 使用集中式硬件加速检测
+    hwaccel_type = ffmpeg_utils.get_ffmpeg_hwaccel_type()
+    return hwaccel_type
+
+
+def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> str:
    """
    保存剪辑后的视频
-    
+
    Args:
        timestamp: 需要裁剪的时间戳,格式为 'HH:MM:SS,mmm-HH:MM:SS,mmm'
                  例如: '00:00:00,000-00:00:20,100'
@ -328,85 +342,151 @@ def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> di
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

-    # 生成更规范的视频文件名
-    video_id = f"vid-{timestamp.replace(':', '-').replace(',', '_')}"
-    video_path = os.path.join(save_dir, f"{video_id}.mp4")
+    # 解析时间戳
+    start_str, end_str = timestamp.split('-')

+    # 格式化输出文件名（使用连字符替代冒号和逗号）
+    safe_start_time = start_str.replace(':', '-').replace(',', '-')
+    safe_end_time = end_str.replace(':', '-').replace(',', '-')
+    output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4"
+    video_path = os.path.join(save_dir, output_filename)
+
+    # 如果视频已存在，直接返回
    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
-        logger.info(f"video already exists: {video_path}")
-        return {timestamp: video_path}
+        logger.info(f"视频已存在: {video_path}")
+        return video_path

    try:
-        # 加载视频获取总时长
-        video = VideoFileClip(origin_video)
-        total_duration = video.duration
-        
-        # 解析时间戳
-        start_str, end_str = timestamp.split('-')
+        # 检查视频是否存在
+        if not os.path.exists(origin_video):
+            logger.error(f"源视频文件不存在: {origin_video}")
+            return ''
+
+        # 获取视频总时长
+        try:
+            probe_cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+                        "-of", "default=noprint_wrappers=1:nokey=1", origin_video]
+            total_duration = float(subprocess.check_output(probe_cmd).decode('utf-8').strip())
+        except subprocess.CalledProcessError as e:
+            logger.error(f"获取视频时长失败: {str(e)}")
+            return ''
+
+        # 计算时间点
        start = time_to_seconds(start_str)
        end = time_to_seconds(end_str)
-        
+
        # 验证时间段
        if start >= total_duration:
            logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)")
-            video.close()
-            return {}
-            
+            return ''
+
        if end > total_duration:
            logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)，将自动调整为视频结尾")
            end = total_duration
-            
+
        if end <= start:
            logger.warning(f"结束时间 {format_timestamp(end)} 必须大于起始时间 {format_timestamp(start)}")
-            video.close()
-            return {}
-            
-        # 剪辑视频
+            return ''
+
+        # 计算剪辑时长
        duration = end - start
-        logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)}，时长 {format_timestamp(duration)}")
-        
-        # 剪辑视频
-        subclip = video.subclip(start, end)
-        
-        try:
-            # 检查视频是否有音频轨道并写入文件
-            subclip.write_videofile(
-                video_path,
-                codec='libx264',
-                audio_codec='aac',
-                temp_audiofile='temp-audio.m4a',
-                remove_temp=True,
-                audio=(subclip.audio is not None),
-                logger=None
+        # logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)}，时长 {format_timestamp(duration)}")
+
+        # 获取硬件加速选项
+        hwaccel = _detect_hardware_acceleration()
+        hwaccel_args = []
+        if hwaccel:
+            hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args()
+
+        # 转换为FFmpeg兼容的时间格式（逗号替换为点）
+        ffmpeg_start_time = start_str.replace(',', '.')
+        ffmpeg_end_time = end_str.replace(',', '.')
+
+        # 构建FFmpeg命令 - 使用新的智能编码器选择
+        encoder = ffmpeg_utils.get_optimal_ffmpeg_encoder()
+
+        ffmpeg_cmd = [
+            "ffmpeg", "-y", *hwaccel_args,
+            "-i", origin_video,
+            "-ss", ffmpeg_start_time,
+            "-to", ffmpeg_end_time,
+            "-c:v", encoder,
+            "-c:a", "aac",
+            "-strict", "experimental",
+            video_path
+        ]
+
+        # 根据编码器类型添加特定参数
+        if "nvenc" in encoder:
+            ffmpeg_cmd.insert(-1, "-preset")
+            ffmpeg_cmd.insert(-1, "medium")
+        elif "videotoolbox" in encoder:
+            ffmpeg_cmd.insert(-1, "-profile:v")
+            ffmpeg_cmd.insert(-1, "high")
+        elif "qsv" in encoder:
+            ffmpeg_cmd.insert(-1, "-preset")
+            ffmpeg_cmd.insert(-1, "medium")
+        elif encoder == "libx264":
+            ffmpeg_cmd.insert(-1, "-preset")
+            ffmpeg_cmd.insert(-1, "medium")
+            ffmpeg_cmd.insert(-1, "-crf")
+            ffmpeg_cmd.insert(-1, "23")
+
+        # 执行FFmpeg命令
+        # logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}")
+        # logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}")
+
+        # 在Windows系统上使用UTF-8编码处理输出，避免GBK编码错误
+        is_windows = os.name == 'nt'
+        if is_windows:
+            process = subprocess.run(
+                ffmpeg_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                encoding='utf-8',  # 明确指定编码为UTF-8
+                text=True,
+                check=False  # 不抛出异常，我们会检查返回码
            )
-            
-            # 验证生成的视频文件
-            if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
-                with VideoFileClip(video_path) as clip:
-                    if clip.duration > 0 and clip.fps > 0:
-                        return {timestamp: video_path}
-                    
-            raise ValueError("视频文件验证失败")
-            
-        except Exception as e:
-            logger.warning(f"视频文件处理失败: {video_path} => {str(e)}")
+        else:
+            process = subprocess.run(
+                ffmpeg_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=False  # 不抛出异常，我们会检查返回码
+            )
+
+        # 检查是否成功
+        if process.returncode != 0:
+            logger.error(f"视频剪辑失败: {process.stderr}")
            if os.path.exists(video_path):
                os.remove(video_path)
-                
-    except Exception as e:
-        logger.warning(f"视频剪辑失败: \n{str(traceback.format_exc())}")
+            return ''
+
+        # 验证生成的视频文件
+        if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+            # 检查视频是否可播放
+            probe_cmd = ["ffprobe", "-v", "error", video_path]
+            # 在Windows系统上使用UTF-8编码
+            if is_windows:
+                validate_result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8')
+            else:
+                validate_result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+            if validate_result.returncode == 0:
+                logger.info(f"视频剪辑成功: {video_path}")
+                return video_path
+
+        logger.error("视频文件验证失败")
        if os.path.exists(video_path):
            os.remove(video_path)
-    finally:
-        # 确保视频对象被正确关闭
-        try:
-            video.close()
-            if 'subclip' in locals():
-                subclip.close()
-        except:
-            pass
-    
-    return {}
+        return ''
+
+    except Exception as e:
+        logger.error(f"视频剪辑过程中发生错误: \n{str(traceback.format_exc())}")
+        if os.path.exists(video_path):
+            os.remove(video_path)
+        return ''


 def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None) -> dict:
@ -428,17 +508,17 @@ def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, pro
        try:
            saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory)
            if saved_video_path:
-                logger.info(f"video saved: {saved_video_path}")
-                video_paths.update(saved_video_path)
-            
+                video_paths.update({index+1:saved_video_path})
+
            # 更新进度
            if progress_callback:
                progress_callback(index + 1, total_items)
        except Exception as e:
            logger.error(f"视频裁剪失败: {utils.to_json(item)} =>\n{str(traceback.format_exc())}")
            return {}
-            
+
    logger.success(f"裁剪 {len(video_paths)} videos")
+    # logger.debug(json.dumps(video_paths, indent=4, ensure_ascii=False))
    return video_paths


--- a/app/services/merger_video.py
+++ b/app/services/merger_video.py
@ -0,0 +1,673 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : merger_video
+@Author : 小林同学
+@Date   : 2025/5/6 下午7:38
+'''
+
+import os
+import shutil
+import subprocess
+from enum import Enum
+from typing import List, Optional, Tuple
+from loguru import logger
+
+from app.utils import ffmpeg_utils
+
+
+class VideoAspect(Enum):
+    """视频宽高比枚举"""
+    landscape = "16:9"  # 横屏 16:9
+    landscape_2 = "4:3"
+    portrait = "9:16"   # 竖屏 9:16
+    portrait_2 = "3:4"
+    square = "1:1"      # 方形 1:1
+
+    def to_resolution(self) -> Tuple[int, int]:
+        """根据宽高比返回标准分辨率"""
+        if self == VideoAspect.portrait:
+            return 1080, 1920  # 竖屏 9:16
+        elif self == VideoAspect.portrait_2:
+            return 720, 1280   # 竖屏 4:3
+        elif self == VideoAspect.landscape:
+            return 1920, 1080  # 横屏 16:9
+        elif self == VideoAspect.landscape_2:
+            return 1280, 720   # 横屏 4:3
+        elif self == VideoAspect.square:
+            return 1080, 1080  # 方形 1:1
+        else:
+            return 1080, 1920  # 默认竖屏
+
+
+def check_ffmpeg_installation() -> bool:
+    """
+    检查ffmpeg是否已安装
+
+    Returns:
+        bool: 如果安装则返回True，否则返回False
+    """
+    try:
+        subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+        return True
+    except (subprocess.SubprocessError, FileNotFoundError):
+        logger.error("ffmpeg未安装或不在系统PATH中，请安装ffmpeg")
+        return False
+
+
+def get_hardware_acceleration_option() -> Optional[str]:
+    """
+    根据系统环境选择合适的硬件加速选项
+
+    Returns:
+        Optional[str]: 硬件加速参数，如果不支持则返回None
+    """
+    # 使用新的硬件加速检测API
+    return ffmpeg_utils.get_ffmpeg_hwaccel_type()
+
+
+def check_video_has_audio(video_path: str) -> bool:
+    """
+    检查视频是否包含音频流
+
+    Args:
+        video_path: 视频文件路径
+
+    Returns:
+        bool: 如果视频包含音频流则返回True，否则返回False
+    """
+    if not os.path.exists(video_path):
+        logger.warning(f"视频文件不存在: {video_path}")
+        return False
+
+    probe_cmd = [
+        'ffprobe', '-v', 'error',
+        '-select_streams', 'a:0',
+        '-show_entries', 'stream=codec_type',
+        '-of', 'csv=p=0',
+        video_path
+    ]
+
+    try:
+        result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False)
+        return result.stdout.strip() == 'audio'
+    except Exception as e:
+        logger.warning(f"检测视频音频流时出错: {str(e)}")
+        return False
+
+
+def create_ffmpeg_concat_file(video_paths: List[str], concat_file_path: str) -> str:
+    """
+    创建ffmpeg合并所需的concat文件
+
+    Args:
+        video_paths: 需要合并的视频文件路径列表
+        concat_file_path: concat文件的输出路径
+
+    Returns:
+        str: concat文件的路径
+    """
+    with open(concat_file_path, 'w', encoding='utf-8') as f:
+        for video_path in video_paths:
+            # 获取绝对路径
+            abs_path = os.path.abspath(video_path)
+            # 在Windows上将反斜杠替换为正斜杠
+            if os.name == 'nt':  # Windows系统
+                abs_path = abs_path.replace('\\', '/')
+            else:  # Unix/Mac系统
+                # 转义特殊字符
+                abs_path = abs_path.replace('\\', '\\\\').replace(':', '\\:')
+
+            # 处理路径中的单引号 (如果有)
+            abs_path = abs_path.replace("'", "\\'")
+
+            f.write(f"file '{abs_path}'\n")
+    return concat_file_path
+
+
+def process_single_video(
+        input_path: str,
+        output_path: str,
+        target_width: int,
+        target_height: int,
+        keep_audio: bool = True,
+        hwaccel: Optional[str] = None
+) -> str:
+    """
+    处理单个视频：调整分辨率、帧率等
+
+    Args:
+        input_path: 输入视频路径
+        output_path: 输出视频路径
+        target_width: 目标宽度
+        target_height: 目标高度
+        keep_audio: 是否保留音频
+        hwaccel: 硬件加速选项
+
+    Returns:
+        str: 处理后的视频路径
+    """
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"找不到视频文件: {input_path}")
+
+    # 构建基本命令
+    command = ['ffmpeg', '-y']
+
+    # 安全检查：如果在Windows上，则慎用硬件加速
+    is_windows = os.name == 'nt'
+    if is_windows and hwaccel:
+        logger.info("在Windows系统上检测到硬件加速请求，将进行额外的兼容性检查")
+        try:
+            # 对视频进行快速探测，检测其基本信息
+            probe_cmd = [
+                'ffprobe', '-v', 'error',
+                '-select_streams', 'v:0',
+                '-show_entries', 'stream=codec_name,width,height',
+                '-of', 'csv=p=0',
+                input_path
+            ]
+            result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False)
+
+            # 如果探测成功，使用硬件加速；否则降级到软件编码
+            if result.returncode != 0:
+                logger.warning(f"视频探测失败，为安全起见，禁用硬件加速: {result.stderr}")
+                hwaccel = None
+        except Exception as e:
+            logger.warning(f"视频探测出错，禁用硬件加速: {str(e)}")
+            hwaccel = None
+
+    # 添加硬件加速参数（使用新的智能检测机制）
+    if hwaccel:
+        try:
+            # 使用新的硬件加速检测API
+            hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args()
+            if hwaccel_args:
+                command.extend(hwaccel_args)
+                logger.debug(f"应用硬件加速参数: {hwaccel_args}")
+            else:
+                logger.info("硬件加速不可用，将使用软件编码")
+                hwaccel = False  # 标记为不使用硬件加速
+        except Exception as e:
+            logger.warning(f"应用硬件加速参数时出错: {str(e)}，将使用软件编码")
+            hwaccel = False  # 标记为不使用硬件加速
+            # 重置命令，移除可能添加了一半的硬件加速参数
+            command = ['ffmpeg', '-y']
+
+    # 输入文件
+    command.extend(['-i', input_path])
+
+    # 处理音频
+    if not keep_audio:
+        command.extend(['-an'])  # 移除音频
+    else:
+        # 检查输入视频是否有音频流
+        has_audio = check_video_has_audio(input_path)
+        if has_audio:
+            command.extend(['-c:a', 'aac', '-b:a', '128k'])  # 音频编码为AAC
+        else:
+            logger.warning(f"视频 {input_path} 没有音频流，将会忽略音频设置")
+            command.extend(['-an'])  # 没有音频流时移除音频设置
+
+    # 视频处理参数：缩放并添加填充以保持比例
+    scale_filter = f"scale={target_width}:{target_height}:force_original_aspect_ratio=decrease"
+    pad_filter = f"pad={target_width}:{target_height}:(ow-iw)/2:(oh-ih)/2"
+    command.extend([
+        '-vf', f"{scale_filter},{pad_filter}",
+        '-r', '30',  # 设置帧率为30fps
+    ])
+
+    # 选择编码器 - 使用新的智能编码器选择
+    encoder = ffmpeg_utils.get_optimal_ffmpeg_encoder()
+
+    if hwaccel and encoder != "libx264":
+        logger.info(f"使用硬件编码器: {encoder}")
+        command.extend(['-c:v', encoder])
+
+        # 根据编码器类型添加特定参数
+        if "nvenc" in encoder:
+            command.extend(['-preset', 'p4', '-profile:v', 'high'])
+        elif "videotoolbox" in encoder:
+            command.extend(['-profile:v', 'high'])
+        elif "qsv" in encoder:
+            command.extend(['-preset', 'medium'])
+        elif "vaapi" in encoder:
+            command.extend(['-profile', '100'])
+        elif "amf" in encoder:
+            command.extend(['-quality', 'balanced'])
+        else:
+            command.extend(['-preset', 'medium', '-profile:v', 'high'])
+    else:
+        logger.info("使用软件编码器(libx264)")
+        command.extend(['-c:v', 'libx264', '-preset', 'medium', '-profile:v', 'high'])
+
+    # 设置视频比特率和其他参数
+    command.extend([
+        '-b:v', '5M',
+        '-maxrate', '8M',
+        '-bufsize', '10M',
+        '-pix_fmt', 'yuv420p',  # 兼容性更好的颜色格式
+    ])
+
+    # 输出文件
+    command.append(output_path)
+
+    # 执行命令
+    try:
+        # logger.info(f"执行FFmpeg命令: {' '.join(command)}")
+        process = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        logger.info(f"视频处理成功: {output_path}")
+        return output_path
+    except subprocess.CalledProcessError as e:
+        error_msg = e.stderr.decode() if e.stderr else str(e)
+        logger.error(f"处理视频失败: {error_msg}")
+
+        # 如果使用硬件加速失败，尝试使用软件编码
+        if hwaccel:
+            logger.info("硬件加速失败，尝试使用软件编码作为备选方案")
+            try:
+                # 强制使用软件编码
+                ffmpeg_utils.force_software_encoding()
+
+                # 构建新的命令，使用软件编码
+                fallback_cmd = ['ffmpeg', '-y', '-i', input_path]
+
+                # 保持原有的音频设置
+                if not keep_audio:
+                    fallback_cmd.extend(['-an'])
+                else:
+                    has_audio = check_video_has_audio(input_path)
+                    if has_audio:
+                        fallback_cmd.extend(['-c:a', 'aac', '-b:a', '128k'])
+                    else:
+                        fallback_cmd.extend(['-an'])
+
+                # 保持原有的视频过滤器
+                fallback_cmd.extend([
+                    '-vf', f"{scale_filter},{pad_filter}",
+                    '-r', '30',
+                    '-c:v', 'libx264',
+                    '-preset', 'medium',
+                    '-profile:v', 'high',
+                    '-b:v', '5M',
+                    '-maxrate', '8M',
+                    '-bufsize', '10M',
+                    '-pix_fmt', 'yuv420p',
+                    output_path
+                ])
+
+                logger.info("执行软件编码备选方案")
+                subprocess.run(fallback_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                logger.info(f"使用软件编码成功处理视频: {output_path}")
+                return output_path
+            except subprocess.CalledProcessError as fallback_error:
+                fallback_error_msg = fallback_error.stderr.decode() if fallback_error.stderr else str(fallback_error)
+                logger.error(f"软件编码备选方案也失败: {fallback_error_msg}")
+
+                # 尝试最基本的编码参数
+                try:
+                    logger.info("尝试最基本的编码参数")
+                    basic_cmd = [
+                        'ffmpeg', '-y', '-i', input_path,
+                        '-c:v', 'libx264', '-preset', 'ultrafast',
+                        '-crf', '23', '-pix_fmt', 'yuv420p',
+                        output_path
+                    ]
+                    subprocess.run(basic_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                    logger.info(f"使用基本编码参数成功处理视频: {output_path}")
+                    return output_path
+                except subprocess.CalledProcessError as basic_error:
+                    basic_error_msg = basic_error.stderr.decode() if basic_error.stderr else str(basic_error)
+                    logger.error(f"基本编码参数也失败: {basic_error_msg}")
+                    raise RuntimeError(f"无法处理视频 {input_path}: 所有编码方案都失败")
+
+        # 如果不是硬件加速导致的问题，或者备选方案也失败了，抛出原始错误
+        raise RuntimeError(f"处理视频失败: {error_msg}")
+
+
+def combine_clip_videos(
+        output_video_path: str,
+        video_paths: List[str],
+        video_ost_list: List[int],
+        video_aspect: VideoAspect = VideoAspect.portrait,
+        threads: int = 4,
+        force_software_encoding: bool = False,  # 新参数，强制使用软件编码
+) -> str:
+    """
+    合并子视频
+    Args:
+        output_video_path: 合并后的存储路径
+        video_paths: 子视频路径列表
+        video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说)
+        video_aspect: 屏幕比例
+        threads: 线程数
+        force_software_encoding: 是否强制使用软件编码（忽略硬件加速检测）
+
+    Returns:
+        str: 合并后的视频路径
+    """
+    # 检查ffmpeg是否安装
+    if not check_ffmpeg_installation():
+        raise RuntimeError("未找到ffmpeg，请先安装")
+
+    # 准备输出目录
+    output_dir = os.path.dirname(output_video_path)
+    os.makedirs(output_dir, exist_ok=True)
+
+    # 获取目标分辨率
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    # 检测可用的硬件加速选项
+    hwaccel = None if force_software_encoding else get_hardware_acceleration_option()
+    if hwaccel:
+        logger.info(f"将使用 {hwaccel} 硬件加速")
+    elif force_software_encoding:
+        logger.info("已强制使用软件编码，跳过硬件加速检测")
+    else:
+        logger.info("未检测到兼容的硬件加速，将使用软件编码")
+
+    # Windows系统上，默认使用软件编码以提高兼容性
+    if os.name == 'nt' and hwaccel:
+        logger.warning("在Windows系统上检测到硬件加速，但为了提高兼容性，建议使用软件编码")
+        # 不强制禁用hwaccel，而是在process_single_video中进行额外安全检查
+
+    # 重组视频路径和原声设置为一个字典列表结构
+    video_segments = []
+
+    # 检查视频路径和原声设置列表长度是否匹配
+    if len(video_paths) != len(video_ost_list):
+        logger.warning(f"视频路径列表({len(video_paths)})和原声设置列表({len(video_ost_list)})长度不匹配")
+        # 调整长度以匹配较短的列表
+        min_length = min(len(video_paths), len(video_ost_list))
+        video_paths = video_paths[:min_length]
+        video_ost_list = video_ost_list[:min_length]
+
+    # 创建视频处理配置字典列表
+    for i, (video_path, video_ost) in enumerate(zip(video_paths, video_ost_list)):
+        if not os.path.exists(video_path):
+            logger.warning(f"视频不存在，跳过: {video_path}")
+            continue
+
+        # 检查是否有音频流
+        has_audio = check_video_has_audio(video_path)
+
+        # 构建视频片段配置
+        segment = {
+            "index": i,
+            "path": video_path,
+            "ost": video_ost,
+            "has_audio": has_audio,
+            "keep_audio": video_ost > 0 and has_audio  # 只有当ost>0且实际有音频时才保留
+        }
+
+        # 记录日志
+        if video_ost > 0 and not has_audio:
+            logger.warning(f"视频 {video_path} 设置为保留原声(ost={video_ost})，但该视频没有音频流")
+
+        video_segments.append(segment)
+
+    # 处理每个视频片段
+    processed_videos = []
+    temp_dir = os.path.join(output_dir, "temp_videos")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    try:
+        # 第一阶段：处理所有视频片段到中间文件
+        for segment in video_segments:
+            # 处理单个视频，去除或保留音频
+            temp_output = os.path.join(temp_dir, f"processed_{segment['index']}.mp4")
+            try:
+                process_single_video(
+                    input_path=segment['path'],
+                    output_path=temp_output,
+                    target_width=video_width,
+                    target_height=video_height,
+                    keep_audio=segment['keep_audio'],
+                    hwaccel=hwaccel
+                )
+                processed_videos.append({
+                    "index": segment["index"],
+                    "path": temp_output,
+                    "keep_audio": segment["keep_audio"]
+                })
+                logger.info(f"视频 {segment['index'] + 1}/{len(video_segments)} 处理完成")
+            except Exception as e:
+                logger.error(f"处理视频 {segment['path']} 时出错: {str(e)}")
+                # 如果使用硬件加速失败，尝试使用软件编码
+                if hwaccel and not force_software_encoding:
+                    logger.info(f"尝试使用软件编码处理视频 {segment['path']}")
+                    try:
+                        process_single_video(
+                            input_path=segment['path'],
+                            output_path=temp_output,
+                            target_width=video_width,
+                            target_height=video_height,
+                            keep_audio=segment['keep_audio'],
+                            hwaccel=None  # 使用软件编码
+                        )
+                        processed_videos.append({
+                            "index": segment["index"],
+                            "path": temp_output,
+                            "keep_audio": segment["keep_audio"]
+                        })
+                        logger.info(f"使用软件编码成功处理视频 {segment['index'] + 1}/{len(video_segments)}")
+                    except Exception as fallback_error:
+                        logger.error(f"使用软件编码处理视频 {segment['path']} 也失败: {str(fallback_error)}")
+                        continue
+                else:
+                    continue
+
+        if not processed_videos:
+            raise ValueError("没有有效的视频片段可以合并")
+
+        # 按原始索引排序处理后的视频
+        processed_videos.sort(key=lambda x: x["index"])
+
+        # 第二阶段：分步骤合并视频 - 避免复杂的filter_complex滤镜
+        try:
+            # 1. 首先，将所有没有音频的视频或音频被禁用的视频合并到一个临时文件中
+            video_paths_only = [video["path"] for video in processed_videos]
+            video_concat_path = os.path.join(temp_dir, "video_concat.mp4")
+
+            # 创建concat文件，用于合并视频流
+            concat_file = os.path.join(temp_dir, "concat_list.txt")
+            create_ffmpeg_concat_file(video_paths_only, concat_file)
+
+            # 合并所有视频流，但不包含音频
+            concat_cmd = [
+                'ffmpeg', '-y',
+                '-f', 'concat',
+                '-safe', '0',
+                '-i', concat_file,
+                '-c:v', 'libx264',
+                '-preset', 'medium',
+                '-profile:v', 'high',
+                '-an',  # 不包含音频
+                '-threads', str(threads),
+                video_concat_path
+            ]
+
+            subprocess.run(concat_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("视频流合并完成")
+
+            # 2. 提取并合并有音频的片段
+            audio_segments = [video for video in processed_videos if video["keep_audio"]]
+
+            if not audio_segments:
+                # 如果没有音频片段，直接使用无音频的合并视频作为最终结果
+                shutil.copy(video_concat_path, output_video_path)
+                logger.info("无音频视频合并完成")
+                return output_video_path
+
+            # 创建音频中间文件
+            audio_files = []
+            for i, segment in enumerate(audio_segments):
+                # 提取音频
+                audio_file = os.path.join(temp_dir, f"audio_{i}.aac")
+                extract_audio_cmd = [
+                    'ffmpeg', '-y',
+                    '-i', segment["path"],
+                    '-vn',  # 不包含视频
+                    '-c:a', 'aac',
+                    '-b:a', '128k',
+                    audio_file
+                ]
+                subprocess.run(extract_audio_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                audio_files.append({
+                    "index": segment["index"],
+                    "path": audio_file
+                })
+                logger.info(f"提取音频 {i+1}/{len(audio_segments)} 完成")
+
+            # 3. 计算每个音频片段的时间位置
+            audio_timings = []
+            current_time = 0.0
+
+            # 获取每个视频片段的时长
+            for i, video in enumerate(processed_videos):
+                duration_cmd = [
+                    'ffprobe', '-v', 'error',
+                    '-show_entries', 'format=duration',
+                    '-of', 'csv=p=0',
+                    video["path"]
+                ]
+                result = subprocess.run(duration_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                duration = float(result.stdout.strip())
+
+                # 如果当前片段需要保留音频，记录时间位置
+                if video["keep_audio"]:
+                    for audio in audio_files:
+                        if audio["index"] == video["index"]:
+                            audio_timings.append({
+                                "file": audio["path"],
+                                "start": current_time,
+                                "index": video["index"]
+                            })
+                            break
+
+                current_time += duration
+
+            # 4. 创建静音音频轨道作为基础
+            silence_audio = os.path.join(temp_dir, "silence.aac")
+            create_silence_cmd = [
+                'ffmpeg', '-y',
+                '-f', 'lavfi',
+                '-i', f'anullsrc=r=44100:cl=stereo',
+                '-t', str(current_time),  # 总时长
+                '-c:a', 'aac',
+                '-b:a', '128k',
+                silence_audio
+            ]
+            subprocess.run(create_silence_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+            # 5. 创建复杂滤镜命令以混合音频
+            filter_script = os.path.join(temp_dir, "filter_script.txt")
+            with open(filter_script, 'w') as f:
+                f.write(f"[0:a]volume=0.0[silence];\n")  # 首先静音背景轨道
+
+                # 添加每个音频文件
+                for i, timing in enumerate(audio_timings):
+                    f.write(f"[{i+1}:a]adelay={int(timing['start']*1000)}|{int(timing['start']*1000)}[a{i}];\n")
+
+                # 混合所有音频
+                mix_str = "[silence]"
+                for i in range(len(audio_timings)):
+                    mix_str += f"[a{i}]"
+                mix_str += f"amix=inputs={len(audio_timings)+1}:duration=longest[aout]"
+                f.write(mix_str)
+
+            # 6. 构建音频合并命令
+            audio_inputs = ['-i', silence_audio]
+            for timing in audio_timings:
+                audio_inputs.extend(['-i', timing["file"]])
+
+            mixed_audio = os.path.join(temp_dir, "mixed_audio.aac")
+            audio_mix_cmd = [
+                'ffmpeg', '-y'
+            ] + audio_inputs + [
+                '-filter_complex_script', filter_script,
+                '-map', '[aout]',
+                '-c:a', 'aac',
+                '-b:a', '128k',
+                mixed_audio
+            ]
+
+            subprocess.run(audio_mix_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("音频混合完成")
+
+            # 7. 将合并的视频和混合的音频组合在一起
+            final_cmd = [
+                'ffmpeg', '-y',
+                '-i', video_concat_path,
+                '-i', mixed_audio,
+                '-c:v', 'copy',
+                '-c:a', 'aac',
+                '-map', '0:v:0',
+                '-map', '1:a:0',
+                '-shortest',
+                output_video_path
+            ]
+
+            subprocess.run(final_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info("视频最终合并完成")
+
+            return output_video_path
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"合并视频过程中出错: {e.stderr.decode() if e.stderr else str(e)}")
+
+            # 尝试备用合并方法 - 最简单的无音频合并
+            logger.info("尝试备用合并方法 - 无音频合并")
+            try:
+                concat_file = os.path.join(temp_dir, "concat_list.txt")
+                video_paths_only = [video["path"] for video in processed_videos]
+                create_ffmpeg_concat_file(video_paths_only, concat_file)
+
+                backup_cmd = [
+                    'ffmpeg', '-y',
+                    '-f', 'concat',
+                    '-safe', '0',
+                    '-i', concat_file,
+                    '-c:v', 'copy',
+                    '-an',  # 无音频
+                    output_video_path
+                ]
+
+                subprocess.run(backup_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                logger.warning("使用备用方法（无音频）成功合并视频")
+                return output_video_path
+            except Exception as backup_error:
+                logger.error(f"备用合并方法也失败: {str(backup_error)}")
+                raise RuntimeError(f"无法合并视频: {str(backup_error)}")
+
+    except Exception as e:
+        logger.error(f"合并视频时出错: {str(e)}")
+        raise
+    finally:
+        # 清理临时文件
+        try:
+            if os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir)
+                logger.info("已清理临时文件")
+        except Exception as e:
+            logger.warning(f"清理临时文件时出错: {str(e)}")
+
+
+if __name__ == '__main__':
+    video_paths = [
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E02_00_14_09_440.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E08_00_27_11_110.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E08_00_34_44_480.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E08_00_42_47_630.mp4',
+        '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E09_00_29_48_160.mp4'
+        ]
+
+    combine_clip_videos(
+        output_video_path="/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/merged_123.mp4",
+        video_paths=video_paths,
+        video_ost_list=[1, 1, 1,1,1],
+        video_aspect=VideoAspect.portrait,
+        force_software_encoding=False  # 默认不强制使用软件编码，让系统自动决定
+    )
--- a/app/services/script_service.py
+++ b/app/services/script_service.py
@ -3,10 +3,11 @@ import json
 import time
 import asyncio
 import requests
+from app.utils import video_processor
 from loguru import logger
 from typing import List, Dict, Any, Callable

-from app.utils import utils, gemini_analyzer, video_processor, video_processor_v2
+from app.utils import utils, gemini_analyzer, video_processor
 from app.utils.script_generator import ScriptProcessor
 from app.config import config

@ -21,6 +22,7 @@ class ScriptGenerator:
        video_path: str,
        video_theme: str = "",
        custom_prompt: str = "",
+        frame_interval_input: int = 5,
        skip_seconds: int = 0,
        threshold: int = 30,
        vision_batch_size: int = 5,
@ -105,20 +107,13 @@ class ScriptGenerator:
        os.makedirs(video_keyframes_dir, exist_ok=True)
        
        try:
-            if config.frames.get("version") == "v2":
-                processor = video_processor_v2.VideoProcessor(video_path)
-                processor.process_video_pipeline(
-                    output_dir=video_keyframes_dir,
-                    skip_seconds=skip_seconds,
-                    threshold=threshold
-                )
-            else:
-                processor = video_processor.VideoProcessor(video_path)
-                processor.process_video(
-                    output_dir=video_keyframes_dir,
-                    skip_seconds=skip_seconds
-                )
-                
+            processor = video_processor.VideoProcessor(video_path)
+            processor.process_video_pipeline(
+                output_dir=video_keyframes_dir,
+                skip_seconds=skip_seconds,
+                threshold=threshold
+            )
+
            for filename in sorted(os.listdir(video_keyframes_dir)):
                if filename.endswith('.jpg'):
                    keyframe_files.append(os.path.join(video_keyframes_dir, filename))
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@ -4,11 +4,11 @@ import re
 import traceback
 from typing import Optional

-from faster_whisper import WhisperModel
+# from faster_whisper import WhisperModel
 from timeit import default_timer as timer
 from loguru import logger
 import google.generativeai as genai
-from moviepy.editor import VideoFileClip
+from moviepy import VideoFileClip
 import os

 from app.config import config
@ -33,7 +33,7 @@ def create(audio_file, subtitle_file: str = ""):
    """
    global model, device, compute_type
    if not model:
-        model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v2"
+        model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v3"
        model_bin_file = f"{model_path}/model.bin"
        if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
            logger.error(
@ -45,12 +45,25 @@ def create(audio_file, subtitle_file: str = ""):
            )
            return None

-        # 尝试使用 CUDA，如果失败则回退到 CPU
+        # 首先使用CPU模式，不触发CUDA检查
+        use_cuda = False
        try:
-            import torch
-            if torch.cuda.is_available():
+            # 在函数中延迟导入torch，而不是在全局范围内
+            # 使用安全的方式检查CUDA可用性
+            def check_cuda_available():
+                try:
+                    import torch
+                    return torch.cuda.is_available()
+                except (ImportError, RuntimeError) as e:
+                    logger.warning(f"检查CUDA可用性时出错: {e}")
+                    return False
+                
+            # 仅当明确需要时才检查CUDA
+            use_cuda = check_cuda_available()
+            
+            if use_cuda:
+                logger.info(f"尝试使用 CUDA 加载模型: {model_path}")
                try:
-                    logger.info(f"尝试使用 CUDA 加载模型: {model_path}")
                    model = WhisperModel(
                        model_size_or_path=model_path,
                        device="cuda",
@ -63,18 +76,18 @@ def create(audio_file, subtitle_file: str = ""):
                except Exception as e:
                    logger.warning(f"CUDA 加载失败，错误信息: {str(e)}")
                    logger.warning("回退到 CPU 模式")
-                    device = "cpu"
-                    compute_type = "int8"
+                    use_cuda = False
            else:
-                logger.info("未检测到 CUDA，使用 CPU 模式")
-                device = "cpu"
-                compute_type = "int8"
-        except ImportError:
-            logger.warning("未安装 torch，使用 CPU 模式")
+                logger.info("使用 CPU 模式")
+        except Exception as e:
+            logger.warning(f"CUDA检查过程出错: {e}")
+            logger.warning("默认使用CPU模式")
+            use_cuda = False
+
+        # 如果CUDA不可用或加载失败，使用CPU
+        if not use_cuda:
            device = "cpu"
            compute_type = "int8"
-
-        if device == "cpu":
            logger.info(f"使用 CPU 加载模型: {model_path}")
            model = WhisperModel(
                model_size_or_path=model_path,
@ -403,7 +416,7 @@ def extract_audio_and_create_subtitle(video_file: str, subtitle_file: str = "")
        logger.info("音频提取完成，开始生成字幕")
        
        # 使用create函数生成字幕
-        create(audio_file, subtitle_file)
+        create("/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav", subtitle_file)
        
        # 删除临时音频文件
        if os.path.exists(audio_file):
@ -422,8 +435,8 @@ if __name__ == "__main__":
    task_id = "123456"
    task_dir = utils.task_dir(task_id)
    subtitle_file = f"{task_dir}/subtitle_123456.srt"
-    audio_file = f"{task_dir}/audio.wav"
-    video_file = "/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_1702.mp4"
+    audio_file = "/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav"
+    video_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/qyn2-2-720p.mp4"

    extract_audio_and_create_subtitle(video_file, subtitle_file)

--- a/app/services/subtitle_merger.py
+++ b/app/services/subtitle_merger.py
@ -0,0 +1,202 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : subtitle_merger
+@Author : viccy
+@Date   : 2025/5/6 下午4:00 
+'''
+
+import re
+import os
+from datetime import datetime, timedelta
+
+
+def parse_time(time_str):
+    """解析时间字符串为timedelta对象"""
+    hours, minutes, seconds_ms = time_str.split(':')
+    seconds, milliseconds = seconds_ms.split(',')
+    
+    td = timedelta(
+        hours=int(hours),
+        minutes=int(minutes),
+        seconds=int(seconds),
+        milliseconds=int(milliseconds)
+    )
+    return td
+
+
+def format_time(td):
+    """将timedelta对象格式化为SRT时间字符串"""
+    total_seconds = int(td.total_seconds())
+    hours = total_seconds // 3600
+    minutes = (total_seconds % 3600) // 60
+    seconds = total_seconds % 60
+    milliseconds = td.microseconds // 1000
+    
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+
+
+def parse_edited_time_range(time_range_str):
+    """从editedTimeRange字符串中提取时间范围"""
+    if not time_range_str:
+        return None, None
+    
+    parts = time_range_str.split('-')
+    if len(parts) != 2:
+        return None, None
+    
+    start_time_str, end_time_str = parts
+    
+    # 将HH:MM:SS格式转换为timedelta
+    start_h, start_m, start_s = map(int, start_time_str.split(':'))
+    end_h, end_m, end_s = map(int, end_time_str.split(':'))
+    
+    start_time = timedelta(hours=start_h, minutes=start_m, seconds=start_s)
+    end_time = timedelta(hours=end_h, minutes=end_m, seconds=end_s)
+    
+    return start_time, end_time
+
+
+def merge_subtitle_files(subtitle_items, output_file=None):
+    """
+    合并多个SRT字幕文件
+    
+    参数:
+        subtitle_items: 字典列表，每个字典包含subtitle文件路径和editedTimeRange
+        output_file: 输出文件的路径，如果为None则自动生成
+    
+    返回:
+        合并后的字幕文件路径
+    """
+    # 按照editedTimeRange的开始时间排序
+    sorted_items = sorted(subtitle_items, 
+                         key=lambda x: parse_edited_time_range(x.get('editedTimeRange', ''))[0] or timedelta())
+    
+    merged_subtitles = []
+    subtitle_index = 1
+    
+    for item in sorted_items:
+        if not item.get('subtitle') or not os.path.exists(item.get('subtitle')):
+            continue
+            
+        # 从editedTimeRange获取起始时间偏移
+        offset_time, _ = parse_edited_time_range(item.get('editedTimeRange', ''))
+        
+        if offset_time is None:
+            print(f"警告: 无法从项目 {item.get('_id')} 的editedTimeRange中提取时间范围，跳过该项")
+            continue
+        
+        with open(item['subtitle'], 'r', encoding='utf-8') as file:
+            content = file.read()
+            
+        # 解析字幕文件
+        subtitle_blocks = re.split(r'\n\s*\n', content.strip())
+        
+        for block in subtitle_blocks:
+            lines = block.strip().split('\n')
+            if len(lines) < 3:  # 确保块有足够的行数
+                continue
+                
+            # 解析时间轴行
+            time_line = lines[1]
+            time_parts = time_line.split(' --> ')
+            if len(time_parts) != 2:
+                continue
+                
+            start_time = parse_time(time_parts[0])
+            end_time = parse_time(time_parts[1])
+            
+            # 应用时间偏移
+            adjusted_start_time = start_time + offset_time
+            adjusted_end_time = end_time + offset_time
+            
+            # 重建字幕块
+            adjusted_time_line = f"{format_time(adjusted_start_time)} --> {format_time(adjusted_end_time)}"
+            text_lines = lines[2:]
+            
+            new_block = [
+                str(subtitle_index),
+                adjusted_time_line,
+                *text_lines
+            ]
+            
+            merged_subtitles.append('\n'.join(new_block))
+            subtitle_index += 1
+    
+    # 确定输出文件路径
+    if output_file is None:
+        dir_path = os.path.dirname(sorted_items[0]['subtitle'])
+        first_start = parse_edited_time_range(sorted_items[0]['editedTimeRange'])[0]
+        last_end = parse_edited_time_range(sorted_items[-1]['editedTimeRange'])[1]
+        
+        first_start_h, first_start_m, first_start_s = int(first_start.seconds // 3600), int((first_start.seconds % 3600) // 60), int(first_start.seconds % 60)
+        last_end_h, last_end_m, last_end_s = int(last_end.seconds // 3600), int((last_end.seconds % 3600) // 60), int(last_end.seconds % 60)
+        
+        first_start_str = f"{first_start_h:02d}_{first_start_m:02d}_{first_start_s:02d}"
+        last_end_str = f"{last_end_h:02d}_{last_end_m:02d}_{last_end_s:02d}"
+        
+        output_file = os.path.join(dir_path, f"merged_subtitle_{first_start_str}-{last_end_str}.srt")
+    
+    # 合并所有字幕块
+    merged_content = '\n\n'.join(merged_subtitles)
+    
+    # 写入合并后的内容
+    with open(output_file, 'w', encoding='utf-8') as file:
+        file.write(merged_content)
+    
+    return output_file
+
+
+if __name__ == '__main__':
+    # 测试数据
+    test_data = [
+        {'picture': '【解说】好的，各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！', 
+         'timestamp': '00:00:00-00:01:15', 
+         'narration': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！', 
+         'OST': 0, 
+         '_id': 1, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', 
+         'sourceTimeRange': '00:00:00-00:00:26', 
+         'duration': 26, 
+         'editedTimeRange': '00:00:00-00:00:26'
+        },
+        {'picture': '【解说】上一集我们看到，范闲在北齐遭遇了惊天变故，生死不明！', 
+         'timestamp': '00:01:15-00:04:40', 
+         'narration': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…', 
+         'OST': 0, 
+         '_id': 2, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', 
+         'sourceTimeRange': '00:01:15-00:01:29', 
+         'duration': 14, 
+         'editedTimeRange': '00:00:26-00:00:40'
+        },
+        {'picture': '【解说】"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。', 
+         'timestamp': '00:04:58-00:05:45', 
+         'narration': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！', 
+         'OST': 0, 
+         '_id': 4, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', 
+         'sourceTimeRange': '00:04:58-00:05:20', 
+         'duration': 22, 
+         'editedTimeRange': '00:00:57-00:01:19'
+        },
+        {'picture': '【解说】但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！', 
+         'timestamp': '00:05:45-00:06:00', 
+         'narration': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！', 
+         'OST': 0, 
+         '_id': 5, 
+         'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3', 
+         'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt', 
+         'sourceTimeRange': '00:05:45-00:05:53', 
+         'duration': 8, 
+         'editedTimeRange': '00:01:19-00:01:27'
+        }
+    ]
+    
+    output_file = merge_subtitle_files(test_data)
+    print(f"字幕文件已合并至: {output_file}")
--- a/app/services/task.py
+++ b/app/services/task.py
@ -9,167 +9,177 @@ from loguru import logger
 from app.config import config
 from app.models import const
 from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams
-from app.services import llm, material, subtitle, video, voice, audio_merger
+from app.services import (llm, material, subtitle, video, voice, audio_merger,
+                          subtitle_merger, clip_video, merger_video, update_script, generate_video)
 from app.services import state as sm
 from app.utils import utils


-def generate_script(task_id, params):
-    logger.info("\n\n## generating video script")
-    video_script = params.video_script.strip()
-    if not video_script:
-        video_script = llm.generate_script(
-            video_subject=params.video_subject,
-            language=params.video_language,
-            paragraph_number=params.paragraph_number,
-        )
-    else:
-        logger.debug(f"video script: \n{video_script}")
+# def generate_script(task_id, params):
+#     logger.info("\n\n## generating video script")
+#     video_script = params.video_script.strip()
+#     if not video_script:
+#         video_script = llm.generate_script(
+#             video_subject=params.video_subject,
+#             language=params.video_language,
+#             paragraph_number=params.paragraph_number,
+#         )
+#     else:
+#         logger.debug(f"video script: \n{video_script}")

-    if not video_script:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error("failed to generate video script.")
-        return None
+#     if not video_script:
+#         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#         logger.error("failed to generate video script.")
+#         return None

-    return video_script
+#     return video_script


-def generate_terms(task_id, params, video_script):
-    logger.info("\n\n## generating video terms")
-    video_terms = params.video_terms
-    if not video_terms:
-        video_terms = llm.generate_terms(
-            video_subject=params.video_subject, video_script=video_script, amount=5
-        )
-    else:
-        if isinstance(video_terms, str):
-            video_terms = [term.strip() for term in re.split(r"[,，]", video_terms)]
-        elif isinstance(video_terms, list):
-            video_terms = [term.strip() for term in video_terms]
-        else:
-            raise ValueError("video_terms must be a string or a list of strings.")
+# def generate_terms(task_id, params, video_script):
+#     logger.info("\n\n## generating video terms")
+#     video_terms = params.video_terms
+#     if not video_terms:
+#         video_terms = llm.generate_terms(
+#             video_subject=params.video_subject, video_script=video_script, amount=5
+#         )
+#     else:
+#         if isinstance(video_terms, str):
+#             video_terms = [term.strip() for term in re.split(r"[,，]", video_terms)]
+#         elif isinstance(video_terms, list):
+#             video_terms = [term.strip() for term in video_terms]
+#         else:
+#             raise ValueError("video_terms must be a string or a list of strings.")

-        logger.debug(f"video terms: {utils.to_json(video_terms)}")
+#         logger.debug(f"video terms: {utils.to_json(video_terms)}")

-    if not video_terms:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error("failed to generate video terms.")
-        return None
+#     if not video_terms:
+#         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#         logger.error("failed to generate video terms.")
+#         return None

-    return video_terms
+#     return video_terms


-def save_script_data(task_id, video_script, video_terms, params):
-    script_file = path.join(utils.task_dir(task_id), "script.json")
-    script_data = {
-        "script": video_script,
-        "search_terms": video_terms,
-        "params": params,
-    }
+# def save_script_data(task_id, video_script, video_terms, params):
+#     script_file = path.join(utils.task_dir(task_id), "script.json")
+#     script_data = {
+#         "script": video_script,
+#         "search_terms": video_terms,
+#         "params": params,
+#     }

-    with open(script_file, "w", encoding="utf-8") as f:
-        f.write(utils.to_json(script_data))
+#     with open(script_file, "w", encoding="utf-8") as f:
+#         f.write(utils.to_json(script_data))


-def generate_audio(task_id, params, video_script):
-    logger.info("\n\n## generating audio")
-    audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
-    sub_maker = voice.tts(
-        text=video_script,
-        voice_name=voice.parse_voice_name(params.voice_name),
-        voice_rate=params.voice_rate,
-        voice_file=audio_file,
-    )
-    if sub_maker is None:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error(
-            """failed to generate audio:
-1. check if the language of the voice matches the language of the video script.
-2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
-        """.strip()
-        )
-        return None, None, None
+# def generate_audio(task_id, params, video_script):
+#     logger.info("\n\n## generating audio")
+#     audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
+#     sub_maker = voice.tts(
+#         text=video_script,
+#         voice_name=voice.parse_voice_name(params.voice_name),
+#         voice_rate=params.voice_rate,
+#         voice_file=audio_file,
+#     )
+#     if sub_maker is None:
+#         sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#         logger.error(
+#             """failed to generate audio:
+# 1. check if the language of the voice matches the language of the video script.
+# 2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
+#         """.strip()
+#         )
+#         return None, None, None

-    audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
-    return audio_file, audio_duration, sub_maker
+#     audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
+#     return audio_file, audio_duration, sub_maker


-def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
-    if not params.subtitle_enabled:
-        return ""
+# def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
+#     if not params.subtitle_enabled:
+#         return ""

-    subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt")
-    subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
-    logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
+#     subtitle_path = path.join(utils.task_dir(task_id), "subtitle111.srt")
+#     subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+#     logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")

-    subtitle_fallback = False
-    if subtitle_provider == "edge":
-        voice.create_subtitle(
-            text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
-        )
-        if not os.path.exists(subtitle_path):
-            subtitle_fallback = True
-            logger.warning("subtitle file not found, fallback to whisper")
+#     subtitle_fallback = False
+#     if subtitle_provider == "edge":
+#         voice.create_subtitle(
+#             text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
+#         )
+#         if not os.path.exists(subtitle_path):
+#             subtitle_fallback = True
+#             logger.warning("subtitle file not found, fallback to whisper")

-    if subtitle_provider == "whisper" or subtitle_fallback:
-        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
-        logger.info("\n\n## correcting subtitle")
-        subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+#     if subtitle_provider == "whisper" or subtitle_fallback:
+#         subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+#         logger.info("\n\n## correcting subtitle")
+#         subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)

-    subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
-    if not subtitle_lines:
-        logger.warning(f"subtitle file is invalid: {subtitle_path}")
-        return ""
+#     subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
+#     if not subtitle_lines:
+#         logger.warning(f"subtitle file is invalid: {subtitle_path}")
+#         return ""

-    return subtitle_path
+#     return subtitle_path


-def get_video_materials(task_id, params, video_terms, audio_duration):
-    if params.video_source == "local":
-        logger.info("\n\n## preprocess local materials")
-        materials = video.preprocess_video(
-            materials=params.video_materials, clip_duration=params.video_clip_duration
-        )
-        if not materials:
-            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-            logger.error(
-                "no valid materials found, please check the materials and try again."
-            )
-            return None
-        return [material_info.url for material_info in materials]
-    else:
-        logger.info(f"\n\n## downloading videos from {params.video_source}")
-        downloaded_videos = material.download_videos(
-            task_id=task_id,
-            search_terms=video_terms,
-            source=params.video_source,
-            video_aspect=params.video_aspect,
-            video_contact_mode=params.video_concat_mode,
-            audio_duration=audio_duration * params.video_count,
-            max_clip_duration=params.video_clip_duration,
-        )
-        if not downloaded_videos:
-            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-            logger.error(
-                "failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
-            )
-            return None
-        return downloaded_videos
+# def get_video_materials(task_id, params, video_terms, audio_duration):
+#     if params.video_source == "local":
+#         logger.info("\n\n## preprocess local materials")
+#         materials = video.preprocess_video(
+#             materials=params.video_materials, clip_duration=params.video_clip_duration
+#         )
+#         if not materials:
+#             sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#             logger.error(
+#                 "no valid materials found, please check the materials and try again."
+#             )
+#             return None
+#         return [material_info.url for material_info in materials]
+#     else:
+#         logger.info(f"\n\n## downloading videos from {params.video_source}")
+#         downloaded_videos = material.download_videos(
+#             task_id=task_id,
+#             search_terms=video_terms,
+#             source=params.video_source,
+#             video_aspect=params.video_aspect,
+#             video_contact_mode=params.video_concat_mode,
+#             audio_duration=audio_duration * params.video_count,
+#             max_clip_duration=params.video_clip_duration,
+#         )
+#         if not downloaded_videos:
+#             sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+#             logger.error(
+#                 "failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
+#             )
+#             return None
+#         return downloaded_videos


 def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict):
-    """后台任务（自动剪辑视频进行剪辑）"""
+    """
+    后台任务（自动剪辑视频进行剪辑）
+    Args:
+        task_id: 任务ID
+        params: 视频参数
+        subclip_path_videos: 视频片段路径
+    """
+    global merged_audio_path, merged_subtitle_path
+
    logger.info(f"\n\n## 开始任务: {task_id}")
-    
-    # 初始化 ImageMagick
-    if not utils.init_imagemagick():
-        logger.warning("ImageMagick 初始化失败，字幕可能无法正常显示")
-    
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0)

-    # tts 角色名称
-    voice_name = voice.parse_voice_name(params.voice_name)
+    # # 初始化 ImageMagick
+    # if not utils.init_imagemagick():
+    #     logger.warning("ImageMagick 初始化失败，字幕可能无法正常显示")

+    # # tts 角色名称
+    # voice_name = voice.parse_voice_name(params.voice_name)
+    """
+    1. 加载剪辑脚本
+    """
    logger.info("\n\n## 1. 加载视频脚本")
    video_script_path = path.join(params.video_clip_json_path)
    
@ -185,174 +195,145 @@ def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: di
                logger.debug(f"解说完整脚本: \n{video_script}")
                logger.debug(f"解说 OST 列表: \n{video_ost}")
                logger.debug(f"解说时间戳列表: \n{time_list}")
-                
-                # 获取视频总时长(单位 s)
-                last_timestamp = list_script[-1]['new_timestamp']
-                end_time = last_timestamp.split("-")[1]
-                total_duration = utils.time_to_seconds(end_time)
-                
        except Exception as e:
-            logger.error(f"无法读取视频json脚本，请检查配置是否正确。{e}")
-            raise ValueError("无法读取视频json脚本，请检查配置是否正确")
+            logger.error(f"无法读取视频json脚本，请检查脚本格式是否正确")
+            raise ValueError("无法读取视频json脚本，请检查脚本格式是否正确")
    else:
        logger.error(f"video_script_path: {video_script_path} \n\n", traceback.format_exc())
        raise ValueError("解说脚本不存在！请检查配置是否正确。")

+    """
+    2. 使用 TTS 生成音频素材
+    """
    logger.info("\n\n## 2. 根据OST设置生成音频列表")
-    # 只为OST=0或2的片段生成TTS音频
+    # 只为OST=0 or 2的判断生成音频， OST=0 仅保留解说 OST=2 保留解说和原声
    tts_segments = [
        segment for segment in list_script 
        if segment['OST'] in [0, 2]
    ]
    logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}")
-    
-    # 初始化音频文件路径
-    audio_files = []
-    final_audio = ""
-    
+
+    tts_results = voice.tts_multiple(
+        task_id=task_id,
+        list_script=tts_segments,  # 只传入需要TTS的片段
+        voice_name=params.voice_name,
+        voice_rate=params.voice_rate,
+        voice_pitch=params.voice_pitch,
+    )
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
+
+    # """
+    # 3. (可选) 使用 whisper 生成字幕
+    # """
+    # if merged_subtitle_path is None:
+    #     if audio_files:
+    #         merged_subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
+    #         subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+    #         logger.info(f"\n\n使用 {subtitle_provider} 生成字幕")
+    #
+    #         subtitle.create(
+    #             audio_file=merged_audio_path,
+    #             subtitle_file=merged_subtitle_path,
+    #         )
+    #         subtitle_lines = subtitle.file_to_subtitles(merged_subtitle_path)
+    #         if not subtitle_lines:
+    #             logger.warning(f"字幕文件无效: {merged_subtitle_path}")
+    #
+    # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
+
+    """
+    3. 裁剪视频 - 将超出音频长度的视频进行裁剪
+    """
+    logger.info("\n\n## 3. 裁剪视频")
+    video_clip_result = clip_video.clip_video(params.video_origin_path, tts_results)
+    # 更新 list_script 中的时间戳
+    tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results}
+    subclip_clip_result = {
+        tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results
+    }
+    new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result)
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60)
+
+    """
+    4. 合并音频和字幕
+    """
+    logger.info("\n\n## 4. 合并音频和字幕")
+    total_duration = sum([script["duration"] for script in new_script_list])
    if tts_segments:
-        audio_files, sub_maker_list = voice.tts_multiple(
-            task_id=task_id,
-            list_script=tts_segments,  # 只传入需要TTS的片段
-            voice_name=voice_name,
-            voice_rate=params.voice_rate,
-            voice_pitch=params.voice_pitch,
-            force_regenerate=True
-        )
-        if audio_files is None:
-            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-            logger.error("TTS转换音频失败, 可能是网络不可用! 如果您在中国, 请使用VPN.")
-            return
-
-        if audio_files:
-            logger.info(f"合并音频文件: {audio_files}")
-            try:
-                # 传入OST信息以便正确处理音频
-                final_audio = audio_merger.merge_audio_files(
-                    task_id=task_id,
-                    audio_files=audio_files,
-                    total_duration=total_duration,
-                    list_script=list_script  # 传入完整脚本以便处理OST
-                )
-                logger.info("音频文件合并成功")
-            except Exception as e:
-                logger.error(f"合并音频文件失败: {str(e)}")
-                final_audio = ""
-    else:
-        # 如果没有需要生成TTS的片段，创建一个空白音频文件
-        # 这样可以确保后续的音频处理能正确进行
-        logger.info("没有需要生成TTS的片段，将保留原声和背景音乐")
-        final_audio = path.join(utils.task_dir(task_id), "empty.mp3")
        try:
-            from moviepy.editor import AudioClip
-            # 创建一个与视频等长的空白音频
-            empty_audio = AudioClip(make_frame=lambda t: 0, duration=total_duration)
-            empty_audio.write_audiofile(final_audio, fps=44100)
-            logger.info(f"已创建空白音频文件: {final_audio}")
-        except Exception as e:
-            logger.error(f"创建空白音频文件失败: {str(e)}")
-            final_audio = ""
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
-
-    subtitle_path = ""
-    if params.subtitle_enabled:
-        if audio_files:
-            subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
-            subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
-            logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
-
-            subtitle.create(
-                audio_file=final_audio,
-                subtitle_file=subtitle_path,
+            # 合并音频文件
+            merged_audio_path = audio_merger.merge_audio_files(
+                task_id=task_id,
+                total_duration=total_duration,
+                list_script=new_script_list
            )
+            logger.info(f"音频文件合并成功->{merged_audio_path}")
+            # 合并字幕文件
+            merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list)
+            logger.info(f"字幕文件合并成功->{merged_subtitle_path}")
+        except Exception as e:
+            logger.error(f"合并音频文件失败: {str(e)}")
+    else:
+        logger.warning("没有需要合并的音频/字幕")
+        merged_audio_path = ""
+        merged_subtitle_path = ""

-            subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
-            if not subtitle_lines:
-                logger.warning(f"字幕文件无效: {subtitle_path}")
-                subtitle_path = ""
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
-
-    logger.info("\n\n## 4. 裁剪视频")
-    subclip_videos = [x for x in subclip_path_videos.values()]
-    # logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}")
-
-    if not subclip_videos:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error(
-            "裁剪视频失败，可能是 ImageMagick 不可用")
-        return
-
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
-
+    """
+    5. 合并视频
+    """
    final_video_paths = []
    combined_video_paths = []

-    _progress = 50
-    index = 1
-    combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
+    combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4")
    logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}")
+    # 如果 new_script_list 中没有 video，则使用 subclip_path_videos 中的视频
+    video_clips = [new_script['video'] if new_script.get('video') else subclip_path_videos.get(new_script.get('_id', '')) for new_script in new_script_list]

-    video.combine_clip_videos(
-        combined_video_path=combined_video_path,
-        video_paths=subclip_videos,
+    merger_video.combine_clip_videos(
+        output_video_path=combined_video_path,
+        video_paths=video_clips,
        video_ost_list=video_ost,
-        list_script=list_script,
        video_aspect=params.video_aspect,
-        threads=params.n_threads  # 多线程
+        threads=params.n_threads
    )
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80)

-    _progress += 50 / 2
-    sm.state.update_task(task_id, progress=_progress)
+    """
+    6. 合并字幕/BGM/配音/视频
+    """
+    output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
+    logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}")

-    final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
+    # bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3'
+    bgm_path = utils.get_bgm_file()

-    logger.info(f"\n\n## 6. 最后合成: {index} => {final_video_path}")
-    
-    # 获取背景音乐
-    bgm_path = None
-    if params.bgm_type or params.bgm_file:
-        try:
-            bgm_path = utils.get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
-            if bgm_path:
-                logger.info(f"使用背景音乐: {bgm_path}")
-        except Exception as e:
-            logger.error(f"获取背景音乐失败: {str(e)}")
-
-    # 示例：自定义字幕样式
-    subtitle_style = {
-        'fontsize': params.font_size,  # 字体大小
-        'color': params.text_fore_color,  # 字体颜色
-        'stroke_color': params.stroke_color,  # 描边颜色
-        'stroke_width': params.stroke_width,  # 描边宽度, 范围0-10
-        'bg_color': params.text_back_color,   # 半透明黑色背景
-        'position': (params.subtitle_position, 0.2),  # 距离顶部60%的位置
-        'method': 'caption'  # 渲染方法
+    # 调用示例
+    options = {
+        'voice_volume': params.tts_volume,  # 配音音量
+        'bgm_volume': params.bgm_volume,  # 背景音乐音量
+        'original_audio_volume': params.original_volume,  # 视频原声音量，0表示不保留
+        'keep_original_audio': True,  # 是否保留原声
+        'subtitle_enabled': params.subtitle_enabled,  # 是否启用字幕 - 修复字幕开关bug
+        'subtitle_font': params.font_name,  # 这里使用相对字体路径，会自动在 font_dir() 目录下查找
+        'subtitle_font_size': params.font_size,
+        'subtitle_color': params.text_fore_color,
+        'subtitle_bg_color': None,  # 直接使用None表示透明背景
+        'subtitle_position': params.subtitle_position,
+        'custom_position': params.custom_position,
+        'threads': params.n_threads
    }
-
-    # 示例：自定义音量配置
-    volume_config = {
-        'original': params.original_volume,  # 原声音量80%
-        'bgm': params.bgm_volume,  # BGM音量20%
-        'narration': params.tts_volume or params.voice_volume,  # 解说音量100%
-    }
-    font_path = utils.font_dir(params.font_name)
-    video.generate_video_v3(
+    generate_video.merge_materials(
        video_path=combined_video_path,
-        subtitle_path=subtitle_path,
+        audio_path=merged_audio_path,
+        subtitle_path=merged_subtitle_path,
        bgm_path=bgm_path,
-        narration_path=final_audio,
-        output_path=final_video_path,
-        volume_config=volume_config,  # 添加音量配置
-        subtitle_style=subtitle_style,
-        font_path=font_path
+        output_path=output_video_path,
+        options=options
    )

-    _progress += 50 / 2
-    sm.state.update_task(task_id, progress=_progress)
-
-    final_video_paths.append(final_video_path)
+    final_video_paths.append(output_video_path)
    combined_video_paths.append(combined_video_path)

    logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.")
@ -400,35 +381,19 @@ def validate_params(video_path, audio_path, output_file, params):


 if __name__ == "__main__":
-    # task_id = "test123"
-    # subclip_path_videos = {'00:41-01:58': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_41-01_58.mp4',
-    #                        '00:06-00:15': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_06-00_15.mp4',
-    #                        '01:10-01:17': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_10-01_17.mp4',
-    #                        '00:47-01:03': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-00_47-01_03.mp4',
-    #                        '01:03-01:10': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-01_03-01_10.mp4',
-    #                        '02:40-03:08': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-02_40-03_08.mp4',
-    #                        '03:02-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_02-03_20.mp4',
-    #                        '03:18-03:20': 'E:\\projects\\NarratoAI\\storage\\cache_videos/vid-03_18-03_20.mp4'}
-    #
-    # params = VideoClipParams(
-    #     video_clip_json_path="E:\\projects\\NarratoAI\\resource/scripts/test003.json",
-    #     video_origin_path="E:\\projects\\NarratoAI\\resource/videos/1.mp4",
-    # )
-    # start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
+    task_id = "demo"

-    task_id = "test456"
-    subclip_path_videos = {'01:10-01:17': './storage/cache_videos/vid-01_10-01_17.mp4',
-                           '01:58-02:04': './storage/cache_videos/vid-01_58-02_04.mp4',
-                           '02:25-02:31': './storage/cache_videos/vid-02_25-02_31.mp4',
-                           '01:28-01:33': './storage/cache_videos/vid-01_28-01_33.mp4',
-                           '03:14-03:18': './storage/cache_videos/vid-03_14-03_18.mp4',
-                           '00:24-00:28': './storage/cache_videos/vid-00_24-00_28.mp4',
-                           '03:02-03:08': './storage/cache_videos/vid-03_02-03_08.mp4',
-                           '00:41-00:44': './storage/cache_videos/vid-00_41-00_44.mp4',
-                           '02:12-02:25': './storage/cache_videos/vid-02_12-02_25.mp4'}
+    # 提前裁剪是为了方便检查视频
+    subclip_path_videos = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-05-390@00-00-57-980.mp4',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-28-900@00-00-43-700.mp4',
+        3: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-01-17-840@00-01-27-600.mp4',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-02-35-460@00-02-52-380.mp4',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-06-59-520@00-07-29-500.mp4',
+    }

    params = VideoClipParams(
-        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/test004.json",
-        video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4",
+        video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/2025-0507-223311.json",
+        video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_4938.mp4",
    )
-    start_subclip(task_id, params, subclip_path_videos=subclip_path_videos)
+    start_subclip(task_id, params, subclip_path_videos)
--- a/app/services/update_script.py
+++ b/app/services/update_script.py
@ -0,0 +1,266 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : update_script
+@Author : 小林同学
+@Date   : 2025/5/6 下午11:00 
+'''
+
+import re
+import os
+from typing import Dict, List, Any, Tuple, Union
+
+
+def extract_timestamp_from_video_path(video_path: str) -> str:
+    """
+    从视频文件路径中提取时间戳
+    
+    Args:
+        video_path: 视频文件路径
+    
+    Returns:
+        提取出的时间戳，格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss'
+    """
+    # 使用正则表达式从文件名中提取时间戳
+    filename = os.path.basename(video_path)
+    
+    # 匹配新格式: vid_00-00-00-000@00-00-20-250.mp4
+    match_new = re.search(r'vid_(\d{2})-(\d{2})-(\d{2})-(\d{3})@(\d{2})-(\d{2})-(\d{2})-(\d{3})\.mp4', filename)
+    if match_new:
+        # 提取并格式化时间戳（包含毫秒）
+        start_h, start_m, start_s, start_ms = match_new.group(1), match_new.group(2), match_new.group(3), match_new.group(4)
+        end_h, end_m, end_s, end_ms = match_new.group(5), match_new.group(6), match_new.group(7), match_new.group(8)
+        return f"{start_h}:{start_m}:{start_s},{start_ms}-{end_h}:{end_m}:{end_s},{end_ms}"
+    
+    # 匹配旧格式: vid-00-00-00-00-00-00.mp4
+    match_old = re.search(r'vid-(\d{2}-\d{2}-\d{2})-(\d{2}-\d{2}-\d{2})\.mp4', filename)
+    if match_old:
+        # 提取并格式化时间戳
+        start_time = match_old.group(1).replace('-', ':')
+        end_time = match_old.group(2).replace('-', ':')
+        return f"{start_time}-{end_time}"
+
+    return ""
+
+
+def calculate_duration(timestamp: str) -> float:
+    """
+    计算时间戳范围的持续时间（秒）
+    
+    Args:
+        timestamp: 格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss' 的时间戳
+    
+    Returns:
+        持续时间（秒）
+    """
+    try:
+        start_time, end_time = timestamp.split('-')
+
+        # 处理毫秒部分
+        if ',' in start_time:
+            start_parts = start_time.split(',')
+            start_time_parts = start_parts[0].split(':')
+            start_ms = float('0.' + start_parts[1]) if len(start_parts) > 1 else 0
+            start_h, start_m, start_s = map(int, start_time_parts)
+        else:
+            start_h, start_m, start_s = map(int, start_time.split(':'))
+            start_ms = 0
+
+        if ',' in end_time:
+            end_parts = end_time.split(',')
+            end_time_parts = end_parts[0].split(':')
+            end_ms = float('0.' + end_parts[1]) if len(end_parts) > 1 else 0
+            end_h, end_m, end_s = map(int, end_time_parts)
+        else:
+            end_h, end_m, end_s = map(int, end_time.split(':'))
+            end_ms = 0
+
+        # 转换为秒
+        start_seconds = start_h * 3600 + start_m * 60 + start_s + start_ms
+        end_seconds = end_h * 3600 + end_m * 60 + end_s + end_ms
+
+        # 计算时间差（秒）
+        return round(end_seconds - start_seconds, 2)
+    except (ValueError, AttributeError):
+        return 0.0
+
+
+def update_script_timestamps(
+    script_list: List[Dict[str, Any]], 
+    video_result: Dict[Union[str, int], str], 
+    audio_result: Dict[Union[str, int], str] = None,
+    subtitle_result: Dict[Union[str, int], str] = None,
+    calculate_edited_timerange: bool = True
+) -> List[Dict[str, Any]]:
+    """
+    根据 video_result 中的视频文件更新 script_list 中的时间戳，添加持续时间，
+    并根据 audio_result 添加音频路径，根据 subtitle_result 添加字幕路径
+    
+    Args:
+        script_list: 原始脚本列表
+        video_result: 视频结果字典，键为原时间戳或_id，值为视频文件路径
+        audio_result: 音频结果字典，键为原时间戳或_id，值为音频文件路径
+        subtitle_result: 字幕结果字典，键为原时间戳或_id，值为字幕文件路径
+        calculate_edited_timerange: 是否计算并添加成品视频中的时间范围
+    
+    Returns:
+        更新后的脚本列表
+    """
+    # 创建副本，避免修改原始数据
+    updated_script = []
+
+    # 建立ID和时间戳到视频路径和新时间戳的映射
+    id_timestamp_mapping = {}
+    for key, video_path in video_result.items():
+        new_timestamp = extract_timestamp_from_video_path(video_path)
+        if new_timestamp:
+            id_timestamp_mapping[key] = {
+                'new_timestamp': new_timestamp,
+                'video_path': video_path
+            }
+
+    # 计算累积时长，用于生成成品视频中的时间范围
+    accumulated_duration = 0.0
+    
+    # 更新脚本中的时间戳
+    for item in script_list:
+        item_copy = item.copy()
+        item_id = item_copy.get('_id')
+        orig_timestamp = item_copy.get('timestamp', '')
+
+        # 初始化音频和字幕路径为空字符串
+        item_copy['audio'] = ""
+        item_copy['subtitle'] = ""
+        item_copy['video'] = ""  # 初始化视频路径为空字符串
+
+        # 如果提供了音频结果字典且ID存在于音频结果中，直接使用对应的音频路径
+        if audio_result:
+            if item_id and item_id in audio_result:
+                item_copy['audio'] = audio_result[item_id]
+            elif orig_timestamp in audio_result:
+                item_copy['audio'] = audio_result[orig_timestamp]
+
+        # 如果提供了字幕结果字典且ID存在于字幕结果中，直接使用对应的字幕路径
+        if subtitle_result:
+            if item_id and item_id in subtitle_result:
+                item_copy['subtitle'] = subtitle_result[item_id]
+            elif orig_timestamp in subtitle_result:
+                item_copy['subtitle'] = subtitle_result[orig_timestamp]
+
+        # 添加视频路径
+        if item_id and item_id in video_result:
+            item_copy['video'] = video_result[item_id]
+        elif orig_timestamp in video_result:
+            item_copy['video'] = video_result[orig_timestamp]
+
+        # 更新时间戳和计算持续时间
+        current_duration = 0.0
+        if item_id and item_id in id_timestamp_mapping:
+            # 根据ID找到对应的新时间戳
+            item_copy['sourceTimeRange'] = id_timestamp_mapping[item_id]['new_timestamp']
+            current_duration = calculate_duration(item_copy['sourceTimeRange'])
+            item_copy['duration'] = current_duration
+        elif orig_timestamp in id_timestamp_mapping:
+            # 根据原始时间戳找到对应的新时间戳
+            item_copy['sourceTimeRange'] = id_timestamp_mapping[orig_timestamp]['new_timestamp']
+            current_duration = calculate_duration(item_copy['sourceTimeRange'])
+            item_copy['duration'] = current_duration
+        elif orig_timestamp:
+            # 对于未更新的时间戳，也计算并添加持续时间
+            item_copy['sourceTimeRange'] = orig_timestamp
+            current_duration = calculate_duration(orig_timestamp)
+            item_copy['duration'] = current_duration
+            
+        # 计算片段在成品视频中的时间范围
+        if calculate_edited_timerange and current_duration > 0:
+            start_time_seconds = accumulated_duration
+            end_time_seconds = accumulated_duration + current_duration
+            
+            # 将秒数转换为 HH:MM:SS 格式
+            start_h = int(start_time_seconds // 3600)
+            start_m = int((start_time_seconds % 3600) // 60)
+            start_s = int(start_time_seconds % 60)
+            
+            end_h = int(end_time_seconds // 3600)
+            end_m = int((end_time_seconds % 3600) // 60)
+            end_s = int(end_time_seconds % 60)
+            
+            item_copy['editedTimeRange'] = f"{start_h:02d}:{start_m:02d}:{start_s:02d}-{end_h:02d}:{end_m:02d}:{end_s:02d}"
+            
+            # 更新累积时长
+            accumulated_duration = end_time_seconds
+
+        updated_script.append(item_copy)
+
+    return updated_script
+
+
+if __name__ == '__main__':
+    list_script = [
+        {
+            'picture': '【解说】好的，各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！',
+            'timestamp': '00:00:00,001-00:01:15,001',
+            'narration': '好的各位，欢迎回到我的频道！《庆余年 2》刚开播就给了我们一个王炸！范闲在北齐"死"了？这怎么可能！上集片尾那个巨大的悬念，这一集就立刻揭晓了！范闲假死归来，他面临的第一个，也是最大的难关，就是如何面对他最敬爱的，同时也是最可怕的那个人——庆帝！',
+            'OST': 0,
+            '_id': 1
+        },
+        {
+            'picture': '【解说】上一集我们看到，范闲在北齐遭遇了惊天变故，生死不明！',
+            'timestamp': '00:01:15,001-00:04:40,001',
+            'narration': '但我们都知道，他绝不可能就这么轻易退场！第二集一开场，范闲就已经秘密回到了京都。他的生死传闻，可不像我们想象中那样只是小范围流传，而是…',
+            'OST': 0,
+            '_id': 2
+        },
+        {
+            'picture': '画面切到王启年小心翼翼地向范闲汇报。',
+            'timestamp': '00:04:41,001-00:04:58,001',
+            'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪',
+            'OST': 1,
+            '_id': 3
+        },
+        {
+            'picture': '【解说】"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。',
+            'timestamp': '00:04:58,001-00:05:45,001',
+            'narration': '"欺君之罪"！在封建王朝，这可是抄家灭族的大罪！搁一般人，肯定脚底抹油溜之大吉了。但范闲是谁啊？他偏要反其道而行之！他竟然决定，直接去见庆帝！冒着天大的风险，用"假死"这个事实去赌庆帝的态度！',
+            'OST': 0,
+            '_id': 4
+        },
+        {
+            'picture': '【解说】但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+            'timestamp': '00:05:45,001-00:06:00,001',
+            'narration': '但想见庆帝，哪有那么容易？范闲艺高人胆大，竟然选择了最激进的方式——闯宫！',
+            'OST': 0,
+            '_id': 5
+        },
+        {
+            'picture': '画面切换到范闲蒙面闯入皇宫，被侍卫包围的场景。',
+            'timestamp': '00:06:00,001-00:06:03,001',
+            'narration': '抓刺客',
+            'OST': 1,
+            '_id': 6
+        }]
+    video_res = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-00-000@00-00-20-250.mp4',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-30-000@00-00-48-950.mp4',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-00-000@00-01-15-688.mp4',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-30-000@00-01-49-512.mp4'}
+    audio_res = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'}
+    sub_res = {
+        1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt',
+        2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt',
+        4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt',
+        5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt'}
+    
+    # 更新并打印结果
+    updated_list_script = update_script_timestamps(list_script, video_res, audio_res, sub_res)
+    for item in updated_list_script:
+        print(
+            f"ID: {item['_id']} | Picture: {item['picture'][:20]}... | Timestamp: {item['timestamp']} | " +
+            f"SourceTimeRange: {item['sourceTimeRange']} | EditedTimeRange: {item.get('editedTimeRange', '')} | " +
+            f"Duration: {item['duration']} 秒 | Audio: {item['audio']} | Video: {item['video']} | Subtitle: {item['subtitle']}")
--- a/app/services/video.py
+++ b/app/services/video.py
@ -1,13 +1,13 @@
 import traceback

-import pysrt
+# import pysrt
 from typing import Optional
 from typing import List
 from loguru import logger
-from moviepy.editor import *
+from moviepy import *
 from PIL import ImageFont
 from contextlib import contextmanager
-from moviepy.editor import (
+from moviepy import (
    VideoFileClip,
    AudioFileClip,
    TextClip,
@ -105,86 +105,6 @@ def manage_clip(clip):
        del clip


-def combine_clip_videos(combined_video_path: str,
-                        video_paths: List[str],
-                        video_ost_list: List[int],
-                        list_script: list,
-                        video_aspect: VideoAspect = VideoAspect.portrait,
-                        threads: int = 2,
-                        ) -> str:
-    """
-    合并子视频
-    Args:
-        combined_video_path: 合并后的存储路径
-        video_paths: 子视频路径列表
-        video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说)
-        list_script: 剪辑脚本
-        video_aspect: 屏幕比例
-        threads: 线程数
-
-    Returns:
-        str: 合并后的视频路径
-    """
-    from app.utils.utils import calculate_total_duration
-    audio_duration = calculate_total_duration(list_script)
-    logger.info(f"音频的最大持续时间: {audio_duration} s")
-
-    output_dir = os.path.dirname(combined_video_path)
-    aspect = VideoAspect(video_aspect)
-    video_width, video_height = aspect.to_resolution()
-
-    clips = []
-    for video_path, video_ost in zip(video_paths, video_ost_list):
-        try:
-            clip = VideoFileClip(video_path)
-
-            if video_ost == 0:  # 不保留原声
-                clip = clip.without_audio()
-            # video_ost 为 1 或 2 时都保留原声，不需要特殊处理
-
-            clip = clip.set_fps(30)
-
-            # 处理视频尺寸
-            clip_w, clip_h = clip.size
-            if clip_w != video_width or clip_h != video_height:
-                clip = resize_video_with_padding(
-                    clip,
-                    target_width=video_width,
-                    target_height=video_height
-                )
-                logger.info(f"视频 {video_path} 已调整尺寸为 {video_width} x {video_height}")
-
-            clips.append(clip)
-
-        except Exception as e:
-            logger.error(f"处理视频 {video_path} 时出错: {str(e)}")
-            continue
-
-    if not clips:
-        raise ValueError("没有有效的视频片段可以合并")
-
-    try:
-        video_clip = concatenate_videoclips(clips)
-        video_clip = video_clip.set_fps(30)
-
-        logger.info("开始合并视频... (过程中出现 UserWarning: 不必理会)")
-        video_clip.write_videofile(
-            filename=combined_video_path,
-            threads=threads,
-            audio_codec="aac",
-            fps=30,
-            temp_audiofile=os.path.join(output_dir, "temp-audio.m4a")
-        )
-    finally:
-        # 确保资源被正确放
-        video_clip.close()
-        for clip in clips:
-            clip.close()
-
-    logger.success("视频合并完成")
-    return combined_video_path
-
-
 def resize_video_with_padding(clip, target_width: int, target_height: int):
    """
    调整视频尺寸并添加黑边
@ -285,7 +205,8 @@ def generate_video_v3(
        bgm_path: Optional[str] = None,
        narration_path: Optional[str] = None,
        output_path: str = "output.mp4",
-        font_path: Optional[str] = None
+        font_path: Optional[str] = None,
+        subtitle_enabled: bool = True
 ) -> None:
    """
    合并视频素材，包括视频、字幕、BGM和解说音频
@ -300,6 +221,7 @@ def generate_video_v3(
            - original: 原声音量（0-1），默认1.0
            - bgm: BGM音量（0-1），默认0.3
            - narration: 解说音量（0-1），默认1.0
+        subtitle_enabled: 是否启用字幕，默认True
        subtitle_style: 字幕样式配置字典，可包含以下键：
            - font: 字体名称
            - fontsize: 字体大小
@ -319,8 +241,8 @@ def generate_video_v3(
    video = VideoFileClip(video_path)
    subtitle_clips = []

-    # 处理字幕（如果提供）
-    if subtitle_path:
+    # 处理字幕（如果启用且提供）- 修复字幕开关bug
+    if subtitle_enabled and subtitle_path:
        if os.path.exists(subtitle_path):
            # 检查字体文件
            if font_path and not os.path.exists(font_path):
@ -388,30 +310,45 @@ def generate_video_v3(
            except Exception as e:
                logger.info(f"警告：处理字幕文件时出错: {str(e)}")
        else:
-            logger.info(f"提示：字幕文件不存在: {subtitle_path}")
+            logger.warning(f"字幕文件不存在: {subtitle_path}")
+    elif not subtitle_enabled:
+        logger.info("字幕已禁用，跳过字幕处理")
+    elif not subtitle_path:
+        logger.info("未提供字幕文件路径，跳过字幕处理")

    # 合并音频
    audio_clips = []

    # 添加原声（设置音量）
-    logger.debug(f"音量配置: {volume_config}")
+    logger.info(f"音量配置详情: {volume_config}")
    if video.audio is not None:
-        original_audio = video.audio.volumex(volume_config['original'])
+        original_volume = volume_config['original']
+        logger.info(f"应用原声音量: {original_volume}")
+        original_audio = video.audio.volumex(original_volume)
        audio_clips.append(original_audio)
+        logger.info("原声音频已添加到合成列表")
+    else:
+        logger.warning("视频没有音轨，无法添加原声")

    # 添加BGM（如果提供）
    if bgm_path:
+        logger.info(f"添加背景音乐: {bgm_path}")
        bgm = AudioFileClip(bgm_path)
        if bgm.duration < video.duration:
            bgm = loop_audio_clip(bgm, video.duration)
        else:
            bgm = bgm.subclip(0, video.duration)
-        bgm = bgm.volumex(volume_config['bgm'])
+        bgm_volume = volume_config['bgm']
+        logger.info(f"应用BGM音量: {bgm_volume}")
+        bgm = bgm.volumex(bgm_volume)
        audio_clips.append(bgm)

    # 添加解说音频（如果提供）
    if narration_path:
-        narration = AudioFileClip(narration_path).volumex(volume_config['narration'])
+        logger.info(f"添加解说音频: {narration_path}")
+        narration_volume = volume_config['narration']
+        logger.info(f"应用解说音量: {narration_volume}")
+        narration = AudioFileClip(narration_path).volumex(narration_volume)
        audio_clips.append(narration)

    # 合成最终视频（包含字幕）
@ -422,18 +359,53 @@ def generate_video_v3(
        final_video = video

    if audio_clips:
+        logger.info(f"合成音频轨道，共 {len(audio_clips)} 个音频片段")
        final_audio = CompositeAudioClip(audio_clips)
        final_video = final_video.set_audio(final_audio)
+        logger.info("音频合成完成")
+    else:
+        logger.warning("没有音频轨道需要合成")

-    # 导出视频
-    logger.info("开始导出视频...")  # 调试信息
-    final_video.write_videofile(
-        output_path,
-        codec='libx264',
-        audio_codec='aac',
-        fps=video.fps
-    )
-    logger.info(f"视频已导出到: {output_path}")  # 调试信息
+    # 导出视频 - 使用优化的编码器
+    logger.info("开始导出视频...")
+
+    # 获取最优编码器
+    from app.utils import ffmpeg_utils
+    optimal_encoder = ffmpeg_utils.get_optimal_ffmpeg_encoder()
+
+    # 根据编码器类型设置参数
+    ffmpeg_params = []
+    if "nvenc" in optimal_encoder:
+        ffmpeg_params = ['-preset', 'medium', '-profile:v', 'high']
+    elif "videotoolbox" in optimal_encoder:
+        ffmpeg_params = ['-profile:v', 'high']
+    elif "qsv" in optimal_encoder:
+        ffmpeg_params = ['-preset', 'medium']
+    elif "vaapi" in optimal_encoder:
+        ffmpeg_params = ['-profile', '100']
+    elif optimal_encoder == "libx264":
+        ffmpeg_params = ['-preset', 'medium', '-crf', '23']
+
+    try:
+        final_video.write_videofile(
+            output_path,
+            codec=optimal_encoder,
+            audio_codec='aac',
+            fps=video.fps,
+            ffmpeg_params=ffmpeg_params
+        )
+        logger.info(f"视频已导出到: {output_path} (使用编码器: {optimal_encoder})")
+    except Exception as e:
+        logger.warning(f"使用 {optimal_encoder} 编码器失败: {str(e)}, 尝试软件编码")
+        # 降级到软件编码
+        final_video.write_videofile(
+            output_path,
+            codec='libx264',
+            audio_codec='aac',
+            fps=video.fps,
+            ffmpeg_params=['-preset', 'medium', '-crf', '23']
+        )
+        logger.info(f"视频已导出到: {output_path} (使用软件编码)")

    # 清理资源
    video.close()
@ -443,4 +415,3 @@ def generate_video_v3(
        bgm.close()
    if narration_path:
        narration.close()
-
--- a/app/services/video_service.py
+++ b/app/services/video_service.py
@ -4,8 +4,6 @@ from loguru import logger
 from typing import Dict, List, Optional, Tuple

 from app.services import material
-from app.models.schema import VideoClipParams
-from app.utils import utils


 class VideoService:
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -5,10 +5,11 @@ import traceback
 import edge_tts
 import asyncio
 from loguru import logger
-from typing import List
+from typing import List, Union
 from datetime import datetime
 from xml.sax.saxutils import unescape
 from edge_tts import submaker, SubMaker
+from edge_tts.submaker import mktimestamp
 from moviepy.video.tools import subtitles
 import time

@ -1036,7 +1037,7 @@ def is_azure_v2_voice(voice_name: str):

 def tts(
    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
-) -> [SubMaker, None]:
+) -> Union[SubMaker, None]:
    if is_azure_v2_voice(voice_name):
        return azure_tts_v2(text, voice_name, voice_file)
    return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file)
@ -1064,7 +1065,7 @@ def convert_pitch_to_percent(rate: float) -> str:

 def azure_tts_v1(
    text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str
-) -> [SubMaker, None]:
+) -> Union[SubMaker, None]:
    voice_name = parse_voice_name(voice_name)
    text = text.strip()
    rate_str = convert_rate_to_percent(voice_rate)
@ -1087,11 +1088,6 @@ def azure_tts_v1(
                        )
                return sub_maker, audio_data

-            # 判断音频文件是否已存在
-            if os.path.exists(voice_file):
-                logger.info(f"voice file exists, skip tts: {voice_file}")
-                continue
-
            # 获取音频数据和字幕信息
            sub_maker, audio_data = asyncio.run(_do())
            
@ -1105,8 +1101,6 @@ def azure_tts_v1(
            # 数据有效，写入文件
            with open(voice_file, "wb") as file:
                file.write(audio_data)
-
-            logger.info(f"completed, output file: {voice_file}")
            return sub_maker
        except Exception as e:
            logger.error(f"生成音频文件时出错: {str(e)}")
@ -1115,7 +1109,7 @@ def azure_tts_v1(
    return None


-def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]:
    voice_name = is_azure_v2_voice(voice_name)
    if not voice_name:
        logger.error(f"invalid voice name: {voice_name}")
@ -1203,11 +1197,14 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None


 def _format_text(text: str) -> str:
-    # text = text.replace("\n", " ")
+    text = text.replace("\n", " ")
+    text = text.replace("\"", " ")
    text = text.replace("[", " ")
    text = text.replace("]", " ")
    text = text.replace("(", " ")
    text = text.replace(")", " ")
+    text = text.replace("）", " ")
+    text = text.replace("（", " ")
    text = text.replace("{", " ")
    text = text.replace("}", " ")
    text = text.strip()
@ -1240,7 +1237,7 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis
            if script_item['OST']:
                continue

-            start_time, end_time = script_item['new_timestamp'].split('-')
+            start_time, end_time = script_item['timestamp'].split('-')
            if sub_maker_index >= len(sub_maker_list):
                logger.error(f"Sub maker list index out of range: {sub_maker_index}")
                break
@ -1317,6 +1314,99 @@ def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], lis
        traceback.print_exc()


+def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
+    """
+    优化字幕文件
+    1. 将字幕文件按照标点符号分割成多行
+    2. 逐行匹配字幕文件中的文本
+    3. 生成新的字幕文件
+    """
+
+    text = _format_text(text)
+
+    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
+        """
+        1
+        00:00:00,000 --> 00:00:02,360
+        跑步是一项简单易行的运动
+        """
+        start_t = mktimestamp(start_time).replace(".", ",")
+        end_t = mktimestamp(end_time).replace(".", ",")
+        return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n"
+
+    start_time = -1.0
+    sub_items = []
+    sub_index = 0
+
+    script_lines = utils.split_string_by_punctuations(text)
+
+    def match_line(_sub_line: str, _sub_index: int):
+        if len(script_lines) <= _sub_index:
+            return ""
+
+        _line = script_lines[_sub_index]
+        if _sub_line == _line:
+            return script_lines[_sub_index].strip()
+
+        _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line)
+        _line_ = re.sub(r"[^\w\s]", "", _line)
+        if _sub_line_ == _line_:
+            return _line_.strip()
+
+        _sub_line_ = re.sub(r"\W+", "", _sub_line)
+        _line_ = re.sub(r"\W+", "", _line)
+        if _sub_line_ == _line_:
+            return _line.strip()
+
+        return ""
+
+    sub_line = ""
+
+    try:
+        for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
+            _start_time, end_time = offset
+            if start_time < 0:
+                start_time = _start_time
+
+            sub = unescape(sub)
+            sub_line += sub
+            sub_text = match_line(sub_line, sub_index)
+            if sub_text:
+                sub_index += 1
+                line = formatter(
+                    idx=sub_index,
+                    start_time=start_time,
+                    end_time=end_time,
+                    sub_text=sub_text,
+                )
+                sub_items.append(line)
+                start_time = -1.0
+                sub_line = ""
+
+        if len(sub_items) == len(script_lines):
+            with open(subtitle_file, "w", encoding="utf-8") as file:
+                file.write("\n".join(sub_items) + "\n")
+            try:
+                sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
+                duration = max([tb for ((ta, tb), txt) in sbs])
+                logger.info(
+                    f"已创建字幕文件: {subtitle_file}, duration: {duration}"
+                )
+                return subtitle_file, duration
+            except Exception as e:
+                logger.error(f"failed, error: {str(e)}")
+                os.remove(subtitle_file)
+        else:
+            logger.error(
+                f"字幕创建失败, 字幕长度: {len(sub_items)}, script_lines len: {len(script_lines)}"
+                f"\nsub_items:{json.dumps(sub_items, indent=4, ensure_ascii=False)}"
+                f"\nscript_lines:{json.dumps(script_lines, indent=4, ensure_ascii=False)}"
+            )
+
+    except Exception as e:
+        logger.error(f"failed, error: {str(e)}")
+
+
 def get_audio_duration(sub_maker: submaker.SubMaker):
    """
    获取音频时长
@ -1326,7 +1416,7 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
    return sub_maker.offset[-1][1] / 10000000


-def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float, force_regenerate: bool = True):
+def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float):
    """
    根据JSON文件中的多段文本进行TTS转换
    
@ -1334,25 +1424,18 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
    :param list_script: 脚本列表
    :param voice_name: 语音名称
    :param voice_rate: 语音速率
-    :param force_regenerate: 是否强制重新生成已存在的音频文件
    :return: 生成的音频文件列表
    """
    voice_name = parse_voice_name(voice_name)
    output_dir = utils.task_dir(task_id)
-    audio_files = []
-    sub_maker_list = []
+    tts_results = []

    for item in list_script:
        if item['OST'] != 1:
            # 将时间戳中的冒号替换为下划线
-            timestamp = item['new_timestamp'].replace(':', '_')
+            timestamp = item['timestamp'].replace(':', '_')
            audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3")
-            
-            # 检查文件是否已存在，如存在且不强制重新生成，则跳过
-            if os.path.exists(audio_file) and not force_regenerate:
-                logger.info(f"音频文件已存在，跳过生成: {audio_file}")
-                audio_files.append(audio_file)
-                continue
+            subtitle_file = os.path.join(output_dir, f"subtitle_{timestamp}.srt")

            text = item['narration']

@ -1369,9 +1452,18 @@ def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: f
                             f"如果您在中国，请使用VPN; "
                             f"或者使用其他 tts 引擎")
                continue
+            else:
+                # 为当前片段生成字幕文件
+                _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)

-            audio_files.append(audio_file)
-            sub_maker_list.append(sub_maker)
+            tts_results.append({
+                "_id": item['_id'],
+                "timestamp": item['timestamp'],
+                "audio_file": audio_file,
+                "subtitle_file": subtitle_file,
+                "duration": duration,
+                "text": text,
+            })
            logger.info(f"已生成音频文件: {audio_file}")

-    return audio_files, sub_maker_list
+    return tts_results
--- a/app/utils/ffmpeg_utils.py
+++ b/app/utils/ffmpeg_utils.py
--- a/app/utils/gemini_analyzer.py
+++ b/app/utils/gemini_analyzer.py
@ -61,7 +61,6 @@ class VisionAnalyzer:
        try:
            # 加载图片
            if isinstance(images[0], str):
-                logger.info("正在加载图片...")
                images = self.load_images(images)

            # 验证图片列表
@ -81,11 +80,14 @@ class VisionAnalyzer:

            images = valid_images
            results = []
-            total_batches = (len(images) + batch_size - 1) // batch_size
+            # 视频帧总数除以批量处理大小，如果有小数则+1
+            batches_needed = len(images) // batch_size
+            if len(images) % batch_size > 0:
+                batches_needed += 1
+                
+            logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次")

-            logger.debug(f"共 {total_batches} 个批次，每批次 {batch_size} 张图片")
-
-            with tqdm(total=total_batches, desc="分析进度") as pbar:
+            with tqdm(total=batches_needed, desc="分析进度") as pbar:
                for i in range(0, len(images), batch_size):
                    batch = images[i:i + batch_size]
                    retry_count = 0
@ -93,8 +95,8 @@ class VisionAnalyzer:
                    while retry_count < 3:
                        try:
                            # 在每个批次处理前添加小延迟
-                            if i > 0:
-                                await asyncio.sleep(2)
+                            # if i > 0:
+                            #     await asyncio.sleep(2)

                            # 确保每个批次的图片都是有效的
                            valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)]
--- a/app/utils/qwenvl_analyzer.py
+++ b/app/utils/qwenvl_analyzer.py
@ -30,7 +30,7 @@ class QwenAnalyzer:

        self.model_name = model_name
        self.api_key = api_key
-        self.base_url = base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
+        self.base_url = base_url

        # 配置API客户端
        self._configure_client()
@ -80,7 +80,7 @@ class QwenAnalyzer:
            # 添加文本提示
            content.append({
                "type": "text",
-                "text": prompt
+                "text": prompt % (len(content), len(content), len(content))
            })

            # 调用API
@ -102,7 +102,7 @@ class QwenAnalyzer:
    async def analyze_images(self,
                             images: Union[List[str], List[PIL.Image.Image]],
                             prompt: str,
-                             batch_size: int = 5) -> List[Dict]:
+                             batch_size: int) -> List[Dict]:
        """
        批量分析多张图片
        Args:
@ -118,7 +118,6 @@ class QwenAnalyzer:

            # 加载图片
            if isinstance(images[0], str):
-                logger.info("正在加载图片...")
                images = self.load_images(images)

            # 验证图片列表
@ -141,9 +140,14 @@ class QwenAnalyzer:

            images = valid_images
            results = []
-            total_batches = (len(images) + batch_size - 1) // batch_size
+            # 视频帧总数除以批量处理大小，如果有小数则+1
+            batches_needed = len(images) // batch_size
+            if len(images) % batch_size > 0:
+                batches_needed += 1
+                
+            logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次")

-            with tqdm(total=total_batches, desc="分析进度") as pbar:
+            with tqdm(total=batches_needed, desc="分析进度") as pbar:
                for i in range(0, len(images), batch_size):
                    batch = images[i:i + batch_size]
                    batch_paths = valid_paths[i:i + batch_size] if valid_paths else None
@ -151,9 +155,9 @@ class QwenAnalyzer:

                    while retry_count < 3:
                        try:
-                            # 在每个批次处理前<EFBFBD><EFBFBD>加小延迟
-                            if i > 0:
-                                await asyncio.sleep(2)
+                            # 在每个批次处理前添加小延迟
+                            # if i > 0:
+                            #     await asyncio.sleep(0.5)

                            # 确保每个批次的图片都是有效的
                            valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)]
@ -209,7 +213,7 @@ class QwenAnalyzer:
        for i, result in enumerate(results):
            response_text = result['response']

-            # 如果有图片路径信息，<EFBFBD><EFBFBD><EFBFBD>用它来生成文件名
+            # 如果有图片路径信息，用它来生成文件名
            if result.get('image_paths'):
                image_paths = result['image_paths']
                img_name_start = Path(image_paths[0]).stem.split('_')[-1]
--- a/app/utils/script_generator.py
+++ b/app/utils/script_generator.py
@ -2,7 +2,7 @@ import os
 import json
 import traceback
 from loguru import logger
-import tiktoken
+# import tiktoken
 from typing import List, Dict
 from datetime import datetime
 from openai import OpenAI
@ -94,12 +94,12 @@ class OpenAIGenerator(BaseGenerator):
            "user": "script_generator"
        }
        
-        # 初始化token计数器
-        try:
-            self.encoding = tiktoken.encoding_for_model(self.model_name)
-        except KeyError:
-            logger.warning(f"未找到模型 {self.model_name} 的专用编码器，使用默认编码器")
-            self.encoding = tiktoken.get_encoding("cl100k_base")
+        # # 初始化token计数器
+        # try:
+        #     self.encoding = tiktoken.encoding_for_model(self.model_name)
+        # except KeyError:
+        #     logger.warning(f"未找到模型 {self.model_name} 的专用编码器，使用默认编码器")
+        #     self.encoding = tiktoken.get_encoding("cl100k_base")

    def _generate(self, messages: list, params: dict) -> any:
        """实现OpenAI特定的生成逻辑"""
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@ -197,6 +197,28 @@ def time_convert_seconds_to_hmsm(seconds) -> str:
    return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds)


+def format_time(seconds: float) -> str:
+    """
+    将秒数转换为格式化的时间字符串 (HH:MM:SS,mmm)
+    
+    参数:
+        seconds: 需要转换的秒数，可以是整数或浮点数
+        
+    返回:
+        格式化的时间字符串，格式为 HH:MM:SS,mmm
+    """
+    # 计算小时、分钟、秒和毫秒
+    hours = int(seconds // 3600)
+    remaining_seconds = seconds % 3600
+    minutes = int(remaining_seconds // 60)
+    remaining_seconds = remaining_seconds % 60
+    secs = int(remaining_seconds)
+    milliseconds = int((remaining_seconds - secs) * 1000)
+    
+    # 格式化为时间字符串
+    return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, secs, milliseconds)
+
+
 def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str:
    start_time = time_convert_seconds_to_hmsm(start_time)
    end_time = time_convert_seconds_to_hmsm(end_time)
@ -303,6 +325,15 @@ def video_dir(sub_dir: str = ""):
    return d


+def subtitle_dir(sub_dir: str = ""):
+    d = resource_dir(f"srt")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
 def split_timestamp(timestamp):
    """
    拆分时间戳
@ -506,7 +537,7 @@ def cut_video(params, progress_callback=None):
        st.session_state['subclip_videos'] = subclip_videos
        for i, video_script in enumerate(video_script_list):
            try:
-                video_script['path'] = subclip_videos[video_script['timestamp']]
+                video_script['path'] = subclip_videos[i+1]
            except KeyError as err:
                logger.error(f"裁剪视频失败: {err}")

--- a/app/utils/video_processor.py
+++ b/app/utils/video_processor.py
@ -1,237 +1,225 @@
-import cv2
-import numpy as np
-from sklearn.cluster import MiniBatchKMeans
+"""
+视频帧提取工具
+
+这个模块提供了简单高效的视频帧提取功能。主要特点：
+1. 使用ffmpeg进行视频处理，支持硬件加速
+2. 按指定时间间隔提取视频关键帧
+3. 支持多种视频格式
+4. 支持高清视频帧输出
+5. 直接从原视频提取高质量关键帧
+
+不依赖OpenCV和sklearn等库，只使用ffmpeg作为外部依赖，降低了安装和使用的复杂度。
+"""
+
 import os
 import re
-from typing import List, Tuple, Generator
+import time
+import subprocess
+from typing import List, Dict
 from loguru import logger
-import gc
 from tqdm import tqdm

+from app.utils import ffmpeg_utils
+

 class VideoProcessor:
-    def __init__(self, video_path: str, batch_size: int = 100):
+    def __init__(self, video_path: str):
        """
        初始化视频处理器
-        
+
        Args:
            video_path: 视频文件路径
-            batch_size: 批处理大小，控制内存使用
        """
        if not os.path.exists(video_path):
            raise FileNotFoundError(f"视频文件不存在: {video_path}")
-        
+
        self.video_path = video_path
-        self.batch_size = batch_size
-        self.cap = cv2.VideoCapture(video_path)
-        
-        if not self.cap.isOpened():
-            raise RuntimeError(f"无法打开视频文件: {video_path}")
-        
-        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
+        self.video_info = self._get_video_info()
+        self.fps = float(self.video_info.get('fps', 25))
+        self.duration = float(self.video_info.get('duration', 0))
+        self.width = int(self.video_info.get('width', 0))
+        self.height = int(self.video_info.get('height', 0))
+        self.total_frames = int(self.fps * self.duration)

-    def __del__(self):
-        """析构函数，确保视频资源被释放"""
-        if hasattr(self, 'cap'):
-            self.cap.release()
-        gc.collect()
+    def _get_video_info(self) -> Dict[str, str]:
+        """
+        使用ffprobe获取视频信息

-    def preprocess_video(self) -> Generator[Tuple[int, np.ndarray], None, None]:
-        """
-        使用生成器方式分批读取视频帧
-        
-        Yields:
-            Tuple[int, np.ndarray]: (帧索引, 视频帧)
-        """
-        self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
-        frame_idx = 0
-        
-        while self.cap.isOpened():
-            ret, frame = self.cap.read()
-            if not ret:
-                break
-                
-            # 降低分辨率以减少内存使用
-            frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)
-            yield frame_idx, frame
-            
-            frame_idx += 1
-            
-            # 定期进行垃圾回收
-            if frame_idx % 1000 == 0:
-                gc.collect()
-
-    def detect_shot_boundaries(self, threshold: int = 70) -> List[int]:
-        """
-        使用批处理方式检测镜头边界
-        
-        Args:
-            threshold: 差异阈值
-            
        Returns:
-            List[int]: 镜头边界帧的索引列表
+            Dict[str, str]: 包含视频基本信息的字典
        """
-        shot_boundaries = []
-        prev_frame = None
-        prev_idx = -1
-        
-        pbar = tqdm(self.preprocess_video(), 
-                   total=self.total_frames,
-                   desc="检测镜头边界",
-                   unit="帧")
-        
-        for frame_idx, curr_frame in pbar:
-            if prev_frame is not None:
-                prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
-                curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)
-                
-                diff = np.mean(np.abs(curr_gray.astype(float) - prev_gray.astype(float)))
-                if diff > threshold:
-                    shot_boundaries.append(frame_idx)
-                    pbar.set_postfix({"检测到边界": len(shot_boundaries)})
-            
-            prev_frame = curr_frame.copy()
-            prev_idx = frame_idx
-            
-            del curr_frame
-            if frame_idx % 100 == 0:
-                gc.collect()
-        
-        return shot_boundaries
+        cmd = [
+            "ffprobe",
+            "-v", "error",
+            "-select_streams", "v:0",
+            "-show_entries", "stream=width,height,r_frame_rate,duration",
+            "-of", "default=noprint_wrappers=1:nokey=0",
+            self.video_path
+        ]

-    def process_shot(self, shot_frames: List[Tuple[int, np.ndarray]]) -> Tuple[np.ndarray, int]:
-        """
-        处理单个镜头的帧
-        
-        Args:
-            shot_frames: 镜头中的帧列表
-            
-        Returns:
-            Tuple[np.ndarray, int]: (关键帧, 帧索引)
-        """
-        if not shot_frames:
-            return None, -1
-            
-        frame_features = []
-        frame_indices = []
-        
-        for idx, frame in tqdm(shot_frames, 
-                             desc="处理镜头帧",
-                             unit="帧",
-                             leave=False):
-            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-            resized_gray = cv2.resize(gray, (32, 32))
-            frame_features.append(resized_gray.flatten())
-            frame_indices.append(idx)
-            
-        frame_features = np.array(frame_features)
-        
-        kmeans = MiniBatchKMeans(n_clusters=1, batch_size=min(len(frame_features), 100),
-                                random_state=0).fit(frame_features)
-        
-        center_idx = np.argmin(np.sum((frame_features - kmeans.cluster_centers_[0]) ** 2, axis=1))
-        
-        return shot_frames[center_idx][1], frame_indices[center_idx]
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            lines = result.stdout.strip().split('\n')
+            info = {}
+            for line in lines:
+                if '=' in line:
+                    key, value = line.split('=', 1)
+                    info[key] = value

-    def extract_keyframes(self, shot_boundaries: List[int]) -> Generator[Tuple[np.ndarray, int], None, None]:
-        """
-        使用生成器方式提取关键帧
-        
-        Args:
-            shot_boundaries: 镜头边界列表
-            
-        Yields:
-            Tuple[np.ndarray, int]: (关键帧, 帧索引)
-        """
-        shot_frames = []
-        current_shot_start = 0
-        
-        for frame_idx, frame in self.preprocess_video():
-            if frame_idx in shot_boundaries:
-                if shot_frames:
-                    keyframe, keyframe_idx = self.process_shot(shot_frames)
-                    if keyframe is not None:
-                        yield keyframe, keyframe_idx
-                    
-                    # 清理内存
-                    shot_frames.clear()
-                    gc.collect()
-                
-                current_shot_start = frame_idx
-            
-            shot_frames.append((frame_idx, frame))
-            
-            # 控制单个镜头的最大帧数
-            if len(shot_frames) > self.batch_size:
-                keyframe, keyframe_idx = self.process_shot(shot_frames)
-                if keyframe is not None:
-                    yield keyframe, keyframe_idx
-                shot_frames.clear()
-                gc.collect()
-        
-        # 处理最后一个镜头
-        if shot_frames:
-            keyframe, keyframe_idx = self.process_shot(shot_frames)
-            if keyframe is not None:
-                yield keyframe, keyframe_idx
+            # 处理帧率（可能是分数形式）
+            if 'r_frame_rate' in info:
+                try:
+                    num, den = map(int, info['r_frame_rate'].split('/'))
+                    info['fps'] = str(num / den)
+                except ValueError:
+                    info['fps'] = info.get('r_frame_rate', '25')

-    def process_video(self, output_dir: str, skip_seconds: float = 0) -> None:
+            return info
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"获取视频信息失败: {e.stderr}")
+            return {
+                'width': '1280',
+                'height': '720',
+                'fps': '25',
+                'duration': '0'
+            }
+
+    def extract_frames_by_interval(self, output_dir: str, interval_seconds: float = 5.0,
+                                  use_hw_accel: bool = True) -> List[int]:
        """
-        处理视频并提取关键帧，使用分批处理方式
-        
+        按指定时间间隔提取视频帧
+
        Args:
            output_dir: 输出目录
-            skip_seconds: 跳过视频开头的秒数
+            interval_seconds: 帧提取间隔（秒）
+            use_hw_accel: 是否使用硬件加速
+
+        Returns:
+            List[int]: 提取的帧号列表
        """
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        # 计算起始时间和帧提取点
+        start_time = 0
+        end_time = self.duration
+        extraction_times = []
+
+        current_time = start_time
+        while current_time < end_time:
+            extraction_times.append(current_time)
+            current_time += interval_seconds
+
+        if not extraction_times:
+            logger.warning("未找到需要提取的帧")
+            return []
+
+        # 确定硬件加速器选项
+        hw_accel = []
+        if use_hw_accel and ffmpeg_utils.is_ffmpeg_hwaccel_available():
+            hw_accel = ffmpeg_utils.get_ffmpeg_hwaccel_args()
+
+        # 提取帧
+        frame_numbers = []
+        for i, timestamp in enumerate(tqdm(extraction_times, desc="提取视频帧")):
+            frame_number = int(timestamp * self.fps)
+            frame_numbers.append(frame_number)
+
+            # 格式化时间戳字符串 (HHMMSSmmm)
+            hours = int(timestamp // 3600)
+            minutes = int((timestamp % 3600) // 60)
+            seconds = int(timestamp % 60)
+            milliseconds = int((timestamp % 1) * 1000)
+            time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
+
+            output_path = os.path.join(output_dir, f"keyframe_{frame_number:06d}_{time_str}.jpg")
+
+            # 使用ffmpeg提取单帧
+            cmd = [
+                "ffmpeg",
+                "-hide_banner",
+                "-loglevel", "error",
+            ]
+
+            # 添加硬件加速参数
+            cmd.extend(hw_accel)
+
+            cmd.extend([
+                "-ss", str(timestamp),
+                "-i", self.video_path,
+                "-vframes", "1",
+                "-q:v", "1",  # 最高质量
+                "-y",
+                output_path
+            ])
+
+            try:
+                subprocess.run(cmd, check=True, capture_output=True)
+            except subprocess.CalledProcessError as e:
+                logger.warning(f"提取帧 {frame_number} 失败: {e.stderr}")
+
+        logger.info(f"成功提取了 {len(frame_numbers)} 个视频帧")
+        return frame_numbers
+
+    def _detect_hw_accelerator(self) -> List[str]:
+        """
+        检测系统可用的硬件加速器
+
+        Returns:
+            List[str]: 硬件加速器ffmpeg命令参数
+        """
+        # 使用集中式硬件加速检测
+        if ffmpeg_utils.is_ffmpeg_hwaccel_available():
+            return ffmpeg_utils.get_ffmpeg_hwaccel_args()
+        return []
+
+    def process_video_pipeline(self,
+                              output_dir: str,
+                              interval_seconds: float = 5.0,  # 帧提取间隔（秒）
+                              use_hw_accel: bool = True) -> None:
+        """
+        执行简化的视频处理流程，直接从原视频按固定时间间隔提取帧
+
+        Args:
+            output_dir: 输出目录
+            interval_seconds: 帧提取间隔（秒）
+            use_hw_accel: 是否使用硬件加速
+        """
+        # 创建输出目录
+        os.makedirs(output_dir, exist_ok=True)
+
        try:
-            # 创建输出目录
-            os.makedirs(output_dir, exist_ok=True)
-            
-            # 计算要跳过的帧数
-            skip_frames = int(skip_seconds * self.fps)
-            self.cap.set(cv2.CAP_PROP_POS_FRAMES, skip_frames)
-            
-            # 检测镜头边界
-            logger.info("开始检测镜头边界...")
-            shot_boundaries = self.detect_shot_boundaries()
-            
-            # 提取关键帧
-            logger.info("开始提取关键帧...")
-            frame_count = 0
-            
-            pbar = tqdm(self.extract_keyframes(shot_boundaries),
-                       desc="提取关键帧",
-                       unit="帧")
-            
-            for keyframe, frame_idx in pbar:
-                if frame_idx < skip_frames:
-                    continue
-                    
-                # 计算时间戳
-                timestamp = frame_idx / self.fps
-                hours = int(timestamp // 3600)
-                minutes = int((timestamp % 3600) // 60)
-                seconds = int(timestamp % 60)
-                time_str = f"{hours:02d}{minutes:02d}{seconds:02d}"
-                
-                # 保存关键帧
-                output_path = os.path.join(output_dir, 
-                                         f'keyframe_{frame_idx:06d}_{time_str}.jpg')
-                cv2.imwrite(output_path, keyframe)
-                frame_count += 1
-                
-                pbar.set_postfix({"已保存": frame_count})
-                
-                if frame_count % 10 == 0:
-                    gc.collect()
-            
-            logger.info(f"关键帧提取完成，共保存 {frame_count} 帧到 {output_dir}")
-            
+            # 直接从原视频提取关键帧
+            logger.info(f"从视频间隔 {interval_seconds} 秒提取关键帧...")
+            self.extract_frames_by_interval(
+                output_dir,
+                interval_seconds=interval_seconds,
+                use_hw_accel=use_hw_accel
+            )
+
+            logger.info(f"处理完成！视频帧已保存在: {output_dir}")
+
        except Exception as e:
-            logger.error(f"视频处理失败: {str(e)}")
+            import traceback
+            logger.error(f"视频处理失败: \n{traceback.format_exc()}")
            raise
-        finally:
-            # 确保资源被释放
-            self.cap.release()
-            gc.collect()
+
+
+if __name__ == "__main__":
+    import time
+
+    start_time = time.time()
+
+    # 使用示例
+    processor = VideoProcessor("./resource/videos/test.mp4")
+
+    # 设置间隔为3秒提取帧
+    processor.process_video_pipeline(
+        output_dir="output",
+        interval_seconds=3.0,
+        use_hw_accel=True
+    )
+
+    end_time = time.time()
+    print(f"处理完成！总耗时: {end_time - start_time:.2f} 秒")
--- a/app/utils/video_processor_v2.py
+++ b/app/utils/video_processor_v2.py
@ -1,382 +0,0 @@
-import cv2
-import numpy as np
-from sklearn.cluster import KMeans
-import os
-import re
-from typing import List, Tuple, Generator
-from loguru import logger
-import subprocess
-from tqdm import tqdm
-
-
-class VideoProcessor:
-    def __init__(self, video_path: str):
-        """
-        初始化视频处理器
-
-        Args:
-            video_path: 视频文件路径
-        """
-        if not os.path.exists(video_path):
-            raise FileNotFoundError(f"视频文件不存在: {video_path}")
-
-        self.video_path = video_path
-        self.cap = cv2.VideoCapture(video_path)
-
-        if not self.cap.isOpened():
-            raise RuntimeError(f"无法打开视频文件: {video_path}")
-
-        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
-
-    def __del__(self):
-        """析构函数，确保视频资源被释放"""
-        if hasattr(self, 'cap'):
-            self.cap.release()
-
-    def preprocess_video(self) -> Generator[np.ndarray, None, None]:
-        """
-        使用生成器方式读取视频帧
-
-        Yields:
-            np.ndarray: 视频帧
-        """
-        self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0)  # 重置到视频开始
-        while self.cap.isOpened():
-            ret, frame = self.cap.read()
-            if not ret:
-                break
-            yield frame
-
-    def detect_shot_boundaries(self, frames: List[np.ndarray], threshold: int = 30) -> List[int]:
-        """
-        使用帧差法检测镜头边界
-        
-        Args:
-            frames: 视频帧列表
-            threshold: 差异阈值，默认值调低为30
-        
-        Returns:
-            List[int]: 镜头边界帧的索引列表
-        """
-        shot_boundaries = []
-        if len(frames) < 2:  # 添加帧数检查
-            logger.warning("视频帧数过少，无法检测场景边界")
-            return [len(frames) - 1]  # 返回最后一帧作为边界
-        
-        for i in range(1, len(frames)):
-            prev_frame = cv2.cvtColor(frames[i - 1], cv2.COLOR_BGR2GRAY)
-            curr_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
-            
-            # 计算帧差
-            diff = np.mean(np.abs(curr_frame.astype(float) - prev_frame.astype(float)))
-            
-            if diff > threshold:
-                shot_boundaries.append(i)
-
-        # 如果没有检测到任何边界，至少返回最后一帧
-        if not shot_boundaries:
-            logger.warning("未检测到场景边界，将视频作为单个场景处理")
-            shot_boundaries.append(len(frames) - 1)
-        
-        return shot_boundaries
-
-    def extract_keyframes(self, frames: List[np.ndarray], shot_boundaries: List[int]) -> Tuple[
-        List[np.ndarray], List[int]]:
-        """
-        从每个镜头中提取关键帧
-
-        Args:
-            frames: 视频帧列表
-            shot_boundaries: 镜头边界列表
-
-        Returns:
-            Tuple[List[np.ndarray], List[int]]: 关键帧列表和对应的帧索引
-        """
-        keyframes = []
-        keyframe_indices = []
-
-        for i in tqdm(range(len(shot_boundaries)), desc="提取关键帧"):
-            start = shot_boundaries[i - 1] if i > 0 else 0
-            end = shot_boundaries[i]
-            shot_frames = frames[start:end]
-
-            if not shot_frames:
-                continue
-
-            # 将每一帧转换为灰度图并展平为一维数组
-            frame_features = np.array([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY).flatten()
-                                       for frame in shot_frames])
-            
-            try:
-                # 尝试使用 KMeans
-                kmeans = KMeans(n_clusters=1, random_state=0).fit(frame_features)
-                center_idx = np.argmin(np.sum((frame_features - kmeans.cluster_centers_[0]) ** 2, axis=1))
-            except Exception as e:
-                logger.warning(f"KMeans 聚类失败，使用备选方案: {str(e)}")
-                # 备选方案：选择镜头中间的帧作为关键帧
-                center_idx = len(shot_frames) // 2
-
-            keyframes.append(shot_frames[center_idx])
-            keyframe_indices.append(start + center_idx)
-
-        return keyframes, keyframe_indices
-
-    def save_keyframes(self, keyframes: List[np.ndarray], keyframe_indices: List[int],
-                       output_dir: str, desc: str = "保存关键帧") -> None:
-        """
-        保存关键帧到指定目录，文件名格式为：keyframe_帧序号_时间戳.jpg
-        时间戳精确到毫秒，格式为：HHMMSSmmm
-        """
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
-        for keyframe, frame_idx in tqdm(zip(keyframes, keyframe_indices),
-                                        total=len(keyframes),
-                                        desc=desc):
-            # 计算精确到毫秒的时间戳
-            timestamp = frame_idx / self.fps
-            hours = int(timestamp // 3600)
-            minutes = int((timestamp % 3600) // 60)
-            seconds = int(timestamp % 60)
-            milliseconds = int((timestamp % 1) * 1000)  # 计算毫秒部分
-            time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
-
-            output_path = os.path.join(output_dir,
-                                       f'keyframe_{frame_idx:06d}_{time_str}.jpg')
-            cv2.imwrite(output_path, keyframe)
-
-    def extract_frames_by_numbers(self, frame_numbers: List[int], output_folder: str) -> None:
-        """
-        根据指定的帧号提取帧，如果多个帧在同一毫秒内，只保留一个
-        """
-        if not frame_numbers:
-            raise ValueError("未提供帧号列表")
-
-        if any(fn >= self.total_frames or fn < 0 for fn in frame_numbers):
-            raise ValueError("存在无效的帧号")
-
-        if not os.path.exists(output_folder):
-            os.makedirs(output_folder)
-
-        # 用于记录已处理的时间戳（毫秒）
-        processed_timestamps = set()
-
-        for frame_number in tqdm(frame_numbers, desc="提取高清帧"):
-            # 计算精确到毫秒的时间戳
-            timestamp = frame_number / self.fps
-            timestamp_ms = int(timestamp * 1000)  # 转换为毫秒
-
-            # 如果这一毫秒已经处理过，跳过
-            if timestamp_ms in processed_timestamps:
-                continue
-
-            self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
-            ret, frame = self.cap.read()
-
-            if ret:
-                # 记录这一毫秒已经处理
-                processed_timestamps.add(timestamp_ms)
-
-                # 计算时间戳字符串
-                hours = int(timestamp // 3600)
-                minutes = int((timestamp % 3600) // 60)
-                seconds = int(timestamp % 60)
-                milliseconds = int((timestamp % 1) * 1000)  # 计算毫秒部分
-                time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}"
-
-                output_path = os.path.join(output_folder,
-                                           f"keyframe_{frame_number:06d}_{time_str}.jpg")
-                cv2.imwrite(output_path, frame)
-            else:
-                logger.info(f"无法读取帧 {frame_number}")
-
-        logger.info(f"共提取了 {len(processed_timestamps)} 个不同时间戳的帧")
-
-    @staticmethod
-    def extract_numbers_from_folder(folder_path: str) -> List[int]:
-        """
-        从文件夹中提取帧号
-        
-        Args:
-            folder_path: 关键帧文件夹路径
-        
-        Returns:
-            List[int]: 排序后的帧号列表
-        """
-        files = [f for f in os.listdir(folder_path) if f.endswith('.jpg')]
-        # 更新正则表达式以匹配新的文件名格式：keyframe_000123_010534123.jpg
-        pattern = re.compile(r'keyframe_(\d+)_\d{9}\.jpg$')
-        numbers = []
-        
-        for f in files:
-            match = pattern.search(f)
-            if match:
-                numbers.append(int(match.group(1)))
-            else:
-                logger.warning(f"文件名格式不匹配: {f}")
-        
-        if not numbers:
-            logger.error(f"在目录 {folder_path} 中未找到有效的关键帧文件")
-        
-        return sorted(numbers)
-
-    def process_video(self, output_dir: str, skip_seconds: float = 0, threshold: int = 30) -> None:
-        """
-        处理视频并提取关键帧
-
-        Args:
-            output_dir: 输出目录
-            skip_seconds: 跳过视频开头的秒数
-        """
-        skip_frames = int(skip_seconds * self.fps)
-
-        logger.info("读取视频帧...")
-        frames = []
-        for frame in tqdm(self.preprocess_video(),
-                          total=self.total_frames,
-                          desc="读取视频"):
-            frames.append(frame)
-
-        frames = frames[skip_frames:]
-
-        if not frames:
-            raise ValueError(f"跳过 {skip_seconds} 秒后没有剩余帧可以处理")
-
-        logger.info("检测场景边界...")
-        shot_boundaries = self.detect_shot_boundaries(frames, threshold)
-        logger.info(f"检测到 {len(shot_boundaries)} 个场景边界")
-
-        keyframes, keyframe_indices = self.extract_keyframes(frames, shot_boundaries)
-
-        adjusted_indices = [idx + skip_frames for idx in keyframe_indices]
-        self.save_keyframes(keyframes, adjusted_indices, output_dir, desc="保存压缩关键帧")
-
-    def process_video_pipeline(self,
-                               output_dir: str,
-                               skip_seconds: float = 0,
-                               threshold: int = 20,  # 降低默认阈值
-                               compressed_width: int = 320,
-                               keep_temp: bool = False) -> None:
-        """
-        执行完整的视频处理流程
-        
-        Args:
-            threshold: 降低默认阈值为20，使场景检测更敏感
-        """
-        os.makedirs(output_dir, exist_ok=True)
-        temp_dir = os.path.join(output_dir, 'temp')
-        compressed_dir = os.path.join(temp_dir, 'compressed')
-        mini_frames_dir = os.path.join(temp_dir, 'mini_frames')
-        hd_frames_dir = output_dir
-
-        os.makedirs(temp_dir, exist_ok=True)
-        os.makedirs(compressed_dir, exist_ok=True)
-        os.makedirs(mini_frames_dir, exist_ok=True)
-        os.makedirs(hd_frames_dir, exist_ok=True)
-
-        mini_processor = None
-        compressed_video = None
-
-        try:
-            # 1. 压缩视频
-            video_name = os.path.splitext(os.path.basename(self.video_path))[0]
-            compressed_video = os.path.join(compressed_dir, f"{video_name}_compressed.mp4")
-
-            # 获取原始视频的宽度和高度
-            original_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-            original_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-            
-            logger.info("步骤1: 压缩视频...")
-            if original_width > original_height:
-                # 横版视频
-                scale_filter = f'scale={compressed_width}:-1'
-            else:
-                # 竖版视频
-                scale_filter = f'scale=-1:{compressed_width}'
-            
-            ffmpeg_cmd = [
-                'ffmpeg', '-i', self.video_path,
-                '-vf', scale_filter,
-                '-y',
-                compressed_video
-            ]
-            
-            try:
-                subprocess.run(ffmpeg_cmd, check=True, capture_output=True, text=True)
-            except subprocess.CalledProcessError as e:
-                logger.error(f"FFmpeg 错误输出: {e.stderr}")
-                raise
-
-            # 2. 从压缩视频中提取关键帧
-            logger.info("\n步骤2: 从压缩视频提取关键帧...")
-            mini_processor = VideoProcessor(compressed_video)
-            mini_processor.process_video(mini_frames_dir, skip_seconds, threshold)
-
-            # 3. 从原始视频提取高清关键帧
-            logger.info("\n步骤3: 提取高清关键帧...")
-            frame_numbers = self.extract_numbers_from_folder(mini_frames_dir)
-
-            if not frame_numbers:
-                raise ValueError("未能从压缩视频中提取到有效的关键帧")
-
-            self.extract_frames_by_numbers(frame_numbers, hd_frames_dir)
-
-            logger.info(f"处理完成！高清关键帧保存在: {hd_frames_dir}")
-
-        except Exception as e:
-            import traceback
-            logger.error(f"视频处理失败: \n{traceback.format_exc()}")
-            raise
-
-        finally:
-            # 释放资源
-            if mini_processor:
-                mini_processor.cap.release()
-                del mini_processor
-
-            # 确保视频文件句柄被释放
-            if hasattr(self, 'cap'):
-                self.cap.release()
-
-            # 等待资源释放
-            import time
-            time.sleep(0.5)
-
-            if not keep_temp:
-                try:
-                    # 先删除压缩视频文件
-                    if compressed_video and os.path.exists(compressed_video):
-                        try:
-                            os.remove(compressed_video)
-                        except Exception as e:
-                            logger.warning(f"删除压缩视频失败: {e}")
-
-                    # 再删除临时目录
-                    import shutil
-                    if os.path.exists(temp_dir):
-                        max_retries = 3
-                        for i in range(max_retries):
-                            try:
-                                shutil.rmtree(temp_dir)
-                                break
-                            except Exception as e:
-                                if i == max_retries - 1:
-                                    logger.warning(f"清理临时文件失败: {e}")
-                                else:
-                                    time.sleep(1)  # 等待1秒后重试
-                                    continue
-
-                    logger.info("临时文件已清理")
-                except Exception as e:
-                    logger.warning(f"清理临时文件时出错: {e}")
-
-
-if __name__ == "__main__":
-    import time
-
-    start_time = time.time()
-    processor = VideoProcessor("E:\\projects\\NarratoAI\\resource\\videos\\test.mp4")
-    processor.process_video_pipeline(output_dir="output")
-    end_time = time.time()
-    print(f"处理完成！总耗时: {end_time - start_time:.2f} 秒")
--- a/config.example.toml
+++ b/config.example.toml
@ -1,184 +1,89 @@
 [app]
-    project_version="0.5.3"
+    project_version="0.6.5"
    # 支持视频理解的大模型提供商
-    #   gemini
-    #   qwenvl
-    vision_llm_provider="qwenvl"
-    vision_analysis_prompt = "你是资深视频内容分析专家，擅长分析视频画面信息，分析下面视频画面内容，只输出客观的画面描述不要给任何总结或评价"
+    #   gemini  (谷歌, 需要 VPN)
+    #   siliconflow (硅基流动)
+    #   qwenvl  (通义千问)
+    vision_llm_provider="Siliconflow"

-    ########## Vision Gemini API Key
+    ########## Gemini 视觉模型
    vision_gemini_api_key = ""
-    vision_gemini_model_name = "gemini-2.0-flash"
+    vision_gemini_model_name = "gemini-2.0-flash-lite"

-    ########## Vision Qwen API Key (默认使用“硅基流动”的QwenVL模型)
+    ########## QwenVL 视觉模型
    vision_qwenvl_api_key = ""
-    vision_qwenvl_model_name = "Qwen/Qwen2.5-VL-32B-Instruct"
-    vision_qwenvl_base_url = "https://api.siliconflow.cn/v1"
+    vision_qwenvl_model_name = "qwen2.5-vl-32b-instruct"
+    vision_qwenvl_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"

-    ########### Vision NarratoAPI Key
-    narrato_api_key = "ggyY91BAO-_ULvAqKum3XexcyN1G3dP86DEzvjZDcrg"
-    narrato_api_url = "https://narratoinsight.scsmtech.cn/api/v1"
-    narrato_vision_model = "gemini-1.5-flash"
-    narrato_vision_key = ""
-    narrato_llm_model = "gpt-4o"
-    narrato_llm_key = ""
+    ########## siliconflow 视觉模型
+    vision_siliconflow_api_key = ""
+    vision_siliconflow_model_name = "Qwen/Qwen2.5-VL-32B-Instruct"
+    vision_siliconflow_base_url = "https://api.siliconflow.cn/v1"
+
+    ########## OpenAI 视觉模型
+    vision_openai_api_key = ""
+    vision_openai_model_name = "gpt-4.1-nano-2025-04-14"
+    vision_openai_base_url = "https://api.openai.com/v1"
+
+    ########### NarratoAPI 微调模型 (未发布)
+    narrato_api_key = ""
+    narrato_api_url = ""
+    narrato_model = "narra-1.0-2025-05-09"

    # 用于生成文案的大模型支持的提供商 (Supported providers):
-    #   openai (默认)
-    #   deepseek (默认使用“硅基流动”的模型)
-    #   moonshot (月之暗面)
+    #   openai (默认, 需要 VPN)
+    #   siliconflow (硅基流动)
+    #   deepseek (深度求索)
+    #   gemini (谷歌, 需要 VPN)
    #   qwen (通义千问)
-    #   gemini
-    text_llm_provider="deepseek"
+    #   moonshot (月之暗面)
+    text_llm_provider="openai"

    ########## OpenAI API Key
    # Get your API key at https://platform.openai.com/api-keys
    text_openai_api_key = ""
    text_openai_base_url = "https://api.openai.com/v1"
-    text_openai_model_name = "gpt-4o-mini"
+    text_openai_model_name = "gpt-4.1-mini-2025-04-14"
+
+    # 使用 硅基流动 第三方 API Key，使用手机号注册：https://cloud.siliconflow.cn/i/pyOKqFCV
+    # 访问 https://cloud.siliconflow.cn/account/ak 获取你的 API 密钥
+    text_siliconflow_api_key = ""
+    text_siliconflow_base_url = "https://api.siliconflow.cn/v1"
+    text_siliconflow_model_name = "deepseek-ai/DeepSeek-R1"

    ########## DeepSeek API Key
-    # 使用 硅基流动 第三方 API Key，使用手机号注册：https://cloud.siliconflow.cn/i/pyOKqFCV
+    # 访问 https://platform.deepseek.com/api_keys 获取你的 API 密钥
    text_deepseek_api_key = ""
-    text_deepseek_base_url = "https://api.siliconflow.cn/v1"
-    text_deepseek_model_name = "deepseek-ai/DeepSeek-V3"
-
-    ########## Moonshot API Key
-    # Visit https://platform.moonshot.cn/console/api-keys to get your API key.
-    text_moonshot_api_key=""
-    text_moonshot_base_url = "https://api.moonshot.cn/v1"
-    text_moonshot_model_name = "moonshot-v1-8k"
-
-    ########## G4F
-    # Visit https://github.com/xtekky/gpt4free to get more details
-    # Supported model list: https://github.com/xtekky/gpt4free/blob/main/g4f/models.py
-    text_g4f_model_name = "gpt-3.5-turbo"
-
-    ########## Azure API Key
-    # Visit https://learn.microsoft.com/zh-cn/azure/ai-services/openai/ to get more details
-    # API documentation: https://learn.microsoft.com/zh-cn/azure/ai-services/openai/reference
-    text_azure_api_key = ""
-    text_azure_base_url=""
-    text_azure_model_name="gpt-35-turbo" # replace with your model deployment name
-    text_azure_api_version = "2024-02-15-preview"
+    text_deepseek_base_url = "https://api.deepseek.com"
+    text_deepseek_model_name = "deepseek-chat"

    ########## Gemini API Key
    text_gemini_api_key=""
-    text_gemini_model_name = "gemini-1.5-flash"
+    text_gemini_model_name = "gemini-2.0-flash"
+    text_gemini_base_url = "https://generativelanguage.googleapis.com/v1beta/openai"

    ########## Qwen API Key
-    # Visit https://dashscope.console.aliyun.com/apiKey to get your API key
-    # Visit below links to get more details
-    # https://tongyi.aliyun.com/qianwen/
-    # https://help.aliyun.com/zh/dashscope/developer-reference/model-introduction
+    # 访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取你的 API 密钥
    text_qwen_api_key = ""
    text_qwen_model_name = "qwen-plus-1127"
    text_qwen_base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"

-
-    # 字幕提供商、可选，支持 whisper 和 faster-whisper-large-v2"whisper"
-    # 默认为 faster-whisper-large-v2 模型地址：https://huggingface.co/guillaumekln/faster-whisper-large-v2
-    subtitle_provider = "faster-whisper-large-v2"
-    subtitle_enabled = true
-
-    # ImageMagick
-    # 安装后，将自动检测到 ImageMagick，Windows 除外！
-    # 例如，在 Windows 上 "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
-    # 下载位置 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
-    # imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe"
-
-    # FFMPEG
-    #
-    # 通常情况下，ffmpeg 会被自动下载，并且会被自动检测到。
-    # 但是如果你的环境有问题，无法自动下载，可能会遇到如下错误：
-    #   RuntimeError: No ffmpeg exe could be found.
-    #   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
-    # 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path，下载地址：https://www.gyan.dev/ffmpeg/builds/
-
-    # ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
-    #########################################################################################
-
-    # 当视频生成成功后，API服务提供的视频下载接入点，默认为当前服务的地址和监听端口
-    # 比如 http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
-    # 如果你需要使用域名对外提供服务（一般会用nginx做代理），则可以设置为你的域名
-    # 比如 https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
-    # endpoint="https://xxxx.com"
-
-    # When the video is successfully generated, the API service provides a download endpoint for the video, defaulting to the service's current address and listening port.
-    # For example, http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
-    # If you need to provide the service externally using a domain name (usually done with nginx as a proxy), you can set it to your domain name.
-    # For example, https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
-    # endpoint="https://xxxx.com"
-    endpoint=""
-
-
-    # Video material storage location
-    # material_directory = ""                    # Indicates that video materials will be downloaded to the default folder, the default folder is ./storage/cache_videos under the current project
-    # material_directory = "/user/harry/videos"  # Indicates that video materials will be downloaded to a specified folder
-    # material_directory = "task"                # Indicates that video materials will be downloaded to the current task's folder, this method does not allow sharing of already downloaded video materials
-
-    # 视频素材存放位置
-    # material_directory = ""                    #表示将视频素材下载到默认的文件夹，默认文件夹为当前项目下的 ./storage/cache_videos
-    # material_directory = "/user/harry/videos"  #表示将视频素材下载到指定的文件夹中
-    # material_directory = "task"                #表示将视频素材下载到当前任务的文件夹中，这种方式无法共享已经下载的视频素材
-
-    material_directory = ""
-
-    # 用于任务的状态管理
-    enable_redis = false
-    redis_host = "localhost"
-    redis_port = 6379
-    redis_db = 0
-    redis_password = ""
-
-    # 文生视频时的最大并发任务数
-    max_concurrent_tasks = 5
+    ########## Moonshot API Key
+    # 访问 https://platform.moonshot.cn/console/api-keys 获取你的 API 密钥
+    text_moonshot_api_key=""
+    text_moonshot_base_url = "https://api.moonshot.cn/v1"
+    text_moonshot_model_name = "moonshot-v1-8k"

    # webui界面是否显示配置项
-    hide_config = false
-
-
-[whisper]
-    # Only effective when subtitle_provider is "whisper"
-
-    # Run on GPU with FP16
-    # model = WhisperModel(model_size, device="cuda", compute_type="float16")
-
-    # Run on GPU with INT8
-    # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
-
-    # Run on CPU with INT8
-    # model = WhisperModel(model_size, device="cpu", compute_type="int8")
-
-    # recommended model_size: "large-v3"
-    model_size="faster-whisper-large-v2"
-    # 如果要使用 GPU，请设置 device=“cuda”
-    device="CPU"
-    compute_type="int8"
-
+    hide_config = true

 [proxy]
-    ### Use a proxy to access the Pexels API
-    ### Format: "http://<username>:<password>@<proxy>:<port>"
-    ### Example: "http://user:pass@proxy:1234"
-    ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
-
    http = "http://127.0.0.1:7890"
    https = "http://127.0.0.1:7890"
-
-[azure]
-    # Azure Speech API Key
-    # Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
-    speech_key=""
-    speech_region=""
+    enabled = false

 [frames]
-    skip_seconds = 0
-    # threshold（差异阈值）用于判断两个连续帧之间是否发生了场景切换
-    # 较小的阈值（如 20）：更敏感，能捕捉到细微的场景变化，但可能会误判，关键帧图片更多
-    # 较大的阈值（如 40）：更保守，只捕捉明显的场景切换，但可能会漏掉渐变场景，关键帧图片更少
-    # 默认值 30：在实践中是一个比较平衡的选择
-    threshold = 30
-    version = "v2"
+    # 提取关键帧的间隔时间
+    frame_interval_input = 3
    # 大模型单次处理的关键帧数量
-    vision_batch_size = 5
+    vision_batch_size = 10
--- a/1
+++ b/1
@ -0,0 +1 @@
+0.6.5
--- a/release-notes.md
+++ b/release-notes.md
@ -2,6 +2,9 @@

 ## Latest Changes

+* docs(README): 更新README. PR [#138](https://github.com/linyqh/NarratoAI/pull/138) by [@linyqh](https://github.com/linyqh).
+* Dev 0.6.0. PR [#137](https://github.com/linyqh/NarratoAI/pull/137) by [@linyqh](https://github.com/linyqh).
+* Dev 0.6.0 . PR [#134](https://github.com/linyqh/NarratoAI/pull/134) by [@linyqh](https://github.com/linyqh).
 * Dev-0.3.9. PR [#73](https://github.com/linyqh/NarratoAI/pull/73) by [@linyqh](https://github.com/linyqh).
 * 0.3.9 版本发布. PR [#71](https://github.com/linyqh/NarratoAI/pull/71) by [@linyqh](https://github.com/linyqh).
 * docs: add Japanese README. PR [#66](https://github.com/linyqh/NarratoAI/pull/66) by [@eltociear](https://github.com/eltociear).
--- a/requirements.txt
+++ b/requirements.txt
@ -1,38 +1,47 @@
-requests~=2.31.0
-moviepy==2.0.0.dev2
-faster-whisper~=1.0.1
-uvicorn~=0.27.1
-fastapi~=0.115.4
-tomli~=2.0.1
-streamlit~=1.40.0
-loguru~=0.7.2
-aiohttp~=3.10.10
-urllib3~=2.2.1
-pydantic~=2.6.3
-g4f~=0.3.0.4
-dashscope~=1.15.0
-google.generativeai>=0.8.3
-python-multipart~=0.0.9
-redis==5.0.3
-opencv-python~=4.10.0.84
-# for azure speech
-# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
-azure-cognitiveservices-speech~=1.37.0
-git-changelog~=2.5.2
-watchdog==5.0.2
-pydub==0.25.1
-psutil>=5.9.0
-opencv-python~=4.10.0.84
-scikit-learn~=1.5.2
-google-generativeai~=0.8.3
-pillow==10.3.0
-python-dotenv~=1.0.1
-openai~=1.53.0
-tqdm>=4.66.6
-tenacity>=9.0.0
-tiktoken==0.8.0
-yt-dlp==2024.11.18
-pysrt==1.1.2
-httpx==0.27.2
-transformers==4.47.0
+# 必须项
+requests~=2.32.0
+moviepy==2.1.1
 edge-tts==6.1.19
+streamlit~=1.45.0
+watchdog==6.0.0
+loguru~=0.7.3
+tomli~=2.2.1
+pydub==0.25.1
+pysrt==1.1.2
+
+openai~=1.77.0
+google-generativeai>=0.8.5
+
+# 待优化项
+# opencv-python==4.11.0.86
+# scikit-learn==1.6.1
+
+# fastapi~=0.115.4
+# uvicorn~=0.27.1
+# pydantic~=2.11.4
+
+# faster-whisper~=1.0.1
+# tomli~=2.0.1
+# aiohttp~=3.10.10
+# httpx==0.27.2
+# urllib3~=2.2.1
+
+# python-multipart~=0.0.9
+# redis==5.0.3
+# opencv-python~=4.10.0.84
+# azure-cognitiveservices-speech~=1.37.0
+# git-changelog~=2.5.2
+# watchdog==5.0.2
+# pydub==0.25.1
+# psutil>=5.9.0
+# scikit-learn~=1.5.2
+# pillow==10.3.0
+# python-dotenv~=1.0.1
+
+# tqdm>=4.66.6
+# tenacity>=9.0.0
+# tiktoken==0.8.0
+# pysrt==1.1.2
+# transformers==4.50.0
+
+# yt-dlp==2025.4.30
--- a/webui.py
+++ b/webui.py
@ -1,13 +1,15 @@
 import streamlit as st
 import os
 import sys
-from uuid import uuid4
+from loguru import logger
 from app.config import config
-from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, review_settings, merge_settings, system_settings
-from webui.utils import cache, file_utils
+from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \
+    review_settings, merge_settings, system_settings
+# from webui.utils import cache, file_utils
 from app.utils import utils
+from app.utils import ffmpeg_utils
 from app.models.schema import VideoClipParams, VideoAspect
-from webui.utils.performance import PerformanceMonitor
+

 # 初始化配置 - 必须是第一个 Streamlit 命令
 st.set_page_config(
@ -17,7 +19,7 @@ st.set_page_config(
    initial_sidebar_state="auto",
    menu_items={
        "Report a bug": "https://github.com/linyqh/NarratoAI/issues",
-        'About': f"# NarratoAI:sunglasses: 📽️ \n #### Version: v{config.project_version} \n "
+        'About': f"# Narrato:blue[AI] :sunglasses: 📽️ \n #### Version: v{config.project_version} \n "
                 f"自动化影视解说视频详情请移步：https://github.com/linyqh/NarratoAI"
    },
 )
@ -28,6 +30,7 @@ hide_streamlit_style = """
 """
 st.markdown(hide_streamlit_style, unsafe_allow_html=True)

+
 def init_log():
    """初始化日志配置"""
    from loguru import logger
@ -35,17 +38,7 @@ def init_log():
    _lvl = "DEBUG"

    def format_record(record):
-        # 增加更多需要过滤的警告消息
-        ignore_messages = [
-            "Examining the path of torch.classes raised",
-            "torch.cuda.is_available()",
-            "CUDA initialization"
-        ]
-        
-        for msg in ignore_messages:
-            if msg in record["message"]:
-                return ""
-            
+        # 简化日志格式化处理，不尝试按特定字符串过滤torch相关内容
        file_path = record["file"].path
        relative_path = os.path.relpath(file_path, config.root_dir)
        record["file"].path = f"./{relative_path}"
@ -57,23 +50,54 @@ def init_log():
                  '- <level>{message}</>' + "\n"
        return _format

-    # 优化日志过滤器
-    def log_filter(record):
-        ignore_messages = [
-            "Examining the path of torch.classes raised",
-            "torch.cuda.is_available()",
-            "CUDA initialization"
-        ]
-        return not any(msg in record["message"] for msg in ignore_messages)
-
+    # 替换为更简单的过滤方式，避免在过滤时访问message内容
+    # 此处先不设置复杂的过滤器，等应用启动后再动态添加
    logger.add(
        sys.stdout,
        level=_lvl,
        format=format_record,
-        colorize=True,
-        filter=log_filter
+        colorize=True
    )

+    # 应用启动后，可以再添加更复杂的过滤器
+    def setup_advanced_filters():
+        """在应用完全启动后设置高级过滤器"""
+        try:
+            for handler_id in logger._core.handlers:
+                logger.remove(handler_id)
+
+            # 重新添加带有高级过滤的处理器
+            def advanced_filter(record):
+                """更复杂的过滤器，在应用启动后安全使用"""
+                ignore_messages = [
+                    "Examining the path of torch.classes raised",
+                    "torch.cuda.is_available()",
+                    "CUDA initialization"
+                ]
+                return not any(msg in record["message"] for msg in ignore_messages)
+
+            logger.add(
+                sys.stdout,
+                level=_lvl,
+                format=format_record,
+                colorize=True,
+                filter=advanced_filter
+            )
+        except Exception as e:
+            # 如果过滤器设置失败，确保日志仍然可用
+            logger.add(
+                sys.stdout,
+                level=_lvl,
+                format=format_record,
+                colorize=True
+            )
+            logger.error(f"设置高级日志过滤器失败: {e}")
+
+    # 将高级过滤器设置放到启动主逻辑后
+    import threading
+    threading.Timer(5.0, setup_advanced_filters).start()
+
+
 def init_global_state():
    """初始化全局状态"""
    if 'video_clip_json' not in st.session_state:
@ -85,6 +109,7 @@ def init_global_state():
    if 'subclip_videos' not in st.session_state:
        st.session_state['subclip_videos'] = {}

+
 def tr(key):
    """翻译函数"""
    i18n_dir = os.path.join(os.path.dirname(__file__), "webui", "i18n")
@ -92,90 +117,107 @@ def tr(key):
    loc = locales.get(st.session_state['ui_language'], {})
    return loc.get("Translation", {}).get(key, key)

+
 def render_generate_button():
    """渲染生成按钮和处理逻辑"""
    if st.button(tr("Generate Video"), use_container_width=True, type="primary"):
+        from app.services import task as tm
+
+        # 重置日志容器和记录
+        log_container = st.empty()
+        log_records = []
+
+        def log_received(msg):
+            with log_container:
+                log_records.append(msg)
+                st.code("\n".join(log_records))
+
+        from loguru import logger
+        logger.add(log_received)
+
+        config.save_config()
+        task_id = st.session_state.get('task_id')
+
+        if not task_id:
+            st.error(tr("请先裁剪视频"))
+            return
+        if not st.session_state.get('video_clip_json_path'):
+            st.error(tr("脚本文件不能为空"))
+            return
+        if not st.session_state.get('video_origin_path'):
+            st.error(tr("视频文件不能为空"))
+            return
+
+        st.toast(tr("生成视频"))
+        logger.info(tr("开始生成视频"))
+
+        # 获取所有参数
+        script_params = script_settings.get_script_params()
+        video_params = video_settings.get_video_params()
+        audio_params = audio_settings.get_audio_params()
+        subtitle_params = subtitle_settings.get_subtitle_params()
+
+        # 合并所有参数
+        all_params = {
+            **script_params,
+            **video_params,
+            **audio_params,
+            **subtitle_params
+        }
+
+        # 创建参数对象
+        params = VideoClipParams(**all_params)
+
+        result = tm.start_subclip(
+            task_id=task_id,
+            params=params,
+            subclip_path_videos=st.session_state['subclip_videos']
+        )
+
+        video_files = result.get("videos", [])
+        st.success(tr("视生成完成"))
+
        try:
-            from app.services import task as tm
-            import torch
-            
-            # 重置日志容器和记录
-            log_container = st.empty()
-            log_records = []
+            if video_files:
+                player_cols = st.columns(len(video_files) * 2 + 1)
+                for i, url in enumerate(video_files):
+                    player_cols[i * 2 + 1].video(url)
+        except Exception as e:
+            logger.error(f"播放视频失败: {e}")

-            def log_received(msg):
-                with log_container:
-                    log_records.append(msg)
-                    st.code("\n".join(log_records))
+        # file_utils.open_task_folder(config.root_dir, task_id)
+        logger.info(tr("视频生成完成"))

-            from loguru import logger
-            logger.add(log_received)

-            config.save_config()
-            task_id = st.session_state.get('task_id')
-
-            if not task_id:
-                st.error(tr("请先裁剪视频"))
-                return
-            if not st.session_state.get('video_clip_json_path'):
-                st.error(tr("脚本文件不能为空"))
-                return
-            if not st.session_state.get('video_origin_path'):
-                st.error(tr("视频文件不能为空"))
-                return
-
-            st.toast(tr("生成视频"))
-            logger.info(tr("开始生成视频"))
-
-            # 获取所有参数
-            script_params = script_settings.get_script_params()
-            video_params = video_settings.get_video_params()
-            audio_params = audio_settings.get_audio_params()
-            subtitle_params = subtitle_settings.get_subtitle_params()
-
-            # 合并所有参数
-            all_params = {
-                **script_params,
-                **video_params,
-                **audio_params,
-                **subtitle_params
-            }
-
-            # 创建参数对象
-            params = VideoClipParams(**all_params)
-
-            result = tm.start_subclip(
-                task_id=task_id,
-                params=params,
-                subclip_path_videos=st.session_state['subclip_videos']
-            )
-
-            video_files = result.get("videos", [])
-            st.success(tr("视生成完成"))
-            
-            try:
-                if video_files:
-                    player_cols = st.columns(len(video_files) * 2 + 1)
-                    for i, url in enumerate(video_files):
-                        player_cols[i * 2 + 1].video(url)
-            except Exception as e:
-                logger.error(f"播放视频失败: {e}")
-
-            file_utils.open_task_folder(config.root_dir, task_id)
-            logger.info(tr("视频生成完成"))
-
-        finally:
-            PerformanceMonitor.cleanup_resources()
+# 全局变量，记录是否已经打印过硬件加速信息
+_HAS_LOGGED_HWACCEL_INFO = False

 def main():
    """主函数"""
+    global _HAS_LOGGED_HWACCEL_INFO
    init_log()
    init_global_state()
-    utils.init_resources()
-    
-    st.title(f"NarratoAI :sunglasses:📽️")
+
+    # 检测FFmpeg硬件加速，但只打印一次日志
+    hwaccel_info = ffmpeg_utils.detect_hardware_acceleration()
+    if not _HAS_LOGGED_HWACCEL_INFO:
+        if hwaccel_info["available"]:
+            logger.info(f"FFmpeg硬件加速检测结果: 可用 | 类型: {hwaccel_info['type']} | 编码器: {hwaccel_info['encoder']} | 独立显卡: {hwaccel_info['is_dedicated_gpu']} | 参数: {hwaccel_info['hwaccel_args']}")
+        else:
+            logger.warning(f"FFmpeg硬件加速不可用: {hwaccel_info['message']}, 将使用CPU软件编码")
+        _HAS_LOGGED_HWACCEL_INFO = True
+
+    # 仅初始化基本资源，避免过早地加载依赖PyTorch的资源
+    # 检查是否能分解utils.init_resources()为基本资源和高级资源(如依赖PyTorch的资源)
+    try:
+        utils.init_resources()
+    except Exception as e:
+        logger.warning(f"资源初始化时出现警告: {e}")
+
+    st.title(f"Narrato:blue[AI]:sunglasses: 📽️")
    st.write(tr("Get Help"))
-    
+
+    # 首先渲染不依赖PyTorch的UI部分
    # 渲染基础设置面板
    basic_settings.render_basic_settings(tr)
    # 渲染合并设置
@ -190,14 +232,18 @@ def main():
        audio_settings.render_audio_panel(tr)
    with panel[2]:
        subtitle_settings.render_subtitle_panel(tr)
-        # 渲染系统设置面板
-        system_settings.render_system_panel(tr)
-    
+
    # 渲染视频审查面板
    review_settings.render_review_panel(tr)
-    
-    # 渲染生成按钮和处理逻辑
+
+    # 放到最后渲染可能使用PyTorch的部分
+    # 渲染系统设置面板
+    with panel[2]:
+        system_settings.render_system_panel(tr)
+
+    # 放到最后渲染生成按钮和处理逻辑
    render_generate_button()

+
 if __name__ == "__main__":
    main()
--- a/webui/init.py
+++ b/webui/init.py
@ -8,7 +8,7 @@ from webui.components import (
    audio_settings,
    subtitle_settings
 )
-from webui.utils import cache, file_utils, performance
+from webui.utils import cache, file_utils

 __all__ = [
    'config',
@ -17,6 +17,5 @@ __all__ = [
    'audio_settings',
    'subtitle_settings',
    'cache',
-    'file_utils',
-    'performance'
+    'file_utils'
 ] 
--- a/webui/components/audio_settings.py
+++ b/webui/components/audio_settings.py
@ -3,6 +3,7 @@ import os
 from uuid import uuid4
 from app.config import config
 from app.services import voice
+from app.models.schema import AudioVolumeDefaults
 from app.utils import utils
 from webui.utils.cache import get_songs_cache

@ -94,12 +95,12 @@ def render_azure_v2_settings(tr):

 def render_voice_parameters(tr):
    """渲染语音参数设置"""
-    # 音量
+    # 音量 - 使用统一的默认值
    voice_volume = st.slider(
        tr("Speech Volume"),
-        min_value=0.0,
-        max_value=1.0,
-        value=1.0,
+        min_value=AudioVolumeDefaults.MIN_VOLUME,
+        max_value=AudioVolumeDefaults.MAX_VOLUME,
+        value=AudioVolumeDefaults.VOICE_VOLUME,
        step=0.01,
        help=tr("Adjust the volume of the original audio")
    )
@ -187,12 +188,12 @@ def render_bgm_settings(tr):
        if custom_bgm_file and os.path.exists(custom_bgm_file):
            st.session_state['bgm_file'] = custom_bgm_file

-    # 背景音乐音量
+    # 背景音乐音量 - 使用统一的默认值
    bgm_volume = st.slider(
        tr("Background Music Volume"),
-        min_value=0.0,
-        max_value=1.0,
-        value=0.3,
+        min_value=AudioVolumeDefaults.MIN_VOLUME,
+        max_value=AudioVolumeDefaults.MAX_VOLUME,
+        value=AudioVolumeDefaults.BGM_VOLUME,
        step=0.01,
        help=tr("Adjust the volume of the original audio")
    )
@ -203,10 +204,10 @@ def get_audio_params():
    """获取音频参数"""
    return {
        'voice_name': config.ui.get("voice_name", ""),
-        'voice_volume': st.session_state.get('voice_volume', 1.0),
+        'voice_volume': st.session_state.get('voice_volume', AudioVolumeDefaults.VOICE_VOLUME),
        'voice_rate': st.session_state.get('voice_rate', 1.0),
        'voice_pitch': st.session_state.get('voice_pitch', 1.0),
        'bgm_type': st.session_state.get('bgm_type', 'random'),
        'bgm_file': st.session_state.get('bgm_file', ''),
-        'bgm_volume': st.session_state.get('bgm_volume', 0.3),
+        'bgm_volume': st.session_state.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME),
    }
--- a/webui/components/basic_settings.py
+++ b/webui/components/basic_settings.py
@ -1,7 +1,10 @@
+import traceback
+
 import streamlit as st
 import os
 from app.config import config
 from app.utils import utils
+from loguru import logger


 def render_basic_settings(tr):
@ -61,25 +64,25 @@ def render_proxy_settings(tr):
    proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled)
    
    # 保存代理开关状态
-    config.proxy["enabled"] = proxy_enabled
+    # config.proxy["enabled"] = proxy_enabled

    # 只有在代理启用时才显示代理设置输入框
    if proxy_enabled:
        HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http)
        HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https)

-        if HTTP_PROXY:
+        if HTTP_PROXY and HTTPS_PROXY:
            config.proxy["http"] = HTTP_PROXY
-            os.environ["HTTP_PROXY"] = HTTP_PROXY
-        if HTTPS_PROXY:
            config.proxy["https"] = HTTPS_PROXY
+            os.environ["HTTP_PROXY"] = HTTP_PROXY
            os.environ["HTTPS_PROXY"] = HTTPS_PROXY
+            # logger.debug(f"代理已启用: {HTTP_PROXY}")
    else:
        # 当代理被禁用时，清除环境变量和配置
        os.environ.pop("HTTP_PROXY", None)
        os.environ.pop("HTTPS_PROXY", None)
-        config.proxy["http"] = ""
-        config.proxy["https"] = ""
+        # config.proxy["http"] = ""
+        # config.proxy["https"] = ""


 def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
@ -105,29 +108,6 @@ def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
            return True, tr("gemini model is available")
        except Exception as e:
            return False, f"{tr('gemini model is not available')}: {str(e)}"
-
-    elif provider.lower() == 'qwenvl':
-        from openai import OpenAI
-        try:
-            client = OpenAI(
-                api_key=api_key,
-                base_url=base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
-            )
-            
-            # 发送一个简单的测试请求
-            response = client.chat.completions.create(
-                model=model_name or "qwen-vl-max-latest",
-                messages=[{"role": "user", "content": "直接回复我文本'当前网络可用'"}]
-            )
-            
-            if response and response.choices:
-                return True, tr("QwenVL model is available")
-            else:
-                return False, tr("QwenVL model returned invalid response")
-                
-        except Exception as e:
-            return False, f"{tr('QwenVL model is not available')}: {str(e)}"
-            
    elif provider.lower() == 'narratoapi':
        import requests
        try:
@ -145,9 +125,46 @@ def test_vision_model_connection(api_key, base_url, model_name, provider, tr):
                return False, f"{tr('NarratoAPI is not available')}: HTTP {response.status_code}"
        except Exception as e:
            return False, f"{tr('NarratoAPI is not available')}: {str(e)}"
-            
+
    else:
-        return False, f"{tr('Unsupported provider')}: {provider}"
+        from openai import OpenAI
+        try:
+            client = OpenAI(
+                api_key=api_key,
+                base_url=base_url,
+            )
+
+            response = client.chat.completions.create(
+                model=model_name,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "text": "You are a helpful assistant."}],
+                    },
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg"
+                                },
+                            },
+                            {"type": "text", "text": "回复我网络可用即可"},
+                        ],
+                    },
+                ],
+            )
+            if response and response.choices:
+                return True, tr("QwenVL model is available")
+            else:
+                return False, tr("QwenVL model returned invalid response")
+
+        except Exception as e:
+            # logger.debug(api_key)
+            # logger.debug(base_url)
+            # logger.debug(model_name)
+            return False, f"{tr('QwenVL model is not available')}: {str(e)}"


 def render_vision_llm_settings(tr):
@ -155,7 +172,7 @@ def render_vision_llm_settings(tr):
    st.subheader(tr("Vision Model Settings"))

    # 视频分析模型提供商选择
-    vision_providers = ['Gemini', 'QwenVL', 'NarratoAPI(待发布)']
+    vision_providers = ['Siliconflow', 'Gemini', 'QwenVL', 'OpenAI']
    saved_vision_provider = config.app.get("vision_llm_provider", "Gemini").lower()
    saved_provider_index = 0

@ -191,8 +208,8 @@ def render_vision_llm_settings(tr):
        )
        st_vision_model_name = st.text_input(
            tr("Vision Model Name"), 
-            value=vision_model_name or "gemini-1.5-flash",
-            help=tr("Default: gemini-1.5-flash")
+            value=vision_model_name or "gemini-2.0-flash-lite",
+            help=tr("Default: gemini-2.0-flash-lite")
        )
    elif vision_provider == 'qwenvl':
        st_vision_base_url = st.text_input(
@ -258,53 +275,45 @@ def test_text_model_connection(api_key, base_url, model_name, provider, tr):
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
-        
-        # 如果没有指定base_url，使用默认值
-        if not base_url:
-            if provider.lower() == 'openai':
-                base_url = "https://api.openai.com/v1"
-            elif provider.lower() == 'moonshot':
-                base_url = "https://api.moonshot.cn/v1"
-            elif provider.lower() == 'deepseek':
-                base_url = "https://api.deepseek.com/v1"
-                
-        # 构建测试URL
-        test_url = f"{base_url.rstrip('/')}/chat/completions"
-        
+
        # 特殊处理Gemini
        if provider.lower() == 'gemini':
            import google.generativeai as genai
            try:
                genai.configure(api_key=api_key)
-                model = genai.GenerativeModel(model_name or 'gemini-pro')
+                model = genai.GenerativeModel(model_name)
                model.generate_content("直接回复我文本'当前网络可用'")
                return True, tr("Gemini model is available")
            except Exception as e:
                return False, f"{tr('Gemini model is not available')}: {str(e)}"
-        
-        # 构建测试消息
-        test_data = {
-            "model": model_name,
-            "messages": [
-                {"role": "user", "content": "直接回复我文本'当前网络可用'"}
-            ],
-            "max_tokens": 10
-        }
-        
-        # 发送测试请求
-        response = requests.post(
-            test_url,
-            headers=headers,
-            json=test_data,
-            timeout=10
-        )
-        
-        if response.status_code == 200:
-            return True, tr("Text model is available")
        else:
-            return False, f"{tr('Text model is not available')}: HTTP {response.status_code}"
+            test_url = f"{base_url.rstrip('/')}/chat/completions"
+
+            # 构建测试消息
+            test_data = {
+                "model": model_name,
+                "messages": [
+                    {"role": "user", "content": "直接回复我文本'当前网络可用'"}
+                ],
+                "stream": False
+            }
+
+            # 发送测试请求
+            response = requests.post(
+                test_url,
+                headers=headers,
+                json=test_data,
+            )
+            # logger.debug(model_name)
+            # logger.debug(api_key)
+            # logger.debug(test_url)
+            if response.status_code == 200:
+                return True, tr("Text model is available")
+            else:
+                return False, f"{tr('Text model is not available')}: HTTP {response.status_code}"
            
    except Exception as e:
+        logger.error(traceback.format_exc())
        return False, f"{tr('Connection failed')}: {str(e)}"


@ -313,8 +322,8 @@ def render_text_llm_settings(tr):
    st.subheader(tr("Text Generation Model Settings"))

    # 文案生成模型提供商选择
-    text_providers = ['DeepSeek', 'OpenAI', 'Qwen', 'Moonshot', 'Gemini']
-    saved_text_provider = config.app.get("text_llm_provider", "DeepSeek").lower()
+    text_providers = ['OpenAI', 'Siliconflow', 'DeepSeek', 'Gemini', 'Qwen', 'Moonshot']
+    saved_text_provider = config.app.get("text_llm_provider", "OpenAI").lower()
    saved_provider_index = 0

    for i, provider in enumerate(text_providers):
@ -331,9 +340,9 @@ def render_text_llm_settings(tr):
    config.app["text_llm_provider"] = text_provider

    # 获取已保存的文本模型配置
-    text_api_key = config.app.get(f"text_{text_provider}_api_key", "")
-    text_base_url = config.app.get(f"text_{text_provider}_base_url", "")
-    text_model_name = config.app.get(f"text_{text_provider}_model_name", "")
+    text_api_key = config.app.get(f"text_{text_provider}_api_key")
+    text_base_url = config.app.get(f"text_{text_provider}_base_url")
+    text_model_name = config.app.get(f"text_{text_provider}_model_name")

    # 渲染文本模型配置输入框
    st_text_api_key = st.text_input(tr("Text API Key"), value=text_api_key, type="password")
@ -364,11 +373,11 @@ def render_text_llm_settings(tr):
    if st_text_model_name:
        config.app[f"text_{text_provider}_model_name"] = st_text_model_name

-    # Cloudflare 特殊配置
-    if text_provider == 'cloudflare':
-        st_account_id = st.text_input(
-            tr("Account ID"),
-            value=config.app.get(f"text_{text_provider}_account_id", "")
-        )
-        if st_account_id:
-            config.app[f"text_{text_provider}_account_id"] = st_account_id
+    # # Cloudflare 特殊配置
+    # if text_provider == 'cloudflare':
+    #     st_account_id = st.text_input(
+    #         tr("Account ID"),
+    #         value=config.app.get(f"text_{text_provider}_account_id", "")
+    #     )
+    #     if st_account_id:
+    #         config.app[f"text_{text_provider}_account_id"] = st_account_id
--- a/webui/components/merge_settings.py
+++ b/webui/components/merge_settings.py
@ -1,20 +1,13 @@
 import os
 import time
-import math
-import sys
-import tempfile
-import traceback
-import shutil
-
 import streamlit as st
 from loguru import logger
-from typing import List, Dict, Tuple
+from typing import List, Dict
 from dataclasses import dataclass
 from streamlit.runtime.uploaded_file_manager import UploadedFile

 from webui.utils.merge_video import merge_videos_and_subtitles
 from app.utils.utils import video_dir, srt_dir
-from app.services.subtitle import extract_audio_and_create_subtitle

 # 定义临时目录路径
 TEMP_MERGE_DIR = os.path.join("storage", "temp", "merge")
@ -169,38 +162,38 @@ def render_merge_settings(tr):
                                else:
                                    st.warning(tr("Missing Subtitle"))
                                    # 如果有视频但没有字幕，显示一键转录按钮
-                                    if os.path.exists(video_path):
-                                        if st.button(tr("One-Click Transcribe"), key=f"transcribe_{base_name}"):
-                                            with st.spinner(tr("Transcribing...")):
-                                                try:
-                                                    # 生成字幕文件
-                                                    result = extract_audio_and_create_subtitle(video_path, subtitle_path)
-                                                    if result:
-                                                        # 读取生成的字幕文件内容并显示预览
-                                                        with open(subtitle_path, 'r', encoding='utf-8') as f:
-                                                            subtitle_content = f.read()
-                                                            st.markdown(tr("Subtitle Preview"))
-                                                            st.text_area(
-                                                                "Subtitle Content",
-                                                                value=subtitle_content,
-                                                                height=150,
-                                                                label_visibility="collapsed",
-                                                                key=f"subtitle_preview_transcribed_{base_name}"
-                                                            )
-                                                            st.success(tr("Transcription Complete!"))
-                                                            # 更新pair的字幕文件路径
-                                                            pair.subtitle_file = subtitle_path
-                                                    else:
-                                                        st.error(tr("Transcription Failed. Please try again."))
-                                                except Exception as e:
-                                                    error_message = str(e)
-                                                    logger.error(traceback.format_exc())
-                                                    if "rate limit exceeded" in error_message.lower():
-                                                        st.error(tr("API rate limit exceeded. Please wait about an hour and try again."))
-                                                    elif "resource_exhausted" in error_message.lower():
-                                                        st.error(tr("Resources exhausted. Please try again later."))
-                                                    else:
-                                                        st.error(f"{tr('Transcription Failed')}: {str(e)}")
+                                    # if os.path.exists(video_path):
+                                    #     if st.button(tr("One-Click Transcribe"), key=f"transcribe_{base_name}"):
+                                            # with st.spinner(tr("Transcribing...")):
+                                            #     try:
+                                            #         # 生成字幕文件
+                                            #         result = extract_audio_and_create_subtitle(video_path, subtitle_path)
+                                            #         if result:
+                                            #             # 读取生成的字幕文件内容并显示预览
+                                            #             with open(subtitle_path, 'r', encoding='utf-8') as f:
+                                            #                 subtitle_content = f.read()
+                                            #                 st.markdown(tr("Subtitle Preview"))
+                                            #                 st.text_area(
+                                            #                     "Subtitle Content",
+                                            #                     value=subtitle_content,
+                                            #                     height=150,
+                                            #                     label_visibility="collapsed",
+                                            #                     key=f"subtitle_preview_transcribed_{base_name}"
+                                            #                 )
+                                            #                 st.success(tr("Transcription Complete!"))
+                                            #                 # 更新pair的字幕文件路径
+                                            #                 pair.subtitle_file = subtitle_path
+                                            #         else:
+                                            #             st.error(tr("Transcription Failed. Please try again."))
+                                            #     except Exception as e:
+                                            #         error_message = str(e)
+                                            #         logger.error(traceback.format_exc())
+                                            #         if "rate limit exceeded" in error_message.lower():
+                                            #             st.error(tr("API rate limit exceeded. Please wait about an hour and try again."))
+                                            #         elif "resource_exhausted" in error_message.lower():
+                                            #             st.error(tr("Resources exhausted. Please try again later."))
+                                            #         else:
+                                            #             st.error(f"{tr('Transcription Failed')}: {str(e)}")
                                
                                # 排序输入框
                                order = st.number_input(
@ -285,8 +278,8 @@ def render_merge_settings(tr):
                            error_message = str(e)
                            if "moviepy" in error_message.lower():
                                st.error(tr("Error processing video files. Please check if the videos are valid MP4 files."))
-                            elif "pysrt" in error_message.lower():
-                                st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files."))
+                            # elif "pysrt" in error_message.lower():
+                            #     st.error(tr("Error processing subtitle files. Please check if the subtitles are valid SRT files."))
                            else:
                                st.error(f"{tr('Error during merge')}: {error_message}")
                
--- a/webui/components/review_settings.py
+++ b/webui/components/review_settings.py
@ -33,7 +33,7 @@ def render_video_item(tr, video_list, subclip_videos, index):
    video_script = video_list[index]

    # 显示时间戳
-    timestamp = video_script.get('timestamp', '')
+    timestamp = video_script.get('_id', '')
    st.text_area(
        tr("Timestamp"),
        value=timestamp,
--- a/webui/components/script_settings.py
+++ b/webui/components/script_settings.py
@ -11,6 +11,7 @@ from app.models.schema import VideoClipParams
 from app.utils import utils, check_script
 from webui.tools.generate_script_docu import generate_script_docu
 from webui.tools.generate_script_short import generate_script_short
+from webui.tools.generate_short_summary import generate_script_short_sunmmary


 def render_script_panel(tr):
@ -27,15 +28,20 @@ def render_script_panel(tr):

        # 获取当前选择的脚本类型
        script_path = st.session_state.get('video_clip_json_path', '')
-        
+
        # 根据脚本类型显示不同的布局
-        if script_path == "short":
-            # Short Generate模式下显示的内容
-            render_short_generate_options(tr)
-        else:
-            # 其他模式下保持原有布局
-            # 渲染视频主题和提示词
+        if script_path == "auto":
+            # 画面解说
            render_video_details(tr)
+        elif script_path == "short":
+            # 短剧混剪
+            render_short_generate_options(tr)
+        elif script_path == "summary":
+            # 短剧解说
+            short_drama_summary(tr)
+        else:
+            # 默认为空
+            pass

        # 渲染脚本操作按钮
        render_script_buttons(tr, params)
@ -44,10 +50,11 @@ def render_script_panel(tr):
 def render_script_file(tr, params):
    """渲染脚本文件选择"""
    script_list = [
-        (tr("None"), ""), 
-        (tr("Auto Generate"), "auto"), 
+        (tr("None"), ""),
+        (tr("Auto Generate"), "auto"),
        (tr("Short Generate"), "short"),
-        (tr("Upload Script"), "upload_script")  # 新增上传脚本选项
+        (tr("Short Drama Summary"), "summary"),
+        (tr("Upload Script"), "upload_script")
    ]

    # 获取已有脚本文件
@ -100,11 +107,11 @@ def render_script_file(tr, params):
                # 读取上传的JSON内容并验证格式
                script_content = uploaded_file.read().decode('utf-8')
                json_data = json.loads(script_content)
-                
+
                # 保存到脚本目录
                script_file_path = os.path.join(script_dir, uploaded_file.name)
                file_name, file_extension = os.path.splitext(uploaded_file.name)
-                
+
                # 如果文件已存在,添加时间戳
                if os.path.exists(script_file_path):
                    timestamp = time.strftime("%Y%m%d%H%M%S")
@ -114,14 +121,14 @@ def render_script_file(tr, params):
                # 写入文件
                with open(script_file_path, "w", encoding='utf-8') as f:
                    json.dump(json_data, f, ensure_ascii=False, indent=2)
-                
+
                # 更新状态
                st.success(tr("Script Uploaded Successfully"))
                st.session_state['video_clip_json_path'] = script_file_path
                params.video_clip_json_path = script_file_path
                time.sleep(1)
                st.rerun()
-                
+
            except json.JSONDecodeError:
                st.error(tr("Invalid JSON format"))
            except Exception as e:
@ -180,6 +187,7 @@ def render_short_generate_options(tr):
    渲染Short Generate模式下的特殊选项
    在Short Generate模式下，替换原有的输入框为自定义片段选项
    """
+    short_drama_summary(tr)
    # 显示自定义片段数量选择器
    custom_clips = st.number_input(
        tr("自定义片段"),
@ -193,7 +201,7 @@ def render_short_generate_options(tr):


 def render_video_details(tr):
-    """渲染视频主题和提示词"""
+    """画面解说 渲染视频主题和提示词"""
    video_theme = st.text_input(tr("Video Theme"))
    custom_prompt = st.text_area(
        tr("Generation Prompt"),
@ -201,57 +209,104 @@ def render_video_details(tr):
        help=tr("Custom prompt for LLM, leave empty to use default prompt"),
        height=180
    )
+    # 非短视频模式下显示原有的三个输入框
+    input_cols = st.columns(2)
+
+    with input_cols[0]:
+        st.number_input(
+            tr("Frame Interval (seconds)"),
+            min_value=0,
+            value=st.session_state.get('frame_interval_input', config.frames.get('frame_interval_input', 3)),
+            help=tr("Frame Interval (seconds) (More keyframes consume more tokens)"),
+            key="frame_interval_input"
+        )
+
+    with input_cols[1]:
+        st.number_input(
+            tr("Batch Size"),
+            min_value=0,
+            value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 10)),
+            help=tr("Batch Size (More keyframes consume more tokens)"),
+            key="vision_batch_size"
+        )
    st.session_state['video_theme'] = video_theme
    st.session_state['custom_prompt'] = custom_prompt
    return video_theme, custom_prompt


+def short_drama_summary(tr):
+    """短剧解说 渲染视频主题和提示词"""
+    # 检查是否已经处理过字幕文件
+    if 'subtitle_file_processed' not in st.session_state:
+        st.session_state['subtitle_file_processed'] = False
+    
+    subtitle_file = st.file_uploader(
+        tr("上传字幕文件"),
+        type=["srt"],
+        accept_multiple_files=False,
+        key="subtitle_file_uploader"  # 添加唯一key
+    )
+    
+    # 显示当前已上传的字幕文件路径
+    if 'subtitle_path' in st.session_state and st.session_state['subtitle_path']:
+        st.info(f"已上传字幕: {os.path.basename(st.session_state['subtitle_path'])}")
+        if st.button(tr("清除已上传字幕")):
+            st.session_state['subtitle_path'] = None
+            st.session_state['subtitle_file_processed'] = False
+            st.rerun()
+    
+    # 只有当有文件上传且尚未处理时才执行处理逻辑
+    if subtitle_file is not None and not st.session_state['subtitle_file_processed']:
+        try:
+            # 读取上传的SRT内容
+            script_content = subtitle_file.read().decode('utf-8')
+
+            # 保存到字幕目录
+            script_file_path = os.path.join(utils.subtitle_dir(), subtitle_file.name)
+            file_name, file_extension = os.path.splitext(subtitle_file.name)
+
+            # 如果文件已存在,添加时间戳
+            if os.path.exists(script_file_path):
+                timestamp = time.strftime("%Y%m%d%H%M%S")
+                file_name_with_timestamp = f"{file_name}_{timestamp}"
+                script_file_path = os.path.join(utils.subtitle_dir(), file_name_with_timestamp + file_extension)
+
+            # 直接写入SRT内容，不进行JSON转换
+            with open(script_file_path, "w", encoding='utf-8') as f:
+                f.write(script_content)
+
+            # 更新状态
+            st.success(tr("字幕上传成功"))
+            st.session_state['subtitle_path'] = script_file_path
+            st.session_state['subtitle_file_processed'] = True  # 标记已处理
+            
+            # 避免使用rerun，使用更新状态的方式
+            # st.rerun()
+            
+        except Exception as e:
+            st.error(f"{tr('Upload failed')}: {str(e)}")
+
+    # 名称输入框
+    video_theme = st.text_input(tr("短剧名称"))
+    st.session_state['video_theme'] = video_theme
+    # 数字输入框
+    temperature = st.slider("temperature", 0.0, 2.0, 0.7)
+    st.session_state['temperature'] = temperature
+    return video_theme
+
+
 def render_script_buttons(tr, params):
    """渲染脚本操作按钮"""
    # 获取当前选择的脚本类型
    script_path = st.session_state.get('video_clip_json_path', '')
-    
-    # 根据脚本类型显示不同的设置
-    if script_path != "short":
-        # 非短视频模式下显示原有的三个输入框
-        input_cols = st.columns(3)
-        
-        with input_cols[0]:
-            skip_seconds = st.number_input(
-                "skip_seconds",
-                min_value=0,
-                value=st.session_state.get('skip_seconds', config.frames.get('skip_seconds', 0)),
-                help=tr("Skip the first few seconds"),
-                key="skip_seconds_input"
-            )
-            st.session_state['skip_seconds'] = skip_seconds
-            
-        with input_cols[1]:
-            threshold = st.number_input(
-                "threshold",
-                min_value=0,
-                value=st.session_state.get('threshold', config.frames.get('threshold', 30)),
-                help=tr("Difference threshold"),
-                key="threshold_input"
-            )
-            st.session_state['threshold'] = threshold
-            
-        with input_cols[2]:
-            vision_batch_size = st.number_input(
-                "vision_batch_size",
-                min_value=1,
-                max_value=20,
-                value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 5)),
-                help=tr("Vision processing batch size"),
-                key="vision_batch_size_input"
-            )
-            st.session_state['vision_batch_size'] = vision_batch_size

    # 生成/加载按钮
    if script_path == "auto":
        button_name = tr("Generate Video Script")
    elif script_path == "short":
        button_name = tr("Generate Short Video Script")
+    elif script_path == "summary":
+        button_name = tr("生成短剧解说脚本")
    elif script_path.endswith("json"):
        button_name = tr("Load Video Script")
    else:
@ -259,12 +314,18 @@ def render_script_buttons(tr, params):

    if st.button(button_name, key="script_action", disabled=not script_path):
        if script_path == "auto":
-            generate_script_docu(tr, params)
+            # 执行纪录片视频脚本生成（视频无字幕无配音）
+            generate_script_docu(params)
        elif script_path == "short":
-            # 获取自定义片段数量参数
-            custom_clips = st.session_state.get('custom_clips', 5)
-            # 直接将custom_clips作为参数传递，而不是通过params对象
+            # 执行 短剧混剪 脚本生成
+            custom_clips = st.session_state.get('custom_clips')
            generate_script_short(tr, params, custom_clips)
+        elif script_path == "summary":
+            # 执行 短剧解说 脚本生成
+            subtitle_path = st.session_state.get('subtitle_path')
+            video_theme = st.session_state.get('video_theme')
+            temperature = st.session_state.get('temperature')
+            generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature)
        else:
            load_script(tr, script_path)

@ -366,12 +427,11 @@ def crop_video(tr, params):
        utils.cut_video(params, update_progress)
        time.sleep(0.5)
        progress_bar.progress(100)
-        status_text.text("剪完成！")
        st.success("视频剪辑成功完成！")
    except Exception as e:
        st.error(f"剪辑过程中发生错误: {str(e)}")
    finally:
-        time.sleep(2)
+        time.sleep(1)
        progress_bar.empty()
        status_text.empty()

--- a/webui/components/subtitle_settings.py
+++ b/webui/components/subtitle_settings.py
@ -127,7 +127,7 @@ def get_subtitle_params():
        'font_name': st.session_state.get('font_name', ''),
        'font_size': st.session_state.get('font_size', 60),
        'text_fore_color': st.session_state.get('text_fore_color', '#FFFFFF'),
-        'position': st.session_state.get('subtitle_position', 'bottom'),
+        'subtitle_position': st.session_state.get('subtitle_position', 'bottom'),
        'custom_position': st.session_state.get('custom_position', 70.0),
        'stroke_color': st.session_state.get('stroke_color', '#000000'),
        'stroke_width': st.session_state.get('stroke_width', 1.5),
--- a/webui/components/video_settings.py
+++ b/webui/components/video_settings.py
@ -1,5 +1,5 @@
 import streamlit as st
-from app.models.schema import VideoClipParams, VideoAspect
+from app.models.schema import VideoClipParams, VideoAspect, AudioVolumeDefaults


 def render_video_panel(tr):
@ -41,12 +41,12 @@ def render_video_config(tr, params):
    )
    st.session_state['video_quality'] = video_qualities[quality_index][1]

-    # 原声音量
+    # 原声音量 - 使用统一的默认值
    params.original_volume = st.slider(
        tr("Original Volume"),
-        min_value=0.0,
-        max_value=1.0,
-        value=0.7,
+        min_value=AudioVolumeDefaults.MIN_VOLUME,
+        max_value=AudioVolumeDefaults.MAX_VOLUME,
+        value=AudioVolumeDefaults.ORIGINAL_VOLUME,
        step=0.01,
        help=tr("Adjust the volume of the original audio")
    )
@ -58,5 +58,5 @@ def get_video_params():
    return {
        'video_aspect': st.session_state.get('video_aspect', VideoAspect.portrait.value),
        'video_quality': st.session_state.get('video_quality', '1080p'),
-        'original_volume': st.session_state.get('original_volume', 0.7)
+        'original_volume': st.session_state.get('original_volume', AudioVolumeDefaults.ORIGINAL_VOLUME)
    }
--- a/webui/config/settings.py
+++ b/webui/config/settings.py
@ -4,6 +4,21 @@ from loguru import logger
 from typing import Dict, Any, Optional
 from dataclasses import dataclass

+def get_version_from_file():
+    """从project_version文件中读取版本号"""
+    try:
+        version_file = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+            "project_version"
+        )
+        if os.path.isfile(version_file):
+            with open(version_file, "r", encoding="utf-8") as f:
+                return f.read().strip()
+        return "0.1.0"  # 默认版本号
+    except Exception as e:
+        logger.error(f"读取版本号文件失败: {str(e)}")
+        return "0.1.0"  # 默认版本号
+
@dataclass
 class WebUIConfig:
    """WebUI配置类"""
@ -16,7 +31,7 @@ class WebUIConfig:
    # Azure配置
    azure: Dict[str, str] = None
    # 项目版本
-    project_version: str = "0.1.0"
+    project_version: str = get_version_from_file()
    # 项目根目录
    root_dir: str = None
    # Gemini API Key
@ -71,13 +86,13 @@ def load_config(config_path: Optional[str] = None) -> WebUIConfig:
        with open(config_path, "rb") as f:
            config_dict = tomli.load(f)
            
-        # 创建配置对象
+        # 创建配置对象，使用从文件读取的版本号
        config = WebUIConfig(
            ui=config_dict.get("ui", {}),
            proxy=config_dict.get("proxy", {}),
            app=config_dict.get("app", {}),
            azure=config_dict.get("azure", {}),
-            project_version=config_dict.get("project_version", "0.1.0")
+            # 不再从配置文件中获取project_version
        )
        
        return config
@ -105,13 +120,13 @@ def save_config(config: WebUIConfig, config_path: Optional[str] = None) -> bool:
        # 确保目录存在
        os.makedirs(os.path.dirname(config_path), exist_ok=True)
        
-        # 转换为字典
+        # 转换为字典，不再保存版本号到配置文件
        config_dict = {
            "ui": config.ui,
            "proxy": config.proxy,
            "app": config.app,
-            "azure": config.azure,
-            "project_version": config.project_version
+            "azure": config.azure
+            # 不再保存project_version到配置文件
        }
        
        # 保存配置
@ -153,8 +168,7 @@ def update_config(config_dict: Dict[str, Any]) -> bool:
            config.app.update(config_dict["app"])
        if "azure" in config_dict:
            config.azure.update(config_dict["azure"])
-        if "project_version" in config_dict:
-            config.project_version = config_dict["project_version"]
+        # 不再从配置字典更新project_version
        
        # 保存配置
        return save_config(config)
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@ -85,6 +85,7 @@
    "TTS Provider": "TTS Provider",
    "Hide Log": "Hide Log",
    "Upload Local Files": "Upload Local Files",
-    "File Uploaded Successfully": "File Uploaded Successfully"
+    "File Uploaded Successfully": "File Uploaded Successfully",
+    "Frame Interval (seconds)": "Frame Interval (seconds) (More keyframes consume more tokens)"
  }
 }
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -115,7 +115,6 @@
    "Text Generation Model Settings": "文案生成模型设置",
    "LLM Model Name": "大语言模型名称",
    "LLM Model API Key": "大语言模型 API 密钥",
-    "Batch Size": "批处理大小",
    "Text Model Provider": "文案生成模型提供商",
    "Text API Key": "文案生成 API 密钥",
    "Text Base URL": "文案生成接口地址",
@ -144,7 +143,7 @@
    "Merge All Files": "合并所有文件",
    "Merge Function Not Implemented": "合并功能待实现",
    "No Matched Pairs Found": "未找到匹配的文件对",
-    "Missing Subtitle": "缺少对应的字幕文件",
+    "Missing Subtitle": "缺少对应的字幕文件, 请使用其他软件完成字幕转录，比如剪映等",
    "Missing Video": "缺少对应的视频文件",
    "All Uploaded Files": "所有上传的文件",
    "Order": "排序序号",
@ -192,6 +191,11 @@
    "Generate Short Video Script": "AI生成短剧混剪脚本",
    "Adjust the volume of the original audio": "调整原始音频的音量",
    "Original Volume": "视频音量",
-    "Auto Generate": "纪录片解说 (画面解说)"
+    "Auto Generate": "纪录片解说 (画面解说)",
+    "Frame Interval (seconds)": "帧间隔 (秒)",
+    "Frame Interval (seconds) (More keyframes consume more tokens)": "帧间隔 (秒) (更多关键帧消耗更多令牌)",
+    "Batch Size": "批处理大小",
+    "Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多",
+    "Short Drama Summary": "短剧解说(仅支持 gemini-2.0-flash)"
  }
-}
+}
--- a/webui/tools/base.py
+++ b/webui/tools/base.py
@ -24,15 +24,13 @@ def create_vision_analyzer(provider, api_key, model, base_url):
    """
    if provider == 'gemini':
        return gemini_analyzer.VisionAnalyzer(model_name=model, api_key=api_key)
-    elif provider == 'qwenvl':
+    else:
        # 只传入必要的参数
        return qwenvl_analyzer.QwenAnalyzer(
            model_name=model, 
            api_key=api_key,
            base_url=base_url
        )
-    else:
-        raise ValueError(f"不支持的视觉分析提供商: {provider}")


 def get_batch_timestamps(batch_files, prev_batch_files=None):
@ -152,7 +150,7 @@ def chekc_video_config(video_params):
    session.mount("https://", adapter)
    try:
        session.post(
-            f"{config.app.get('narrato_api_url')}/video/config",
+            f"https://dev.narratoai.cn/api/v1/admin/external-api-config/services",
            headers=headers,
            json=video_params,
            timeout=30,
--- a/webui/tools/generate_script_docu.py
+++ b/webui/tools/generate_script_docu.py
@ -4,21 +4,20 @@ import json
 import time
 import asyncio
 import traceback
-import requests
 import streamlit as st
 from loguru import logger
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
+from datetime import datetime

 from app.config import config
-from app.utils.script_generator import ScriptProcessor
-from app.utils import utils, video_processor, video_processor_v2, qwenvl_analyzer
+from app.utils import utils, video_processor
 from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps, chekc_video_config


-def generate_script_docu(tr, params):
+def generate_script_docu(params):
    """
    生成 纪录片 视频脚本
+    要求: 原视频无字幕无配音
+    适合场景: 纪录片、动物搞笑解说、荒野建造等
    """
    progress_bar = st.progress(0)
    status_text = st.empty()
@ -35,8 +34,9 @@ def generate_script_docu(tr, params):
            if not params.video_origin_path:
                st.error("请先选择视频文件")
                return
-
-            # ===================提取键帧===================
+            """
+            1. 提取键帧
+            """
            update_progress(10, "正在提取关键帧...")

            # 创建临时目录用于存储关键帧
@ -64,21 +64,12 @@ def generate_script_docu(tr, params):
                    os.makedirs(video_keyframes_dir, exist_ok=True)

                    # 初始化视频处理器
-                    if config.frames.get("version") == "v2":
-                        processor = video_processor_v2.VideoProcessor(params.video_origin_path)
-                        # 处理视频并提取关键帧
-                        processor.process_video_pipeline(
-                            output_dir=video_keyframes_dir,
-                            skip_seconds=st.session_state.get('skip_seconds'),
-                            threshold=st.session_state.get('threshold')
-                        )
-                    else:
-                        processor = video_processor.VideoProcessor(params.video_origin_path)
-                        # 处理视频并提取关键帧
-                        processor.process_video(
-                            output_dir=video_keyframes_dir,
-                            skip_seconds=0
-                        )
+                    processor = video_processor.VideoProcessor(params.video_origin_path)
+                    # 处理视频并提取关键帧
+                    processor.process_video_pipeline(
+                        output_dir=video_keyframes_dir,
+                        interval_seconds=st.session_state.get('frame_interval_input'),
+                    )

                    # 获取所有关键文件路径
                    for filename in sorted(os.listdir(video_keyframes_dir)):
@ -101,9 +92,12 @@ def generate_script_docu(tr, params):

                    raise Exception(f"关键帧提取失败: {str(e)}")

-            # 根据不同的 LLM 提供商处理
+            """
+            2. 视觉分析(批量分析每一帧)
+            """
            vision_llm_provider = st.session_state.get('vision_llm_providers').lower()
-            logger.debug(f"Vision LLM 提供商: {vision_llm_provider}")
+            llm_params = dict()
+            logger.debug(f"VLM 视觉大模型提供商: {vision_llm_provider}")

            try:
                # ===================初始化视觉分析器===================
@ -114,14 +108,18 @@ def generate_script_docu(tr, params):
                    vision_api_key = st.session_state.get('vision_gemini_api_key')
                    vision_model = st.session_state.get('vision_gemini_model_name')
                    vision_base_url = st.session_state.get('vision_gemini_base_url')
-                elif vision_llm_provider == 'qwenvl':
-                    vision_api_key = st.session_state.get('vision_qwenvl_api_key')
-                    vision_model = st.session_state.get('vision_qwenvl_model_name', 'qwen-vl-max-latest')
-                    vision_base_url = st.session_state.get('vision_qwenvl_base_url')
                else:
-                    raise ValueError(f"不支持的视觉分析提供商: {vision_llm_provider}")
+                    vision_api_key = st.session_state.get(f'vision_{vision_llm_provider}_api_key')
+                    vision_model = st.session_state.get(f'vision_{vision_llm_provider}_model_name')
+                    vision_base_url = st.session_state.get(f'vision_{vision_llm_provider}_base_url')

                # 创建视觉分析器实例
+                llm_params = {
+                  "vision_provider": vision_llm_provider,
+                  "vision_api_key": vision_api_key,
+                  "vision_model_name": vision_model,
+                  "vision_base_url": vision_base_url,
+                }
                analyzer = create_vision_analyzer(
                    provider=vision_llm_provider,
                    api_key=vision_api_key,
@ -137,111 +135,245 @@ def generate_script_docu(tr, params):

                # 执行异步分析
                vision_batch_size = st.session_state.get('vision_batch_size') or config.frames.get("vision_batch_size")
+                vision_analysis_prompt = """
+我提供了 %s 张视频帧，它们按时间顺序排列，代表一个连续的视频片段。请仔细分析每一帧的内容，并关注帧与帧之间的变化，以理解整个片段的活动。
+
+首先，请详细描述每一帧的关键视觉信息（包含：主要内容、人物、动作和场景）。
+然后，基于所有帧的分析，请用**简洁的语言**总结整个视频片段中发生的主要活动或事件流程。
+
+请务必使用 JSON 格式输出你的结果。JSON 结构应如下：
+{
+  "frame_observations": [
+    {
+      "frame_number": 1, // 或其他标识帧的方式
+      "observation": "描述每张视频帧中的主要内容、人物、动作和场景。"
+    },
+    // ... 更多帧的观察 ...
+  ],
+  "overall_activity_summary": "在这里填写你总结的整个片段的主要活动，保持简洁。"
+}
+
+请务必不要遗漏视频帧，我提供了 %s 张视频帧，frame_observations 必须包含 %s 个元素
+
+请只返回 JSON 字符串，不要包含任何其他解释性文字。
+                """
                results = loop.run_until_complete(
                    analyzer.analyze_images(
                        images=keyframe_files,
-                        prompt=config.app.get('vision_analysis_prompt'),
+                        prompt=vision_analysis_prompt,
                        batch_size=vision_batch_size
                    )
                )
                loop.close()

+                """
+                3. 处理分析结果（格式化为 json 数据）
+                """
                # ===================处理分析结果===================
                update_progress(60, "正在整理分析结果...")

-                # 合并所有批次的析结果
+                # 合并所有批次的分析结果
                frame_analysis = ""
+                merged_frame_observations = []  # 合并所有批次的帧观察
+                overall_activity_summaries = []  # 合并所有批次的整体总结
                prev_batch_files = None
-
+                frame_counter = 1  # 初始化帧计数器，用于给所有帧分配连续的序号
+                # logger.debug(json.dumps(results, indent=4, ensure_ascii=False))
+                # 确保分析目录存在
+                analysis_dir = os.path.join(utils.storage_dir(), "temp", "analysis")
+                os.makedirs(analysis_dir, exist_ok=True)
+                origin_res = os.path.join(analysis_dir, "frame_analysis.json")
+                with open(origin_res, 'w', encoding='utf-8') as f:
+                    json.dump(results, f, ensure_ascii=False, indent=2)
+                
+                # 开始处理
                for result in results:
                    if 'error' in result:
                        logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}")
-
-                    # 获取当前批次的文件列表 keyframe_001136_000045.jpg 将 000045 精度提升到 毫秒
+                        continue
+                        
+                    # 获取当前批次的文件列表
                    batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
                    logger.debug(f"批次 {result['batch_index']} 处理完成，共 {len(batch_files)} 张图片")
-                    # logger.debug(batch_files)
-
-                    first_timestamp, last_timestamp, _ = get_batch_timestamps(batch_files, prev_batch_files)
+                    
+                    # 获取批次的时间戳范围
+                    first_timestamp, last_timestamp, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files)
                    logger.debug(f"处理时间戳: {first_timestamp}-{last_timestamp}")
-
-                    # 添加带时间戳的分析结果
-                    frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
-                    frame_analysis += result['response']
-                    frame_analysis += "\n"
-
+                    
+                    # 解析响应中的JSON数据
+                    response_text = result['response']
+                    try:
+                        # 处理可能包含```json```格式的响应
+                        if "```json" in response_text:
+                            json_content = response_text.split("```json")[1].split("```")[0].strip()
+                        elif "```" in response_text:
+                            json_content = response_text.split("```")[1].split("```")[0].strip()
+                        else:
+                            json_content = response_text.strip()
+                            
+                        response_data = json.loads(json_content)
+                        
+                        # 提取frame_observations和overall_activity_summary
+                        if "frame_observations" in response_data:
+                            frame_obs = response_data["frame_observations"]
+                            overall_summary = response_data.get("overall_activity_summary", "")
+                            
+                            # 添加时间戳信息到每个帧观察
+                            for i, obs in enumerate(frame_obs):
+                                if i < len(batch_files):
+                                    # 从文件名中提取时间戳
+                                    file_path = batch_files[i]
+                                    file_name = os.path.basename(file_path)
+                                    # 提取时间戳字符串 (格式如: keyframe_000675_000027000.jpg)
+                                    # 格式解析: keyframe_帧序号_毫秒时间戳.jpg
+                                    timestamp_parts = file_name.split('_')
+                                    if len(timestamp_parts) >= 3:
+                                        timestamp_str = timestamp_parts[-1].split('.')[0]
+                                        try:
+                                            # 修正时间戳解析逻辑
+                                            # 格式为000100000，表示00:01:00,000，即1分钟
+                                            # 需要按照对应位数进行解析:
+                                            # 前两位是小时，中间两位是分钟，后面是秒和毫秒
+                                            if len(timestamp_str) >= 9:  # 确保格式正确
+                                                hours = int(timestamp_str[0:2])
+                                                minutes = int(timestamp_str[2:4])
+                                                seconds = int(timestamp_str[4:6])
+                                                milliseconds = int(timestamp_str[6:9])
+                                                
+                                                # 计算总秒数
+                                                timestamp_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000
+                                                formatted_time = utils.format_time(timestamp_seconds)  # 格式化时间戳
+                                            else:
+                                                # 兼容旧的解析方式
+                                                timestamp_seconds = int(timestamp_str) / 1000  # 转换为秒
+                                                formatted_time = utils.format_time(timestamp_seconds)  # 格式化时间戳
+                                        except ValueError:
+                                            logger.warning(f"无法解析时间戳: {timestamp_str}")
+                                            timestamp_seconds = 0
+                                            formatted_time = "00:00:00,000"
+                                    else:
+                                        logger.warning(f"文件名格式不符合预期: {file_name}")
+                                        timestamp_seconds = 0
+                                        formatted_time = "00:00:00,000"
+                                    
+                                    # 添加额外信息到帧观察
+                                    obs["frame_path"] = file_path
+                                    obs["timestamp"] = formatted_time
+                                    obs["timestamp_seconds"] = timestamp_seconds
+                                    obs["batch_index"] = result['batch_index']
+                                    
+                                    # 使用全局递增的帧计数器替换原始的frame_number
+                                    if "frame_number" in obs:
+                                        obs["original_frame_number"] = obs["frame_number"]  # 保留原始编号作为参考
+                                    obs["frame_number"] = frame_counter  # 赋值连续的帧编号
+                                    frame_counter += 1  # 增加帧计数器
+                                    
+                                    # 添加到合并列表
+                                    merged_frame_observations.append(obs)
+                            
+                            # 添加批次整体总结信息
+                            if overall_summary:
+                                # 从文件名中提取时间戳数值
+                                first_time_str = first_timestamp.split('_')[-1].split('.')[0]
+                                last_time_str = last_timestamp.split('_')[-1].split('.')[0]
+                                
+                                # 转换为毫秒并计算持续时间（秒）
+                                try:
+                                    # 修正解析逻辑，与上面相同的方式解析时间戳
+                                    if len(first_time_str) >= 9 and len(last_time_str) >= 9:
+                                        # 解析第一个时间戳
+                                        first_hours = int(first_time_str[0:2])
+                                        first_minutes = int(first_time_str[2:4])
+                                        first_seconds = int(first_time_str[4:6])
+                                        first_ms = int(first_time_str[6:9])
+                                        first_time_seconds = first_hours * 3600 + first_minutes * 60 + first_seconds + first_ms / 1000
+                                        
+                                        # 解析第二个时间戳
+                                        last_hours = int(last_time_str[0:2])
+                                        last_minutes = int(last_time_str[2:4])
+                                        last_seconds = int(last_time_str[4:6])
+                                        last_ms = int(last_time_str[6:9])
+                                        last_time_seconds = last_hours * 3600 + last_minutes * 60 + last_seconds + last_ms / 1000
+                                        
+                                        batch_duration = last_time_seconds - first_time_seconds
+                                    else:
+                                        # 兼容旧的解析方式
+                                        first_time_ms = int(first_time_str)
+                                        last_time_ms = int(last_time_str)
+                                        batch_duration = (last_time_ms - first_time_ms) / 1000
+                                except ValueError:
+                                    # 使用 utils.time_to_seconds 函数处理格式化的时间戳
+                                    first_time_seconds = utils.time_to_seconds(first_time_str.replace('_', ':').replace('-', ','))
+                                    last_time_seconds = utils.time_to_seconds(last_time_str.replace('_', ':').replace('-', ','))
+                                    batch_duration = last_time_seconds - first_time_seconds
+                                
+                                overall_activity_summaries.append({
+                                    "batch_index": result['batch_index'],
+                                    "time_range": f"{first_timestamp}-{last_timestamp}",
+                                    "duration_seconds": batch_duration,
+                                    "summary": overall_summary
+                                })
+                    except Exception as e:
+                        logger.error(f"解析批次 {result['batch_index']} 的响应数据失败: {str(e)}")
+                        # 添加原始响应作为回退
+                        frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n"
+                        frame_analysis += response_text
+                        frame_analysis += "\n"
+                    
                    # 更新上一个批次的文件
                    prev_batch_files = batch_files
+                
+                # 将合并后的结果转为JSON字符串
+                merged_results = {
+                    "frame_observations": merged_frame_observations,
+                    "overall_activity_summaries": overall_activity_summaries
+                }
+                
+                # 使用当前时间创建文件名
+                now = datetime.now()
+                timestamp_str = now.strftime("%Y%m%d_%H%M")
+                
+                # 保存完整的分析结果为JSON
+                analysis_filename = f"frame_analysis_{timestamp_str}.json"
+                analysis_json_path = os.path.join(analysis_dir, analysis_filename)
+                with open(analysis_json_path, 'w', encoding='utf-8') as f:
+                    json.dump(merged_results, f, ensure_ascii=False, indent=2)
+                logger.info(f"分析结果已保存到: {analysis_json_path}")

-                if not frame_analysis.strip():
-                    raise Exception("未能生成有效的帧分析结果")
-
-                # 保存分析结果
-                analysis_path = os.path.join(utils.temp_dir(), "frame_analysis.txt")
-                with open(analysis_path, 'w', encoding='utf-8') as f:
-                    f.write(frame_analysis)
-
-                update_progress(70, "正在生成脚本...")
-
+                """
+                4. 生成文案
+                """
+                logger.info("开始准备生成解说文案")
+                update_progress(80, "正在生成文案...")
+                from app.services.generate_narration_script import parse_frame_analysis_to_markdown, generate_narration
                # 从配置中获取文本生成相关配置
                text_provider = config.app.get('text_llm_provider', 'gemini').lower()
                text_api_key = config.app.get(f'text_{text_provider}_api_key')
                text_model = config.app.get(f'text_{text_provider}_model_name')
                text_base_url = config.app.get(f'text_{text_provider}_base_url')
-
-                # 构建帧内容列表
-                frame_content_list = []
-                prev_batch_files = None
-
-                for i, result in enumerate(results):
-                    if 'error' in result:
-                        continue
-
-                    batch_files = get_batch_files(keyframe_files, result, vision_batch_size)
-                    _, _, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files)
-
-                    frame_content = {
-                        "timestamp": timestamp_range,
-                        "picture": result['response'],
-                        "narration": "",
-                        "OST": 2
-                    }
-                    frame_content_list.append(frame_content)
-
-                    logger.debug(f"添加帧内容: 时间范围={timestamp_range}, 分析结果长度={len(result['response'])}")
-
-                    # 更新上一个批次的文件
-                    prev_batch_files = batch_files
-
-                if not frame_content_list:
-                    raise Exception("没有有效的帧内容可以处理")
-
-                # ===================开始生成文案===================
-                update_progress(80, "正在生成文案...")
-                # 校验配置
-                api_params = {
-                    "vision_api_key": vision_api_key,
-                    "vision_model_name": vision_model,
-                    "vision_base_url": vision_base_url or "",
+                llm_params.update({
+                    "text_provider": text_provider,
                    "text_api_key": text_api_key,
                    "text_model_name": text_model,
-                    "text_base_url": text_base_url or ""
-                }
-                chekc_video_config(api_params)
-                custom_prompt = st.session_state.get('custom_prompt', '')
-                processor = ScriptProcessor(
-                    model_name=text_model,
-                    api_key=text_api_key,
-                    prompt=custom_prompt,
-                    base_url=text_base_url or "",
-                    video_theme=st.session_state.get('video_theme', '')
+                    "text_base_url": text_base_url
+                })
+                chekc_video_config(llm_params)
+                # 整理帧分析数据
+                markdown_output = parse_frame_analysis_to_markdown(analysis_json_path)
+
+                # 生成解说文案
+                narration = generate_narration(
+                    markdown_output,
+                    text_api_key,
+                    base_url=text_base_url,
+                    model=text_model
                )
-
-                # 处理帧内容生成脚本
-                script_result = processor.process_frames(frame_content_list)
-
+                narration_dict = json.loads(narration)['items']
+                # 为 narration_dict 中每个 item 新增一个 OST: 2 的字段, 代表保留原声和配音
+                narration_dict = [{**item, "OST": 2} for item in narration_dict]
+                logger.debug(f"解说文案创作完成:\n{"\n".join([item['narration'] for item in narration_dict])}")
                # 结果转换为JSON字符串
-                script = json.dumps(script_result, ensure_ascii=False, indent=2)
+                script = json.dumps(narration_dict, ensure_ascii=False, indent=2)

            except Exception as e:
                logger.exception(f"大模型处理过程中发生错误\n{traceback.format_exc()}")
@ -250,7 +382,7 @@ def generate_script_docu(tr, params):
            if script is None:
                st.error("生成脚本失败，请检查日志")
                st.stop()
-            logger.info(f"脚本生成完成")
+            logger.success(f"剪辑脚本生成完成")
            if isinstance(script, list):
                st.session_state['video_clip_json'] = script
            elif isinstance(script, str):
--- a/webui/tools/generate_script_short.py
+++ b/webui/tools/generate_script_short.py
@ -36,9 +36,10 @@ def generate_script_short(tr, params, custom_clips=5):
            text_api_key = config.app.get(f'text_{text_provider}_api_key')
            text_model = config.app.get(f'text_{text_provider}_model_name')
            text_base_url = config.app.get(f'text_{text_provider}_base_url')
-            vision_api_key = st.session_state.get(f'vision_{text_provider}_api_key', "")
-            vision_model = st.session_state.get(f'vision_{text_provider}_model_name', "")
-            vision_base_url = st.session_state.get(f'vision_{text_provider}_base_url', "")
+            vision_llm_provider = st.session_state.get('vision_llm_providers').lower()
+            vision_api_key = st.session_state.get(f'vision_{vision_llm_provider}_api_key', "")
+            vision_model = st.session_state.get(f'vision_{vision_llm_provider}_model_name', "")
+            vision_base_url = st.session_state.get(f'vision_{vision_llm_provider}_base_url', "")
            narrato_api_key = config.app.get('narrato_api_key')

            update_progress(20, "开始准备生成脚本")
@ -50,9 +51,11 @@ def generate_script_short(tr, params, custom_clips=5):
                st.stop()

            api_params = {
+                "vision_provider": vision_llm_provider,
                "vision_api_key": vision_api_key,
                "vision_model_name": vision_model,
                "vision_base_url": vision_base_url or "",
+                "text_provider": text_provider,
                "text_api_key": text_api_key,
                "text_model_name": text_model,
                "text_base_url": text_base_url or ""
@ -65,8 +68,6 @@ def generate_script_short(tr, params, custom_clips=5):
                api_key=text_api_key,
                model_name=text_model,
                base_url=text_base_url,
-                narrato_api_key=narrato_api_key,
-                bert_path="app/models/bert/",
                custom_clips=custom_clips,
            )

--- a/webui/tools/generate_short_summary.py
+++ b/webui/tools/generate_short_summary.py
@ -0,0 +1,127 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+'''
+@Project: NarratoAI
+@File   : 短剧解说脚本生成
+@Author : 小林同学
+@Date   : 2025/5/10 下午10:26 
+'''
+import os
+import json
+import time
+import traceback
+import streamlit as st
+from loguru import logger
+
+from app.config import config
+from app.services.SDE.short_drama_explanation import analyze_subtitle, generate_narration_script
+
+
+def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature):
+    """
+    生成 短剧解说 视频脚本
+    要求: 提供高质量短剧字幕
+    适合场景: 短剧
+    """
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+
+    def update_progress(progress: float, message: str = ""):
+        progress_bar.progress(progress)
+        if message:
+            status_text.text(f"{progress}% - {message}")
+        else:
+            status_text.text(f"进度: {progress}%")
+
+    try:
+        with st.spinner("正在生成脚本..."):
+            if not params.video_origin_path:
+                st.error("请先选择视频文件")
+                return
+            """
+            1. 获取字幕
+            """
+            update_progress(30, "正在解析字幕...")
+            # 判断字幕文件是否存在
+            if not os.path.exists(subtitle_path):
+                st.error("字幕文件不存在")
+                return
+
+            """
+            2. 分析字幕总结剧情
+            """
+            text_provider = config.app.get('text_llm_provider', 'gemini').lower()
+            text_api_key = config.app.get(f'text_{text_provider}_api_key')
+            text_model = config.app.get(f'text_{text_provider}_model_name')
+            text_base_url = config.app.get(f'text_{text_provider}_base_url')
+            analysis_result = analyze_subtitle(
+                subtitle_file_path=subtitle_path,
+                api_key=text_api_key,
+                model=text_model,
+                base_url=text_base_url,
+                save_result=True,
+                temperature=temperature
+            )
+            """
+            3. 根据剧情生成解说文案
+            """
+            if analysis_result["status"] == "success":
+                logger.info("字幕分析成功！")
+                update_progress(60, "正在生成文案...")
+
+                # 根据剧情生成解说文案
+                narration_result = generate_narration_script(
+                    short_name=video_theme,
+                    plot_analysis=analysis_result["analysis"],
+                    api_key=text_api_key,
+                    model=text_model,
+                    base_url=text_base_url,
+                    save_result=True,
+                    temperature=temperature
+                )
+
+                if narration_result["status"] == "success":
+                    logger.info("\n解说文案生成成功！")
+                    logger.info(narration_result["narration_script"])
+                else:
+                    logger.info(f"\n解说文案生成失败: {narration_result['message']}")
+                    st.error("生成脚本失败，请检查日志")
+                    st.stop()
+            else:
+                logger.error(f"分析失败: {analysis_result['message']}")
+                st.error("生成脚本失败，请检查日志")
+                st.stop()
+
+            """
+            4. 生成文案
+            """
+            logger.info("开始准备生成解说文案")
+
+            # 结果转换为JSON字符串
+            narration_script = narration_result["narration_script"]
+            narration_dict = json.loads(narration_script)
+            script = json.dumps(narration_dict['items'], ensure_ascii=False, indent=2)
+
+            if script is None:
+                st.error("生成脚本失败，请检查日志")
+                st.stop()
+            logger.success(f"剪辑脚本生成完成")
+            if isinstance(script, list):
+                st.session_state['video_clip_json'] = script
+            elif isinstance(script, str):
+                st.session_state['video_clip_json'] = json.loads(script)
+            update_progress(90, "整理输出...")
+
+        time.sleep(0.1)
+        progress_bar.progress(100)
+        status_text.text("脚本生成完成！")
+        st.success("视频脚本生成成功！")
+
+    except Exception as err:
+        st.error(f"生成过程中发生错误: {str(err)}")
+        logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}")
+    finally:
+        time.sleep(2)
+        progress_bar.empty()
+        status_text.empty()
--- a/webui/utils/init.py
+++ b/webui/utils/init.py
@ -1,8 +0,0 @@
-from .performance import monitor_performance, PerformanceMonitor
-from .cache import *
-from .file_utils import *
-
-__all__ = [
-    'monitor_performance',
-    'PerformanceMonitor'
-] 
--- a/webui/utils/merge_video.py
+++ b/webui/utils/merge_video.py
@ -1,9 +1,9 @@
 """
 合并视频和字幕文件
 """
-from moviepy.editor import VideoFileClip, concatenate_videoclips
-import pysrt
 import os
+import pysrt
+from moviepy import VideoFileClip, concatenate_videoclips


 def get_video_duration(video_path):
--- a/webui/utils/performance.py
+++ b/webui/utils/performance.py
@ -1,37 +0,0 @@
-import psutil
-import os
-from loguru import logger
-import torch
-
-class PerformanceMonitor:
-    @staticmethod
-    def monitor_memory():
-        process = psutil.Process(os.getpid())
-        memory_info = process.memory_info()
-        
-        logger.debug(f"Memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")
-        
-        if torch.cuda.is_available():
-            gpu_memory = torch.cuda.memory_allocated() / 1024 / 1024
-            logger.debug(f"GPU Memory usage: {gpu_memory:.2f} MB")
-    
-    @staticmethod
-    def cleanup_resources():
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        
-        import gc
-        gc.collect()
-        
-        PerformanceMonitor.monitor_memory()
-
-def monitor_performance(func):
-    """性能监控装饰器"""
-    def wrapper(*args, **kwargs):
-        try:
-            PerformanceMonitor.monitor_memory()
-            result = func(*args, **kwargs)
-            return result
-        finally:
-            PerformanceMonitor.cleanup_resources()
-    return wrapper