first commit

2026-03-13 07:16:01 +00:00 · 2024-08-12 21:35:06 +08:00 · 2024-08-12 21:35:06 +08:00 · e874999bd1
commit e874999bd1
57 changed files with 7945 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,24 @@
+# Exclude common Python files and directories
+venv/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.pyz
+*.pyw
+*.pyi
+*.egg-info/
+
+# Exclude development and local files
+.env
+.env.*
+*.log
+*.db
+
+# Exclude version control system files
+.git/
+.gitignore
+.svn/
+
+storage/
+config.toml
--- a/.github/workflows/codeReview.yml
+++ b/.github/workflows/codeReview.yml
@ -0,0 +1,24 @@
+name: Code Review
+
+permissions:
+  contents: read
+  pull-requests: write
+
+on:
+  # 在提合并请求的时候触发
+  pull_request:
+    types: [opened, reopened]
+  workflow_dispatch:
+
+jobs:
+  codeReview:
+    runs-on: ubuntu-latest
+    steps:
+      - name: GPT代码逻辑检查
+        uses: anc95/ChatGPT-CodeReview@main
+        env:
+          GITHUB_TOKEN: ${{ secrets.GIT_TOKEN }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          OPENAI_API_ENDPOINT: https://api.groq.com/openai/v1
+          MODEL: llama-3.1-70b-versatile
+          LANGUAGE: Chinese
--- a/.github/workflows/dockerImageBuild.yml.bak
+++ b/.github/workflows/dockerImageBuild.yml.bak
@ -0,0 +1,35 @@
+name: build_docker
+
+on:
+  release:
+    types: [created] # 表示在创建新的 Release 时触发
+
+jobs:
+  build_docker:
+    name: Build docker
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          platforms: linux/amd64,linux/arm64
+          tags: |
+            ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:${{ github.ref_name }}
+            ${{ secrets.DOCKERHUB_USERNAME }}/${{ GITHUB_REPOSITORY_NAME_PART }}:latest
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,29 @@
+.DS_Store
+/config.toml
+/storage/
+/.idea/
+/app/services/__pycache__
+/app/__pycache__/
+/app/config/__pycache__/
+/app/models/__pycache__/
+/app/utils/__pycache__/
+/*/__pycache__/*
+.vscode
+/**/.streamlit
+__pycache__
+logs/
+
+node_modules
+# VuePress 默认临时文件目录
+/sites/docs/.vuepress/.temp
+# VuePress 默认缓存目录
+/sites/docs/.vuepress/.cache
+# VuePress 默认构建生成的静态文件目录
+/sites/docs/.vuepress/dist
+# 模型目录
+/models/
+./models/*
+resource/scripts/*
+resource/videos/*
+resource/songs/*
+resource/fonts/*
--- a/45
+++ b/45
@ -0,0 +1,45 @@
+# Use an official Python runtime as a parent image
+FROM python:3.10-slim-bullseye
+
+# Set the working directory in the container
+WORKDIR /NarratoAI
+
+# 设置/NarratoAI目录权限为777
+RUN chmod 777 /NarratoAI
+
+ENV PYTHONPATH="/NarratoAI"
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    imagemagick \
+    ffmpeg \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Fix security policy for ImageMagick
+RUN sed -i '/<policy domain="path" rights="none" pattern="@\*"/d' /etc/ImageMagick-6/policy.xml
+
+# Copy only the requirements.txt first to leverage Docker cache
+COPY requirements.txt ./
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Now copy the rest of the codebase into the image
+COPY . .
+
+# Expose the port the app runs on
+EXPOSE 8501
+
+# Command to run the application
+CMD ["streamlit", "run", "./webui/Main.py","--browser.serverAddress=127.0.0.1","--server.enableCORS=True","--browser.gatherUsageStats=False"]
+
+# 1. Build the Docker image using the following command
+# docker build -t moneyprinterturbo .
+
+# 2. Run the Docker container using the following command
+## For Linux or MacOS:
+# docker run -v $(pwd)/config.toml:/NarratoAI/config.toml -v $(pwd)/storage:/NarratoAI/storage -p 8501:8501 moneyprinterturbo
+## For Windows:
+# docker run -v %cd%/config.toml:/NarratoAI/config.toml -v %cd%/storage:/NarratoAI/storage -p 8501:8501 moneyprinterturbo
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 linyq
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README-en.md
+++ b/README-en.md
@ -0,0 +1,170 @@
+<div align="center">
+<h1 align="center" style="font-size: 2cm;"> NarratoAI 😎 </h1>
+<h3 align="center">All-in-One AI-Powered Video Narration + Automated Editing Tool🎬</h3>
+
+<h3> 📖 <a href="README.md">Simplified Chinese</a> | English </h3>
+<div align="center">
+  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</div>
+<br>
+NarratoAI is an automated video narration tool that provides an all-in-one solution for script writing, automated video editing, voice-over, and subtitle generation, powered by LLM to enhance efficient content creation.
+<br>
+
+[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI)
+[![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE)
+[![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues)
+[![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers)
+[![Discord](https://img.shields.io/discord/1134848537704804432?style=for-the-badge)](https://dsc.gg/fuji-community)
+
+<h3>Home</h3>
+
+![](docs/index.png)
+
+<h3>Video Review Interface</h3>
+
+![](docs/check.png)
+
+</div>
+
+## System Requirements 📦
+
+- Recommended minimum: CPU with 4 cores or more, 8GB RAM or more, GPU is not required
+- Windows 10 or MacOS 11.0 or above
+
+## Quick Start 🚀
+### Apply for Google AI Studio Account
+1. Visit https://aistudio.google.com/app/prompts/new_chat to apply for an account.
+2. Click `Get API Key` to request an API Key.
+3. Enter the obtained API Key into the `gemini_api_key` setting in the `config.example.toml` file.
+
+### Configure Proxy VPN
+> The method to configure VPN is not restricted, as long as you can access Google's network. Here, `clash` is used as an example.
+1. Note the port of the clash service, usually `http://127.0.0.1:7890`.
+2. If the port is not `7890`, modify the `VPN_PROXY_URL` in the `docker-compose.yml` file to your proxy address.
+   ```yaml
+   environment:
+     - "VPN_PROXY_URL=http://host.docker.internal:7890" # Change to your proxy port; host.docker.internal represents the IP of the physical machine.
+    ```
+
+3. (Optional) Or modify the `proxy` settings in the `config.example.toml` file.
+   ```toml
+   [proxy]
+    ### Use a proxy to access the Pexels API
+    ### Format: "http://<username>:<password>@<proxy>:<port>"
+    ### Example: "http://user:pass@proxy:1234"
+    ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
+
+    http = "http://xx.xx.xx.xx:7890"
+    https = "http://xx.xx.xx.xx:7890"
+   ```
+
+### Docker Deployment 🐳
+#### ① Start Docker
+```shell
+cd NarratoAI
+docker-compose up
+```
+#### ② Access the Web Interface
+
+Open your browser and go to http://127.0.0.1:8501
+
+#### ③ Access the API Documentation
+
+Open your browser and go to http://127.0.0.1:8080/docs or http://127.0.0.1:8080/redoc
+
+## Usage
+#### 1. Basic Configuration, Select Model, Enter API Key, and Choose Model
+> Currently, only the `Gemini` model is supported. Other modes will be added in future updates. Contributions are welcome via [PR](https://github.com/linyqh/NarratoAI/pulls) to join in the development 🎉🎉🎉
+<div align="center">
+  <img src="docs/img001.png" alt="001" width="1000"/>
+</div>
+
+#### 2. Select the Video for Narration and Click to Generate Video Script
+> A demo video is included in the platform. To use your own video, place the mp4 file in the `resource/videos` directory and refresh your browser.
+> Note: The filename can be anything, but it must not contain Chinese characters, special characters, spaces, backslashes, etc.
+<div align="center">
+  <img src="docs/img002.png" alt="002" width="400"/>
+</div>
+
+#### 3. Save the Script and Start Editing
+> After saving the script, refresh the browser, and the newly generated `.json` script file will appear in the script file dropdown. Select the json file and video to start editing.
+<div align="center">
+  <img src="docs/img003.png" alt="003" width="400"/>
+</div>
+
+#### 4. Review the Video; if there are segments that don't meet the rules, click to regenerate or manually edit them.
+<div align="center">
+  <img src="docs/img004.png" alt="003" width="1000"/>
+</div>
+
+#### 5. Configure Basic Video Parameters
+<div align="center">
+  <img src="docs/img005.png" alt="003" width="700"/>
+</div>
+
+#### 6. Start Generating
+<div align="center">
+  <img src="docs/img006.png" alt="003" width="1000"/>
+</div>
+
+#### 7. Video Generation Complete
+<div align="center">
+  <img src="docs/img007.png" alt="003" width="1000"/>
+</div>
+
+## Development 💻
+1. Install Dependencies
+```shell
+conda create -n narratoai python=3.10
+conda activate narratoai
+cd narratoai
+pip install -r requirements.txt
+```
+2. Install ImageMagick
+###### Windows:
+
+- Download https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-36-Q16-x64-static.exe
+- Install the downloaded ImageMagick, ensuring you do not change the installation path
+- Update `imagemagick_path` in the `config.toml` file to your actual installation path (typically `C:\Program Files\ImageMagick-7.1.1-Q16\magick.exe`)
+
+###### MacOS:
+
+```shell
+brew install imagemagick
+````
+
+###### Ubuntu
+
+```shell
+sudo apt-get install imagemagick
+```
+
+###### CentOS
+
+```shell
+sudo yum install ImageMagick
+```
+
+3. initiate webui
+```shell
+streamlit run ./webui/Main.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
+```
+4. Access http://127.0.0.1:8501
+
+## Feedback & Suggestions 📢
+
+### 👏👏👏 You can submit [issues](https://github.com/linyqh/NarratoAI/issues) or [pull requests](https://github.com/linyqh/NarratoAI/pulls) 🎉🎉🎉
+
+## Reference Projects 📚
+- https://github.com/FujiwaraChoki/MoneyPrinter
+- https://github.com/harry0703/MoneyPrinterTurbo
+
+This project was refactored based on the above projects with the addition of video narration features. Thanks to the original authors for their open-source spirit 🥳🥳🥳 
+
+## License 📝
+
+Click to view the [`LICENSE`](LICENSE) file
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date)
--- a/README.md
+++ b/README.md
@ -0,0 +1,175 @@
+
+<div align="center">
+<h1 align="center" style="font-size: 2cm;"> NarratoAI 😎📽️ </h1>
+<h3 align="center">一站式 AI 影视解说+自动化剪辑工具🎬🎞️ </h3>
+
+
+<h3>📖 简体中文 | <a href="README-en.md">English</a></h3>
+<div align="center">
+
+[//]: # (  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FNarratoAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>)
+</div>
+<br>
+NarratoAI 是一个自动化影视解说工具，基于LLM实现文案撰写、自动化视频剪辑、配音和字幕生成的一站式流程，助力高效内容创作。
+<br>
+
+[![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI)
+[![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE)
+[![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues)
+[![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers)
+[![Discord](https://img.shields.io/discord/1134848537704804432?style=for-the-badge)](https://discord.gg/WBKChhmZ)
+
+
+<h3>首页</h3>
+
+![](docs/index.png)
+
+<h3>视频审查界面</h3>
+
+![](docs/check.png)
+
+</div>
+
+## 配置要求 📦
+
+- 建议最低 CPU 4核或以上，内存 8G 或以上，显卡非必须
+- Windows 10 或 MacOS 11.0 以上系统
+
+## 快速开始 🚀
+### 申请 Google AI studio 账号
+1. 访问 https://aistudio.google.com/app/prompts/new_chat 申请账号
+2. 点击 `Get API Key` 申请 API Key
+3. 申请的 API Key 填入 `config.example.toml` 文件中的 `gemini_api_key` 配置
+
+### 配置 proxy VPN
+> 配置vpn的方法不限，只要能正常访问 Google 网络即可，本文采用的是 chash
+1. 记住 clash 服务的端口，一般为 `http://127.0.0.1:7890`
+2. 若端口不为 `7890`，请修改 `docker-compose.yml` 文件中的 `VPN_PROXY_URL` 为你的代理地址
+   ```yaml
+   environment:
+     - "VPN_PROXY_URL=http://host.docker.internal:7890" # 修改为你的代理端口；host.docker.internal表示物理机的IP
+   ```
+3. (可选)或者修改 `config.example.toml` 文件中的 `proxy` 配置
+   ```toml
+   [proxy]
+    ### Use a proxy to access the Pexels API
+    ### Format: "http://<username>:<password>@<proxy>:<port>"
+    ### Example: "http://user:pass@proxy:1234"
+    ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
+
+    http = "http://xx.xx.xx.xx:7890"
+    https = "http://xx.xx.xx.xx:7890"
+   ```
+### docker部署🐳
+#### ① 垃取项目，启动Docker
+```shell
+git clone https://github.com/linyqh/NarratoAI.git
+cd NarratoAI
+docker-compose up
+```
+#### ② 访问Web界面
+
+打开浏览器，访问 http://127.0.0.1:8501
+
+#### ③ 访问API文档
+
+打开浏览器，访问 http://127.0.0.1:8080/docs 或者 http://127.0.0.1:8080/redoc
+
+## 使用方法
+#### 1. 基础配置，选择模型，填入APIKey，选择模型
+> 目前暂时只支持 `Gemini` 模型，其他模式待后续更新，欢迎大家提交 [PR](https://github.com/linyqh/NarratoAI/pulls)，参与开发 🎉🎉🎉
+<div align="center">
+  <img src="docs/img001.png" alt="001" width="1000"/>
+</div>
+
+#### 2. 选择需要解说的视频，点击生成视频脚本
+> 平台内置了一个演示视频，若要使用自己的视频，将mp4文件放在 `resource/videos` 目录下，刷新浏览器即可，
+> 注意：文件名随意，但文件名不能包含中文，特殊字符，空格，反斜杠等
+<div align="center">
+  <img src="docs/img002.png" alt="002" width="400"/>
+</div>
+
+#### 3. 保存脚本，开始剪辑
+> 保存脚本后，刷新浏览器，在脚本文件的下拉框就会有新生成的 `.json` 脚本文件，选择json文件和视频就可以开始剪辑了。
+<div align="center">
+  <img src="docs/img003.png" alt="003" width="400"/>
+</div>
+
+#### 4. 检查视频，若视频存在不符合规则的片段，可以点击重新生成或者手动编辑
+<div align="center">
+  <img src="docs/img004.png" alt="003" width="1000"/>
+</div>
+
+#### 5. 配置视频基本参数
+<div align="center">
+  <img src="docs/img005.png" alt="003" width="700"/>
+</div>
+
+#### 6. 开始生成
+<div align="center">
+  <img src="docs/img006.png" alt="003" width="1000"/>
+</div>
+
+#### 7. 视频生成完成
+<div align="center">
+  <img src="docs/img007.png" alt="003" width="1000"/>
+</div>
+
+## 开发 💻
+1. 安装依赖
+```shell
+conda create -n narratoai python=3.10
+conda activate narratoai
+cd narratoai
+pip install -r requirements.txt
+```
+
+2. 安装 ImageMagick
+###### Windows:
+
+- 下载 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-36-Q16-x64-static.exe
+- 安装下载好的 ImageMagick，注意不要修改安装路径
+- 修改 `配置文件 config.toml` 中的 `imagemagick_path` 为你的实际安装路径（一般在 `C:\Program Files\ImageMagick-7.1.1-Q16\magick.exe`）
+
+###### MacOS:
+
+```shell
+brew install imagemagick
+````
+
+###### Ubuntu
+
+```shell
+sudo apt-get install imagemagick
+```
+
+###### CentOS
+
+```shell
+sudo yum install ImageMagick
+```
+3. 启动 webui
+```shell
+streamlit run ./webui/Main.py --browser.serverAddress=127.0.0.1 --server.enableCORS=True --browser.gatherUsageStats=False
+```
+4. 访问 http://127.0.0.1:8501
+
+
+## 反馈建议 📢
+
+### 👏👏👏 可以提交 [issue](https://github.com/linyqh/NarratoAI/issues)或者 [pull request](https://github.com/linyqh/NarratoAI/pulls) 🎉🎉🎉
+
+## 参考项目 📚
+- https://github.com/FujiwaraChoki/MoneyPrinter
+- https://github.com/harry0703/MoneyPrinterTurbo
+
+该项目基于以上项目重构而来，增加了影视解说功能，感谢大佬的开源精神 🥳🥳🥳 
+
+## 许可证 📝
+
+点击查看 [`LICENSE`](LICENSE) 文件
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date)
+
--- a/app/init.py
+++ b/app/init.py
--- a/app/asgi.py
+++ b/app/asgi.py
@ -0,0 +1,82 @@
+"""Application implementation - ASGI."""
+
+import os
+
+from fastapi import FastAPI, Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from loguru import logger
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+
+from app.config import config
+from app.models.exception import HttpException
+from app.router import root_api_router
+from app.utils import utils
+
+
+def exception_handler(request: Request, e: HttpException):
+    return JSONResponse(
+        status_code=e.status_code,
+        content=utils.get_response(e.status_code, e.data, e.message),
+    )
+
+
+def validation_exception_handler(request: Request, e: RequestValidationError):
+    return JSONResponse(
+        status_code=400,
+        content=utils.get_response(
+            status=400, data=e.errors(), message="field required"
+        ),
+    )
+
+
+def get_application() -> FastAPI:
+    """Initialize FastAPI application.
+
+    Returns:
+       FastAPI: Application object instance.
+
+    """
+    instance = FastAPI(
+        title=config.project_name,
+        description=config.project_description,
+        version=config.project_version,
+        debug=False,
+    )
+    instance.include_router(root_api_router)
+    instance.add_exception_handler(HttpException, exception_handler)
+    instance.add_exception_handler(RequestValidationError, validation_exception_handler)
+    return instance
+
+
+app = get_application()
+
+# Configures the CORS middleware for the FastAPI app
+cors_allowed_origins_str = os.getenv("CORS_ALLOWED_ORIGINS", "")
+origins = cors_allowed_origins_str.split(",") if cors_allowed_origins_str else ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+task_dir = utils.task_dir()
+app.mount(
+    "/tasks", StaticFiles(directory=task_dir, html=True, follow_symlink=True), name=""
+)
+
+public_dir = utils.public_dir()
+app.mount("/", StaticFiles(directory=public_dir, html=True), name="")
+
+
+@app.on_event("shutdown")
+def shutdown_event():
+    logger.info("shutdown event")
+
+
+@app.on_event("startup")
+def startup_event():
+    logger.info("startup event")
--- a/app/config/init.py
+++ b/app/config/init.py
@ -0,0 +1,56 @@
+import os
+import sys
+
+from loguru import logger
+
+from app.config import config
+from app.utils import utils
+
+
+def __init_logger():
+    # _log_file = utils.storage_dir("logs/server.log")
+    _lvl = config.log_level
+    root_dir = os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+    )
+
+    def format_record(record):
+        # 获取日志记录中的文件全路径
+        file_path = record["file"].path
+        # 将绝对路径转换为相对于项目根目录的路径
+        relative_path = os.path.relpath(file_path, root_dir)
+        # 更新记录中的文件路径
+        record["file"].path = f"./{relative_path}"
+        # 返回修改后的格式字符串
+        # 您可以根据需要调整这里的格式
+        _format = (
+            "<green>{time:%Y-%m-%d %H:%M:%S}</> | "
+            + "<level>{level}</> | "
+            + '"{file.path}:{line}":<blue> {function}</> '
+            + "- <level>{message}</>"
+            + "\n"
+        )
+        return _format
+
+    logger.remove()
+
+    logger.add(
+        sys.stdout,
+        level=_lvl,
+        format=format_record,
+        colorize=True,
+    )
+
+    # logger.add(
+    #     _log_file,
+    #     level=_lvl,
+    #     format=format_record,
+    #     rotation="00:00",
+    #     retention="3 days",
+    #     backtrace=True,
+    #     diagnose=True,
+    #     enqueue=True,
+    # )
+
+
+__init_logger()
--- a/app/config/config.py
+++ b/app/config/config.py
@ -0,0 +1,70 @@
+import os
+import socket
+import toml
+import shutil
+from loguru import logger
+
+root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+config_file = f"{root_dir}/config.toml"
+
+
+def load_config():
+    # fix: IsADirectoryError: [Errno 21] Is a directory: '/NarratoAI/config.toml'
+    if os.path.isdir(config_file):
+        shutil.rmtree(config_file)
+
+    if not os.path.isfile(config_file):
+        example_file = f"{root_dir}/config.example.toml"
+        if os.path.isfile(example_file):
+            shutil.copyfile(example_file, config_file)
+            logger.info(f"copy config.example.toml to config.toml")
+
+    logger.info(f"load config from file: {config_file}")
+
+    try:
+        _config_ = toml.load(config_file)
+    except Exception as e:
+        logger.warning(f"load config failed: {str(e)}, try to load as utf-8-sig")
+        with open(config_file, mode="r", encoding="utf-8-sig") as fp:
+            _cfg_content = fp.read()
+            _config_ = toml.loads(_cfg_content)
+    return _config_
+
+
+def save_config():
+    with open(config_file, "w", encoding="utf-8") as f:
+        _cfg["app"] = app
+        _cfg["azure"] = azure
+        _cfg["ui"] = ui
+        f.write(toml.dumps(_cfg))
+
+
+_cfg = load_config()
+app = _cfg.get("app", {})
+whisper = _cfg.get("whisper", {})
+proxy = _cfg.get("proxy", {})
+azure = _cfg.get("azure", {})
+ui = _cfg.get("ui", {})
+
+hostname = socket.gethostname()
+
+log_level = _cfg.get("log_level", "DEBUG")
+listen_host = _cfg.get("listen_host", "0.0.0.0")
+listen_port = _cfg.get("listen_port", 8080)
+project_name = _cfg.get("project_name", "NarratoAI")
+project_description = _cfg.get(
+    "project_description",
+    "<a href='https://github.com/linyqh/NarratoAI'>https://github.com/linyqh/NarratoAI</a>",
+)
+project_version = _cfg.get("app", {}).get("project_version")
+reload_debug = False
+
+imagemagick_path = app.get("imagemagick_path", "")
+if imagemagick_path and os.path.isfile(imagemagick_path):
+    os.environ["IMAGEMAGICK_BINARY"] = imagemagick_path
+
+ffmpeg_path = app.get("ffmpeg_path", "")
+if ffmpeg_path and os.path.isfile(ffmpeg_path):
+    os.environ["IMAGEIO_FFMPEG_EXE"] = ffmpeg_path
+
+logger.info(f"{project_name} v{project_version}")
--- a/app/controllers/base.py
+++ b/app/controllers/base.py
@ -0,0 +1,31 @@
+from uuid import uuid4
+
+from fastapi import Request
+
+from app.config import config
+from app.models.exception import HttpException
+
+
+def get_task_id(request: Request):
+    task_id = request.headers.get("x-task-id")
+    if not task_id:
+        task_id = uuid4()
+    return str(task_id)
+
+
+def get_api_key(request: Request):
+    api_key = request.headers.get("x-api-key")
+    return api_key
+
+
+def verify_token(request: Request):
+    token = get_api_key(request)
+    if token != config.app.get("api_key", ""):
+        request_id = get_task_id(request)
+        request_url = request.url
+        user_agent = request.headers.get("user-agent")
+        raise HttpException(
+            task_id=request_id,
+            status_code=401,
+            message=f"invalid token: {request_url}, {user_agent}",
+        )
--- a/app/controllers/manager/base_manager.py
+++ b/app/controllers/manager/base_manager.py
@ -0,0 +1,64 @@
+import threading
+from typing import Callable, Any, Dict
+
+
+class TaskManager:
+    def __init__(self, max_concurrent_tasks: int):
+        self.max_concurrent_tasks = max_concurrent_tasks
+        self.current_tasks = 0
+        self.lock = threading.Lock()
+        self.queue = self.create_queue()
+
+    def create_queue(self):
+        raise NotImplementedError()
+
+    def add_task(self, func: Callable, *args: Any, **kwargs: Any):
+        with self.lock:
+            if self.current_tasks < self.max_concurrent_tasks:
+                print(f"add task: {func.__name__}, current_tasks: {self.current_tasks}")
+                self.execute_task(func, *args, **kwargs)
+            else:
+                print(
+                    f"enqueue task: {func.__name__}, current_tasks: {self.current_tasks}"
+                )
+                self.enqueue({"func": func, "args": args, "kwargs": kwargs})
+
+    def execute_task(self, func: Callable, *args: Any, **kwargs: Any):
+        thread = threading.Thread(
+            target=self.run_task, args=(func, *args), kwargs=kwargs
+        )
+        thread.start()
+
+    def run_task(self, func: Callable, *args: Any, **kwargs: Any):
+        try:
+            with self.lock:
+                self.current_tasks += 1
+            func(*args, **kwargs)  # 在这里调用函数，传递*args和**kwargs
+        finally:
+            self.task_done()
+
+    def check_queue(self):
+        with self.lock:
+            if (
+                self.current_tasks < self.max_concurrent_tasks
+                and not self.is_queue_empty()
+            ):
+                task_info = self.dequeue()
+                func = task_info["func"]
+                args = task_info.get("args", ())
+                kwargs = task_info.get("kwargs", {})
+                self.execute_task(func, *args, **kwargs)
+
+    def task_done(self):
+        with self.lock:
+            self.current_tasks -= 1
+        self.check_queue()
+
+    def enqueue(self, task: Dict):
+        raise NotImplementedError()
+
+    def dequeue(self):
+        raise NotImplementedError()
+
+    def is_queue_empty(self):
+        raise NotImplementedError()
--- a/app/controllers/manager/memory_manager.py
+++ b/app/controllers/manager/memory_manager.py
@ -0,0 +1,18 @@
+from queue import Queue
+from typing import Dict
+
+from app.controllers.manager.base_manager import TaskManager
+
+
+class InMemoryTaskManager(TaskManager):
+    def create_queue(self):
+        return Queue()
+
+    def enqueue(self, task: Dict):
+        self.queue.put(task)
+
+    def dequeue(self):
+        return self.queue.get()
+
+    def is_queue_empty(self):
+        return self.queue.empty()
--- a/app/controllers/manager/redis_manager.py
+++ b/app/controllers/manager/redis_manager.py
@ -0,0 +1,56 @@
+import json
+from typing import Dict
+
+import redis
+
+from app.controllers.manager.base_manager import TaskManager
+from app.models.schema import VideoParams
+from app.services import task as tm
+
+FUNC_MAP = {
+    "start": tm.start,
+    # 'start_test': tm.start_test
+}
+
+
+class RedisTaskManager(TaskManager):
+    def __init__(self, max_concurrent_tasks: int, redis_url: str):
+        self.redis_client = redis.Redis.from_url(redis_url)
+        super().__init__(max_concurrent_tasks)
+
+    def create_queue(self):
+        return "task_queue"
+
+    def enqueue(self, task: Dict):
+        task_with_serializable_params = task.copy()
+
+        if "params" in task["kwargs"] and isinstance(
+            task["kwargs"]["params"], VideoParams
+        ):
+            task_with_serializable_params["kwargs"]["params"] = task["kwargs"][
+                "params"
+            ].dict()
+
+        # 将函数对象转换为其名称
+        task_with_serializable_params["func"] = task["func"].__name__
+        self.redis_client.rpush(self.queue, json.dumps(task_with_serializable_params))
+
+    def dequeue(self):
+        task_json = self.redis_client.lpop(self.queue)
+        if task_json:
+            task_info = json.loads(task_json)
+            # 将函数名称转换回函数对象
+            task_info["func"] = FUNC_MAP[task_info["func"]]
+
+            if "params" in task_info["kwargs"] and isinstance(
+                task_info["kwargs"]["params"], dict
+            ):
+                task_info["kwargs"]["params"] = VideoParams(
+                    **task_info["kwargs"]["params"]
+                )
+
+            return task_info
+        return None
+
+    def is_queue_empty(self):
+        return self.redis_client.llen(self.queue) == 0
--- a/app/controllers/ping.py
+++ b/app/controllers/ping.py
@ -0,0 +1,14 @@
+from fastapi import APIRouter
+from fastapi import Request
+
+router = APIRouter()
+
+
+@router.get(
+    "/ping",
+    tags=["Health Check"],
+    description="检查服务可用性",
+    response_description="pong",
+)
+def ping(request: Request) -> str:
+    return "pong"
--- a/app/controllers/v1/base.py
+++ b/app/controllers/v1/base.py
@ -0,0 +1,11 @@
+from fastapi import APIRouter, Depends
+
+
+def new_router(dependencies=None):
+    router = APIRouter()
+    router.tags = ["V1"]
+    router.prefix = "/api/v1"
+    # 将认证依赖项应用于所有路由
+    if dependencies:
+        router.dependencies = dependencies
+    return router
--- a/app/controllers/v1/llm.py
+++ b/app/controllers/v1/llm.py
@ -0,0 +1,44 @@
+from fastapi import Request
+from app.controllers.v1.base import new_router
+from app.models.schema import (
+    VideoScriptResponse,
+    VideoScriptRequest,
+    VideoTermsResponse,
+    VideoTermsRequest,
+)
+from app.services import llm
+from app.utils import utils
+
+# 认证依赖项
+# router = new_router(dependencies=[Depends(base.verify_token)])
+router = new_router()
+
+
+@router.post(
+    "/scripts",
+    response_model=VideoScriptResponse,
+    summary="Create a script for the video",
+)
+def generate_video_script(request: Request, body: VideoScriptRequest):
+    video_script = llm.generate_script(
+        video_subject=body.video_subject,
+        language=body.video_language,
+        paragraph_number=body.paragraph_number,
+    )
+    response = {"video_script": video_script}
+    return utils.get_response(200, response)
+
+
+@router.post(
+    "/terms",
+    response_model=VideoTermsResponse,
+    summary="Generate video terms based on the video script",
+)
+def generate_video_terms(request: Request, body: VideoTermsRequest):
+    video_terms = llm.generate_terms(
+        video_subject=body.video_subject,
+        video_script=body.video_script,
+        amount=body.amount,
+    )
+    response = {"video_terms": video_terms}
+    return utils.get_response(200, response)
--- a/app/controllers/v1/video.py
+++ b/app/controllers/v1/video.py
@ -0,0 +1,271 @@
+import glob
+import os
+import pathlib
+import shutil
+from typing import Union
+
+from fastapi import BackgroundTasks, Depends, Path, Request, UploadFile
+from fastapi.params import File
+from fastapi.responses import FileResponse, StreamingResponse
+from loguru import logger
+
+from app.config import config
+from app.controllers import base
+from app.controllers.manager.memory_manager import InMemoryTaskManager
+from app.controllers.manager.redis_manager import RedisTaskManager
+from app.controllers.v1.base import new_router
+from app.models.exception import HttpException
+from app.models.schema import (
+    AudioRequest,
+    BgmRetrieveResponse,
+    BgmUploadResponse,
+    SubtitleRequest,
+    TaskDeletionResponse,
+    TaskQueryRequest,
+    TaskQueryResponse,
+    TaskResponse,
+    TaskVideoRequest,
+)
+from app.services import state as sm
+from app.services import task as tm
+from app.utils import utils
+
+# 认证依赖项
+# router = new_router(dependencies=[Depends(base.verify_token)])
+router = new_router()
+
+_enable_redis = config.app.get("enable_redis", False)
+_redis_host = config.app.get("redis_host", "localhost")
+_redis_port = config.app.get("redis_port", 6379)
+_redis_db = config.app.get("redis_db", 0)
+_redis_password = config.app.get("redis_password", None)
+_max_concurrent_tasks = config.app.get("max_concurrent_tasks", 5)
+
+redis_url = f"redis://:{_redis_password}@{_redis_host}:{_redis_port}/{_redis_db}"
+# 根据配置选择合适的任务管理器
+if _enable_redis:
+    task_manager = RedisTaskManager(
+        max_concurrent_tasks=_max_concurrent_tasks, redis_url=redis_url
+    )
+else:
+    task_manager = InMemoryTaskManager(max_concurrent_tasks=_max_concurrent_tasks)
+
+
+@router.post("/videos", response_model=TaskResponse, summary="Generate a short video")
+def create_video(
+    background_tasks: BackgroundTasks, request: Request, body: TaskVideoRequest
+):
+    return create_task(request, body, stop_at="video")
+
+
+@router.post("/subtitle", response_model=TaskResponse, summary="Generate subtitle only")
+def create_subtitle(
+    background_tasks: BackgroundTasks, request: Request, body: SubtitleRequest
+):
+    return create_task(request, body, stop_at="subtitle")
+
+
+@router.post("/audio", response_model=TaskResponse, summary="Generate audio only")
+def create_audio(
+    background_tasks: BackgroundTasks, request: Request, body: AudioRequest
+):
+    return create_task(request, body, stop_at="audio")
+
+
+def create_task(
+    request: Request,
+    body: Union[TaskVideoRequest, SubtitleRequest, AudioRequest],
+    stop_at: str,
+):
+    task_id = utils.get_uuid()
+    request_id = base.get_task_id(request)
+    try:
+        task = {
+            "task_id": task_id,
+            "request_id": request_id,
+            "params": body.model_dump(),
+        }
+        sm.state.update_task(task_id)
+        task_manager.add_task(tm.start, task_id=task_id, params=body, stop_at=stop_at)
+        logger.success(f"Task created: {utils.to_json(task)}")
+        return utils.get_response(200, task)
+    except ValueError as e:
+        raise HttpException(
+            task_id=task_id, status_code=400, message=f"{request_id}: {str(e)}"
+        )
+
+
+@router.get(
+    "/tasks/{task_id}", response_model=TaskQueryResponse, summary="Query task status"
+)
+def get_task(
+    request: Request,
+    task_id: str = Path(..., description="Task ID"),
+    query: TaskQueryRequest = Depends(),
+):
+    endpoint = config.app.get("endpoint", "")
+    if not endpoint:
+        endpoint = str(request.base_url)
+    endpoint = endpoint.rstrip("/")
+
+    request_id = base.get_task_id(request)
+    task = sm.state.get_task(task_id)
+    if task:
+        task_dir = utils.task_dir()
+
+        def file_to_uri(file):
+            if not file.startswith(endpoint):
+                _uri_path = v.replace(task_dir, "tasks").replace("\\", "/")
+                _uri_path = f"{endpoint}/{_uri_path}"
+            else:
+                _uri_path = file
+            return _uri_path
+
+        if "videos" in task:
+            videos = task["videos"]
+            urls = []
+            for v in videos:
+                urls.append(file_to_uri(v))
+            task["videos"] = urls
+        if "combined_videos" in task:
+            combined_videos = task["combined_videos"]
+            urls = []
+            for v in combined_videos:
+                urls.append(file_to_uri(v))
+            task["combined_videos"] = urls
+        return utils.get_response(200, task)
+
+    raise HttpException(
+        task_id=task_id, status_code=404, message=f"{request_id}: task not found"
+    )
+
+
+@router.delete(
+    "/tasks/{task_id}",
+    response_model=TaskDeletionResponse,
+    summary="Delete a generated short video task",
+)
+def delete_video(request: Request, task_id: str = Path(..., description="Task ID")):
+    request_id = base.get_task_id(request)
+    task = sm.state.get_task(task_id)
+    if task:
+        tasks_dir = utils.task_dir()
+        current_task_dir = os.path.join(tasks_dir, task_id)
+        if os.path.exists(current_task_dir):
+            shutil.rmtree(current_task_dir)
+
+        sm.state.delete_task(task_id)
+        logger.success(f"video deleted: {utils.to_json(task)}")
+        return utils.get_response(200)
+
+    raise HttpException(
+        task_id=task_id, status_code=404, message=f"{request_id}: task not found"
+    )
+
+
+@router.get(
+    "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files"
+)
+def get_bgm_list(request: Request):
+    suffix = "*.mp3"
+    song_dir = utils.song_dir()
+    files = glob.glob(os.path.join(song_dir, suffix))
+    bgm_list = []
+    for file in files:
+        bgm_list.append(
+            {
+                "name": os.path.basename(file),
+                "size": os.path.getsize(file),
+                "file": file,
+            }
+        )
+    response = {"files": bgm_list}
+    return utils.get_response(200, response)
+
+
+@router.post(
+    "/musics",
+    response_model=BgmUploadResponse,
+    summary="Upload the BGM file to the songs directory",
+)
+def upload_bgm_file(request: Request, file: UploadFile = File(...)):
+    request_id = base.get_task_id(request)
+    # check file ext
+    if file.filename.endswith("mp3"):
+        song_dir = utils.song_dir()
+        save_path = os.path.join(song_dir, file.filename)
+        # save file
+        with open(save_path, "wb+") as buffer:
+            # If the file already exists, it will be overwritten
+            file.file.seek(0)
+            buffer.write(file.file.read())
+        response = {"file": save_path}
+        return utils.get_response(200, response)
+
+    raise HttpException(
+        "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded"
+    )
+
+
+@router.get("/stream/{file_path:path}")
+async def stream_video(request: Request, file_path: str):
+    tasks_dir = utils.task_dir()
+    video_path = os.path.join(tasks_dir, file_path)
+    range_header = request.headers.get("Range")
+    video_size = os.path.getsize(video_path)
+    start, end = 0, video_size - 1
+
+    length = video_size
+    if range_header:
+        range_ = range_header.split("bytes=")[1]
+        start, end = [int(part) if part else None for part in range_.split("-")]
+        if start is None:
+            start = video_size - end
+            end = video_size - 1
+        if end is None:
+            end = video_size - 1
+        length = end - start + 1
+
+    def file_iterator(file_path, offset=0, bytes_to_read=None):
+        with open(file_path, "rb") as f:
+            f.seek(offset, os.SEEK_SET)
+            remaining = bytes_to_read or video_size
+            while remaining > 0:
+                bytes_to_read = min(4096, remaining)
+                data = f.read(bytes_to_read)
+                if not data:
+                    break
+                remaining -= len(data)
+                yield data
+
+    response = StreamingResponse(
+        file_iterator(video_path, start, length), media_type="video/mp4"
+    )
+    response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}"
+    response.headers["Accept-Ranges"] = "bytes"
+    response.headers["Content-Length"] = str(length)
+    response.status_code = 206  # Partial Content
+
+    return response
+
+
+@router.get("/download/{file_path:path}")
+async def download_video(_: Request, file_path: str):
+    """
+    download video
+    :param _: Request request
+    :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4
+    :return: video file
+    """
+    tasks_dir = utils.task_dir()
+    video_path = os.path.join(tasks_dir, file_path)
+    file_path = pathlib.Path(video_path)
+    filename = file_path.stem
+    extension = file_path.suffix
+    headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"}
+    return FileResponse(
+        path=video_path,
+        headers=headers,
+        filename=f"{filename}{extension}",
+        media_type=f"video/{extension[1:]}",
+    )
--- a/app/models/init.py
+++ b/app/models/init.py
--- a/app/models/const.py
+++ b/app/models/const.py
@ -0,0 +1,25 @@
+PUNCTUATIONS = [
+    "?",
+    ",",
+    ".",
+    "、",
+    ";",
+    ":",
+    "!",
+    "…",
+    "？",
+    "，",
+    "。",
+    "、",
+    "；",
+    "：",
+    "！",
+    "...",
+]
+
+TASK_STATE_FAILED = -1
+TASK_STATE_COMPLETE = 1
+TASK_STATE_PROCESSING = 4
+
+FILE_TYPE_VIDEOS = ["mp4", "mov", "mkv", "webm"]
+FILE_TYPE_IMAGES = ["jpg", "jpeg", "png", "bmp"]
--- a/app/models/exception.py
+++ b/app/models/exception.py
@ -0,0 +1,28 @@
+import traceback
+from typing import Any
+
+from loguru import logger
+
+
+class HttpException(Exception):
+    def __init__(
+        self, task_id: str, status_code: int, message: str = "", data: Any = None
+    ):
+        self.message = message
+        self.status_code = status_code
+        self.data = data
+        # 获取异常堆栈信息
+        tb_str = traceback.format_exc().strip()
+        if not tb_str or tb_str == "NoneType: None":
+            msg = f"HttpException: {status_code}, {task_id}, {message}"
+        else:
+            msg = f"HttpException: {status_code}, {task_id}, {message}\n{tb_str}"
+
+        if status_code == 400:
+            logger.warning(msg)
+        else:
+            logger.error(msg)
+
+
+class FileNotFoundException(Exception):
+    pass
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -0,0 +1,370 @@
+import warnings
+from enum import Enum
+from typing import Any, List, Optional
+
+import pydantic
+from pydantic import BaseModel
+
+# 忽略 Pydantic 的特定警告
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message="Field name.*shadows an attribute in parent.*",
+)
+
+
+class VideoConcatMode(str, Enum):
+    random = "random"
+    sequential = "sequential"
+
+
+class VideoAspect(str, Enum):
+    landscape = "16:9"
+    portrait = "9:16"
+    square = "1:1"
+
+    def to_resolution(self):
+        if self == VideoAspect.landscape.value:
+            return 1920, 1080
+        elif self == VideoAspect.portrait.value:
+            return 1080, 1920
+        elif self == VideoAspect.square.value:
+            return 1080, 1080
+        return 1080, 1920
+
+
+class _Config:
+    arbitrary_types_allowed = True
+
+
+@pydantic.dataclasses.dataclass(config=_Config)
+class MaterialInfo:
+    provider: str = "pexels"
+    url: str = ""
+    duration: int = 0
+
+
+# VoiceNames = [
+#     # zh-CN
+#     "female-zh-CN-XiaoxiaoNeural",
+#     "female-zh-CN-XiaoyiNeural",
+#     "female-zh-CN-liaoning-XiaobeiNeural",
+#     "female-zh-CN-shaanxi-XiaoniNeural",
+#
+#     "male-zh-CN-YunjianNeural",
+#     "male-zh-CN-YunxiNeural",
+#     "male-zh-CN-YunxiaNeural",
+#     "male-zh-CN-YunyangNeural",
+#
+#     # "female-zh-HK-HiuGaaiNeural",
+#     # "female-zh-HK-HiuMaanNeural",
+#     # "male-zh-HK-WanLungNeural",
+#     #
+#     # "female-zh-TW-HsiaoChenNeural",
+#     # "female-zh-TW-HsiaoYuNeural",
+#     # "male-zh-TW-YunJheNeural",
+#
+#     # en-US
+#     "female-en-US-AnaNeural",
+#     "female-en-US-AriaNeural",
+#     "female-en-US-AvaNeural",
+#     "female-en-US-EmmaNeural",
+#     "female-en-US-JennyNeural",
+#     "female-en-US-MichelleNeural",
+#
+#     "male-en-US-AndrewNeural",
+#     "male-en-US-BrianNeural",
+#     "male-en-US-ChristopherNeural",
+#     "male-en-US-EricNeural",
+#     "male-en-US-GuyNeural",
+#     "male-en-US-RogerNeural",
+#     "male-en-US-SteffanNeural",
+# ]
+
+
+class VideoParams(BaseModel):
+    """
+    {
+      "video_subject": "",
+      "video_aspect": "横屏 16:9（西瓜视频）",
+      "voice_name": "女生-晓晓",
+      "bgm_name": "random",
+      "font_name": "STHeitiMedium 黑体-中",
+      "text_color": "#FFFFFF",
+      "font_size": 60,
+      "stroke_color": "#000000",
+      "stroke_width": 1.5
+    }
+    """
+
+    video_subject: str
+    video_script: str = ""  # 用于生成视频的脚本
+    video_terms: Optional[str | list] = None  # 用于生成视频的关键词
+    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
+    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
+    video_clip_duration: Optional[int] = 5
+    video_count: Optional[int] = 1
+
+    video_source: Optional[str] = "pexels"
+    video_materials: Optional[List[MaterialInfo]] = None  # 用于生成视频的素材
+
+    video_language: Optional[str] = ""  # auto detect
+
+    voice_name: Optional[str] = ""
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.0
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+
+    subtitle_enabled: Optional[bool] = True
+    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
+    custom_position: float = 70.0
+    font_name: Optional[str] = "STHeitiMedium.ttc"
+    text_fore_color: Optional[str] = "#FFFFFF"
+    text_background_color: Optional[str] = "transparent"
+
+    font_size: int = 60
+    stroke_color: Optional[str] = "#000000"
+    stroke_width: float = 1.5
+    n_threads: Optional[int] = 2
+    paragraph_number: Optional[int] = 1
+
+
+class SubtitleRequest(BaseModel):
+    video_script: str
+    video_language: Optional[str] = ""
+    voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.2
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+    subtitle_position: Optional[str] = "bottom"
+    font_name: Optional[str] = "STHeitiMedium.ttc"
+    text_fore_color: Optional[str] = "#FFFFFF"
+    text_background_color: Optional[str] = "transparent"
+    font_size: int = 60
+    stroke_color: Optional[str] = "#000000"
+    stroke_width: float = 1.5
+    video_source: Optional[str] = "local"
+    subtitle_enabled: Optional[str] = "true"
+
+
+class AudioRequest(BaseModel):
+    video_script: str
+    video_language: Optional[str] = ""
+    voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.2
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+    video_source: Optional[str] = "local"
+
+
+class VideoScriptParams:
+    """
+    {
+      "video_subject": "春天的花海",
+      "video_language": "",
+      "paragraph_number": 1
+    }
+    """
+
+    video_subject: Optional[str] = "春天的花海"
+    video_language: Optional[str] = ""
+    paragraph_number: Optional[int] = 1
+
+
+class VideoTermsParams:
+    """
+    {
+      "video_subject": "",
+      "video_script": "",
+      "amount": 5
+    }
+    """
+
+    video_subject: Optional[str] = "春天的花海"
+    video_script: Optional[str] = (
+        "春天的花海，如诗如画般展现在眼前。万物复苏的季节里，大地披上了一袭绚丽多彩的盛装。金黄的迎春、粉嫩的樱花、洁白的梨花、艳丽的郁金香……"
+    )
+    amount: Optional[int] = 5
+
+
+class BaseResponse(BaseModel):
+    status: int = 200
+    message: Optional[str] = "success"
+    data: Any = None
+
+
+class TaskVideoRequest(VideoParams, BaseModel):
+    pass
+
+
+class TaskQueryRequest(BaseModel):
+    pass
+
+
+class VideoScriptRequest(VideoScriptParams, BaseModel):
+    pass
+
+
+class VideoTermsRequest(VideoTermsParams, BaseModel):
+    pass
+
+
+######################################################################################################
+######################################################################################################
+######################################################################################################
+######################################################################################################
+class TaskResponse(BaseResponse):
+    class TaskResponseData(BaseModel):
+        task_id: str
+
+    data: TaskResponseData
+
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {"task_id": "6c85c8cc-a77a-42b9-bc30-947815aa0558"},
+            },
+        }
+
+
+class TaskQueryResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "state": 1,
+                    "progress": 100,
+                    "videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/final-1.mp4"
+                    ],
+                    "combined_videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/combined-1.mp4"
+                    ],
+                },
+            },
+        }
+
+
+class TaskDeletionResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "state": 1,
+                    "progress": 100,
+                    "videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/final-1.mp4"
+                    ],
+                    "combined_videos": [
+                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/combined-1.mp4"
+                    ],
+                },
+            },
+        }
+
+
+class VideoScriptResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "video_script": "春天的花海，是大自然的一幅美丽画卷。在这个季节里，大地复苏，万物生长，花朵争相绽放，形成了一片五彩斑斓的花海..."
+                },
+            },
+        }
+
+
+class VideoTermsResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {"video_terms": ["sky", "tree"]},
+            },
+        }
+
+
+class BgmRetrieveResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {
+                    "files": [
+                        {
+                            "name": "output013.mp3",
+                            "size": 1891269,
+                            "file": "/NarratoAI/resource/songs/output013.mp3",
+                        }
+                    ]
+                },
+            },
+        }
+
+
+class BgmUploadResponse(BaseResponse):
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "status": 200,
+                "message": "success",
+                "data": {"file": "/NarratoAI/resource/songs/example.mp3"},
+            },
+        }
+
+
+class VideoClipParams(BaseModel):
+    video_subject: Optional[str] = "春天的花海让人心旷神怡"
+
+    video_clip_json: Optional[str] = ""      # 视频剪辑脚本
+    video_origin_path: Optional[str] = ""    # 原视频路径
+    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value        # 视频比例
+    video_clip_duration: Optional[int] = 5      # 视频片段时长
+    video_count: Optional[int] = 1      # 视频片段数量
+    video_source: Optional[str] = "local"
+    video_language: Optional[str] = ""  # 自动检测
+    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
+
+    # # 女性
+    # "zh-CN-XiaoxiaoNeural",
+    # "zh-CN-XiaoyiNeural",
+    # # 男性
+    # "zh-CN-YunjianNeural" 男声
+    # "zh-CN-YunyangNeural",
+    # "zh-CN-YunxiNeural",
+    voice_name: Optional[str] = "zh-CN-YunjianNeural"      # 语音名称 指定选择：
+    voice_volume: Optional[float] = 1.0     # 语音音量
+    voice_rate: Optional[float] = 1.0       # 语速
+
+    bgm_name: Optional[str] = "random"  # 背景音乐名称
+    bgm_type: Optional[str] = "random"  # 背景音乐类型
+    bgm_file: Optional[str] = ""        # 背景音乐文件
+    bgm_volume: Optional[float] = 0.2
+
+    subtitle_enabled: Optional[bool] = True     # 是否启用字幕
+    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
+    font_name: Optional[str] = "STHeitiMedium.ttc"      # 字体名称
+    text_fore_color: Optional[str] = "#FFFFFF"      # 文字前景色
+    text_background_color: Optional[str] = "transparent"    # 文字背景色
+
+    font_size: int = 60     # 文字大小
+    stroke_color: Optional[str] = "#000000"     # 文字描边颜色
+    stroke_width: float = 1.5       # 文字描边宽度
+    n_threads: Optional[int] = 2    # 线程数
+    paragraph_number: Optional[int] = 1     # 段落数量
--- a/app/router.py
+++ b/app/router.py
@ -0,0 +1,17 @@
+"""Application configuration - root APIRouter.
+
+Defines all FastAPI application endpoints.
+
+Resources:
+    1. https://fastapi.tiangolo.com/tutorial/bigger-applications
+
+"""
+
+from fastapi import APIRouter
+
+from app.controllers.v1 import llm, video
+
+root_api_router = APIRouter()
+# v1
+root_api_router.include_router(video.router)
+root_api_router.include_router(llm.router)
--- a/app/services/init.py
+++ b/app/services/init.py
--- a/app/services/llm.py
+++ b/app/services/llm.py
@ -0,0 +1,477 @@
+import logging
+import re
+import os
+import json
+from typing import List
+from loguru import logger
+from openai import OpenAI
+from openai import AzureOpenAI
+from openai.types.chat import ChatCompletion
+import google.generativeai as gemini
+
+from app.config import config
+
+_max_retries = 5
+
+
+def _generate_response(prompt: str) -> str:
+    content = ""
+    llm_provider = config.app.get("llm_provider", "openai")
+    logger.info(f"llm provider: {llm_provider}")
+    if llm_provider == "g4f":
+        model_name = config.app.get("g4f_model_name", "")
+        if not model_name:
+            model_name = "gpt-3.5-turbo-16k-0613"
+        import g4f
+
+        content = g4f.ChatCompletion.create(
+            model=model_name,
+            messages=[{"role": "user", "content": prompt}],
+        )
+    else:
+        api_version = ""  # for azure
+        if llm_provider == "moonshot":
+            api_key = config.app.get("moonshot_api_key")
+            model_name = config.app.get("moonshot_model_name")
+            base_url = "https://api.moonshot.cn/v1"
+        elif llm_provider == "ollama":
+            # api_key = config.app.get("openai_api_key")
+            api_key = "ollama"  # any string works but you are required to have one
+            model_name = config.app.get("ollama_model_name")
+            base_url = config.app.get("ollama_base_url", "")
+            if not base_url:
+                base_url = "http://localhost:11434/v1"
+        elif llm_provider == "openai":
+            api_key = config.app.get("openai_api_key")
+            model_name = config.app.get("openai_model_name")
+            base_url = config.app.get("openai_base_url", "")
+            if not base_url:
+                base_url = "https://api.openai.com/v1"
+        elif llm_provider == "oneapi":
+            api_key = config.app.get("oneapi_api_key")
+            model_name = config.app.get("oneapi_model_name")
+            base_url = config.app.get("oneapi_base_url", "")
+        elif llm_provider == "azure":
+            api_key = config.app.get("azure_api_key")
+            model_name = config.app.get("azure_model_name")
+            base_url = config.app.get("azure_base_url", "")
+            api_version = config.app.get("azure_api_version", "2024-02-15-preview")
+        elif llm_provider == "gemini":
+            api_key = config.app.get("gemini_api_key")
+            model_name = config.app.get("gemini_model_name")
+            base_url = "***"
+        elif llm_provider == "qwen":
+            api_key = config.app.get("qwen_api_key")
+            model_name = config.app.get("qwen_model_name")
+            base_url = "***"
+        elif llm_provider == "cloudflare":
+            api_key = config.app.get("cloudflare_api_key")
+            model_name = config.app.get("cloudflare_model_name")
+            account_id = config.app.get("cloudflare_account_id")
+            base_url = "***"
+        elif llm_provider == "deepseek":
+            api_key = config.app.get("deepseek_api_key")
+            model_name = config.app.get("deepseek_model_name")
+            base_url = config.app.get("deepseek_base_url")
+            if not base_url:
+                base_url = "https://api.deepseek.com"
+        elif llm_provider == "ernie":
+            api_key = config.app.get("ernie_api_key")
+            secret_key = config.app.get("ernie_secret_key")
+            base_url = config.app.get("ernie_base_url")
+            model_name = "***"
+            if not secret_key:
+                raise ValueError(
+                    f"{llm_provider}: secret_key is not set, please set it in the config.toml file."
+                )
+        else:
+            raise ValueError(
+                "llm_provider is not set, please set it in the config.toml file."
+            )
+
+        if not api_key:
+            raise ValueError(
+                f"{llm_provider}: api_key is not set, please set it in the config.toml file."
+            )
+        if not model_name:
+            raise ValueError(
+                f"{llm_provider}: model_name is not set, please set it in the config.toml file."
+            )
+        if not base_url:
+            raise ValueError(
+                f"{llm_provider}: base_url is not set, please set it in the config.toml file."
+            )
+
+        if llm_provider == "qwen":
+            import dashscope
+            from dashscope.api_entities.dashscope_response import GenerationResponse
+
+            dashscope.api_key = api_key
+            response = dashscope.Generation.call(
+                model=model_name, messages=[{"role": "user", "content": prompt}]
+            )
+            if response:
+                if isinstance(response, GenerationResponse):
+                    status_code = response.status_code
+                    if status_code != 200:
+                        raise Exception(
+                            f'[{llm_provider}] returned an error response: "{response}"'
+                        )
+
+                    content = response["output"]["text"]
+                    return content.replace("\n", "")
+                else:
+                    raise Exception(
+                        f'[{llm_provider}] returned an invalid response: "{response}"'
+                    )
+            else:
+                raise Exception(f"[{llm_provider}] returned an empty response")
+
+        if llm_provider == "gemini":
+            import google.generativeai as genai
+
+            genai.configure(api_key=api_key, transport="rest")
+
+            generation_config = {
+                "temperature": 0.5,
+                "top_p": 1,
+                "top_k": 1,
+                "max_output_tokens": 2048,
+            }
+
+            safety_settings = [
+                {
+                    "category": "HARM_CATEGORY_HARASSMENT",
+                    "threshold": "BLOCK_ONLY_HIGH",
+                },
+                {
+                    "category": "HARM_CATEGORY_HATE_SPEECH",
+                    "threshold": "BLOCK_ONLY_HIGH",
+                },
+                {
+                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+                    "threshold": "BLOCK_ONLY_HIGH",
+                },
+                {
+                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+                    "threshold": "BLOCK_ONLY_HIGH",
+                },
+            ]
+
+            model = genai.GenerativeModel(
+                model_name=model_name,
+                generation_config=generation_config,
+                safety_settings=safety_settings,
+            )
+
+            try:
+                response = model.generate_content(prompt)
+                candidates = response.candidates
+                generated_text = candidates[0].content.parts[0].text
+            except (AttributeError, IndexError) as e:
+                print("Gemini Error:", e)
+
+            return generated_text
+
+        if llm_provider == "cloudflare":
+            import requests
+
+            response = requests.post(
+                f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model_name}",
+                headers={"Authorization": f"Bearer {api_key}"},
+                json={
+                    "messages": [
+                        {"role": "system", "content": "You are a friendly assistant"},
+                        {"role": "user", "content": prompt},
+                    ]
+                },
+            )
+            result = response.json()
+            logger.info(result)
+            return result["result"]["response"]
+
+        if llm_provider == "ernie":
+            import requests
+
+            params = {
+                "grant_type": "client_credentials",
+                "client_id": api_key,
+                "client_secret": secret_key,
+            }
+            access_token = (
+                requests.post("https://aip.baidubce.com/oauth/2.0/token", params=params)
+                .json()
+                .get("access_token")
+            )
+            url = f"{base_url}?access_token={access_token}"
+
+            payload = json.dumps(
+                {
+                    "messages": [{"role": "user", "content": prompt}],
+                    "temperature": 0.5,
+                    "top_p": 0.8,
+                    "penalty_score": 1,
+                    "disable_search": False,
+                    "enable_citation": False,
+                    "response_format": "text",
+                }
+            )
+            headers = {"Content-Type": "application/json"}
+
+            response = requests.request(
+                "POST", url, headers=headers, data=payload
+            ).json()
+            return response.get("result")
+
+        if llm_provider == "azure":
+            client = AzureOpenAI(
+                api_key=api_key,
+                api_version=api_version,
+                azure_endpoint=base_url,
+            )
+        else:
+            client = OpenAI(
+                api_key=api_key,
+                base_url=base_url,
+            )
+
+        response = client.chat.completions.create(
+            model=model_name, messages=[{"role": "user", "content": prompt}]
+        )
+        if response:
+            if isinstance(response, ChatCompletion):
+                content = response.choices[0].message.content
+            else:
+                raise Exception(
+                    f'[{llm_provider}] returned an invalid response: "{response}", please check your network '
+                    f"connection and try again."
+                )
+        else:
+            raise Exception(
+                f"[{llm_provider}] returned an empty response, please check your network connection and try again."
+            )
+
+    return content.replace("\n", "")
+
+
+def generate_script(
+    video_subject: str, language: str = "", paragraph_number: int = 1
+) -> str:
+    prompt = f"""
+# Role: Video Script Generator
+
+## Goals:
+Generate a script for a video, depending on the subject of the video.
+
+## Constrains:
+1. the script is to be returned as a string with the specified number of paragraphs.
+2. do not under any circumstance reference this prompt in your response.
+3. get straight to the point, don't start with unnecessary things like, "welcome to this video".
+4. you must not include any type of markdown or formatting in the script, never use a title.
+5. only return the raw content of the script.
+6. do not include "voiceover", "narrator" or similar indicators of what should be spoken at the beginning of each paragraph or line.
+7. you must not mention the prompt, or anything about the script itself. also, never talk about the amount of paragraphs or lines. just write the script.
+8. respond in the same language as the video subject.
+
+# Initialization:
+- video subject: {video_subject}
+- number of paragraphs: {paragraph_number}
+""".strip()
+    if language:
+        prompt += f"\n- language: {language}"
+
+    final_script = ""
+    logger.info(f"subject: {video_subject}")
+
+    def format_response(response):
+        # Clean the script
+        # Remove asterisks, hashes
+        response = response.replace("*", "")
+        response = response.replace("#", "")
+
+        # Remove markdown syntax
+        response = re.sub(r"\[.*\]", "", response)
+        response = re.sub(r"\(.*\)", "", response)
+
+        # Split the script into paragraphs
+        paragraphs = response.split("\n\n")
+
+        # Select the specified number of paragraphs
+        selected_paragraphs = paragraphs[:paragraph_number]
+
+        # Join the selected paragraphs into a single string
+        return "\n\n".join(paragraphs)
+
+    for i in range(_max_retries):
+        try:
+            response = _generate_response(prompt=prompt)
+            if response:
+                final_script = format_response(response)
+            else:
+                logging.error("gpt returned an empty response")
+
+            # g4f may return an error message
+            if final_script and "当日额度已消耗完" in final_script:
+                raise ValueError(final_script)
+
+            if final_script:
+                break
+        except Exception as e:
+            logger.error(f"failed to generate script: {e}")
+
+        if i < _max_retries:
+            logger.warning(f"failed to generate video script, trying again... {i + 1}")
+
+    logger.success(f"completed: \n{final_script}")
+    return final_script.strip()
+
+
+def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
+    prompt = f"""
+# Role: Video Search Terms Generator
+
+## Goals:
+Generate {amount} search terms for stock videos, depending on the subject of a video.
+
+## Constrains:
+1. the search terms are to be returned as a json-array of strings.
+2. each search term should consist of 1-3 words, always add the main subject of the video.
+3. you must only return the json-array of strings. you must not return anything else. you must not return the script.
+4. the search terms must be related to the subject of the video.
+5. reply with english search terms only.
+
+## Output Example:
+["search term 1", "search term 2", "search term 3","search term 4","search term 5"]
+
+## Context:
+### Video Subject
+{video_subject}
+
+### Video Script
+{video_script}
+
+Please note that you must use English for generating video search terms; Chinese is not accepted.
+""".strip()
+
+    logger.info(f"subject: {video_subject}")
+
+    search_terms = []
+    response = ""
+    for i in range(_max_retries):
+        try:
+            response = _generate_response(prompt)
+            search_terms = json.loads(response)
+            if not isinstance(search_terms, list) or not all(
+                isinstance(term, str) for term in search_terms
+            ):
+                logger.error("response is not a list of strings.")
+                continue
+
+        except Exception as e:
+            logger.warning(f"failed to generate video terms: {str(e)}")
+            if response:
+                match = re.search(r"\[.*]", response)
+                if match:
+                    try:
+                        search_terms = json.loads(match.group())
+                    except Exception as e:
+                        logger.warning(f"failed to generate video terms: {str(e)}")
+                        pass
+
+        if search_terms and len(search_terms) > 0:
+            break
+        if i < _max_retries:
+            logger.warning(f"failed to generate video terms, trying again... {i + 1}")
+
+    logger.success(f"completed: \n{search_terms}")
+    return search_terms
+
+
+def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot: str) -> str:
+    '''
+    使用 gemini-1.5-pro 进行影视解析
+    Args:
+        video_origin_name: str - 影视作品的原始名称
+        video_origin_path: str - 影视作品的原始路径
+        video_plot: str - 影视作品的简介或剧情概述
+
+    Return:
+        str - 解析后的 JSON 格式字符串
+    '''
+    api_key = config.app.get("gemini_api_key")
+    model_name = config.app.get("gemini_model_name")
+
+    gemini.configure(api_key=api_key)
+    model = gemini.GenerativeModel(model_name=model_name)
+
+    prompt = """
+# Role: 影视解说专家
+
+## Background:
+擅长根据剧情描述视频的画面和故事，能够生成一段非常有趣的解说文案。
+
+## Goals:
+1. 根据剧情描述视频的画面和故事，并对重要的画面进行展开叙述
+2. 根据剧情内容，生成符合 tiktok/抖音 风格的影视解说文案
+3. 将结果直接以json格式输出给用户，需要包含字段： picture 画面描述， timestamp 时间戳， narration 解说文案
+4. 剧情内容如下：{%s}
+
+## Skills
+- 精通 tiktok/抖音 等短视频影视解说文案撰写
+- 能够理解视频中的故事和画面表现
+- 能精准匹配视频中的画面和时间戳
+- 能精准把控旁白和时长
+- 精通中文
+- 精通JSON数据格式
+
+## Constrains
+- 解说文案的时长要和时间戳的时长尽量匹配
+- 忽略视频中关于广告的内容
+- 忽略视频中片头和片尾
+- 不得在脚本中包含任何类型的 Markdown 或格式
+
+## Format
+- 对应JSON的key为：picture， timestamp， narration
+""" % video_plot
+    logger.debug(f"视频名称: {video_origin_name}")
+    try:
+        gemini_video_file = gemini.upload_file(video_origin_path)
+        logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}")
+        while gemini_video_file.state.name == "PROCESSING":
+            import time
+            time.sleep(1)
+            gemini_video_file = gemini.get_file(gemini_video_file.name)
+            logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}")
+        if gemini_video_file.state.name == "FAILED":
+            raise ValueError(gemini_video_file.state.name)
+    except:
+        logger.error("上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确")
+        raise TimeoutError("上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确")
+
+    streams = model.generate_content([prompt, gemini_video_file], stream=True)
+    response = []
+    for chunk in streams:
+        response.append(chunk.text)
+
+    response = "".join(response)
+    logger.success(f"llm response: \n{response}")
+
+    return response
+
+
+if __name__ == "__main__":
+    juqin = ""
+    res = gemini_video2json("test", "/NarratoAI/resource/videos/test.mp4", juqin)
+    print(res)
+
+    # video_subject = "生命的意义是什么"
+    # script = generate_script(
+    #     video_subject=video_subject, language="zh-CN", paragraph_number=1
+    # )
+    # print("######################")
+    # print(script)
+    # search_terms = generate_terms(
+    #     video_subject=video_subject, video_script=script, amount=5
+    # )
+    # print("######################")
+    # print(search_terms)
--- a/app/services/material.py
+++ b/app/services/material.py
@ -0,0 +1,335 @@
+import os
+import random
+from urllib.parse import urlencode
+
+import requests
+from typing import List
+from loguru import logger
+from moviepy.video.io.VideoFileClip import VideoFileClip
+
+from app.config import config
+from app.models.schema import VideoAspect, VideoConcatMode, MaterialInfo
+from app.utils import utils
+
+requested_count = 0
+
+
+def get_api_key(cfg_key: str):
+    api_keys = config.app.get(cfg_key)
+    if not api_keys:
+        raise ValueError(
+            f"\n\n##### {cfg_key} is not set #####\n\nPlease set it in the config.toml file: {config.config_file}\n\n"
+            f"{utils.to_json(config.app)}"
+        )
+
+    # if only one key is provided, return it
+    if isinstance(api_keys, str):
+        return api_keys
+
+    global requested_count
+    requested_count += 1
+    return api_keys[requested_count % len(api_keys)]
+
+
+def search_videos_pexels(
+    search_term: str,
+    minimum_duration: int,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+) -> List[MaterialInfo]:
+    aspect = VideoAspect(video_aspect)
+    video_orientation = aspect.name
+    video_width, video_height = aspect.to_resolution()
+    api_key = get_api_key("pexels_api_keys")
+    headers = {"Authorization": api_key}
+    # Build URL
+    params = {"query": search_term, "per_page": 20, "orientation": video_orientation}
+    query_url = f"https://api.pexels.com/videos/search?{urlencode(params)}"
+    logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}")
+
+    try:
+        r = requests.get(
+            query_url,
+            headers=headers,
+            proxies=config.proxy,
+            verify=False,
+            timeout=(30, 60),
+        )
+        response = r.json()
+        video_items = []
+        if "videos" not in response:
+            logger.error(f"search videos failed: {response}")
+            return video_items
+        videos = response["videos"]
+        # loop through each video in the result
+        for v in videos:
+            duration = v["duration"]
+            # check if video has desired minimum duration
+            if duration < minimum_duration:
+                continue
+            video_files = v["video_files"]
+            # loop through each url to determine the best quality
+            for video in video_files:
+                w = int(video["width"])
+                h = int(video["height"])
+                if w == video_width and h == video_height:
+                    item = MaterialInfo()
+                    item.provider = "pexels"
+                    item.url = video["link"]
+                    item.duration = duration
+                    video_items.append(item)
+                    break
+        return video_items
+    except Exception as e:
+        logger.error(f"search videos failed: {str(e)}")
+
+    return []
+
+
+def search_videos_pixabay(
+    search_term: str,
+    minimum_duration: int,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+) -> List[MaterialInfo]:
+    aspect = VideoAspect(video_aspect)
+
+    video_width, video_height = aspect.to_resolution()
+
+    api_key = get_api_key("pixabay_api_keys")
+    # Build URL
+    params = {
+        "q": search_term,
+        "video_type": "all",  # Accepted values: "all", "film", "animation"
+        "per_page": 50,
+        "key": api_key,
+    }
+    query_url = f"https://pixabay.com/api/videos/?{urlencode(params)}"
+    logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}")
+
+    try:
+        r = requests.get(
+            query_url, proxies=config.proxy, verify=False, timeout=(30, 60)
+        )
+        response = r.json()
+        video_items = []
+        if "hits" not in response:
+            logger.error(f"search videos failed: {response}")
+            return video_items
+        videos = response["hits"]
+        # loop through each video in the result
+        for v in videos:
+            duration = v["duration"]
+            # check if video has desired minimum duration
+            if duration < minimum_duration:
+                continue
+            video_files = v["videos"]
+            # loop through each url to determine the best quality
+            for video_type in video_files:
+                video = video_files[video_type]
+                w = int(video["width"])
+                h = int(video["height"])
+                if w >= video_width:
+                    item = MaterialInfo()
+                    item.provider = "pixabay"
+                    item.url = video["url"]
+                    item.duration = duration
+                    video_items.append(item)
+                    break
+        return video_items
+    except Exception as e:
+        logger.error(f"search videos failed: {str(e)}")
+
+    return []
+
+
+def save_video(video_url: str, save_dir: str = "") -> str:
+    if not save_dir:
+        save_dir = utils.storage_dir("cache_videos")
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    url_without_query = video_url.split("?")[0]
+    url_hash = utils.md5(url_without_query)
+    video_id = f"vid-{url_hash}"
+    video_path = f"{save_dir}/{video_id}.mp4"
+
+    # if video already exists, return the path
+    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+        logger.info(f"video already exists: {video_path}")
+        return video_path
+
+    # if video does not exist, download it
+    with open(video_path, "wb") as f:
+        f.write(
+            requests.get(
+                video_url, proxies=config.proxy, verify=False, timeout=(60, 240)
+            ).content
+        )
+
+    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+        try:
+            clip = VideoFileClip(video_path)
+            duration = clip.duration
+            fps = clip.fps
+            clip.close()
+            if duration > 0 and fps > 0:
+                return video_path
+        except Exception as e:
+            try:
+                os.remove(video_path)
+            except Exception as e:
+                logger.warning(f"无效的视频文件: {video_path} => {str(e)}")
+    return ""
+
+
+def download_videos(
+    task_id: str,
+    search_terms: List[str],
+    source: str = "pexels",
+    video_aspect: VideoAspect = VideoAspect.portrait,
+    video_contact_mode: VideoConcatMode = VideoConcatMode.random,
+    audio_duration: float = 0.0,
+    max_clip_duration: int = 5,
+) -> List[str]:
+    valid_video_items = []
+    valid_video_urls = []
+    found_duration = 0.0
+    search_videos = search_videos_pexels
+    if source == "pixabay":
+        search_videos = search_videos_pixabay
+
+    for search_term in search_terms:
+        video_items = search_videos(
+            search_term=search_term,
+            minimum_duration=max_clip_duration,
+            video_aspect=video_aspect,
+        )
+        logger.info(f"found {len(video_items)} videos for '{search_term}'")
+
+        for item in video_items:
+            if item.url not in valid_video_urls:
+                valid_video_items.append(item)
+                valid_video_urls.append(item.url)
+                found_duration += item.duration
+
+    logger.info(
+        f"found total videos: {len(valid_video_items)}, required duration: {audio_duration} seconds, found duration: {found_duration} seconds"
+    )
+    video_paths = []
+
+    material_directory = config.app.get("material_directory", "").strip()
+    if material_directory == "task":
+        material_directory = utils.task_dir(task_id)
+    elif material_directory and not os.path.isdir(material_directory):
+        material_directory = ""
+
+    if video_contact_mode.value == VideoConcatMode.random.value:
+        random.shuffle(valid_video_items)
+
+    total_duration = 0.0
+    for item in valid_video_items:
+        try:
+            logger.info(f"downloading video: {item.url}")
+            saved_video_path = save_video(
+                video_url=item.url, save_dir=material_directory
+            )
+            if saved_video_path:
+                logger.info(f"video saved: {saved_video_path}")
+                video_paths.append(saved_video_path)
+                seconds = min(max_clip_duration, item.duration)
+                total_duration += seconds
+                if total_duration > audio_duration:
+                    logger.info(
+                        f"total duration of downloaded videos: {total_duration} seconds, skip downloading more"
+                    )
+                    break
+        except Exception as e:
+            logger.error(f"failed to download video: {utils.to_json(item)} => {str(e)}")
+    logger.success(f"downloaded {len(video_paths)} videos")
+    return video_paths
+
+
+def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> dict:
+    """
+    保存剪辑后的视频
+    Args:
+        timestamp: 需要裁剪的单个时间戳，如：'00:36-00:40'
+        origin_video: 原视频路径
+        save_dir: 存储目录
+
+    Returns:
+        裁剪后的视频路径
+    """
+    if not save_dir:
+        save_dir = utils.storage_dir("cache_videos")
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    # url_hash = utils.md5(str(uuid.uuid4()))
+    video_id = f"vid-{timestamp.replace(':', '_')}"
+    video_path = f"{save_dir}/{video_id}.mp4"
+
+    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
+        logger.info(f"video already exists: {video_path}")
+        return {timestamp: video_path}
+
+    # 剪辑视频
+    start, end = utils.split_timestamp(timestamp)
+    video = VideoFileClip(origin_video).subclip(start, end)
+    video.write_videofile(video_path)
+
+    if os.path.getsize(video_path) > 0 and os.path.exists(video_path):
+        try:
+            clip = VideoFileClip(video_path)
+            duration = clip.duration
+            fps = clip.fps
+            clip.close()
+            if duration > 0 and fps > 0:
+                return {timestamp: video_path}
+        except Exception as e:
+            try:
+                os.remove(video_path)
+            except Exception as e:
+                logger.warning(str(e))
+            logger.warning(f"无效的视频文件: {video_path}")
+    return {}
+
+
+def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, ) -> dict:
+    """
+    剪辑视频
+    Args:
+        task_id: 任务id
+        timestamp_terms: 需要剪辑的时间戳列表，如:['00:00-00:20', '00:36-00:40', '07:07-07:22']
+        origin_video: 原视频路径
+
+    Returns:
+        剪辑后的视频路径
+    """
+    video_paths = {}
+    for item in timestamp_terms:
+        logger.info(f"需要裁剪 '{origin_video}' 为 {len(timestamp_terms)} 个视频")
+        material_directory = config.app.get("material_directory", "").strip()
+        if material_directory == "task":
+            material_directory = utils.task_dir(task_id)
+        elif material_directory and not os.path.isdir(material_directory):
+            material_directory = ""
+
+        try:
+            logger.info(f"clip video: {item}")
+            saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory)
+            if saved_video_path:
+                logger.info(f"video saved: {saved_video_path}")
+                video_paths.update(saved_video_path)
+        except Exception as e:
+            logger.error(f"视频裁剪失败: {utils.to_json(item)} => {str(e)}")
+            return {}
+    logger.success(f"裁剪 {len(video_paths)} videos")
+    return video_paths
+
+
+if __name__ == "__main__":
+    download_videos(
+        "test123", ["Money Exchange Medium"], audio_duration=100, source="pixabay"
+    )
--- a/app/services/state.py
+++ b/app/services/state.py
@ -0,0 +1,122 @@
+import ast
+from abc import ABC, abstractmethod
+from app.config import config
+from app.models import const
+
+
+# Base class for state management
+class BaseState(ABC):
+    @abstractmethod
+    def update_task(self, task_id: str, state: int, progress: int = 0, **kwargs):
+        pass
+
+    @abstractmethod
+    def get_task(self, task_id: str):
+        pass
+
+
+# Memory state management
+class MemoryState(BaseState):
+    def __init__(self):
+        self._tasks = {}
+
+    def update_task(
+        self,
+        task_id: str,
+        state: int = const.TASK_STATE_PROCESSING,
+        progress: int = 0,
+        **kwargs,
+    ):
+        progress = int(progress)
+        if progress > 100:
+            progress = 100
+
+        self._tasks[task_id] = {
+            "state": state,
+            "progress": progress,
+            **kwargs,
+        }
+
+    def get_task(self, task_id: str):
+        return self._tasks.get(task_id, None)
+
+    def delete_task(self, task_id: str):
+        if task_id in self._tasks:
+            del self._tasks[task_id]
+
+
+# Redis state management
+class RedisState(BaseState):
+    def __init__(self, host="localhost", port=6379, db=0, password=None):
+        import redis
+
+        self._redis = redis.StrictRedis(host=host, port=port, db=db, password=password)
+
+    def update_task(
+        self,
+        task_id: str,
+        state: int = const.TASK_STATE_PROCESSING,
+        progress: int = 0,
+        **kwargs,
+    ):
+        progress = int(progress)
+        if progress > 100:
+            progress = 100
+
+        fields = {
+            "state": state,
+            "progress": progress,
+            **kwargs,
+        }
+
+        for field, value in fields.items():
+            self._redis.hset(task_id, field, str(value))
+
+    def get_task(self, task_id: str):
+        task_data = self._redis.hgetall(task_id)
+        if not task_data:
+            return None
+
+        task = {
+            key.decode("utf-8"): self._convert_to_original_type(value)
+            for key, value in task_data.items()
+        }
+        return task
+
+    def delete_task(self, task_id: str):
+        self._redis.delete(task_id)
+
+    @staticmethod
+    def _convert_to_original_type(value):
+        """
+        Convert the value from byte string to its original data type.
+        You can extend this method to handle other data types as needed.
+        """
+        value_str = value.decode("utf-8")
+
+        try:
+            # try to convert byte string array to list
+            return ast.literal_eval(value_str)
+        except (ValueError, SyntaxError):
+            pass
+
+        if value_str.isdigit():
+            return int(value_str)
+        # Add more conversions here if needed
+        return value_str
+
+
+# Global state
+_enable_redis = config.app.get("enable_redis", False)
+_redis_host = config.app.get("redis_host", "localhost")
+_redis_port = config.app.get("redis_port", 6379)
+_redis_db = config.app.get("redis_db", 0)
+_redis_password = config.app.get("redis_password", None)
+
+state = (
+    RedisState(
+        host=_redis_host, port=_redis_port, db=_redis_db, password=_redis_password
+    )
+    if _enable_redis
+    else MemoryState()
+)
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@ -0,0 +1,299 @@
+import json
+import os.path
+import re
+
+from faster_whisper import WhisperModel
+from timeit import default_timer as timer
+from loguru import logger
+
+from app.config import config
+from app.utils import utils
+
+model_size = config.whisper.get("model_size", "large-v3")
+device = config.whisper.get("device", "cpu")
+compute_type = config.whisper.get("compute_type", "int8")
+model = None
+
+
+def create(audio_file, subtitle_file: str = ""):
+    global model
+    if not model:
+        model_path = f"{utils.root_dir()}/models/whisper-{model_size}"
+        model_bin_file = f"{model_path}/model.bin"
+        if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
+            model_path = model_size
+
+        logger.info(
+            f"loading model: {model_path}, device: {device}, compute_type: {compute_type}"
+        )
+        try:
+            model = WhisperModel(
+                model_size_or_path=model_path, device=device, compute_type=compute_type
+            )
+        except Exception as e:
+            logger.error(
+                f"failed to load model: {e} \n\n"
+                f"********************************************\n"
+                f"this may be caused by network issue. \n"
+                f"please download the model manually and put it in the 'models' folder. \n"
+                f"see [README.md FAQ](https://github.com/harry0703/NarratoAI) for more details.\n"
+                f"********************************************\n\n"
+            )
+            return None
+
+    logger.info(f"start, output file: {subtitle_file}")
+    if not subtitle_file:
+        subtitle_file = f"{audio_file}.srt"
+
+    segments, info = model.transcribe(
+        audio_file,
+        beam_size=5,
+        word_timestamps=True,
+        vad_filter=True,
+        vad_parameters=dict(min_silence_duration_ms=500),
+    )
+
+    logger.info(
+        f"detected language: '{info.language}', probability: {info.language_probability:.2f}"
+    )
+
+    start = timer()
+    subtitles = []
+
+    def recognized(seg_text, seg_start, seg_end):
+        seg_text = seg_text.strip()
+        if not seg_text:
+            return
+
+        msg = "[%.2fs -> %.2fs] %s" % (seg_start, seg_end, seg_text)
+        logger.debug(msg)
+
+        subtitles.append(
+            {"msg": seg_text, "start_time": seg_start, "end_time": seg_end}
+        )
+
+    for segment in segments:
+        words_idx = 0
+        words_len = len(segment.words)
+
+        seg_start = 0
+        seg_end = 0
+        seg_text = ""
+
+        if segment.words:
+            is_segmented = False
+            for word in segment.words:
+                if not is_segmented:
+                    seg_start = word.start
+                    is_segmented = True
+
+                seg_end = word.end
+                # 如果包含标点,则断句
+                seg_text += word.word
+
+                if utils.str_contains_punctuation(word.word):
+                    # remove last char
+                    seg_text = seg_text[:-1]
+                    if not seg_text:
+                        continue
+
+                    recognized(seg_text, seg_start, seg_end)
+
+                    is_segmented = False
+                    seg_text = ""
+
+                if words_idx == 0 and segment.start < word.start:
+                    seg_start = word.start
+                if words_idx == (words_len - 1) and segment.end > word.end:
+                    seg_end = word.end
+                words_idx += 1
+
+        if not seg_text:
+            continue
+
+        recognized(seg_text, seg_start, seg_end)
+
+    end = timer()
+
+    diff = end - start
+    logger.info(f"complete, elapsed: {diff:.2f} s")
+
+    idx = 1
+    lines = []
+    for subtitle in subtitles:
+        text = subtitle.get("msg")
+        if text:
+            lines.append(
+                utils.text_to_srt(
+                    idx, text, subtitle.get("start_time"), subtitle.get("end_time")
+                )
+            )
+            idx += 1
+
+    sub = "\n".join(lines) + "\n"
+    with open(subtitle_file, "w", encoding="utf-8") as f:
+        f.write(sub)
+    logger.info(f"subtitle file created: {subtitle_file}")
+
+
+def file_to_subtitles(filename):
+    if not filename or not os.path.isfile(filename):
+        return []
+
+    times_texts = []
+    current_times = None
+    current_text = ""
+    index = 0
+    with open(filename, "r", encoding="utf-8") as f:
+        for line in f:
+            times = re.findall("([0-9]*:[0-9]*:[0-9]*,[0-9]*)", line)
+            if times:
+                current_times = line
+            elif line.strip() == "" and current_times:
+                index += 1
+                times_texts.append((index, current_times.strip(), current_text.strip()))
+                current_times, current_text = None, ""
+            elif current_times:
+                current_text += line
+    return times_texts
+
+
+def levenshtein_distance(s1, s2):
+    if len(s1) < len(s2):
+        return levenshtein_distance(s2, s1)
+
+    if len(s2) == 0:
+        return len(s1)
+
+    previous_row = range(len(s2) + 1)
+    for i, c1 in enumerate(s1):
+        current_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = previous_row[j + 1] + 1
+            deletions = current_row[j] + 1
+            substitutions = previous_row[j] + (c1 != c2)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+
+    return previous_row[-1]
+
+
+def similarity(a, b):
+    distance = levenshtein_distance(a.lower(), b.lower())
+    max_length = max(len(a), len(b))
+    return 1 - (distance / max_length)
+
+
+def correct(subtitle_file, video_script):
+    subtitle_items = file_to_subtitles(subtitle_file)
+    script_lines = utils.split_string_by_punctuations(video_script)
+
+    corrected = False
+    new_subtitle_items = []
+    script_index = 0
+    subtitle_index = 0
+
+    while script_index < len(script_lines) and subtitle_index < len(subtitle_items):
+        script_line = script_lines[script_index].strip()
+        subtitle_line = subtitle_items[subtitle_index][2].strip()
+
+        if script_line == subtitle_line:
+            new_subtitle_items.append(subtitle_items[subtitle_index])
+            script_index += 1
+            subtitle_index += 1
+        else:
+            combined_subtitle = subtitle_line
+            start_time = subtitle_items[subtitle_index][1].split(" --> ")[0]
+            end_time = subtitle_items[subtitle_index][1].split(" --> ")[1]
+            next_subtitle_index = subtitle_index + 1
+
+            while next_subtitle_index < len(subtitle_items):
+                next_subtitle = subtitle_items[next_subtitle_index][2].strip()
+                if similarity(
+                    script_line, combined_subtitle + " " + next_subtitle
+                ) > similarity(script_line, combined_subtitle):
+                    combined_subtitle += " " + next_subtitle
+                    end_time = subtitle_items[next_subtitle_index][1].split(" --> ")[1]
+                    next_subtitle_index += 1
+                else:
+                    break
+
+            if similarity(script_line, combined_subtitle) > 0.8:
+                logger.warning(
+                    f"Merged/Corrected - Script: {script_line}, Subtitle: {combined_subtitle}"
+                )
+                new_subtitle_items.append(
+                    (
+                        len(new_subtitle_items) + 1,
+                        f"{start_time} --> {end_time}",
+                        script_line,
+                    )
+                )
+                corrected = True
+            else:
+                logger.warning(
+                    f"Mismatch - Script: {script_line}, Subtitle: {combined_subtitle}"
+                )
+                new_subtitle_items.append(
+                    (
+                        len(new_subtitle_items) + 1,
+                        f"{start_time} --> {end_time}",
+                        script_line,
+                    )
+                )
+                corrected = True
+
+            script_index += 1
+            subtitle_index = next_subtitle_index
+
+    # 处理剩余的脚本行
+    while script_index < len(script_lines):
+        logger.warning(f"Extra script line: {script_lines[script_index]}")
+        if subtitle_index < len(subtitle_items):
+            new_subtitle_items.append(
+                (
+                    len(new_subtitle_items) + 1,
+                    subtitle_items[subtitle_index][1],
+                    script_lines[script_index],
+                )
+            )
+            subtitle_index += 1
+        else:
+            new_subtitle_items.append(
+                (
+                    len(new_subtitle_items) + 1,
+                    "00:00:00,000 --> 00:00:00,000",
+                    script_lines[script_index],
+                )
+            )
+        script_index += 1
+        corrected = True
+
+    if corrected:
+        with open(subtitle_file, "w", encoding="utf-8") as fd:
+            for i, item in enumerate(new_subtitle_items):
+                fd.write(f"{i + 1}\n{item[1]}\n{item[2]}\n\n")
+        logger.info("Subtitle corrected")
+    else:
+        logger.success("Subtitle is correct")
+
+
+if __name__ == "__main__":
+    task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072"
+    task_dir = utils.task_dir(task_id)
+    subtitle_file = f"{task_dir}/subtitle.srt"
+    audio_file = f"{task_dir}/audio.mp3"
+
+    subtitles = file_to_subtitles(subtitle_file)
+    print(subtitles)
+
+    script_file = f"{task_dir}/script.json"
+    with open(script_file, "r") as f:
+        script_content = f.read()
+    s = json.loads(script_content)
+    script = s.get("script")
+
+    correct(subtitle_file, script)
+
+    subtitle_file = f"{task_dir}/subtitle-test.srt"
+    create(audio_file, subtitle_file)
--- a/app/services/task.py
+++ b/app/services/task.py
@ -0,0 +1,473 @@
+import math
+import json
+import os.path
+import re
+from os import path
+
+from edge_tts import SubMaker
+from loguru import logger
+
+from app.config import config
+from app.models import const
+from app.models.schema import VideoConcatMode, VideoParams, VideoClipParams
+from app.services import llm, material, subtitle, video, voice
+from app.services import state as sm
+from app.utils import utils
+
+
+def generate_script(task_id, params):
+    logger.info("\n\n## generating video script")
+    video_script = params.video_script.strip()
+    if not video_script:
+        video_script = llm.generate_script(
+            video_subject=params.video_subject,
+            language=params.video_language,
+            paragraph_number=params.paragraph_number,
+        )
+    else:
+        logger.debug(f"video script: \n{video_script}")
+
+    if not video_script:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error("failed to generate video script.")
+        return None
+
+    return video_script
+
+
+def generate_terms(task_id, params, video_script):
+    logger.info("\n\n## generating video terms")
+    video_terms = params.video_terms
+    if not video_terms:
+        video_terms = llm.generate_terms(
+            video_subject=params.video_subject, video_script=video_script, amount=5
+        )
+    else:
+        if isinstance(video_terms, str):
+            video_terms = [term.strip() for term in re.split(r"[,，]", video_terms)]
+        elif isinstance(video_terms, list):
+            video_terms = [term.strip() for term in video_terms]
+        else:
+            raise ValueError("video_terms must be a string or a list of strings.")
+
+        logger.debug(f"video terms: {utils.to_json(video_terms)}")
+
+    if not video_terms:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error("failed to generate video terms.")
+        return None
+
+    return video_terms
+
+
+def save_script_data(task_id, video_script, video_terms, params):
+    script_file = path.join(utils.task_dir(task_id), "script.json")
+    script_data = {
+        "script": video_script,
+        "search_terms": video_terms,
+        "params": params,
+    }
+
+    with open(script_file, "w", encoding="utf-8") as f:
+        f.write(utils.to_json(script_data))
+
+
+def generate_audio(task_id, params, video_script):
+    logger.info("\n\n## generating audio")
+    audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
+    sub_maker = voice.tts(
+        text=video_script,
+        voice_name=voice.parse_voice_name(params.voice_name),
+        voice_rate=params.voice_rate,
+        voice_file=audio_file,
+    )
+    if sub_maker is None:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error(
+            """failed to generate audio:
+1. check if the language of the voice matches the language of the video script.
+2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
+        """.strip()
+        )
+        return None, None, None
+
+    audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
+    return audio_file, audio_duration, sub_maker
+
+
+def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
+    if not params.subtitle_enabled:
+        return ""
+
+    subtitle_path = path.join(utils.task_dir(task_id), "subtitle.srt")
+    subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+    logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
+
+    subtitle_fallback = False
+    if subtitle_provider == "edge":
+        voice.create_subtitle(
+            text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
+        )
+        if not os.path.exists(subtitle_path):
+            subtitle_fallback = True
+            logger.warning("subtitle file not found, fallback to whisper")
+
+    if subtitle_provider == "whisper" or subtitle_fallback:
+        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+        logger.info("\n\n## correcting subtitle")
+        subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+
+    subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
+    if not subtitle_lines:
+        logger.warning(f"subtitle file is invalid: {subtitle_path}")
+        return ""
+
+    return subtitle_path
+
+
+def get_video_materials(task_id, params, video_terms, audio_duration):
+    if params.video_source == "local":
+        logger.info("\n\n## preprocess local materials")
+        materials = video.preprocess_video(
+            materials=params.video_materials, clip_duration=params.video_clip_duration
+        )
+        if not materials:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            logger.error(
+                "no valid materials found, please check the materials and try again."
+            )
+            return None
+        return [material_info.url for material_info in materials]
+    else:
+        logger.info(f"\n\n## downloading videos from {params.video_source}")
+        downloaded_videos = material.download_videos(
+            task_id=task_id,
+            search_terms=video_terms,
+            source=params.video_source,
+            video_aspect=params.video_aspect,
+            video_contact_mode=params.video_concat_mode,
+            audio_duration=audio_duration * params.video_count,
+            max_clip_duration=params.video_clip_duration,
+        )
+        if not downloaded_videos:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            logger.error(
+                "failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
+            )
+            return None
+        return downloaded_videos
+
+
+def generate_final_videos(
+        task_id, params, downloaded_videos, audio_file, subtitle_path
+):
+    final_video_paths = []
+    combined_video_paths = []
+    video_concat_mode = (
+        params.video_concat_mode if params.video_count == 1 else VideoConcatMode.random
+    )
+
+    _progress = 50
+    for i in range(params.video_count):
+        index = i + 1
+        combined_video_path = path.join(
+            utils.task_dir(task_id), f"combined-{index}.mp4"
+        )
+        logger.info(f"\n\n## combining video: {index} => {combined_video_path}")
+        video.combine_videos(
+            combined_video_path=combined_video_path,
+            video_paths=downloaded_videos,
+            audio_file=audio_file,
+            video_aspect=params.video_aspect,
+            video_concat_mode=video_concat_mode,
+            max_clip_duration=params.video_clip_duration,
+            threads=params.n_threads,
+        )
+
+        _progress += 50 / params.video_count / 2
+        sm.state.update_task(task_id, progress=_progress)
+
+        final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
+
+        logger.info(f"\n\n## generating video: {index} => {final_video_path}")
+        video.generate_video(
+            video_path=combined_video_path,
+            audio_path=audio_file,
+            subtitle_path=subtitle_path,
+            output_file=final_video_path,
+            params=params,
+        )
+
+        _progress += 50 / params.video_count / 2
+        sm.state.update_task(task_id, progress=_progress)
+
+        final_video_paths.append(final_video_path)
+        combined_video_paths.append(combined_video_path)
+
+    return final_video_paths, combined_video_paths
+
+
+def start(task_id, params: VideoParams, stop_at: str = "video"):
+    logger.info(f"start task: {task_id}, stop_at: {stop_at}")
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
+
+    if type(params.video_concat_mode) is str:
+        params.video_concat_mode = VideoConcatMode(params.video_concat_mode)
+        
+    # 1. Generate script
+    video_script = generate_script(task_id, params)
+    if not video_script:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
+
+    if stop_at == "script":
+        sm.state.update_task(
+            task_id, state=const.TASK_STATE_COMPLETE, progress=100, script=video_script
+        )
+        return {"script": video_script}
+
+    # 2. Generate terms
+    video_terms = ""
+    if params.video_source != "local":
+        video_terms = generate_terms(task_id, params, video_script)
+        if not video_terms:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            return
+
+    save_script_data(task_id, video_script, video_terms, params)
+
+    if stop_at == "terms":
+        sm.state.update_task(
+            task_id, state=const.TASK_STATE_COMPLETE, progress=100, terms=video_terms
+        )
+        return {"script": video_script, "terms": video_terms}
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
+
+    # 3. Generate audio
+    audio_file, audio_duration, sub_maker = generate_audio(task_id, params, video_script)
+    if not audio_file:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
+
+    if stop_at == "audio":
+        sm.state.update_task(
+            task_id,
+            state=const.TASK_STATE_COMPLETE,
+            progress=100,
+            audio_file=audio_file,
+        )
+        return {"audio_file": audio_file, "audio_duration": audio_duration}
+
+    # 4. Generate subtitle
+    subtitle_path = generate_subtitle(task_id, params, video_script, sub_maker, audio_file)
+
+    if stop_at == "subtitle":
+        sm.state.update_task(
+            task_id,
+            state=const.TASK_STATE_COMPLETE,
+            progress=100,
+            subtitle_path=subtitle_path,
+        )
+        return {"subtitle_path": subtitle_path}
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
+
+    # 5. Get video materials
+    downloaded_videos = get_video_materials(
+        task_id, params, video_terms, audio_duration
+    )
+    if not downloaded_videos:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    if stop_at == "materials":
+        sm.state.update_task(
+            task_id,
+            state=const.TASK_STATE_COMPLETE,
+            progress=100,
+            materials=downloaded_videos,
+        )
+        return {"materials": downloaded_videos}
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
+
+    # 6. Generate final videos
+    final_video_paths, combined_video_paths = generate_final_videos(
+        task_id, params, downloaded_videos, audio_file, subtitle_path
+    )
+
+    if not final_video_paths:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    logger.success(
+        f"task {task_id} finished, generated {len(final_video_paths)} videos."
+    )
+
+    kwargs = {
+        "videos": final_video_paths,
+        "combined_videos": combined_video_paths,
+        "script": video_script,
+        "terms": video_terms,
+        "audio_file": audio_file,
+        "audio_duration": audio_duration,
+        "subtitle_path": subtitle_path,
+        "materials": downloaded_videos,
+    }
+    sm.state.update_task(
+        task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs
+    )
+    return kwargs
+
+
+def start_subclip(task_id, params: VideoClipParams, subclip_path_videos):
+    """
+    后台任务（自动剪辑视频进行剪辑）
+    """
+    logger.info(f"\n\n## 开始任务: {task_id}")
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
+
+    voice_name = voice.parse_voice_name(params.voice_name)
+    # voice_name = 'zh-CN-XiaoyiNeural'
+    paragraph_number = params.paragraph_number
+    n_threads = params.n_threads
+    max_clip_duration = params.video_clip_duration
+
+    logger.info("\n\n## 1. 读取json")
+    # video_script = params.video_script.strip()
+    # 搜索 ../storage 目录下 名称为 video_subject 的docx文件,并读出所有字符串
+    # video_script_path = path.join('E:\\Projects\\linyq\\MoneyPrinterLin\\txt.txt\\txt2.json')
+    video_script_path = path.join(params.video_clip_json)
+    # 判断json文件是否存在
+    if path.exists(video_script_path):
+        # 读取json文件内容，并转为dict
+        with open(video_script_path, "r", encoding="utf-8") as f:
+            list_script = json.load(f)
+            video_list = [i['narration'] for i in list_script]
+            time_list = [i['timestamp'] for i in list_script]
+
+            video_script = " ".join(video_list)
+            logger.debug(f"原json脚本: \n{video_script}")
+            logger.debug(f"原json时间戳: \n{time_list}")
+
+    else:
+        print("#@#@#@", params.video_clip_json)
+        raise ValueError("解说文案不存在！检查文案名称是否正确。")
+
+    # video_script = llm.text_polishing(context=video_script, language=params.video_language)
+    # logger.debug(f"润色后的视频脚本: \n{video_script}")
+    # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
+
+    logger.info("\n\n## 2. 生成音频")
+    audio_file = path.join(utils.task_dir(task_id), f"audio.mp3")
+    sub_maker = voice.tts(text=video_script, voice_name=voice_name, voice_file=audio_file, voice_rate=params.voice_rate)
+    if sub_maker is None:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error(
+            "无法生成音频，可能是网络不可用。如果您在中国，请使用VPN。或者手动选择 zh-CN-Yunjian-男性 音频")
+        return
+
+    audio_duration = voice.get_audio_duration(sub_maker)
+    audio_duration = math.ceil(audio_duration)
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
+
+    subtitle_path = ""
+    if params.subtitle_enabled:
+        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
+        subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+        logger.info(f"\n\n## 3. 生成字幕、提供程序是: {subtitle_provider}")
+        subtitle_fallback = False
+        if subtitle_provider == "edge":
+            voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path)
+            if not os.path.exists(subtitle_path):
+                subtitle_fallback = True
+                logger.warning("找不到字幕文件，回退到whisper")
+
+        if subtitle_provider == "whisper" or subtitle_fallback:
+            subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+            logger.info("\n\n## 更正字幕")
+            subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+
+        subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
+        if not subtitle_lines:
+            logger.warning(f"字幕文件无效: {subtitle_path}")
+            subtitle_path = ""
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
+
+    logger.info("\n\n## 4. 裁剪视频")
+    subclip_videos = [x for x in subclip_path_videos.values()]
+    # subclip_videos = material.clip_videos(task_id=task_id,
+    #                                          timestamp_terms=time_list,
+    #                                          origin_video=params.video_origin_path
+    #                                          )
+    logger.debug(f"\n\n## 裁剪后的视频文件列表: \n{subclip_videos}")
+
+    if not subclip_videos:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error(
+            "裁剪视频失败，可能是 ImageMagick 不可用")
+        return
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
+
+    final_video_paths = []
+    combined_video_paths = []
+
+    _progress = 50
+    for i in range(params.video_count):
+        index = i + 1
+        combined_video_path = path.join(utils.task_dir(task_id), f"combined-{index}.mp4")
+        logger.info(f"\n\n## 5. 合并视频: {index} => {combined_video_path}")
+        video.combine_clip_videos(combined_video_path=combined_video_path,
+                             video_paths=subclip_videos,
+                             video_script_list=video_list,
+                             audio_file=audio_file,
+                             video_aspect=params.video_aspect,
+                             threads=n_threads)
+
+        _progress += 50 / params.video_count / 2
+        sm.state.update_task(task_id, progress=_progress)
+
+        final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
+
+        logger.info(f"\n\n## 6. 生成视频: {index} => {final_video_path}")
+        # 把所有东西合到在一起
+        video.generate_video(video_path=combined_video_path,
+                             audio_path=audio_file,
+                             subtitle_path=subtitle_path,
+                             output_file=final_video_path,
+                             params=params,
+                             )
+
+        _progress += 50 / params.video_count / 2
+        sm.state.update_task(task_id, progress=_progress)
+
+        final_video_paths.append(final_video_path)
+        combined_video_paths.append(combined_video_path)
+
+    logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.")
+
+    kwargs = {
+        "videos": final_video_paths,
+        "combined_videos": combined_video_paths
+    }
+    sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs)
+    return kwargs
+
+
+if __name__ == "__main__":
+    task_id = "task_id"
+    params = VideoParams(
+        video_subject="金钱的作用",
+        voice_name="zh-CN-XiaoyiNeural-Female",
+        voice_rate=1.0,
+
+    )
+    start(task_id, params, stop_at="video")
--- a/app/services/video.py
+++ b/app/services/video.py
@ -0,0 +1,520 @@
+import glob
+import random
+from typing import List
+from typing import Union
+
+from loguru import logger
+from moviepy.editor import *
+from moviepy.video.tools.subtitles import SubtitlesClip
+from PIL import ImageFont
+
+from app.models import const
+from app.models.schema import MaterialInfo, VideoAspect, VideoConcatMode, VideoParams, VideoClipParams
+from app.utils import utils
+
+
+def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
+    if not bgm_type:
+        return ""
+
+    if bgm_file and os.path.exists(bgm_file):
+        return bgm_file
+
+    if bgm_type == "random":
+        suffix = "*.mp3"
+        song_dir = utils.song_dir()
+        files = glob.glob(os.path.join(song_dir, suffix))
+        return random.choice(files)
+
+    return ""
+
+
+def combine_videos(
+    combined_video_path: str,
+    video_paths: List[str],
+    audio_file: str,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+    video_concat_mode: VideoConcatMode = VideoConcatMode.random,
+    max_clip_duration: int = 5,
+    threads: int = 2,
+) -> str:
+    audio_clip = AudioFileClip(audio_file)
+    audio_duration = audio_clip.duration
+    logger.info(f"max duration of audio: {audio_duration} seconds")
+    # Required duration of each clip
+    req_dur = audio_duration / len(video_paths)
+    req_dur = max_clip_duration
+    logger.info(f"each clip will be maximum {req_dur} seconds long")
+    output_dir = os.path.dirname(combined_video_path)
+
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    clips = []
+    video_duration = 0
+
+    raw_clips = []
+    for video_path in video_paths:
+        clip = VideoFileClip(video_path).without_audio()
+        clip_duration = clip.duration
+        start_time = 0
+
+        while start_time < clip_duration:
+            end_time = min(start_time + max_clip_duration, clip_duration)
+            split_clip = clip.subclip(start_time, end_time)
+            raw_clips.append(split_clip)
+            # logger.info(f"splitting from {start_time:.2f} to {end_time:.2f}, clip duration {clip_duration:.2f}, split_clip duration {split_clip.duration:.2f}")
+            start_time = end_time
+            if video_concat_mode.value == VideoConcatMode.sequential.value:
+                break
+
+    # random video_paths order
+    if video_concat_mode.value == VideoConcatMode.random.value:
+        random.shuffle(raw_clips)
+
+    # Add downloaded clips over and over until the duration of the audio (max_duration) has been reached
+    while video_duration < audio_duration:
+        for clip in raw_clips:
+            # Check if clip is longer than the remaining audio
+            if (audio_duration - video_duration) < clip.duration:
+                clip = clip.subclip(0, (audio_duration - video_duration))
+            # Only shorten clips if the calculated clip length (req_dur) is shorter than the actual clip to prevent still image
+            elif req_dur < clip.duration:
+                clip = clip.subclip(0, req_dur)
+            clip = clip.set_fps(30)
+
+            # Not all videos are same size, so we need to resize them
+            clip_w, clip_h = clip.size
+            if clip_w != video_width or clip_h != video_height:
+                clip_ratio = clip.w / clip.h
+                video_ratio = video_width / video_height
+
+                if clip_ratio == video_ratio:
+                    # 等比例缩放
+                    clip = clip.resize((video_width, video_height))
+                else:
+                    # 等比缩放视频
+                    if clip_ratio > video_ratio:
+                        # 按照目标宽度等比缩放
+                        scale_factor = video_width / clip_w
+                    else:
+                        # 按照目标高度等比缩放
+                        scale_factor = video_height / clip_h
+
+                    new_width = int(clip_w * scale_factor)
+                    new_height = int(clip_h * scale_factor)
+                    clip_resized = clip.resize(newsize=(new_width, new_height))
+
+                    background = ColorClip(
+                        size=(video_width, video_height), color=(0, 0, 0)
+                    )
+                    clip = CompositeVideoClip(
+                        [
+                            background.set_duration(clip.duration),
+                            clip_resized.set_position("center"),
+                        ]
+                    )
+
+                logger.info(
+                    f"resizing video to {video_width} x {video_height}, clip size: {clip_w} x {clip_h}"
+                )
+
+            if clip.duration > max_clip_duration:
+                clip = clip.subclip(0, max_clip_duration)
+
+            clips.append(clip)
+            video_duration += clip.duration
+
+    video_clip = concatenate_videoclips(clips)
+    video_clip = video_clip.set_fps(30)
+    logger.info("writing")
+    # https://github.com/harry0703/NarratoAI/issues/111#issuecomment-2032354030
+    video_clip.write_videofile(
+        filename=combined_video_path,
+        threads=threads,
+        logger=None,
+        temp_audiofile_path=output_dir,
+        audio_codec="aac",
+        fps=30,
+    )
+    video_clip.close()
+    logger.success("completed")
+    return combined_video_path
+
+
+def wrap_text(text, max_width, font="Arial", fontsize=60):
+    # 创建字体对象
+    font = ImageFont.truetype(font, fontsize)
+
+    def get_text_size(inner_text):
+        inner_text = inner_text.strip()
+        left, top, right, bottom = font.getbbox(inner_text)
+        return right - left, bottom - top
+
+    width, height = get_text_size(text)
+    if width <= max_width:
+        return text, height
+
+    # logger.warning(f"wrapping text, max_width: {max_width}, text_width: {width}, text: {text}")
+
+    processed = True
+
+    _wrapped_lines_ = []
+    words = text.split(" ")
+    _txt_ = ""
+    for word in words:
+        _before = _txt_
+        _txt_ += f"{word} "
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            if _txt_.strip() == word.strip():
+                processed = False
+                break
+            _wrapped_lines_.append(_before)
+            _txt_ = f"{word} "
+    _wrapped_lines_.append(_txt_)
+    if processed:
+        _wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
+        result = "\n".join(_wrapped_lines_).strip()
+        height = len(_wrapped_lines_) * height
+        # logger.warning(f"wrapped text: {result}")
+        return result, height
+
+    _wrapped_lines_ = []
+    chars = list(text)
+    _txt_ = ""
+    for word in chars:
+        _txt_ += word
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            _wrapped_lines_.append(_txt_)
+            _txt_ = ""
+    _wrapped_lines_.append(_txt_)
+    result = "\n".join(_wrapped_lines_).strip()
+    height = len(_wrapped_lines_) * height
+    # logger.warning(f"wrapped text: {result}")
+    return result, height
+
+
+def generate_video(
+    video_path: str,
+    audio_path: str,
+    subtitle_path: str,
+    output_file: str,
+    params: Union[VideoParams, VideoClipParams],
+):
+    aspect = VideoAspect(params.video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    logger.info(f"start, video size: {video_width} x {video_height}")
+    logger.info(f"  ① video: {video_path}")
+    logger.info(f"  ② audio: {audio_path}")
+    logger.info(f"  ③ subtitle: {subtitle_path}")
+    logger.info(f"  ④ output: {output_file}")
+
+    # https://github.com/harry0703/NarratoAI/issues/217
+    # PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'final-1.mp4.tempTEMP_MPY_wvf_snd.mp3'
+    # write into the same directory as the output file
+    output_dir = os.path.dirname(output_file)
+
+    font_path = ""
+    if params.subtitle_enabled:
+        if not params.font_name:
+            params.font_name = "STHeitiMedium.ttc"
+        font_path = os.path.join(utils.font_dir(), params.font_name)
+        if os.name == "nt":
+            font_path = font_path.replace("\\", "/")
+
+        logger.info(f"using font: {font_path}")
+
+    def create_text_clip(subtitle_item):
+        phrase = subtitle_item[1]
+        max_width = video_width * 0.9
+        wrapped_txt, txt_height = wrap_text(
+            phrase, max_width=max_width, font=font_path, fontsize=params.font_size
+        )
+        _clip = TextClip(
+            wrapped_txt,
+            font=font_path,
+            fontsize=params.font_size,
+            color=params.text_fore_color,
+            bg_color=params.text_background_color,
+            stroke_color=params.stroke_color,
+            stroke_width=params.stroke_width,
+            print_cmd=False,
+        )
+        duration = subtitle_item[0][1] - subtitle_item[0][0]
+        _clip = _clip.set_start(subtitle_item[0][0])
+        _clip = _clip.set_end(subtitle_item[0][1])
+        _clip = _clip.set_duration(duration)
+        if params.subtitle_position == "bottom":
+            _clip = _clip.set_position(("center", video_height * 0.95 - _clip.h))
+        elif params.subtitle_position == "top":
+            _clip = _clip.set_position(("center", video_height * 0.05))
+        elif params.subtitle_position == "custom":
+            # 确保字幕完全在屏幕内
+            margin = 10  # 额外的边距，单位为像素
+            max_y = video_height - _clip.h - margin
+            min_y = margin
+            custom_y = (video_height - _clip.h) * (params.custom_position / 100)
+            custom_y = max(min_y, min(custom_y, max_y))  # 限制 y 值在有效范围内
+            _clip = _clip.set_position(("center", custom_y))
+        else:  # center
+            _clip = _clip.set_position(("center", "center"))
+        return _clip
+
+    video_clip = VideoFileClip(video_path)
+    audio_clip = AudioFileClip(audio_path).volumex(params.voice_volume)
+
+    if subtitle_path and os.path.exists(subtitle_path):
+        sub = SubtitlesClip(subtitles=subtitle_path, encoding="utf-8")
+        text_clips = []
+        for item in sub.subtitles:
+            clip = create_text_clip(subtitle_item=item)
+            text_clips.append(clip)
+        video_clip = CompositeVideoClip([video_clip, *text_clips])
+
+    bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
+    if bgm_file:
+        try:
+            bgm_clip = (
+                AudioFileClip(bgm_file).volumex(params.bgm_volume).audio_fadeout(3)
+            )
+            bgm_clip = afx.audio_loop(bgm_clip, duration=video_clip.duration)
+            audio_clip = CompositeAudioClip([audio_clip, bgm_clip])
+        except Exception as e:
+            logger.error(f"failed to add bgm: {str(e)}")
+
+    video_clip = video_clip.set_audio(audio_clip)
+    video_clip.write_videofile(
+        output_file,
+        audio_codec="aac",
+        temp_audiofile_path=output_dir,
+        threads=params.n_threads or 2,
+        logger=None,
+        fps=30,
+    )
+    video_clip.close()
+    del video_clip
+    logger.success("completed")
+
+
+def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
+    for material in materials:
+        if not material.url:
+            continue
+
+        ext = utils.parse_extension(material.url)
+        try:
+            clip = VideoFileClip(material.url)
+        except Exception:
+            clip = ImageClip(material.url)
+
+        width = clip.size[0]
+        height = clip.size[1]
+        if width < 480 or height < 480:
+            logger.warning(f"video is too small, width: {width}, height: {height}")
+            continue
+
+        if ext in const.FILE_TYPE_IMAGES:
+            logger.info(f"processing image: {material.url}")
+            # 创建一个图片剪辑，并设置持续时间为3秒钟
+            clip = (
+                ImageClip(material.url)
+                .set_duration(clip_duration)
+                .set_position("center")
+            )
+            # 使用resize方法来添加缩放效果。这里使用了lambda函数来使得缩放效果随时间变化。
+            # 假设我们想要从原始大小逐渐放大到120%的大小。
+            # t代表当前时间，clip.duration为视频总时长，这里是3秒。
+            # 注意：1 表示100%的大小，所以1.2表示120%的大小
+            zoom_clip = clip.resize(
+                lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
+            )
+
+            # 如果需要，可以创建一个包含缩放剪辑的复合视频剪辑
+            # （这在您想要在视频中添加其他元素时非常有用）
+            final_clip = CompositeVideoClip([zoom_clip])
+
+            # 输出视频
+            video_file = f"{material.url}.mp4"
+            final_clip.write_videofile(video_file, fps=30, logger=None)
+            final_clip.close()
+            del final_clip
+            material.url = video_file
+            logger.success(f"completed: {video_file}")
+    return materials
+
+
+def combine_clip_videos(combined_video_path: str,
+                        video_paths: List[str],
+                        video_script_list: List[str],
+                        audio_file: str,
+                        video_aspect: VideoAspect = VideoAspect.portrait,
+                        threads: int = 2,
+                        ) -> str:
+    """
+    合并子视频
+    Args:
+        combined_video_path: 合并后的存储路径
+        video_paths: 子视频路径列表
+        audio_file: mp3旁白
+        video_aspect: 屏幕比例
+        threads: 线程数
+
+    Returns:
+
+    """
+    audio_clip = AudioFileClip(audio_file)
+    audio_duration = audio_clip.duration
+    logger.info(f"音频的最大持续时间: {audio_duration} s")
+    # 每个剪辑所需的持续时间
+    req_dur = audio_duration / len(video_paths)
+    # req_dur = max_clip_duration
+    # logger.info(f"每个剪辑的最大长度为 {req_dur} s")
+    output_dir = os.path.dirname(combined_video_path)
+
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    clips = []
+    video_duration = 0
+    # 一遍又一遍地添加下载的剪辑，直到达到音频的持续时间 （max_duration）
+    while video_duration < audio_duration:
+        for video_path, video_script in zip(video_paths, video_script_list):
+            clip = VideoFileClip(video_path).without_audio()
+            # 检查剪辑是否比剩余音频长
+            if (audio_duration - video_duration) < clip.duration:
+                clip = clip.subclip(0, (audio_duration - video_duration))
+            # 仅当计算出的剪辑长度 （req_dur） 短于实际剪辑时，才缩短剪辑以防止静止图像
+            elif req_dur < clip.duration:
+                clip = clip.subclip(0, req_dur)
+            clip = clip.set_fps(30)
+
+            # 并非所有视频的大小都相同，因此我们需要调整它们的大小
+            clip_w, clip_h = clip.size
+            if clip_w != video_width or clip_h != video_height:
+                clip_ratio = clip.w / clip.h
+                video_ratio = video_width / video_height
+
+                if clip_ratio == video_ratio:
+                    # 等比例缩放
+                    clip = clip.resize((video_width, video_height))
+                else:
+                    # 等比缩放视频
+                    if clip_ratio > video_ratio:
+                        # 按照目标宽度等比缩放
+                        scale_factor = video_width / clip_w
+                    else:
+                        # 按照目标高度等比缩放
+                        scale_factor = video_height / clip_h
+
+                    new_width = int(clip_w * scale_factor)
+                    new_height = int(clip_h * scale_factor)
+                    clip_resized = clip.resize(newsize=(new_width, new_height))
+
+                    background = ColorClip(size=(video_width, video_height), color=(0, 0, 0))
+                    clip = CompositeVideoClip([
+                        background.set_duration(clip.duration),
+                        clip_resized.set_position("center")
+                    ])
+
+                logger.info(f"将视频 {video_path} 大小调整为 {video_width} x {video_height}, 剪辑尺寸: {clip_w} x {clip_h}")
+
+            # TODO: 片段时长过长时，需要缩短，但暂时没有好的解决方案
+            # if clip.duration > 5:
+            #     ctime = utils.reduce_video_time(txt=video_script)
+            #     if clip.duration > (2 * ctime):
+            #         clip = clip.subclip(ctime, 2*ctime)
+            #     else:
+            #         clip = clip.subclip(0, ctime)
+            #     logger.info(f"视频 {video_path} 片段时长较长，将剪辑时长缩短至 {ctime} 秒")
+
+            clips.append(clip)
+            video_duration += clip.duration
+
+    video_clip = concatenate_videoclips(clips)
+    video_clip = video_clip.set_fps(30)
+    logger.info(f"合并中...")
+    video_clip.write_videofile(filename=combined_video_path,
+                               threads=threads,
+                               logger=None,
+                               temp_audiofile_path=output_dir,
+                               audio_codec="aac",
+                               fps=30,
+                               )
+    video_clip.close()
+    logger.success(f"completed")
+    return combined_video_path
+
+
+if __name__ == "__main__":
+    from app.utils import utils
+
+    suffix = "*.mp4"
+    song_dir = utils.video_dir()
+    files = glob.glob(os.path.join(song_dir, suffix))
+
+    print(files)
+
+    # m = MaterialInfo()
+    # m.url = "/Users/harry/Downloads/IMG_2915.JPG"
+    # m.provider = "local"
+    # materials = preprocess_video([m], clip_duration=4)
+    # print(materials)
+
+    # txt_en = "Here's your guide to travel hacks for budget-friendly adventures"
+    # txt_zh = "测试长字段这是您的旅行技巧指南帮助您进行预算友好的冒险"
+    # font = utils.resource_dir() + "/fonts/STHeitiMedium.ttc"
+    # for txt in [txt_en, txt_zh]:
+    #     t, h = wrap_text(text=txt, max_width=1000, font=font, fontsize=60)
+    #     print(t)
+    #
+    # task_id = "aa563149-a7ea-49c2-b39f-8c32cc225baf"
+    # task_dir = utils.task_dir(task_id)
+    # video_file = f"{task_dir}/combined-1.mp4"
+    # audio_file = f"{task_dir}/audio.mp3"
+    # subtitle_file = f"{task_dir}/subtitle.srt"
+    # output_file = f"{task_dir}/final.mp4"
+    #
+    # # video_paths = []
+    # # for file in os.listdir(utils.storage_dir("test")):
+    # #     if file.endswith(".mp4"):
+    # #         video_paths.append(os.path.join(utils.storage_dir("test"), file))
+    # #
+    # # combine_videos(combined_video_path=video_file,
+    # #                audio_file=audio_file,
+    # #                video_paths=video_paths,
+    # #                video_aspect=VideoAspect.portrait,
+    # #                video_concat_mode=VideoConcatMode.random,
+    # #                max_clip_duration=5,
+    # #                threads=2)
+    #
+    # cfg = VideoParams()
+    # cfg.video_aspect = VideoAspect.portrait
+    # cfg.font_name = "STHeitiMedium.ttc"
+    # cfg.font_size = 60
+    # cfg.stroke_color = "#000000"
+    # cfg.stroke_width = 1.5
+    # cfg.text_fore_color = "#FFFFFF"
+    # cfg.text_background_color = "transparent"
+    # cfg.bgm_type = "random"
+    # cfg.bgm_file = ""
+    # cfg.bgm_volume = 1.0
+    # cfg.subtitle_enabled = True
+    # cfg.subtitle_position = "bottom"
+    # cfg.n_threads = 2
+    # cfg.paragraph_number = 1
+    #
+    # cfg.voice_volume = 1.0
+    #
+    # generate_video(video_path=video_file,
+    #                audio_path=audio_file,
+    #                subtitle_path=subtitle_file,
+    #                output_file=output_file,
+    #                params=cfg
+    #                )
--- a/app/services/voice.py
+++ b/app/services/voice.py
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@ -0,0 +1,271 @@
+import locale
+import os
+import platform
+import threading
+from typing import Any
+from loguru import logger
+import json
+from uuid import uuid4
+import urllib3
+
+from app.models import const
+
+urllib3.disable_warnings()
+
+
+def get_response(status: int, data: Any = None, message: str = ""):
+    obj = {
+        "status": status,
+    }
+    if data:
+        obj["data"] = data
+    if message:
+        obj["message"] = message
+    return obj
+
+
+def to_json(obj):
+    try:
+        # 定义一个辅助函数来处理不同类型的对象
+        def serialize(o):
+            # 如果对象是可序列化类型，直接返回
+            if isinstance(o, (int, float, bool, str)) or o is None:
+                return o
+            # 如果对象是二进制数据，转换为base64编码的字符串
+            elif isinstance(o, bytes):
+                return "*** binary data ***"
+            # 如果对象是字典，递归处理每个键值对
+            elif isinstance(o, dict):
+                return {k: serialize(v) for k, v in o.items()}
+            # 如果对象是列表或元组，递归处理每个元素
+            elif isinstance(o, (list, tuple)):
+                return [serialize(item) for item in o]
+            # 如果对象是自定义类型，尝试返回其__dict__属性
+            elif hasattr(o, "__dict__"):
+                return serialize(o.__dict__)
+            # 其他情况返回None（或者可以选择抛出异常）
+            else:
+                return None
+
+        # 使用serialize函数处理输入对象
+        serialized_obj = serialize(obj)
+
+        # 序列化处理后的对象为JSON字符串
+        return json.dumps(serialized_obj, ensure_ascii=False, indent=4)
+    except Exception as e:
+        return None
+
+
+def get_uuid(remove_hyphen: bool = False):
+    u = str(uuid4())
+    if remove_hyphen:
+        u = u.replace("-", "")
+    return u
+
+
+def root_dir():
+    return os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+
+def storage_dir(sub_dir: str = "", create: bool = False):
+    d = os.path.join(root_dir(), "storage")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if create and not os.path.exists(d):
+        os.makedirs(d)
+
+    return d
+
+
+def resource_dir(sub_dir: str = ""):
+    d = os.path.join(root_dir(), "resource")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    return d
+
+
+def task_dir(sub_dir: str = ""):
+    d = os.path.join(storage_dir(), "tasks")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def font_dir(sub_dir: str = ""):
+    d = resource_dir(f"fonts")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def song_dir(sub_dir: str = ""):
+    d = resource_dir(f"songs")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def public_dir(sub_dir: str = ""):
+    d = resource_dir(f"public")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def run_in_background(func, *args, **kwargs):
+    def run():
+        try:
+            func(*args, **kwargs)
+        except Exception as e:
+            logger.error(f"run_in_background error: {e}")
+
+    thread = threading.Thread(target=run)
+    thread.start()
+    return thread
+
+
+def time_convert_seconds_to_hmsm(seconds) -> str:
+    hours = int(seconds // 3600)
+    seconds = seconds % 3600
+    minutes = int(seconds // 60)
+    milliseconds = int(seconds * 1000) % 1000
+    seconds = int(seconds % 60)
+    return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds)
+
+
+def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str:
+    start_time = time_convert_seconds_to_hmsm(start_time)
+    end_time = time_convert_seconds_to_hmsm(end_time)
+    srt = """%d
+%s --> %s
+%s
+        """ % (
+        idx,
+        start_time,
+        end_time,
+        msg,
+    )
+    return srt
+
+
+def str_contains_punctuation(word):
+    for p in const.PUNCTUATIONS:
+        if p in word:
+            return True
+    return False
+
+
+def split_string_by_punctuations(s):
+    result = []
+    txt = ""
+
+    previous_char = ""
+    next_char = ""
+    for i in range(len(s)):
+        char = s[i]
+        if char == "\n":
+            result.append(txt.strip())
+            txt = ""
+            continue
+
+        if i > 0:
+            previous_char = s[i - 1]
+        if i < len(s) - 1:
+            next_char = s[i + 1]
+
+        if char == "." and previous_char.isdigit() and next_char.isdigit():
+            # 取现1万，按2.5%收取手续费, 2.5 中的 . 不能作为换行标记
+            txt += char
+            continue
+
+        if char not in const.PUNCTUATIONS:
+            txt += char
+        else:
+            result.append(txt.strip())
+            txt = ""
+    result.append(txt.strip())
+    # filter empty string
+    result = list(filter(None, result))
+    return result
+
+
+def md5(text):
+    import hashlib
+
+    return hashlib.md5(text.encode("utf-8")).hexdigest()
+
+
+def get_system_locale():
+    try:
+        loc = locale.getdefaultlocale()
+        # zh_CN, zh_TW return zh
+        # en_US, en_GB return en
+        language_code = loc[0].split("_")[0]
+        return language_code
+    except Exception as e:
+        return "en"
+
+
+def load_locales(i18n_dir):
+    _locales = {}
+    for root, dirs, files in os.walk(i18n_dir):
+        for file in files:
+            if file.endswith(".json"):
+                lang = file.split(".")[0]
+                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
+                    _locales[lang] = json.loads(f.read())
+    return _locales
+
+
+def parse_extension(filename):
+    return os.path.splitext(filename)[1].strip().lower().replace(".", "")
+
+
+def script_dir(sub_dir: str = ""):
+    d = resource_dir(f"scripts")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def video_dir(sub_dir: str = ""):
+    d = resource_dir(f"videos")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def split_timestamp(timestamp):
+    """
+    拆分时间戳
+    """
+    start, end = timestamp.split('-')
+    start_hour, start_minute = map(int, start.split(':'))
+    end_hour, end_minute = map(int, end.split(':'))
+
+    start_time = '00:{:02d}:{:02d}'.format(start_hour, start_minute)
+    end_time = '00:{:02d}:{:02d}'.format(end_hour, end_minute)
+
+    return start_time, end_time
+
+
+def reduce_video_time(txt: str, duration: float = 0.21531):
+    """
+    按照字数缩减视频时长，一个字耗时约 0.21531 s,
+    Returns:
+    """
+    # 返回结果四舍五入为整数
+    duration = len(txt) * duration
+    return int(duration)
--- a/changelog.py
+++ b/changelog.py
@ -0,0 +1,17 @@
+from git_changelog.cli import build_and_render
+
+# 运行这段脚本自动生成CHANGELOG.md文件
+
+build_and_render(
+    repository=".",
+    output="CHANGELOG.md",
+    convention="angular",
+    provider="github",
+    template="keepachangelog",
+    parse_trailers=True,
+    parse_refs=False,
+    sections=["build", "deps", "feat", "fix", "refactor"],
+    versioning="pep440",
+    bump="1.1.2",  # 指定bump版本
+    in_place=True,
+)
--- a/config.example.toml
+++ b/config.example.toml
@ -0,0 +1,194 @@
+[app]
+    project_version="0.1.2"
+    video_source = "pexels"  # "pexels" or "pixabay"
+    # Pexels API Key
+    # Register at https://www.pexels.com/api/ to get your API key.
+    # You can use multiple keys to avoid rate limits.
+    # For example: pexels_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
+    # 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
+    pexels_api_keys = []
+
+    # Pixabay API Key
+    # Register at https://pixabay.com/api/docs/ to get your API key.
+    # You can use multiple keys to avoid rate limits.
+    # For example: pixabay_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
+    # 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
+    pixabay_api_keys = []
+
+    # 如果你没有 OPENAI API Key，可以使用 g4f 代替，或者使用国内的 Moonshot API
+    # If you don't have an OPENAI API Key, you can use g4f instead
+
+    # 支持的提供商 (Supported providers):
+    #   openai
+    #   moonshot (月之暗面)
+    #   oneapi
+    #   g4f
+    #   azure
+    #   qwen (通义千问)
+    #   gemini
+    llm_provider="openai"
+
+    ########## Ollama Settings
+    # No need to set it unless you want to use your own proxy
+    ollama_base_url = ""
+    # Check your available models at https://ollama.com/library
+    ollama_model_name = ""
+
+    ########## OpenAI API Key
+    # Get your API key at https://platform.openai.com/api-keys
+    openai_api_key = ""
+    # No need to set it unless you want to use your own proxy
+    openai_base_url = ""
+    # Check your available models at https://platform.openai.com/account/limits
+    openai_model_name = "gpt-4-turbo"
+
+    ########## Moonshot API Key
+    # Visit https://platform.moonshot.cn/console/api-keys to get your API key.
+    moonshot_api_key=""
+    moonshot_base_url = "https://api.moonshot.cn/v1"
+    moonshot_model_name = "moonshot-v1-8k"
+
+    ########## OneAPI API Key
+    # Visit https://github.com/songquanpeng/one-api to get your API key
+    oneapi_api_key=""
+    oneapi_base_url=""
+    oneapi_model_name=""
+
+    ########## G4F
+    # Visit https://github.com/xtekky/gpt4free to get more details
+    # Supported model list: https://github.com/xtekky/gpt4free/blob/main/g4f/models.py
+    g4f_model_name = "gpt-3.5-turbo"
+
+    ########## Azure API Key
+    # Visit https://learn.microsoft.com/zh-cn/azure/ai-services/openai/ to get more details
+    # API documentation: https://learn.microsoft.com/zh-cn/azure/ai-services/openai/reference
+    azure_api_key = ""
+    azure_base_url=""
+    azure_model_name="gpt-35-turbo" # replace with your model deployment name
+    azure_api_version = "2024-02-15-preview"
+
+    ########## Gemini API Key
+    gemini_api_key=""
+    gemini_model_name = "gemini-1.5-flash"
+
+    ########## Qwen API Key
+    # Visit https://dashscope.console.aliyun.com/apiKey to get your API key
+    # Visit below links to get more details
+    # https://tongyi.aliyun.com/qianwen/
+    # https://help.aliyun.com/zh/dashscope/developer-reference/model-introduction
+    qwen_api_key = ""
+    qwen_model_name = "qwen-max"
+
+
+    ########## DeepSeek API Key
+    # Visit https://platform.deepseek.com/api_keys to get your API key
+    deepseek_api_key = ""
+    deepseek_base_url = "https://api.deepseek.com"
+    deepseek_model_name = "deepseek-chat"
+
+    # Subtitle Provider, "edge" or "whisper"
+    # If empty, the subtitle will not be generated
+    subtitle_provider = "edge"
+
+    #
+    # ImageMagick
+    #
+    # Once you have installed it, ImageMagick will be automatically detected, except on Windows!
+    # On Windows, for example "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
+    # Download from https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
+
+    # imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe"
+
+
+    #
+    # FFMPEG
+    #
+    # 通常情况下，ffmpeg 会被自动下载，并且会被自动检测到。
+    # 但是如果你的环境有问题，无法自动下载，可能会遇到如下错误：
+    #   RuntimeError: No ffmpeg exe could be found.
+    #   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
+    # 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path，下载地址：https://www.gyan.dev/ffmpeg/builds/
+
+    # Under normal circumstances, ffmpeg is downloaded automatically and detected automatically.
+    # However, if there is an issue with your environment that prevents automatic downloading, you might encounter the following error:
+    #   RuntimeError: No ffmpeg exe could be found.
+    #   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
+    # In such cases, you can manually download ffmpeg and set the ffmpeg_path, download link: https://www.gyan.dev/ffmpeg/builds/
+
+    # ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
+    #########################################################################################
+
+    # 当视频生成成功后，API服务提供的视频下载接入点，默认为当前服务的地址和监听端口
+    # 比如 http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+    # 如果你需要使用域名对外提供服务（一般会用nginx做代理），则可以设置为你的域名
+    # 比如 https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+    # endpoint="https://xxxx.com"
+
+    # When the video is successfully generated, the API service provides a download endpoint for the video, defaulting to the service's current address and listening port.
+    # For example, http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+    # If you need to provide the service externally using a domain name (usually done with nginx as a proxy), you can set it to your domain name.
+    # For example, https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+    # endpoint="https://xxxx.com"
+    endpoint=""
+
+
+    # Video material storage location
+    # material_directory = ""                    # Indicates that video materials will be downloaded to the default folder, the default folder is ./storage/cache_videos under the current project
+    # material_directory = "/user/harry/videos"  # Indicates that video materials will be downloaded to a specified folder
+    # material_directory = "task"                # Indicates that video materials will be downloaded to the current task's folder, this method does not allow sharing of already downloaded video materials
+
+    # 视频素材存放位置
+    # material_directory = ""                    #表示将视频素材下载到默认的文件夹，默认文件夹为当前项目下的 ./storage/cache_videos
+    # material_directory = "/user/harry/videos"  #表示将视频素材下载到指定的文件夹中
+    # material_directory = "task"                #表示将视频素材下载到当前任务的文件夹中，这种方式无法共享已经下载的视频素材
+
+    material_directory = ""
+
+    # Used for state management of the task
+    enable_redis = false
+    redis_host = "localhost"
+    redis_port = 6379
+    redis_db = 0
+    redis_password = ""
+
+    # 文生视频时的最大并发任务数
+    max_concurrent_tasks = 5
+
+    # webui界面是否显示配置项
+    # webui hide baisc config panel
+    hide_config = false
+
+
+[whisper]
+    # Only effective when subtitle_provider is "whisper"
+
+    # Run on GPU with FP16
+    # model = WhisperModel(model_size, device="cuda", compute_type="float16")
+
+    # Run on GPU with INT8
+    # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
+
+    # Run on CPU with INT8
+    # model = WhisperModel(model_size, device="cpu", compute_type="int8")
+
+    # recommended model_size: "large-v3"
+    model_size="large-v3"
+    # if you want to use GPU, set device="cuda"
+    device="CPU"
+    compute_type="int8"
+
+
+[proxy]
+    ### Use a proxy to access the Pexels API
+    ### Format: "http://<username>:<password>@<proxy>:<port>"
+    ### Example: "http://user:pass@proxy:1234"
+    ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
+
+    # http = "http://10.10.1.10:3128"
+    # https = "http://10.10.1.10:1080"
+
+[azure]
+    # Azure Speech API Key
+    # Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
+    speech_key=""
+    speech_region=""
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,26 @@
+x-common-volumes: &common-volumes
+  - ./:/NarratoAI
+
+services:
+  webui:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: "webui"
+    ports:
+      - "8501:8501"
+    command: [ "bash", "webui.sh" ]
+    volumes: *common-volumes
+    environment:
+      - "VPN_PROXY_URL=http://host.docker.internal:7890"
+    restart: always
+  api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: "api"
+    ports:
+      - "8080:8080"
+    command: [ "python3", "main.py" ]
+    volumes: *common-volumes
+    restart: always
--- a/docs/check.png
+++ b/docs/check.png
--- a/docs/img001.png
+++ b/docs/img001.png
--- a/docs/img002.png
+++ b/docs/img002.png
--- a/docs/img003.png
+++ b/docs/img003.png
--- a/docs/img004.png
+++ b/docs/img004.png
--- a/docs/img005.png
+++ b/docs/img005.png
--- a/docs/img006.png
+++ b/docs/img006.png
--- a/docs/img007.png
+++ b/docs/img007.png
--- a/docs/index.png
+++ b/docs/index.png
--- a/docs/voice-list.txt
+++ b/docs/voice-list.txt
@ -0,0 +1,941 @@
+Name: af-ZA-AdriNeural
+Gender: Female
+
+Name: af-ZA-WillemNeural
+Gender: Male
+
+Name: am-ET-AmehaNeural
+Gender: Male
+
+Name: am-ET-MekdesNeural
+Gender: Female
+
+Name: ar-AE-FatimaNeural
+Gender: Female
+
+Name: ar-AE-HamdanNeural
+Gender: Male
+
+Name: ar-BH-AliNeural
+Gender: Male
+
+Name: ar-BH-LailaNeural
+Gender: Female
+
+Name: ar-DZ-AminaNeural
+Gender: Female
+
+Name: ar-DZ-IsmaelNeural
+Gender: Male
+
+Name: ar-EG-SalmaNeural
+Gender: Female
+
+Name: ar-EG-ShakirNeural
+Gender: Male
+
+Name: ar-IQ-BasselNeural
+Gender: Male
+
+Name: ar-IQ-RanaNeural
+Gender: Female
+
+Name: ar-JO-SanaNeural
+Gender: Female
+
+Name: ar-JO-TaimNeural
+Gender: Male
+
+Name: ar-KW-FahedNeural
+Gender: Male
+
+Name: ar-KW-NouraNeural
+Gender: Female
+
+Name: ar-LB-LaylaNeural
+Gender: Female
+
+Name: ar-LB-RamiNeural
+Gender: Male
+
+Name: ar-LY-ImanNeural
+Gender: Female
+
+Name: ar-LY-OmarNeural
+Gender: Male
+
+Name: ar-MA-JamalNeural
+Gender: Male
+
+Name: ar-MA-MounaNeural
+Gender: Female
+
+Name: ar-OM-AbdullahNeural
+Gender: Male
+
+Name: ar-OM-AyshaNeural
+Gender: Female
+
+Name: ar-QA-AmalNeural
+Gender: Female
+
+Name: ar-QA-MoazNeural
+Gender: Male
+
+Name: ar-SA-HamedNeural
+Gender: Male
+
+Name: ar-SA-ZariyahNeural
+Gender: Female
+
+Name: ar-SY-AmanyNeural
+Gender: Female
+
+Name: ar-SY-LaithNeural
+Gender: Male
+
+Name: ar-TN-HediNeural
+Gender: Male
+
+Name: ar-TN-ReemNeural
+Gender: Female
+
+Name: ar-YE-MaryamNeural
+Gender: Female
+
+Name: ar-YE-SalehNeural
+Gender: Male
+
+Name: az-AZ-BabekNeural
+Gender: Male
+
+Name: az-AZ-BanuNeural
+Gender: Female
+
+Name: bg-BG-BorislavNeural
+Gender: Male
+
+Name: bg-BG-KalinaNeural
+Gender: Female
+
+Name: bn-BD-NabanitaNeural
+Gender: Female
+
+Name: bn-BD-PradeepNeural
+Gender: Male
+
+Name: bn-IN-BashkarNeural
+Gender: Male
+
+Name: bn-IN-TanishaaNeural
+Gender: Female
+
+Name: bs-BA-GoranNeural
+Gender: Male
+
+Name: bs-BA-VesnaNeural
+Gender: Female
+
+Name: ca-ES-EnricNeural
+Gender: Male
+
+Name: ca-ES-JoanaNeural
+Gender: Female
+
+Name: cs-CZ-AntoninNeural
+Gender: Male
+
+Name: cs-CZ-VlastaNeural
+Gender: Female
+
+Name: cy-GB-AledNeural
+Gender: Male
+
+Name: cy-GB-NiaNeural
+Gender: Female
+
+Name: da-DK-ChristelNeural
+Gender: Female
+
+Name: da-DK-JeppeNeural
+Gender: Male
+
+Name: de-AT-IngridNeural
+Gender: Female
+
+Name: de-AT-JonasNeural
+Gender: Male
+
+Name: de-CH-JanNeural
+Gender: Male
+
+Name: de-CH-LeniNeural
+Gender: Female
+
+Name: de-DE-AmalaNeural
+Gender: Female
+
+Name: de-DE-ConradNeural
+Gender: Male
+
+Name: de-DE-FlorianMultilingualNeural
+Gender: Male
+
+Name: de-DE-KatjaNeural
+Gender: Female
+
+Name: de-DE-KillianNeural
+Gender: Male
+
+Name: de-DE-SeraphinaMultilingualNeural
+Gender: Female
+
+Name: el-GR-AthinaNeural
+Gender: Female
+
+Name: el-GR-NestorasNeural
+Gender: Male
+
+Name: en-AU-NatashaNeural
+Gender: Female
+
+Name: en-AU-WilliamNeural
+Gender: Male
+
+Name: en-CA-ClaraNeural
+Gender: Female
+
+Name: en-CA-LiamNeural
+Gender: Male
+
+Name: en-GB-LibbyNeural
+Gender: Female
+
+Name: en-GB-MaisieNeural
+Gender: Female
+
+Name: en-GB-RyanNeural
+Gender: Male
+
+Name: en-GB-SoniaNeural
+Gender: Female
+
+Name: en-GB-ThomasNeural
+Gender: Male
+
+Name: en-HK-SamNeural
+Gender: Male
+
+Name: en-HK-YanNeural
+Gender: Female
+
+Name: en-IE-ConnorNeural
+Gender: Male
+
+Name: en-IE-EmilyNeural
+Gender: Female
+
+Name: en-IN-NeerjaExpressiveNeural
+Gender: Female
+
+Name: en-IN-NeerjaNeural
+Gender: Female
+
+Name: en-IN-PrabhatNeural
+Gender: Male
+
+Name: en-KE-AsiliaNeural
+Gender: Female
+
+Name: en-KE-ChilembaNeural
+Gender: Male
+
+Name: en-NG-AbeoNeural
+Gender: Male
+
+Name: en-NG-EzinneNeural
+Gender: Female
+
+Name: en-NZ-MitchellNeural
+Gender: Male
+
+Name: en-NZ-MollyNeural
+Gender: Female
+
+Name: en-PH-JamesNeural
+Gender: Male
+
+Name: en-PH-RosaNeural
+Gender: Female
+
+Name: en-SG-LunaNeural
+Gender: Female
+
+Name: en-SG-WayneNeural
+Gender: Male
+
+Name: en-TZ-ElimuNeural
+Gender: Male
+
+Name: en-TZ-ImaniNeural
+Gender: Female
+
+Name: en-US-AnaNeural
+Gender: Female
+
+Name: en-US-AndrewNeural
+Gender: Male
+
+Name: en-US-AriaNeural
+Gender: Female
+
+Name: en-US-AvaNeural
+Gender: Female
+
+Name: en-US-BrianNeural
+Gender: Male
+
+Name: en-US-ChristopherNeural
+Gender: Male
+
+Name: en-US-EmmaNeural
+Gender: Female
+
+Name: en-US-EricNeural
+Gender: Male
+
+Name: en-US-GuyNeural
+Gender: Male
+
+Name: en-US-JennyNeural
+Gender: Female
+
+Name: en-US-MichelleNeural
+Gender: Female
+
+Name: en-US-RogerNeural
+Gender: Male
+
+Name: en-US-SteffanNeural
+Gender: Male
+
+Name: en-ZA-LeahNeural
+Gender: Female
+
+Name: en-ZA-LukeNeural
+Gender: Male
+
+Name: es-AR-ElenaNeural
+Gender: Female
+
+Name: es-AR-TomasNeural
+Gender: Male
+
+Name: es-BO-MarceloNeural
+Gender: Male
+
+Name: es-BO-SofiaNeural
+Gender: Female
+
+Name: es-CL-CatalinaNeural
+Gender: Female
+
+Name: es-CL-LorenzoNeural
+Gender: Male
+
+Name: es-CO-GonzaloNeural
+Gender: Male
+
+Name: es-CO-SalomeNeural
+Gender: Female
+
+Name: es-CR-JuanNeural
+Gender: Male
+
+Name: es-CR-MariaNeural
+Gender: Female
+
+Name: es-CU-BelkysNeural
+Gender: Female
+
+Name: es-CU-ManuelNeural
+Gender: Male
+
+Name: es-DO-EmilioNeural
+Gender: Male
+
+Name: es-DO-RamonaNeural
+Gender: Female
+
+Name: es-EC-AndreaNeural
+Gender: Female
+
+Name: es-EC-LuisNeural
+Gender: Male
+
+Name: es-ES-AlvaroNeural
+Gender: Male
+
+Name: es-ES-ElviraNeural
+Gender: Female
+
+Name: es-ES-XimenaNeural
+Gender: Female
+
+Name: es-GQ-JavierNeural
+Gender: Male
+
+Name: es-GQ-TeresaNeural
+Gender: Female
+
+Name: es-GT-AndresNeural
+Gender: Male
+
+Name: es-GT-MartaNeural
+Gender: Female
+
+Name: es-HN-CarlosNeural
+Gender: Male
+
+Name: es-HN-KarlaNeural
+Gender: Female
+
+Name: es-MX-DaliaNeural
+Gender: Female
+
+Name: es-MX-JorgeNeural
+Gender: Male
+
+Name: es-NI-FedericoNeural
+Gender: Male
+
+Name: es-NI-YolandaNeural
+Gender: Female
+
+Name: es-PA-MargaritaNeural
+Gender: Female
+
+Name: es-PA-RobertoNeural
+Gender: Male
+
+Name: es-PE-AlexNeural
+Gender: Male
+
+Name: es-PE-CamilaNeural
+Gender: Female
+
+Name: es-PR-KarinaNeural
+Gender: Female
+
+Name: es-PR-VictorNeural
+Gender: Male
+
+Name: es-PY-MarioNeural
+Gender: Male
+
+Name: es-PY-TaniaNeural
+Gender: Female
+
+Name: es-SV-LorenaNeural
+Gender: Female
+
+Name: es-SV-RodrigoNeural
+Gender: Male
+
+Name: es-US-AlonsoNeural
+Gender: Male
+
+Name: es-US-PalomaNeural
+Gender: Female
+
+Name: es-UY-MateoNeural
+Gender: Male
+
+Name: es-UY-ValentinaNeural
+Gender: Female
+
+Name: es-VE-PaolaNeural
+Gender: Female
+
+Name: es-VE-SebastianNeural
+Gender: Male
+
+Name: et-EE-AnuNeural
+Gender: Female
+
+Name: et-EE-KertNeural
+Gender: Male
+
+Name: fa-IR-DilaraNeural
+Gender: Female
+
+Name: fa-IR-FaridNeural
+Gender: Male
+
+Name: fi-FI-HarriNeural
+Gender: Male
+
+Name: fi-FI-NooraNeural
+Gender: Female
+
+Name: fil-PH-AngeloNeural
+Gender: Male
+
+Name: fil-PH-BlessicaNeural
+Gender: Female
+
+Name: fr-BE-CharlineNeural
+Gender: Female
+
+Name: fr-BE-GerardNeural
+Gender: Male
+
+Name: fr-CA-AntoineNeural
+Gender: Male
+
+Name: fr-CA-JeanNeural
+Gender: Male
+
+Name: fr-CA-SylvieNeural
+Gender: Female
+
+Name: fr-CA-ThierryNeural
+Gender: Male
+
+Name: fr-CH-ArianeNeural
+Gender: Female
+
+Name: fr-CH-FabriceNeural
+Gender: Male
+
+Name: fr-FR-DeniseNeural
+Gender: Female
+
+Name: fr-FR-EloiseNeural
+Gender: Female
+
+Name: fr-FR-HenriNeural
+Gender: Male
+
+Name: fr-FR-RemyMultilingualNeural
+Gender: Male
+
+Name: fr-FR-VivienneMultilingualNeural
+Gender: Female
+
+Name: ga-IE-ColmNeural
+Gender: Male
+
+Name: ga-IE-OrlaNeural
+Gender: Female
+
+Name: gl-ES-RoiNeural
+Gender: Male
+
+Name: gl-ES-SabelaNeural
+Gender: Female
+
+Name: gu-IN-DhwaniNeural
+Gender: Female
+
+Name: gu-IN-NiranjanNeural
+Gender: Male
+
+Name: he-IL-AvriNeural
+Gender: Male
+
+Name: he-IL-HilaNeural
+Gender: Female
+
+Name: hi-IN-MadhurNeural
+Gender: Male
+
+Name: hi-IN-SwaraNeural
+Gender: Female
+
+Name: hr-HR-GabrijelaNeural
+Gender: Female
+
+Name: hr-HR-SreckoNeural
+Gender: Male
+
+Name: hu-HU-NoemiNeural
+Gender: Female
+
+Name: hu-HU-TamasNeural
+Gender: Male
+
+Name: id-ID-ArdiNeural
+Gender: Male
+
+Name: id-ID-GadisNeural
+Gender: Female
+
+Name: is-IS-GudrunNeural
+Gender: Female
+
+Name: is-IS-GunnarNeural
+Gender: Male
+
+Name: it-IT-DiegoNeural
+Gender: Male
+
+Name: it-IT-ElsaNeural
+Gender: Female
+
+Name: it-IT-GiuseppeNeural
+Gender: Male
+
+Name: it-IT-IsabellaNeural
+Gender: Female
+
+Name: ja-JP-KeitaNeural
+Gender: Male
+
+Name: ja-JP-NanamiNeural
+Gender: Female
+
+Name: jv-ID-DimasNeural
+Gender: Male
+
+Name: jv-ID-SitiNeural
+Gender: Female
+
+Name: ka-GE-EkaNeural
+Gender: Female
+
+Name: ka-GE-GiorgiNeural
+Gender: Male
+
+Name: kk-KZ-AigulNeural
+Gender: Female
+
+Name: kk-KZ-DauletNeural
+Gender: Male
+
+Name: km-KH-PisethNeural
+Gender: Male
+
+Name: km-KH-SreymomNeural
+Gender: Female
+
+Name: kn-IN-GaganNeural
+Gender: Male
+
+Name: kn-IN-SapnaNeural
+Gender: Female
+
+Name: ko-KR-HyunsuNeural
+Gender: Male
+
+Name: ko-KR-InJoonNeural
+Gender: Male
+
+Name: ko-KR-SunHiNeural
+Gender: Female
+
+Name: lo-LA-ChanthavongNeural
+Gender: Male
+
+Name: lo-LA-KeomanyNeural
+Gender: Female
+
+Name: lt-LT-LeonasNeural
+Gender: Male
+
+Name: lt-LT-OnaNeural
+Gender: Female
+
+Name: lv-LV-EveritaNeural
+Gender: Female
+
+Name: lv-LV-NilsNeural
+Gender: Male
+
+Name: mk-MK-AleksandarNeural
+Gender: Male
+
+Name: mk-MK-MarijaNeural
+Gender: Female
+
+Name: ml-IN-MidhunNeural
+Gender: Male
+
+Name: ml-IN-SobhanaNeural
+Gender: Female
+
+Name: mn-MN-BataaNeural
+Gender: Male
+
+Name: mn-MN-YesuiNeural
+Gender: Female
+
+Name: mr-IN-AarohiNeural
+Gender: Female
+
+Name: mr-IN-ManoharNeural
+Gender: Male
+
+Name: ms-MY-OsmanNeural
+Gender: Male
+
+Name: ms-MY-YasminNeural
+Gender: Female
+
+Name: mt-MT-GraceNeural
+Gender: Female
+
+Name: mt-MT-JosephNeural
+Gender: Male
+
+Name: my-MM-NilarNeural
+Gender: Female
+
+Name: my-MM-ThihaNeural
+Gender: Male
+
+Name: nb-NO-FinnNeural
+Gender: Male
+
+Name: nb-NO-PernilleNeural
+Gender: Female
+
+Name: ne-NP-HemkalaNeural
+Gender: Female
+
+Name: ne-NP-SagarNeural
+Gender: Male
+
+Name: nl-BE-ArnaudNeural
+Gender: Male
+
+Name: nl-BE-DenaNeural
+Gender: Female
+
+Name: nl-NL-ColetteNeural
+Gender: Female
+
+Name: nl-NL-FennaNeural
+Gender: Female
+
+Name: nl-NL-MaartenNeural
+Gender: Male
+
+Name: pl-PL-MarekNeural
+Gender: Male
+
+Name: pl-PL-ZofiaNeural
+Gender: Female
+
+Name: ps-AF-GulNawazNeural
+Gender: Male
+
+Name: ps-AF-LatifaNeural
+Gender: Female
+
+Name: pt-BR-AntonioNeural
+Gender: Male
+
+Name: pt-BR-FranciscaNeural
+Gender: Female
+
+Name: pt-BR-ThalitaNeural
+Gender: Female
+
+Name: pt-PT-DuarteNeural
+Gender: Male
+
+Name: pt-PT-RaquelNeural
+Gender: Female
+
+Name: ro-RO-AlinaNeural
+Gender: Female
+
+Name: ro-RO-EmilNeural
+Gender: Male
+
+Name: ru-RU-DmitryNeural
+Gender: Male
+
+Name: ru-RU-SvetlanaNeural
+Gender: Female
+
+Name: si-LK-SameeraNeural
+Gender: Male
+
+Name: si-LK-ThiliniNeural
+Gender: Female
+
+Name: sk-SK-LukasNeural
+Gender: Male
+
+Name: sk-SK-ViktoriaNeural
+Gender: Female
+
+Name: sl-SI-PetraNeural
+Gender: Female
+
+Name: sl-SI-RokNeural
+Gender: Male
+
+Name: so-SO-MuuseNeural
+Gender: Male
+
+Name: so-SO-UbaxNeural
+Gender: Female
+
+Name: sq-AL-AnilaNeural
+Gender: Female
+
+Name: sq-AL-IlirNeural
+Gender: Male
+
+Name: sr-RS-NicholasNeural
+Gender: Male
+
+Name: sr-RS-SophieNeural
+Gender: Female
+
+Name: su-ID-JajangNeural
+Gender: Male
+
+Name: su-ID-TutiNeural
+Gender: Female
+
+Name: sv-SE-MattiasNeural
+Gender: Male
+
+Name: sv-SE-SofieNeural
+Gender: Female
+
+Name: sw-KE-RafikiNeural
+Gender: Male
+
+Name: sw-KE-ZuriNeural
+Gender: Female
+
+Name: sw-TZ-DaudiNeural
+Gender: Male
+
+Name: sw-TZ-RehemaNeural
+Gender: Female
+
+Name: ta-IN-PallaviNeural
+Gender: Female
+
+Name: ta-IN-ValluvarNeural
+Gender: Male
+
+Name: ta-LK-KumarNeural
+Gender: Male
+
+Name: ta-LK-SaranyaNeural
+Gender: Female
+
+Name: ta-MY-KaniNeural
+Gender: Female
+
+Name: ta-MY-SuryaNeural
+Gender: Male
+
+Name: ta-SG-AnbuNeural
+Gender: Male
+
+Name: ta-SG-VenbaNeural
+Gender: Female
+
+Name: te-IN-MohanNeural
+Gender: Male
+
+Name: te-IN-ShrutiNeural
+Gender: Female
+
+Name: th-TH-NiwatNeural
+Gender: Male
+
+Name: th-TH-PremwadeeNeural
+Gender: Female
+
+Name: tr-TR-AhmetNeural
+Gender: Male
+
+Name: tr-TR-EmelNeural
+Gender: Female
+
+Name: uk-UA-OstapNeural
+Gender: Male
+
+Name: uk-UA-PolinaNeural
+Gender: Female
+
+Name: ur-IN-GulNeural
+Gender: Female
+
+Name: ur-IN-SalmanNeural
+Gender: Male
+
+Name: ur-PK-AsadNeural
+Gender: Male
+
+Name: ur-PK-UzmaNeural
+Gender: Female
+
+Name: uz-UZ-MadinaNeural
+Gender: Female
+
+Name: uz-UZ-SardorNeural
+Gender: Male
+
+Name: vi-VN-HoaiMyNeural
+Gender: Female
+
+Name: vi-VN-NamMinhNeural
+Gender: Male
+
+Name: zh-CN-XiaoxiaoNeural
+Gender: Female
+
+Name: zh-CN-XiaoyiNeural
+Gender: Female
+
+Name: zh-CN-YunjianNeural
+Gender: Male
+
+Name: zh-CN-YunxiNeural
+Gender: Male
+
+Name: zh-CN-YunxiaNeural
+Gender: Male
+
+Name: zh-CN-YunyangNeural
+Gender: Male
+
+Name: zh-CN-liaoning-XiaobeiNeural
+Gender: Female
+
+Name: zh-CN-shaanxi-XiaoniNeural
+Gender: Female
+
+Name: zh-HK-HiuGaaiNeural
+Gender: Female
+
+Name: zh-HK-HiuMaanNeural
+Gender: Female
+
+Name: zh-HK-WanLungNeural
+Gender: Male
+
+Name: zh-TW-HsiaoChenNeural
+Gender: Female
+
+Name: zh-TW-HsiaoYuNeural
+Gender: Female
+
+Name: zh-TW-YunJheNeural
+Gender: Male
+
+Name: zu-ZA-ThandoNeural
+Gender: Female
+
+Name: zu-ZA-ThembaNeural
+Gender: Male
--- a/main.py
+++ b/main.py
@ -0,0 +1,16 @@
+import uvicorn
+from loguru import logger
+
+from app.config import config
+
+if __name__ == "__main__":
+    logger.info(
+        "start server, docs: http://127.0.0.1:" + str(config.listen_port) + "/docs"
+    )
+    uvicorn.run(
+        app="app.asgi:app",
+        host=config.listen_host,
+        port=config.listen_port,
+        reload=config.reload_debug,
+        log_level="warning",
+    )
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,26 @@
+requests~=2.31.0
+moviepy~=2.0.0.dev2
+openai~=1.13.3
+faster-whisper~=1.0.1
+edge_tts~=6.1.10
+uvicorn~=0.27.1
+fastapi~=0.110.0
+tomli~=2.0.1
+streamlit~=1.33.0
+loguru~=0.7.2
+aiohttp~=3.9.3
+urllib3~=2.2.1
+pillow~=10.3.0
+pydantic~=2.6.3
+g4f~=0.3.0.4
+dashscope~=1.15.0
+google.generativeai>=0.7.2
+python-multipart~=0.0.9
+redis==5.0.3
+# if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video
+# please install opencv-python to fix "PIL.Image' has no attribute 'ANTIALIAS'" error
+opencv-python~=4.9.0.80
+# for azure speech
+# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
+azure-cognitiveservices-speech~=1.37.0
+git-changelog~=2.5.2
--- a/resource/public/index.html
+++ b/resource/public/index.html
@ -0,0 +1,19 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>NarratoAI</title>
+</head>
+<body>
+<h1>NarratoAI</h1>
+<a href="https://github.com/harry0703/NarratoAI">https://github.com/harry0703/NarratoAI</a>
+<p>
+    只需提供一个视频 主题 或 关键词 ，就可以全自动生成视频文案、视频素材、视频字幕、视频背景音乐，然后合成一个高清的短视频。
+</p>
+
+<p>
+    Simply provide a topic or keyword for a video, and it will automatically generate the video copy, video materials,
+    video subtitles, and video background music before synthesizing a high-definition short video.
+</p>
+</body>
+</html>
--- a/webui.bat
+++ b/webui.bat
@ -0,0 +1,43 @@
+@echo off
+set CURRENT_DIR=%CD%
+echo ***** Current directory: %CURRENT_DIR% *****
+set PYTHONPATH=%CURRENT_DIR%
+
+@echo off
+setlocal enabledelayedexpansion
+
+rem 创建链接和路径的数组
+set "urls_paths[0]=https://zenodo.org/records/13293144/files/MicrosoftYaHeiBold.ttc|.\resource\fonts"
+set "urls_paths[1]=https://zenodo.org/records/13293144/files/MicrosoftYaHeiNormal.ttc|.\resource\fonts"
+set "urls_paths[2]=https://zenodo.org/records/13293144/files/STHeitiLight.ttc|.\resource\fonts"
+set "urls_paths[3]=https://zenodo.org/records/13293144/files/STHeitiMedium.ttc|.\resource\fonts"
+set "urls_paths[4]=https://zenodo.org/records/13293144/files/UTM%20Kabel%20KT.ttf|.\resource\fonts"
+set "urls_paths[5]=https://zenodo.org/records/13293129/files/demo.mp4|.\resource\videos"
+set "urls_paths[6]=https://zenodo.org/records/13293150/files/output000.mp3|.\resource\songs"
+set "urls_paths[7]=https://zenodo.org/records/13293150/files/output001.mp3|.\resource\songs"
+set "urls_paths[8]=https://zenodo.org/records/13293150/files/output002.mp3|.\resource\songs"
+set "urls_paths[9]=https://zenodo.org/records/13293150/files/output003.mp3|.\resource\songs"
+set "urls_paths[10]=https://zenodo.org/records/13293150/files/output004.mp3|.\resource\songs"
+set "urls_paths[11]=https://zenodo.org/records/13293150/files/output005.mp3|.\resource\songs"
+set "urls_paths[12]=https://zenodo.org/records/13293150/files/output006.mp3|.\resource\songs"
+set "urls_paths[13]=https://zenodo.org/records/13293150/files/output007.mp3|.\resource\songs"
+set "urls_paths[14]=https://zenodo.org/records/13293150/files/output008.mp3|.\resource\songs"
+set "urls_paths[15]=https://zenodo.org/records/13293150/files/output009.mp3|.\resource\songs"
+set "urls_paths[16]=https://zenodo.org/records/13293150/files/output010.mp3|.\resource\songs"
+
+rem 循环下载所有文件并保存到指定路径
+for /L %%i in (0,1,16) do (
+    for /f "tokens=1,2 delims=|" %%a in ("!urls_paths[%%i]!") do (
+        if not exist "%%b" mkdir "%%b"
+        echo 正在下载 %%a 到 %%b
+        curl -o "%%b\%%~nxa" %%a
+    )
+)
+
+echo 所有文件已成功下载到指定目录
+endlocal
+pause
+
+
+rem set HF_ENDPOINT=https://hf-mirror.com
+streamlit run .\webui\Main.py --browser.gatherUsageStats=False --server.enableCORS=True
--- a/webui.sh
+++ b/webui.sh
@ -0,0 +1,58 @@
+#!/bin/bash
+
+# 从环境变量中加载VPN代理的配置URL
+vpn_proxy_url="$VPN_PROXY_URL"
+# 检查是否成功加载
+if [ -z "$vpn_proxy_url" ]; then
+    echo "VPN代理配置URL未设置，请检查环境变量VPN_PROXY_URL"
+    exit 1
+fi
+# 使用VPN代理进行一些操作，比如通过代理下载文件
+export http_proxy="$vpn_proxy_url"
+export https_proxy="$vpn_proxy_url"
+
+# 创建链接和路径的数组
+declare -A urls_paths=(
+    ["https://zenodo.org/records/13293144/files/MicrosoftYaHeiBold.ttc"]="./resource/fonts"
+    ["https://zenodo.org/records/13293144/files/MicrosoftYaHeiNormal.ttc"]="./resource/fonts"
+    ["https://zenodo.org/records/13293144/files/STHeitiLight.ttc"]="./resource/fonts"
+    ["https://zenodo.org/records/13293144/files/STHeitiMedium.ttc"]="./resource/fonts"
+    ["https://zenodo.org/records/13293144/files/UTM%20Kabel%20KT.ttf"]="./resource/fonts"
+    ["https://zenodo.org/records/13293129/files/demo.mp4"]="./resource/videos"
+    ["https://zenodo.org/records/13293150/files/output000.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output001.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output002.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output003.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output004.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output005.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output006.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output007.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output008.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output009.mp3"]="./resource/songs"
+    ["https://zenodo.org/records/13293150/files/output010.mp3"]="./resource/songs"
+    # 添加更多链接及其对应的路径
+)
+
+# 循环下载所有文件并保存到指定路径
+for url in "${!urls_paths[@]}"; do
+    output_dir="${urls_paths[$url]}"
+    mkdir -p "$output_dir"  # 创建目录（如果不存在）
+
+    # 提取文件名
+    filename=$(basename "$url")
+
+    # 检查文件是否已经存在
+    if [ -f "$output_dir/$filename" ]; then
+        echo "文件 $filename 已经存在，跳过下载"
+    else
+        wget -P "$output_dir" "$url" &
+    fi
+done
+
+# 等待所有下载完成
+wait
+
+echo "所有文件已成功下载到指定目录"
+
+
+streamlit run ./webui/Main.py --browser.serverAddress="0.0.0.0" --server.enableCORS=True --browser.gatherUsageStats=False
--- a/webui/Main.py
+++ b/webui/Main.py
@ -0,0 +1,746 @@
+import sys
+import os
+import glob
+import json
+import datetime
+
+# 将项目的根目录添加到系统路径中，以允许从项目导入模块
+root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+if root_dir not in sys.path:
+    sys.path.append(root_dir)
+    print("******** sys.path ********")
+    print(sys.path)
+    print("")
+
+import streamlit as st
+
+import os
+from uuid import uuid4
+import platform
+import streamlit.components.v1 as components
+from loguru import logger
+from app.config import config
+
+st.set_page_config(
+    page_title="NarratoAI",
+    page_icon="📽️",
+    layout="wide",
+    initial_sidebar_state="auto",
+    menu_items={
+        "Report a bug": "https://github.com/linyqh/NarratoAI/issues",
+        'About': f"# NarratoAI:sunglasses: 📽️ \n #### Version: v{config.project_version} \n "
+                                f"自动化影视解说视频详情请移步：https://github.com/linyqh/NarratoAI"
+    },
+)
+
+from app.models.const import FILE_TYPE_IMAGES, FILE_TYPE_VIDEOS
+from app.models.schema import VideoClipParams, VideoAspect, VideoConcatMode
+from app.services import task as tm, llm, voice, material
+from app.utils import utils
+
+os.environ["HTTP_PROXY"] = config.proxy.get("http", "") or os.getenv("VPN_PROXY_URL", "")
+os.environ["HTTPS_PROXY"] = config.proxy.get("https", "") or os.getenv("VPN_PROXY_URL", "")
+
+hide_streamlit_style = """
+<style>#root > div:nth-child(1) > div > div > div > div > section > div {padding-top: 6px; padding-bottom: 10px; padding-left: 20px; padding-right: 20px;}</style>
+"""
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+st.title(f"NarratoAI :sunglasses:📽️")
+support_locales = [
+    "zh-CN",
+    "zh-HK",
+    "zh-TW",
+    "de-DE",
+    "en-US",
+    "fr-FR",
+    "vi-VN",
+    "th-TH",
+]
+font_dir = os.path.join(root_dir, "resource", "fonts")
+song_dir = os.path.join(root_dir, "resource", "songs")
+i18n_dir = os.path.join(root_dir, "webui", "i18n")
+config_file = os.path.join(root_dir, "webui", ".streamlit", "webui.toml")
+system_locale = utils.get_system_locale()
+
+if 'video_subject' not in st.session_state:
+    st.session_state['video_subject'] = ''
+if 'video_clip_json' not in st.session_state:
+    st.session_state['video_clip_json'] = ''
+if 'video_plot' not in st.session_state:
+    st.session_state['video_plot'] = ''
+if 'ui_language' not in st.session_state:
+    st.session_state['ui_language'] = config.ui.get("language", system_locale)
+
+
+def get_all_fonts():
+    fonts = []
+    for root, dirs, files in os.walk(font_dir):
+        for file in files:
+            if file.endswith(".ttf") or file.endswith(".ttc"):
+                fonts.append(file)
+    fonts.sort()
+    return fonts
+
+
+def get_all_songs():
+    songs = []
+    for root, dirs, files in os.walk(song_dir):
+        for file in files:
+            if file.endswith(".mp3"):
+                songs.append(file)
+    return songs
+
+
+def open_task_folder(task_id):
+    try:
+        sys = platform.system()
+        path = os.path.join(root_dir, "storage", "tasks", task_id)
+        if os.path.exists(path):
+            if sys == 'Windows':
+                os.system(f"start {path}")
+            if sys == 'Darwin':
+                os.system(f"open {path}")
+    except Exception as e:
+        logger.error(e)
+
+
+def scroll_to_bottom():
+    js = f"""
+    <script>
+        console.log("scroll_to_bottom");
+        function scroll(dummy_var_to_force_repeat_execution){{
+            var sections = parent.document.querySelectorAll('section.main');
+            console.log(sections);
+            for(let index = 0; index<sections.length; index++) {{
+                sections[index].scrollTop = sections[index].scrollHeight;
+            }}
+        }}
+        scroll(1);
+    </script>
+    """
+    st.components.v1.html(js, height=0, width=0)
+
+
+def init_log():
+    logger.remove()
+    _lvl = "DEBUG"
+
+    def format_record(record):
+        # 获取日志记录中的文件全路径
+        file_path = record["file"].path
+        # 将绝对路径转换为相对于项目根目录的路径
+        relative_path = os.path.relpath(file_path, root_dir)
+        # 更新记录中的文件路径
+        record["file"].path = f"./{relative_path}"
+        # 返回修改后的格式字符串
+        # 您可以根据需要调整这里的格式
+        record['message'] = record['message'].replace(root_dir, ".")
+
+        _format = '<green>{time:%Y-%m-%d %H:%M:%S}</> | ' + \
+                  '<level>{level}</> | ' + \
+                  '"{file.path}:{line}":<blue> {function}</> ' + \
+                  '- <level>{message}</>' + "\n"
+        return _format
+
+    logger.add(
+        sys.stdout,
+        level=_lvl,
+        format=format_record,
+        colorize=True,
+    )
+
+
+init_log()
+
+locales = utils.load_locales(i18n_dir)
+
+
+def tr(key):
+    loc = locales.get(st.session_state['ui_language'], {})
+    return loc.get("Translation", {}).get(key, key)
+
+
+st.write(tr("Get Help"))
+
+# 基础设置
+with st.expander(tr("Basic Settings"), expanded=False):
+    config_panels = st.columns(3)
+    left_config_panel = config_panels[0]
+    middle_config_panel = config_panels[1]
+    right_config_panel = config_panels[2]
+    with left_config_panel:
+        display_languages = []
+        selected_index = 0
+        for i, code in enumerate(locales.keys()):
+            display_languages.append(f"{code} - {locales[code].get('Language')}")
+            if code == st.session_state['ui_language']:
+                selected_index = i
+
+        selected_language = st.selectbox(tr("Language"), options=display_languages,
+                                         index=selected_index)
+        if selected_language:
+            code = selected_language.split(" - ")[0].strip()
+            st.session_state['ui_language'] = code
+            config.ui['language'] = code
+
+    with middle_config_panel:
+        #   openai
+        #   moonshot (月之暗面)
+        #   oneapi
+        #   g4f
+        #   azure
+        #   qwen (通义千问)
+        #   gemini
+        #   ollama
+        llm_providers = ['OpenAI', 'Moonshot', 'Azure', 'Qwen', 'Gemini', 'Ollama', 'G4f', 'OneAPI', "Cloudflare"]
+        saved_llm_provider = config.app.get("llm_provider", "OpenAI").lower()
+        saved_llm_provider_index = 0
+        for i, provider in enumerate(llm_providers):
+            if provider.lower() == saved_llm_provider:
+                saved_llm_provider_index = i
+                break
+
+        llm_provider = st.selectbox(tr("LLM Provider"), options=llm_providers, index=saved_llm_provider_index)
+        llm_provider = llm_provider.lower()
+        config.app["llm_provider"] = llm_provider
+
+        llm_api_key = config.app.get(f"{llm_provider}_api_key", "")
+        llm_base_url = config.app.get(f"{llm_provider}_base_url", "")
+        llm_model_name = config.app.get(f"{llm_provider}_model_name", "")
+        llm_account_id = config.app.get(f"{llm_provider}_account_id", "")
+        st_llm_api_key = st.text_input(tr("API Key"), value=llm_api_key, type="password")
+        st_llm_base_url = st.text_input(tr("Base Url"), value=llm_base_url)
+        st_llm_model_name = st.text_input(tr("Model Name"), value=llm_model_name)
+        if st_llm_api_key:
+            config.app[f"{llm_provider}_api_key"] = st_llm_api_key
+        if st_llm_base_url:
+            config.app[f"{llm_provider}_base_url"] = st_llm_base_url
+        if st_llm_model_name:
+            config.app[f"{llm_provider}_model_name"] = st_llm_model_name
+
+        if llm_provider == 'cloudflare':
+            st_llm_account_id = st.text_input(tr("Account ID"), value=llm_account_id)
+            if st_llm_account_id:
+                config.app[f"{llm_provider}_account_id"] = st_llm_account_id
+
+    with right_config_panel:
+        pexels_api_keys = config.app.get("pexels_api_keys", [])
+        if isinstance(pexels_api_keys, str):
+            pexels_api_keys = [pexels_api_keys]
+        pexels_api_key = ", ".join(pexels_api_keys)
+
+        pexels_api_key = st.text_input(tr("Pexels API Key"), value=pexels_api_key, type="password")
+        pexels_api_key = pexels_api_key.replace(" ", "")
+        if pexels_api_key:
+            config.app["pexels_api_keys"] = pexels_api_key.split(",")
+
+panel = st.columns(3)
+left_panel = panel[0]
+middle_panel = panel[1]
+right_panel = panel[2]
+
+params = VideoClipParams()
+
+# 左侧面板
+with left_panel:
+    with st.container(border=True):
+        st.write(tr("Video Script Configuration"))
+        # 脚本语言
+        video_languages = [
+            (tr("Auto Detect"), ""),
+        ]
+        for code in ["zh-CN", "zh-TW", "de-DE", "en-US", "vi-VN"]:
+            video_languages.append((code, code))
+
+        selected_index = st.selectbox(tr("Script Language"),
+                                      index=0,
+                                      options=range(len(video_languages)),  # 使用索引作为内部选项值
+                                      format_func=lambda x: video_languages[x][0]  # 显示给用户的是标签
+                                      )
+        params.video_language = video_languages[selected_index][1]
+
+        # 脚本路径
+        suffix = "*.json"
+        song_dir = utils.script_dir()
+        files = glob.glob(os.path.join(song_dir, suffix))
+        script_list = []
+        for file in files:
+            script_list.append({
+                "name": os.path.basename(file),
+                "size": os.path.getsize(file),
+                "file": file,
+            })
+
+        script_path = [(tr("Auto Generate"), ""), ]
+        for code in [file['file'] for file in script_list]:
+            script_path.append((code, code))
+
+        selected_json2 = st.selectbox(tr("Script Files"),
+                                      index=0,
+                                      options=range(len(script_path)),  # 使用索引作为内部选项值
+                                      format_func=lambda x: script_path[x][0]  # 显示给用户的是标签
+                                      )
+        params.video_clip_json = script_path[selected_json2][1]
+        video_json_file = params.video_clip_json
+
+        # 视频文件
+        suffix = "*.mp4"
+        song_dir = utils.video_dir()
+        files = glob.glob(os.path.join(song_dir, suffix))
+        video_list = []
+        for file in files:
+            video_list.append({
+                "name": os.path.basename(file),
+                "size": os.path.getsize(file),
+                "file": file,
+            })
+
+        video_path = [(tr("None"), ""), ]
+        for code in [file['file'] for file in video_list]:
+            video_path.append((code, code))
+
+        selected_index2 = st.selectbox(tr("Video File"),
+                                       index=0,
+                                       options=range(len(video_path)),  # 使用索引作为内部选项值
+                                       format_func=lambda x: video_path[x][0]  # 显示给用户的是标签
+                                       )
+        params.video_origin_path = video_path[selected_index2][1]
+
+        # 剧情内容
+        video_plot = st.text_area(
+            tr("Plot Description"),
+            value=st.session_state['video_plot'],
+            height=180
+        )
+
+        if st.button(tr("Video Script Generate"), key="auto_generate_script"):
+            with st.spinner(tr("Video Script Generate")):
+                if video_json_file == "" and params.video_origin_path != "":
+                    script = llm.gemini_video2json(
+                        video_origin_name=params.video_origin_path.split("\\")[-1],
+                        video_origin_path=params.video_origin_path,
+                        video_plot=video_plot
+                    )
+                    st.session_state['video_clip_json'] = script
+                    cleaned_string = script.strip("```json").strip("```")
+                    st.session_state['video_script_list'] = json.loads(cleaned_string)
+                else:
+                    with open(video_json_file, 'r', encoding='utf-8') as f:
+                        script = f.read()
+                        st.session_state['video_clip_json'] = script
+                        cleaned_string = script.strip("```json").strip("```")
+                        st.session_state['video_script_list'] = json.loads(cleaned_string)
+
+        video_clip_json_details = st.text_area(
+            tr("Video Script"),
+            value=st.session_state['video_clip_json'],
+            height=180
+        )
+
+        button_columns = st.columns(2)
+        with button_columns[0]:
+            if st.button(tr("Save Script"), key="auto_generate_terms", use_container_width=True):
+                if not video_clip_json_details:
+                    st.error(tr("请输入视频脚本"))
+                    st.stop()
+
+                with st.spinner(tr("保存脚本")):
+                    script_dir = utils.script_dir()
+                    # 获取当前时间戳，形如 2024-0618-171820
+                    timestamp = datetime.datetime.now().strftime("%Y-%m%d-%H%M%S")
+                    save_path = os.path.join(script_dir, f"{timestamp}.json")
+
+                    # 尝试解析输入的 JSON 数据
+                    input_json = str(video_clip_json_details).replace("'", '"')
+                    input_json = input_json.strip('```json').strip('```')
+                    try:
+                        data = json.loads(input_json)
+                    except:
+                        raise ValueError("视频脚本格式错误，请检查脚本是否符合 JSON 格式")
+
+                    # 检查是否是一个列表
+                    if not isinstance(data, list):
+                        raise ValueError("JSON is not a list")
+
+                    # 检查列表中的每个元素是否包含所需的键
+                    required_keys = {"picture", "timestamp", "narration"}
+                    for item in data:
+                        if not isinstance(item, dict):
+                            raise ValueError("List 元素不是字典")
+                        if not required_keys.issubset(item.keys()):
+                            raise ValueError("Dict 元素不包含必需的键")
+
+                    # 存储为新的 JSON 文件
+                    with open(save_path, 'w', encoding='utf-8') as file:
+                        json.dump(data, file, ensure_ascii=False, indent=4)
+                        # 将data的值存储到 session_state 中，类似缓存
+                        st.session_state['video_script_list'] = data
+
+                    logger.debug(f"脚本内容已成功保存到 {save_path}")
+
+        with button_columns[1]:
+            if st.button(tr("Crop Video"), key="auto_crop_video", use_container_width=True):
+                with st.spinner(tr("裁剪视频中...")):
+                    st.session_state['task_id'] = str(uuid4())
+
+                    if st.session_state.get('video_script_list', None) is not None:
+                        video_script_list = st.session_state.video_script_list
+                        time_list = [i['timestamp'] for i in video_script_list]
+                        subclip_videos = material.clip_videos(
+                            task_id=st.session_state['task_id'],
+                            timestamp_terms=time_list,
+                            origin_video=params.video_origin_path
+                        )
+                        if subclip_videos is None:
+                            st.error(tr("裁剪视频失败"))
+                            st.stop()
+                        st.session_state['subclip_videos'] = subclip_videos
+                        for video_script in video_script_list:
+                            try:
+                                video_script['path'] = subclip_videos[video_script['timestamp']]
+                            except KeyError as e:
+                                st.error(f"裁剪视频失败")
+                        # logger.debug(f"当前的脚本为：{st.session_state.video_script_list}")
+                    else:
+                        st.error(tr("请先生成视频脚本"))
+
+# 新中间面板
+with middle_panel:
+    with st.container(border=True):
+        st.write(tr("Video Settings"))
+        video_concat_modes = [
+            (tr("Sequential"), "sequential"),
+            (tr("Random"), "random"),
+        ]
+        # video_sources = [
+        #     (tr("Pexels"), "pexels"),
+        #     (tr("Pixabay"), "pixabay"),
+        #     (tr("Local file"), "local"),
+        #     (tr("TikTok"), "douyin"),
+        #     (tr("Bilibili"), "bilibili"),
+        #     (tr("Xiaohongshu"), "xiaohongshu"),
+        # ]
+        #
+        # saved_video_source_name = config.app.get("video_source", "pexels")
+        # saved_video_source_index = [v[1] for v in video_sources].index(
+        #     saved_video_source_name
+        # )
+        #
+        # selected_index = st.selectbox(
+        #     tr("Video Source"),
+        #     options=range(len(video_sources)),
+        #     format_func=lambda x: video_sources[x][0],
+        #     index=saved_video_source_index,
+        # )
+        # params.video_source = video_sources[selected_index][1]
+        # config.app["video_source"] = params.video_source
+        #
+        # if params.video_source == "local":
+        #     _supported_types = FILE_TYPE_VIDEOS + FILE_TYPE_IMAGES
+        #     uploaded_files = st.file_uploader(
+        #         "Upload Local Files",
+        #         type=["mp4", "mov", "avi", "flv", "mkv", "jpg", "jpeg", "png"],
+        #         accept_multiple_files=True,
+        #     )
+
+        selected_index = st.selectbox(
+            tr("Video Concat Mode"),
+            index=1,
+            options=range(len(video_concat_modes)),  # 使用索引作为内部选项值
+            format_func=lambda x: video_concat_modes[x][0],  # 显示给用户的是标签
+        )
+        params.video_concat_mode = VideoConcatMode(
+            video_concat_modes[selected_index][1]
+        )
+
+        video_aspect_ratios = [
+            (tr("Portrait"), VideoAspect.portrait.value),
+            (tr("Landscape"), VideoAspect.landscape.value),
+        ]
+        selected_index = st.selectbox(
+            tr("Video Ratio"),
+            options=range(len(video_aspect_ratios)),  # 使用索引作为内部选项值
+            format_func=lambda x: video_aspect_ratios[x][0],  # 显示给用户的是标签
+        )
+        params.video_aspect = VideoAspect(video_aspect_ratios[selected_index][1])
+
+        params.video_clip_duration = st.selectbox(
+            tr("Clip Duration"), options=[2, 3, 4, 5, 6, 7, 8, 9, 10], index=1
+        )
+        params.video_count = st.selectbox(
+            tr("Number of Videos Generated Simultaneously"),
+            options=[1, 2, 3, 4, 5],
+            index=0,
+        )
+    with st.container(border=True):
+        st.write(tr("Audio Settings"))
+
+        # tts_providers = ['edge', 'azure']
+        # tts_provider = st.selectbox(tr("TTS Provider"), tts_providers)
+
+        voices = voice.get_all_azure_voices(filter_locals=support_locales)
+        friendly_names = {
+            v: v.replace("Female", tr("Female"))
+            .replace("Male", tr("Male"))
+            .replace("Neural", "")
+            for v in voices
+        }
+        saved_voice_name = config.ui.get("voice_name", "")
+        saved_voice_name_index = 0
+        if saved_voice_name in friendly_names:
+            saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name)
+        else:
+            for i, v in enumerate(voices):
+                if (
+                    v.lower().startswith(st.session_state["ui_language"].lower())
+                    and "V2" not in v
+                ):
+                    saved_voice_name_index = i
+                    break
+
+        selected_friendly_name = st.selectbox(
+            tr("Speech Synthesis"),
+            options=list(friendly_names.values()),
+            index=saved_voice_name_index,
+        )
+
+        voice_name = list(friendly_names.keys())[
+            list(friendly_names.values()).index(selected_friendly_name)
+        ]
+        params.voice_name = voice_name
+        config.ui["voice_name"] = voice_name
+
+        if st.button(tr("Play Voice")):
+            play_content = params.video_subject
+            if not play_content:
+                play_content = params.video_script
+            if not play_content:
+                play_content = tr("Voice Example")
+            with st.spinner(tr("Synthesizing Voice")):
+                temp_dir = utils.storage_dir("temp", create=True)
+                audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
+                sub_maker = voice.tts(
+                    text=play_content,
+                    voice_name=voice_name,
+                    voice_rate=params.voice_rate,
+                    voice_file=audio_file,
+                )
+                # if the voice file generation failed, try again with a default content.
+                if not sub_maker:
+                    play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
+                    sub_maker = voice.tts(
+                        text=play_content,
+                        voice_name=voice_name,
+                        voice_rate=params.voice_rate,
+                        voice_file=audio_file,
+                    )
+
+                if sub_maker and os.path.exists(audio_file):
+                    st.audio(audio_file, format="audio/mp3")
+                    if os.path.exists(audio_file):
+                        os.remove(audio_file)
+
+        if voice.is_azure_v2_voice(voice_name):
+            saved_azure_speech_region = config.azure.get("speech_region", "")
+            saved_azure_speech_key = config.azure.get("speech_key", "")
+            azure_speech_region = st.text_input(
+                tr("Speech Region"), value=saved_azure_speech_region
+            )
+            azure_speech_key = st.text_input(
+                tr("Speech Key"), value=saved_azure_speech_key, type="password"
+            )
+            config.azure["speech_region"] = azure_speech_region
+            config.azure["speech_key"] = azure_speech_key
+
+        params.voice_volume = st.selectbox(
+            tr("Speech Volume"),
+            options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0],
+            index=2,
+        )
+
+        params.voice_rate = st.selectbox(
+            tr("Speech Rate"),
+            options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
+            index=2,
+        )
+
+        bgm_options = [
+            (tr("No Background Music"), ""),
+            (tr("Random Background Music"), "random"),
+            (tr("Custom Background Music"), "custom"),
+        ]
+        selected_index = st.selectbox(
+            tr("Background Music"),
+            index=1,
+            options=range(len(bgm_options)),  # 使用索引作为内部选项值
+            format_func=lambda x: bgm_options[x][0],  # 显示给用户的是标签
+        )
+        # 获取选择的背景音乐类型
+        params.bgm_type = bgm_options[selected_index][1]
+
+        # 根据选择显示或隐藏组件
+        if params.bgm_type == "custom":
+            custom_bgm_file = st.text_input(tr("Custom Background Music File"))
+            if custom_bgm_file and os.path.exists(custom_bgm_file):
+                params.bgm_file = custom_bgm_file
+                # st.write(f":red[已选择自定义背景音乐]：**{custom_bgm_file}**")
+        params.bgm_volume = st.selectbox(
+            tr("Background Music Volume"),
+            options=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+            index=2,
+        )
+
+# 新右侧面板
+with right_panel:
+    with st.container(border=True):
+        st.write(tr("Subtitle Settings"))
+        params.subtitle_enabled = st.checkbox(tr("Enable Subtitles"), value=True)
+        font_names = get_all_fonts()
+        saved_font_name = config.ui.get("font_name", "")
+        saved_font_name_index = 0
+        if saved_font_name in font_names:
+            saved_font_name_index = font_names.index(saved_font_name)
+        params.font_name = st.selectbox(
+            tr("Font"), font_names, index=saved_font_name_index
+        )
+        config.ui["font_name"] = params.font_name
+
+        subtitle_positions = [
+            (tr("Top"), "top"),
+            (tr("Center"), "center"),
+            (tr("Bottom"), "bottom"),
+            (tr("Custom"), "custom"),
+        ]
+        selected_index = st.selectbox(
+            tr("Position"),
+            index=2,
+            options=range(len(subtitle_positions)),
+            format_func=lambda x: subtitle_positions[x][0],
+        )
+        params.subtitle_position = subtitle_positions[selected_index][1]
+
+        if params.subtitle_position == "custom":
+            custom_position = st.text_input(
+                tr("Custom Position (% from top)"), value="70.0"
+            )
+            try:
+                params.custom_position = float(custom_position)
+                if params.custom_position < 0 or params.custom_position > 100:
+                    st.error(tr("Please enter a value between 0 and 100"))
+            except ValueError:
+                st.error(tr("Please enter a valid number"))
+
+        font_cols = st.columns([0.3, 0.7])
+        with font_cols[0]:
+            saved_text_fore_color = config.ui.get("text_fore_color", "#FFFFFF")
+            params.text_fore_color = st.color_picker(
+                tr("Font Color"), saved_text_fore_color
+            )
+            config.ui["text_fore_color"] = params.text_fore_color
+
+        with font_cols[1]:
+            saved_font_size = config.ui.get("font_size", 60)
+            params.font_size = st.slider(tr("Font Size"), 30, 100, saved_font_size)
+            config.ui["font_size"] = params.font_size
+
+        stroke_cols = st.columns([0.3, 0.7])
+        with stroke_cols[0]:
+            params.stroke_color = st.color_picker(tr("Stroke Color"), "#000000")
+        with stroke_cols[1]:
+            params.stroke_width = st.slider(tr("Stroke Width"), 0.0, 10.0, 1.5)
+
+# 视频编辑面板
+with st.expander(tr("视频审查"), expanded=False):
+    try:
+        video_list = st.session_state['video_script_list']
+    except KeyError as e:
+        video_list = []
+
+    # 计算列数和行数
+    num_videos = len(video_list)
+    cols_per_row = 3
+    rows = (num_videos + cols_per_row - 1) // cols_per_row  # 向上取整计算行数
+
+    # 使用容器展示视频
+    for row in range(rows):
+        cols = st.columns(cols_per_row)
+        for col in range(cols_per_row):
+            index = row * cols_per_row + col
+            if index < num_videos:
+                with cols[col]:
+                    video_info = video_list[index]
+                    video_path = video_info.get('path')
+                    if video_path is not None:
+                        initial_narration = video_info['narration']
+                        initial_picture = video_info['picture']
+                        initial_timestamp = video_info['timestamp']
+
+                        with open(video_path, 'rb') as video_file:
+                            video_bytes = video_file.read()
+                            st.video(video_bytes)
+
+                        # 可编辑的输入框
+                        text_panels = st.columns(2)
+                        with text_panels[0]:
+                            text1 = st.text_area("时间戳", value=initial_timestamp, height=20)
+                        with text_panels[1]:
+                            text2 = st.text_area("画面描述", value=initial_picture, height=20)
+                        text3 = st.text_area("解说旁白", value=initial_narration, height=100)
+
+                        # 清空文本框按钮
+                        if st.button("重新生成", key=f"button_{index}"):
+                            print(123123)
+                            # with st.spinner(tr("大模型生成中...")):
+
+start_button = st.button(tr("Generate Video"), use_container_width=True, type="primary")
+if start_button:
+    config.save_config()
+    task_id = st.session_state['task_id']
+    if not params.video_clip_json:
+        st.error(tr("脚本文件不能为空"))
+        scroll_to_bottom()
+        st.stop()
+    if not params.video_origin_path:
+        st.error(tr("视频文件不能为空"))
+        scroll_to_bottom()
+        st.stop()
+    if llm_provider != 'g4f' and not config.app.get(f"{llm_provider}_api_key", ""):
+        st.error(tr("请输入 LLM API 密钥"))
+        scroll_to_bottom()
+        st.stop()
+
+    log_container = st.empty()
+    log_records = []
+
+
+    def log_received(msg):
+        with log_container:
+            log_records.append(msg)
+            st.code("\n".join(log_records))
+
+
+    logger.add(log_received)
+
+    st.toast(tr("生成视频"))
+    logger.info(tr("开始生成视频"))
+    logger.info(utils.to_json(params))
+    scroll_to_bottom()
+
+    result = tm.start_subclip(task_id=task_id, params=params, subclip_path_videos=st.session_state.subclip_videos)
+
+    video_files = result.get("videos", [])
+    st.success(tr("视频生成完成"))
+    try:
+        if video_files:
+            # 将视频播放器居中
+            player_cols = st.columns(len(video_files) * 2 + 1)
+            for i, url in enumerate(video_files):
+                player_cols[i * 2 + 1].video(url)
+    except Exception as e:
+        pass
+
+    open_task_folder(task_id)
+    logger.info(tr("视频生成完成"))
+    scroll_to_bottom()
+
+config.save_config()
--- a/webui/i18n/de.json
+++ b/webui/i18n/de.json
@ -0,0 +1,79 @@
+{
+  "Language": "German",
+  "Translation": {
+    "Video Script Settings": "**Drehbuch / Topic des Videos**",
+    "Video Subject": "Worum soll es in dem Video gehen? (Geben Sie ein Keyword an, :red[Dank KI wird automatisch ein Drehbuch generieren])",
+    "Script Language": "Welche Sprache soll zum Generieren von Drehbüchern  verwendet werden? :red[KI generiert anhand dieses Begriffs das Drehbuch]",
+    "Generate Video Script and Keywords": "Klicken Sie hier, um mithilfe von KI ein [Video Drehbuch] und [Video Keywords] basierend auf dem **Keyword** zu generieren.",
+    "Auto Detect": "Automatisch erkennen",
+    "Video Script": "Drehbuch (Storybook) (:blue[① Optional, KI generiert  ② Die richtige Zeichensetzung hilft bei der Erstellung von Untertiteln])",
+    "Generate Video Keywords": "Klicken Sie, um KI zum Generieren zu verwenden [Video Keywords] basierend auf dem **Drehbuch**",
+    "Please Enter the Video Subject": "Bitte geben Sie zuerst das Drehbuch an",
+    "Generating Video Script and Keywords": "KI generiert ein Drehbuch und Schlüsselwörter...",
+    "Generating Video Keywords": "AI is generating video keywords...",
+    "Video Keywords": "Video Schlüsselwörter (:blue[① Optional, KI generiert ② Verwende **, (Kommas)** zur Trennung der Wörter, in englischer Sprache])",
+    "Video Settings": "**Video Einstellungen**",
+    "Video Concat Mode": "Videoverkettungsmodus",
+    "Random": "Zufällige Verkettung (empfohlen)",
+    "Sequential": "Sequentielle Verkettung",
+    "Video Ratio": "Video-Seitenverhältnis",
+    "Portrait": "Portrait 9:16",
+    "Landscape": "Landschaft 16:9",
+    "Clip Duration": "Maximale Dauer einzelner Videoclips in sekunden",
+    "Number of Videos Generated Simultaneously": "Anzahl der parallel generierten Videos",
+    "Audio Settings": "**Audio Einstellungen**",
+    "Speech Synthesis": "Sprachausgabe",
+    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "Lautstärke der Sprachausgabe",
+    "Speech Rate": "Lesegeschwindigkeit (1,0 bedeutet 1x)",
+    "Male": "Männlich",
+    "Female": "Weiblich",
+    "Background Music": "Hintergrundmusik",
+    "No Background Music": "Ohne Hintergrundmusik",
+    "Random Background Music": "Zufällig erzeugte Hintergrundmusik",
+    "Custom Background Music": "Benutzerdefinierte Hintergrundmusik",
+    "Custom Background Music File": "Bitte gib den Pfad zur Musikdatei an:",
+    "Background Music Volume": "Lautstärke: (0.2 entspricht 20%, sollte nicht zu laut sein)",
+    "Subtitle Settings": "**Untertitel-Einstellungen**",
+    "Enable Subtitles": "Untertitel aktivieren (Wenn diese Option deaktiviert ist, werden die Einstellungen nicht genutzt)",
+    "Font": "Schriftart des Untertitels",
+    "Position": "Ausrichtung des Untertitels",
+    "Top": "Oben",
+    "Center": "Mittig",
+    "Bottom": "Unten (empfohlen)",
+    "Custom": "Benutzerdefinierte Position (70, was 70% von oben bedeutet)",
+    "Font Size": "Schriftgröße für Untertitel",
+    "Font Color": "Schriftfarbe",
+    "Stroke Color": "Kontur",
+    "Stroke Width": "Breite der Untertitelkontur",
+    "Generate Video": "Generiere Videos durch KI",
+    "Video Script and Subject Cannot Both Be Empty": "Das Video-Thema und Drehbuch dürfen nicht beide leer sein",
+    "Generating Video": "Video wird erstellt, bitte warten...",
+    "Start Generating Video": "Beginne mit der Generierung",
+    "Video Generation Completed": "Video erfolgreich generiert",
+    "Video Generation Failed": "Video Generierung fehlgeschlagen",
+    "You can download the generated video from the following links": "Sie können das generierte Video über die folgenden Links herunterladen",
+    "Basic Settings": "**Grunde Instellungen**",
+    "Pexels API Key": "Pexels API Key ([Get API Key](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Pixabay API Key ([Get API Key](https://pixabay.com/api/docs/#api_search_videos))",
+    "Language": "Language",
+    "LLM Provider": "LLM Provider",
+    "API Key": "API Key (:red[Required])",
+    "Base Url": "Base Url",
+    "Model Name": "Model Name",
+    "Please Enter the LLM API Key": "Please Enter the **LLM API Key**",
+    "Please Enter the Pexels API Key": "Please Enter the **Pexels API Key**",
+    "Please Enter the Pixabay API Key": "Please Enter the **Pixabay API Key**",
+    "Get Help": "If you need help, or have any questions, you can join discord for help ",
+    "Video Source": "Video Source",
+    "TikTok": "TikTok (TikTok support is coming soon)",
+    "Bilibili": "Bilibili (Bilibili support is coming soon)",
+    "Xiaohongshu": "Xiaohongshu (Xiaohongshu support is coming soon)",
+    "Local file": "Local file",
+    "Play Voice": "Play Voice",
+    "Voice Example": "This is an example text for testing speech synthesis",
+    "Synthesizing Voice": "Synthesizing voice, please wait...",
+    "TTS Provider": "Select the voice synthesis provider"
+  }
+}
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@ -0,0 +1,81 @@
+{
+  "Language": "English",
+  "Translation": {
+    "Video Script Settings": "**Video Script Settings**",
+    "Video Subject": "Video Subject (Provide a keyword, :red[AI will automatically generate] video script)",
+    "Script Language": "Language for Generating Video Script (AI will automatically output based on the language of your subject)",
+    "Generate Video Script and Keywords": "Click to use AI to generate [Video Script] and [Video Keywords] based on **subject**",
+    "Auto Detect": "Auto Detect",
+    "Video Script": "Video Script (:blue[① Optional, AI generated  ② Proper punctuation helps with subtitle generation])",
+    "Generate Video Keywords": "Click to use AI to generate [Video Keywords] based on **script**",
+    "Please Enter the Video Subject": "Please Enter the Video Script First",
+    "Generating Video Script and Keywords": "AI is generating video script and keywords...",
+    "Generating Video Keywords": "AI is generating video keywords...",
+    "Video Keywords": "Video Keywords (:blue[① Optional, AI generated ② Use **English commas** for separation, English only])",
+    "Video Settings": "**Video Settings**",
+    "Video Concat Mode": "Video Concatenation Mode",
+    "Random": "Random Concatenation (Recommended)",
+    "Sequential": "Sequential Concatenation",
+    "Video Ratio": "Video Aspect Ratio",
+    "Portrait": "Portrait 9:16",
+    "Landscape": "Landscape 16:9",
+    "Clip Duration": "Maximum Duration of Video Clips (seconds)",
+    "Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously",
+    "Audio Settings": "**Audio Settings**",
+    "Speech Synthesis": "Speech Synthesis Voice",
+    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "Speech Volume (1.0 represents 100%)",
+    "Speech Rate": "Speech Rate (1.0 means 1x speed)",
+    "Male": "Male",
+    "Female": "Female",
+    "Background Music": "Background Music",
+    "No Background Music": "No Background Music",
+    "Random Background Music": "Random Background Music",
+    "Custom Background Music": "Custom Background Music",
+    "Custom Background Music File": "Please enter the file path for custom background music:",
+    "Background Music Volume": "Background Music Volume (0.2 represents 20%, background music should not be too loud)",
+    "Subtitle Settings": "**Subtitle Settings**",
+    "Enable Subtitles": "Enable Subtitles (If unchecked, the settings below will not take effect)",
+    "Font": "Subtitle Font",
+    "Position": "Subtitle Position",
+    "Top": "Top",
+    "Center": "Center",
+    "Bottom": "Bottom (Recommended)",
+    "Custom": "Custom position (70, indicating 70% down from the top)",
+    "Font Size": "Subtitle Font Size",
+    "Font Color": "Subtitle Font Color",
+    "Stroke Color": "Subtitle Outline Color",
+    "Stroke Width": "Subtitle Outline Width",
+    "Generate Video": "Generate Video",
+    "Video Script and Subject Cannot Both Be Empty": "Video Subject and Video Script cannot both be empty",
+    "Generating Video": "Generating video, please wait...",
+    "Start Generating Video": "Start Generating Video",
+    "Video Generation Completed": "Video Generation Completed",
+    "Video Generation Failed": "Video Generation Failed",
+    "You can download the generated video from the following links": "You can download the generated video from the following links",
+    "Pexels API Key": "Pexels API Key ([Get API Key](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Pixabay API Key ([Get API Key](https://pixabay.com/api/docs/#api_search_videos))",
+    "Basic Settings": "**Basic Settings** (:blue[Click to expand])",
+    "Language": "Language",
+    "LLM Provider": "LLM Provider",
+    "API Key": "API Key (:red[Required])",
+    "Base Url": "Base Url",
+    "Account ID": "Account ID (Get from Cloudflare dashboard)",
+    "Model Name": "Model Name",
+    "Please Enter the LLM API Key": "Please Enter the **LLM API Key**",
+    "Please Enter the Pexels API Key": "Please Enter the **Pexels API Key**",
+    "Please Enter the Pixabay API Key": "Please Enter the **Pixabay API Key**",
+    "Get Help": "If you need help, or have any questions, you can join discord for help ",
+    "Video Source": "Video Source",
+    "TikTok": "TikTok (TikTok support is coming soon)",
+    "Bilibili": "Bilibili (Bilibili support is coming soon)",
+    "Xiaohongshu": "Xiaohongshu (Xiaohongshu support is coming soon)",
+    "Local file": "Local file",
+    "Play Voice": "Play Voice",
+    "Voice Example": "This is an example text for testing speech synthesis",
+    "Synthesizing Voice": "Synthesizing voice, please wait...",
+    "TTS Provider": "Select the voice synthesis provider",
+    "Hide Log": "Hide Log"
+  }
+}
--- a/webui/i18n/vi.json
+++ b/webui/i18n/vi.json
@ -0,0 +1,80 @@
+{
+  "Language": "Tiếng Việt",
+  "Translation": {
+    "Video Script Settings": "**Cài Đặt Kịch Bản Video**",
+    "Video Subject": "Chủ Đề Video (Cung cấp một từ khóa, :red[AI sẽ tự động tạo ra] kịch bản video)",
+    "Script Language": "Ngôn Ngữ cho Việc Tạo Kịch Bản Video (AI sẽ tự động xuất ra dựa trên ngôn ngữ của chủ đề của bạn)",
+    "Generate Video Script and Keywords": "Nhấn để sử dụng AI để tạo [Kịch Bản Video] và [Từ Khóa Video] dựa trên **chủ đề**",
+    "Auto Detect": "Tự Động Phát Hiện",
+    "Video Script": "Kịch Bản Video (:blue[① Tùy chọn, AI tạo ra  ② Dấu câu chính xác giúp việc tạo phụ đề)",
+    "Generate Video Keywords": "Nhấn để sử dụng AI để tạo [Từ Khóa Video] dựa trên **kịch bản**",
+    "Please Enter the Video Subject": "Vui lòng Nhập Kịch Bản Video Trước",
+    "Generating Video Script and Keywords": "AI đang tạo kịch bản video và từ khóa...",
+    "Generating Video Keywords": "AI đang tạo từ khóa video...",
+    "Video Keywords": "Từ Khóa Video (:blue[① Tùy chọn, AI tạo ra ② Sử dụng dấu phẩy **Tiếng Anh** để phân tách, chỉ sử dụng Tiếng Anh])",
+    "Video Settings": "**Cài Đặt Video**",
+    "Video Concat Mode": "Chế Độ Nối Video",
+    "Random": "Nối Ngẫu Nhiên (Được Khuyến Nghị)",
+    "Sequential": "Nối Theo Thứ Tự",
+    "Video Ratio": "Tỷ Lệ Khung Hình Video",
+    "Portrait": "Dọc 9:16",
+    "Landscape": "Ngang 16:9",
+    "Clip Duration": "Thời Lượng Tối Đa Của Đoạn Video (giây)",
+    "Number of Videos Generated Simultaneously": "Số Video Được Tạo Ra Đồng Thời",
+    "Audio Settings": "**Cài Đặt Âm Thanh**",
+    "Speech Synthesis": "Giọng Đọc Văn Bản",
+    "Speech Region": "Vùng(:red[Bắt Buộc，[Lấy Vùng](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "Khóa API(:red[Bắt Buộc，[Lấy Khóa API](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "Âm Lượng Giọng Đọc (1.0 đại diện cho 100%)",
+    "Speech Rate": "Tốc độ đọc (1.0 biểu thị tốc độ gốc)",
+    "Male": "Nam",
+    "Female": "Nữ",
+    "Background Music": "Âm Nhạc Nền",
+    "No Background Music": "Không Có Âm Nhạc Nền",
+    "Random Background Music": "Âm Nhạc Nền Ngẫu Nhiên",
+    "Custom Background Music": "Âm Nhạc Nền Tùy Chỉnh",
+    "Custom Background Music File": "Vui lòng nhập đường dẫn tệp cho âm nhạc nền tùy chỉnh:",
+    "Background Music Volume": "Âm Lượng Âm Nhạc Nền (0.2 đại diện cho 20%, âm nhạc nền không nên quá to)",
+    "Subtitle Settings": "**Cài Đặt Phụ Đề**",
+    "Enable Subtitles": "Bật Phụ Đề (Nếu không chọn, các cài đặt dưới đây sẽ không có hiệu lực)",
+    "Font": "Phông Chữ Phụ Đề",
+    "Position": "Vị Trí Phụ Đề",
+    "Top": "Trên",
+    "Center": "Giữa",
+    "Bottom": "Dưới (Được Khuyến Nghị)",
+    "Custom": "Vị trí tùy chỉnh (70, chỉ ra là cách đầu trang 70%)",
+    "Font Size": "Cỡ Chữ Phụ Đề",
+    "Font Color": "Màu Chữ Phụ Đề",
+    "Stroke Color": "Màu Viền Phụ Đề",
+    "Stroke Width": "Độ Rộng Viền Phụ Đề",
+    "Generate Video": "Tạo Video",
+    "Video Script and Subject Cannot Both Be Empty": "Chủ Đề Video và Kịch Bản Video không thể cùng trống",
+    "Generating Video": "Đang tạo video, vui lòng đợi...",
+    "Start Generating Video": "Bắt Đầu Tạo Video",
+    "Video Generation Completed": "Hoàn Tất Tạo Video",
+    "Video Generation Failed": "Tạo Video Thất Bại",
+    "You can download the generated video from the following links": "Bạn có thể tải video được tạo ra từ các liên kết sau",
+    "Pexels API Key": "Khóa API Pexels ([Lấy Khóa API](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Pixabay API Key ([Get API Key](https://pixabay.com/api/docs/#api_search_videos))",
+    "Basic Settings": "**Cài Đặt Cơ Bản** (:blue[Nhấp để mở rộng])",
+    "Language": "Ngôn Ngữ",
+    "LLM Provider": "Nhà Cung Cấp LLM",
+    "API Key": "Khóa API (:red[Bắt Buộc])",
+    "Base Url": "Url Cơ Bản",
+    "Account ID": "ID Tài Khoản (Lấy từ bảng điều khiển Cloudflare)",
+    "Model Name": "Tên Mô Hình",
+    "Please Enter the LLM API Key": "Vui lòng Nhập **Khóa API LLM**",
+    "Please Enter the Pexels API Key": "Vui lòng Nhập **Khóa API Pexels**",
+    "Please Enter the Pixabay API Key": "Vui lòng Nhập **Pixabay API Key**",
+    "Get Help": "Nếu bạn cần giúp đỡ hoặc có bất kỳ câu hỏi nào, bạn có thể tham gia discord để được giúp đỡ ",
+    "Video Source": "Video Source",
+    "TikTok": "TikTok (TikTok support is coming soon)",
+    "Bilibili": "Bilibili (Bilibili support is coming soon)",
+    "Xiaohongshu": "Xiaohongshu (Xiaohongshu support is coming soon)",
+    "Local file": "Local file",
+    "Play Voice": "Play Voice",
+    "Voice Example": "This is an example text for testing speech synthesis",
+    "Synthesizing Voice": "Synthesizing voice, please wait...",
+    "TTS Provider": "Select the voice synthesis provider"
+  }
+}
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -0,0 +1,88 @@
+{
+  "Language": "简体中文",
+  "Translation": {
+    "Video Script Configuration": "**视频脚本配置**",
+    "Video Script Generate": "生成视频脚本",
+    "Video Subject": "视频主题（给定一个关键词，:red[AI自动生成]视频文案）",
+    "Script Language": "生成视频脚本的语言（一般情况AI会自动根据你输入的主题语言输出）",
+    "Script Files": "脚本文件",
+    "Generate Video Script and Keywords": "点击使用AI根据**主题**生成 【视频文案】 和 【视频关键词】",
+    "Auto Detect": "自动检测",
+    "Auto Generate": "自动生成",
+    "Video Script": "视频脚本（:blue[①可不填，使用AI生成  ②合理使用标点断句，有助于生成字幕]）",
+    "Save Script": "保存脚本",
+    "Crop Video": "裁剪视频",
+    "Video File": "视频文件（:blue）",
+    "Plot Description": "剧情描述 (:blue[可从 https://www.tvmao.com/ 获取])",
+    "Generate Video Keywords": "点击使用AI根据**文案**生成【视频关键词】",
+    "Please Enter the Video Subject": "请先填写视频文案",
+    "Generating Video Script and Keywords": "AI正在生成视频文案和关键词...",
+    "Generating Video Keywords": "AI正在生成视频关键词...",
+    "Video Keywords": "视频关键词（:blue[①可不填，使用AI生成 ②用**英文逗号**分隔，只支持英文]）",
+    "Video Settings": "**视频设置**",
+    "Video Concat Mode": "视频拼接模式",
+    "Random": "随机拼接（推荐）",
+    "Sequential": "顺序拼接",
+    "Video Ratio": "视频比例",
+    "Portrait": "竖屏 9:16（抖音视频）",
+    "Landscape": "横屏 16:9（西瓜视频）",
+    "Clip Duration": "视频片段最大时长(秒)（**不是视频总长度**，是指每个**合成片段**的长度）",
+    "Number of Videos Generated Simultaneously": "同时生成视频数量",
+    "Audio Settings": "**音频设置**",
+    "Speech Synthesis": "朗读声音（:red[**与文案语言保持一致**。注意：V2版效果更好，但是需要API KEY]）",
+    "Speech Region": "服务区域 (:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key (:red[必填，密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "朗读音量（1.0表示100%）",
+    "Speech Rate": "朗读速度（1.0表示1倍速）",
+    "Male": "男性",
+    "Female": "女性",
+    "Background Music": "背景音乐",
+    "No Background Music": "无背景音乐",
+    "Random Background Music": "随机背景音乐",
+    "Custom Background Music": "自定义背景音乐",
+    "Custom Background Music File": "请输入自定义背景音乐的文件路径",
+    "Background Music Volume": "背景音乐音量（0.2表示20%，背景声音不宜过高）",
+    "Subtitle Settings": "**字幕设置**",
+    "Enable Subtitles": "启用字幕（若取消勾选，下面的设置都将不生效）",
+    "Font": "字幕字体",
+    "Position": "字幕位置",
+    "Top": "顶部",
+    "Center": "中间",
+    "Bottom": "底部（推荐）",
+    "Custom": "自定义位置（70，表示离顶部70%的位置）",
+    "Font Size": "字幕大小",
+    "Font Color": "字幕颜色",
+    "Stroke Color": "描边颜色",
+    "Stroke Width": "描边粗细",
+    "Generate Video": "生成视频",
+    "Video Script and Subject Cannot Both Be Empty": "视频主题 和 视频文案，不能同时为空",
+    "Generating Video": "正在生成视频，请稍候...",
+    "Start Generating Video": "开始生成视频",
+    "Video Generation Completed": "视频生成完成",
+    "Video Generation Failed": "视频生成失败",
+    "You can download the generated video from the following links": "你可以从以下链接下载生成的视频",
+    "Basic Settings": "**基础设置** (:blue[点击展开])",
+    "Language": "界面语言",
+    "Pexels API Key": "Pexels API Key ([点击获取](https://www.pexels.com/api/)) :red[推荐使用]",
+    "Pixabay API Key": "Pixabay API Key ([点击获取](https://pixabay.com/api/docs/#api_search_videos)) :red[可以不用配置，如果 Pexels 无法使用，再选择Pixabay]",
+    "LLM Provider": "大模型提供商",
+    "API Key": "API Key (:red[必填，需要到大模型提供商的后台申请])",
+    "Base Url": "Base Url (可选)",
+    "Account ID": "账户ID (Cloudflare的dash面板url中获取)",
+    "Model Name": "模型名称 (:blue[需要到大模型提供商的后台确认被授权的模型名称])",
+    "Please Enter the LLM API Key": "请先填写大模型 **API Key**",
+    "Please Enter the Pexels API Key": "请先填写 **Pexels API Key**",
+    "Please Enter the Pixabay API Key": "请先填写 **Pixabay API Key**",
+    "Get Help": "一站式 AI 影视解说+自动化剪辑工具\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\n有任何问题或建议，可以加入 **社区频道** 求助或讨论：https://discord.gg/WBKChhmZ",
+    "Video Source": "视频来源",
+    "TikTok": "抖音 (TikTok 支持中，敬请期待)",
+    "Bilibili": "哔哩哔哩 (Bilibili 支持中，敬请期待)",
+    "Xiaohongshu": "小红书 (Xiaohongshu 支持中，敬请期待)",
+    "Local file": "本地文件",
+    "Play Voice": "试听语音合成",
+    "Voice Example": "这是一段测试语音合成的示例文本",
+    "Synthesizing Voice": "语音合成中，请稍候...",
+    "TTS Provider": "语音合成提供商",
+    "Hide Log": "隐藏日志"
+  }
+}